4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
516 /* Routines that know the structure may reference the singular name. */
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
541 uint64_t frac_hm
; /* high-middle */
542 uint64_t frac_lm
; /* low-middle */
546 /* These apply to the most significant word of each FloatPartsN. */
547 #define DECOMPOSED_BINARY_POINT 63
548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
550 /* Structure holding all of the relevant parameters for a format.
551 * exp_size: the size of the exponent field
552 * exp_bias: the offset applied to the exponent field
553 * exp_max: the maximum normalised exponent
554 * frac_size: the size of the fraction field
555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
556 * The following are computed based the size of fraction
557 * frac_lsb: least significant bit of fraction
558 * frac_lsbm1: the bit below the least significant bit (for rounding)
559 * round_mask/roundeven_mask: masks used for rounding
560 * The following optional modifiers are available:
561 * arm_althp: handle ARM Alternative Half Precision
572 uint64_t roundeven_mask
;
576 /* Expand fields based on the size of exponent and fraction */
577 #define FLOAT_PARAMS(E, F) \
579 .exp_bias = ((1 << E) - 1) >> 1, \
580 .exp_max = (1 << E) - 1, \
582 .frac_shift = (-F - 1) & 63, \
583 .frac_lsb = 1ull << ((-F - 1) & 63), \
584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
588 static const FloatFmt float16_params
= {
592 static const FloatFmt float16_params_ahp
= {
597 static const FloatFmt bfloat16_params
= {
601 static const FloatFmt float32_params
= {
605 static const FloatFmt float64_params
= {
609 static const FloatFmt float128_params
= {
610 FLOAT_PARAMS(15, 112)
613 /* Unpack a float to parts, but do not canonicalize. */
614 static void unpack_raw64(FloatParts64
*r
, const FloatFmt
*fmt
, uint64_t raw
)
616 const int f_size
= fmt
->frac_size
;
617 const int e_size
= fmt
->exp_size
;
619 *r
= (FloatParts64
) {
620 .cls
= float_class_unclassified
,
621 .sign
= extract64(raw
, f_size
+ e_size
, 1),
622 .exp
= extract64(raw
, f_size
, e_size
),
623 .frac
= extract64(raw
, 0, f_size
)
627 static inline void float16_unpack_raw(FloatParts64
*p
, float16 f
)
629 unpack_raw64(p
, &float16_params
, f
);
632 static inline void bfloat16_unpack_raw(FloatParts64
*p
, bfloat16 f
)
634 unpack_raw64(p
, &bfloat16_params
, f
);
637 static inline void float32_unpack_raw(FloatParts64
*p
, float32 f
)
639 unpack_raw64(p
, &float32_params
, f
);
642 static inline void float64_unpack_raw(FloatParts64
*p
, float64 f
)
644 unpack_raw64(p
, &float64_params
, f
);
647 static void float128_unpack_raw(FloatParts128
*p
, float128 f
)
649 const int f_size
= float128_params
.frac_size
- 64;
650 const int e_size
= float128_params
.exp_size
;
652 *p
= (FloatParts128
) {
653 .cls
= float_class_unclassified
,
654 .sign
= extract64(f
.high
, f_size
+ e_size
, 1),
655 .exp
= extract64(f
.high
, f_size
, e_size
),
656 .frac_hi
= extract64(f
.high
, 0, f_size
),
661 /* Pack a float from parts, but do not canonicalize. */
662 static uint64_t pack_raw64(const FloatParts64
*p
, const FloatFmt
*fmt
)
664 const int f_size
= fmt
->frac_size
;
665 const int e_size
= fmt
->exp_size
;
668 ret
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
669 ret
= deposit64(ret
, f_size
, e_size
, p
->exp
);
670 ret
= deposit64(ret
, 0, f_size
, p
->frac
);
674 static inline float16
float16_pack_raw(const FloatParts64
*p
)
676 return make_float16(pack_raw64(p
, &float16_params
));
679 static inline bfloat16
bfloat16_pack_raw(const FloatParts64
*p
)
681 return pack_raw64(p
, &bfloat16_params
);
684 static inline float32
float32_pack_raw(const FloatParts64
*p
)
686 return make_float32(pack_raw64(p
, &float32_params
));
689 static inline float64
float64_pack_raw(const FloatParts64
*p
)
691 return make_float64(pack_raw64(p
, &float64_params
));
694 static float128
float128_pack_raw(const FloatParts128
*p
)
696 const int f_size
= float128_params
.frac_size
- 64;
697 const int e_size
= float128_params
.exp_size
;
700 hi
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
701 hi
= deposit64(hi
, f_size
, e_size
, p
->exp
);
702 hi
= deposit64(hi
, 0, f_size
, p
->frac_hi
);
703 return make_float128(hi
, p
->frac_lo
);
706 /*----------------------------------------------------------------------------
707 | Functions and definitions to determine: (1) whether tininess for underflow
708 | is detected before or after rounding by default, (2) what (if anything)
709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
711 | are propagated from function inputs to output. These details are target-
713 *----------------------------------------------------------------------------*/
714 #include "softfloat-specialize.c.inc"
716 #define PARTS_GENERIC_64_128(NAME, P) \
717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
719 #define PARTS_GENERIC_64_128_256(NAME, P) \
720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
721 (FloatParts128 *, parts128_##NAME), parts64_##NAME)
723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
726 static void parts64_return_nan(FloatParts64
*a
, float_status
*s
);
727 static void parts128_return_nan(FloatParts128
*a
, float_status
*s
);
729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
731 static FloatParts64
*parts64_pick_nan(FloatParts64
*a
, FloatParts64
*b
,
733 static FloatParts128
*parts128_pick_nan(FloatParts128
*a
, FloatParts128
*b
,
736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
738 static FloatParts64
*parts64_pick_nan_muladd(FloatParts64
*a
, FloatParts64
*b
,
739 FloatParts64
*c
, float_status
*s
,
740 int ab_mask
, int abc_mask
);
741 static FloatParts128
*parts128_pick_nan_muladd(FloatParts128
*a
,
745 int ab_mask
, int abc_mask
);
747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
750 static void parts64_canonicalize(FloatParts64
*p
, float_status
*status
,
751 const FloatFmt
*fmt
);
752 static void parts128_canonicalize(FloatParts128
*p
, float_status
*status
,
753 const FloatFmt
*fmt
);
755 #define parts_canonicalize(A, S, F) \
756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
758 static void parts64_uncanon(FloatParts64
*p
, float_status
*status
,
759 const FloatFmt
*fmt
);
760 static void parts128_uncanon(FloatParts128
*p
, float_status
*status
,
761 const FloatFmt
*fmt
);
763 #define parts_uncanon(A, S, F) \
764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
766 static void parts64_add_normal(FloatParts64
*a
, FloatParts64
*b
);
767 static void parts128_add_normal(FloatParts128
*a
, FloatParts128
*b
);
768 static void parts256_add_normal(FloatParts256
*a
, FloatParts256
*b
);
770 #define parts_add_normal(A, B) \
771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
773 static bool parts64_sub_normal(FloatParts64
*a
, FloatParts64
*b
);
774 static bool parts128_sub_normal(FloatParts128
*a
, FloatParts128
*b
);
775 static bool parts256_sub_normal(FloatParts256
*a
, FloatParts256
*b
);
777 #define parts_sub_normal(A, B) \
778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
780 static FloatParts64
*parts64_addsub(FloatParts64
*a
, FloatParts64
*b
,
781 float_status
*s
, bool subtract
);
782 static FloatParts128
*parts128_addsub(FloatParts128
*a
, FloatParts128
*b
,
783 float_status
*s
, bool subtract
);
785 #define parts_addsub(A, B, S, Z) \
786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
788 static FloatParts64
*parts64_mul(FloatParts64
*a
, FloatParts64
*b
,
790 static FloatParts128
*parts128_mul(FloatParts128
*a
, FloatParts128
*b
,
793 #define parts_mul(A, B, S) \
794 PARTS_GENERIC_64_128(mul, A)(A, B, S)
796 static FloatParts64
*parts64_muladd(FloatParts64
*a
, FloatParts64
*b
,
797 FloatParts64
*c
, int flags
,
799 static FloatParts128
*parts128_muladd(FloatParts128
*a
, FloatParts128
*b
,
800 FloatParts128
*c
, int flags
,
803 #define parts_muladd(A, B, C, Z, S) \
804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
806 static FloatParts64
*parts64_div(FloatParts64
*a
, FloatParts64
*b
,
808 static FloatParts128
*parts128_div(FloatParts128
*a
, FloatParts128
*b
,
811 #define parts_div(A, B, S) \
812 PARTS_GENERIC_64_128(div, A)(A, B, S)
814 static bool parts64_round_to_int_normal(FloatParts64
*a
, FloatRoundMode rm
,
815 int scale
, int frac_size
);
816 static bool parts128_round_to_int_normal(FloatParts128
*a
, FloatRoundMode r
,
817 int scale
, int frac_size
);
819 #define parts_round_to_int_normal(A, R, C, F) \
820 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F)
822 static void parts64_round_to_int(FloatParts64
*a
, FloatRoundMode rm
,
823 int scale
, float_status
*s
,
824 const FloatFmt
*fmt
);
825 static void parts128_round_to_int(FloatParts128
*a
, FloatRoundMode r
,
826 int scale
, float_status
*s
,
827 const FloatFmt
*fmt
);
829 #define parts_round_to_int(A, R, C, S, F) \
830 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F)
832 static int64_t parts64_float_to_sint(FloatParts64
*p
, FloatRoundMode rmode
,
833 int scale
, int64_t min
, int64_t max
,
835 static int64_t parts128_float_to_sint(FloatParts128
*p
, FloatRoundMode rmode
,
836 int scale
, int64_t min
, int64_t max
,
839 #define parts_float_to_sint(P, R, Z, MN, MX, S) \
840 PARTS_GENERIC_64_128(float_to_sint, P)(P, R, Z, MN, MX, S)
842 static uint64_t parts64_float_to_uint(FloatParts64
*p
, FloatRoundMode rmode
,
843 int scale
, uint64_t max
,
845 static uint64_t parts128_float_to_uint(FloatParts128
*p
, FloatRoundMode rmode
,
846 int scale
, uint64_t max
,
849 #define parts_float_to_uint(P, R, Z, M, S) \
850 PARTS_GENERIC_64_128(float_to_uint, P)(P, R, Z, M, S)
852 static void parts64_sint_to_float(FloatParts64
*p
, int64_t a
,
853 int scale
, float_status
*s
);
854 static void parts128_sint_to_float(FloatParts128
*p
, int64_t a
,
855 int scale
, float_status
*s
);
857 #define parts_sint_to_float(P, I, Z, S) \
858 PARTS_GENERIC_64_128(sint_to_float, P)(P, I, Z, S)
860 static void parts64_uint_to_float(FloatParts64
*p
, uint64_t a
,
861 int scale
, float_status
*s
);
862 static void parts128_uint_to_float(FloatParts128
*p
, uint64_t a
,
863 int scale
, float_status
*s
);
865 #define parts_uint_to_float(P, I, Z, S) \
866 PARTS_GENERIC_64_128(uint_to_float, P)(P, I, Z, S)
869 * Helper functions for softfloat-parts.c.inc, per-size operations.
872 #define FRAC_GENERIC_64_128(NAME, P) \
873 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
875 #define FRAC_GENERIC_64_128_256(NAME, P) \
876 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
877 (FloatParts128 *, frac128_##NAME), frac64_##NAME)
879 static bool frac64_add(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
881 return uadd64_overflow(a
->frac
, b
->frac
, &r
->frac
);
884 static bool frac128_add(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
887 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
888 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
892 static bool frac256_add(FloatParts256
*r
, FloatParts256
*a
, FloatParts256
*b
)
895 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
896 r
->frac_lm
= uadd64_carry(a
->frac_lm
, b
->frac_lm
, &c
);
897 r
->frac_hm
= uadd64_carry(a
->frac_hm
, b
->frac_hm
, &c
);
898 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
902 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B)
904 static bool frac64_addi(FloatParts64
*r
, FloatParts64
*a
, uint64_t c
)
906 return uadd64_overflow(a
->frac
, c
, &r
->frac
);
909 static bool frac128_addi(FloatParts128
*r
, FloatParts128
*a
, uint64_t c
)
911 c
= uadd64_overflow(a
->frac_lo
, c
, &r
->frac_lo
);
912 return uadd64_overflow(a
->frac_hi
, c
, &r
->frac_hi
);
915 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
917 static void frac64_allones(FloatParts64
*a
)
922 static void frac128_allones(FloatParts128
*a
)
924 a
->frac_hi
= a
->frac_lo
= -1;
927 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
929 static int frac64_cmp(FloatParts64
*a
, FloatParts64
*b
)
931 return a
->frac
== b
->frac
? 0 : a
->frac
< b
->frac
? -1 : 1;
934 static int frac128_cmp(FloatParts128
*a
, FloatParts128
*b
)
936 uint64_t ta
= a
->frac_hi
, tb
= b
->frac_hi
;
938 ta
= a
->frac_lo
, tb
= b
->frac_lo
;
943 return ta
< tb
? -1 : 1;
946 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
948 static void frac64_clear(FloatParts64
*a
)
953 static void frac128_clear(FloatParts128
*a
)
955 a
->frac_hi
= a
->frac_lo
= 0;
958 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
960 static bool frac64_div(FloatParts64
*a
, FloatParts64
*b
)
962 uint64_t n1
, n0
, r
, q
;
966 * We want a 2*N / N-bit division to produce exactly an N-bit
967 * result, so that we do not lose any precision and so that we
968 * do not have to renormalize afterward. If A.frac < B.frac,
969 * then division would produce an (N-1)-bit result; shift A left
970 * by one to produce the an N-bit result, and return true to
971 * decrement the exponent to match.
973 * The udiv_qrnnd algorithm that we're using requires normalization,
974 * i.e. the msb of the denominator must be set, which is already true.
976 ret
= a
->frac
< b
->frac
;
984 q
= udiv_qrnnd(&r
, n0
, n1
, b
->frac
);
986 /* Set lsb if there is a remainder, to set inexact. */
987 a
->frac
= q
| (r
!= 0);
992 static bool frac128_div(FloatParts128
*a
, FloatParts128
*b
)
994 uint64_t q0
, q1
, a0
, a1
, b0
, b1
;
995 uint64_t r0
, r1
, r2
, r3
, t0
, t1
, t2
, t3
;
998 a0
= a
->frac_hi
, a1
= a
->frac_lo
;
999 b0
= b
->frac_hi
, b1
= b
->frac_lo
;
1001 ret
= lt128(a0
, a1
, b0
, b1
);
1003 a1
= shr_double(a0
, a1
, 1);
1007 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */
1008 q0
= estimateDiv128To64(a0
, a1
, b0
);
1011 * Estimate is high because B1 was not included (unless B1 == 0).
1012 * Reduce quotient and increase remainder until remainder is non-negative.
1013 * This loop will execute 0 to 2 times.
1015 mul128By64To192(b0
, b1
, q0
, &t0
, &t1
, &t2
);
1016 sub192(a0
, a1
, 0, t0
, t1
, t2
, &r0
, &r1
, &r2
);
1019 add192(r0
, r1
, r2
, 0, b0
, b1
, &r0
, &r1
, &r2
);
1022 /* Repeat using the remainder, producing a second word of quotient. */
1023 q1
= estimateDiv128To64(r1
, r2
, b0
);
1024 mul128By64To192(b0
, b1
, q1
, &t1
, &t2
, &t3
);
1025 sub192(r1
, r2
, 0, t1
, t2
, t3
, &r1
, &r2
, &r3
);
1028 add192(r1
, r2
, r3
, 0, b0
, b1
, &r1
, &r2
, &r3
);
1031 /* Any remainder indicates inexact; set sticky bit. */
1032 q1
|= (r2
| r3
) != 0;
1039 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B)
1041 static bool frac64_eqz(FloatParts64
*a
)
1043 return a
->frac
== 0;
1046 static bool frac128_eqz(FloatParts128
*a
)
1048 return (a
->frac_hi
| a
->frac_lo
) == 0;
1051 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
1053 static void frac64_mulw(FloatParts128
*r
, FloatParts64
*a
, FloatParts64
*b
)
1055 mulu64(&r
->frac_lo
, &r
->frac_hi
, a
->frac
, b
->frac
);
1058 static void frac128_mulw(FloatParts256
*r
, FloatParts128
*a
, FloatParts128
*b
)
1060 mul128To256(a
->frac_hi
, a
->frac_lo
, b
->frac_hi
, b
->frac_lo
,
1061 &r
->frac_hi
, &r
->frac_hm
, &r
->frac_lm
, &r
->frac_lo
);
1064 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B)
1066 static void frac64_neg(FloatParts64
*a
)
1071 static void frac128_neg(FloatParts128
*a
)
1074 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
1075 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
1078 static void frac256_neg(FloatParts256
*a
)
1081 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
1082 a
->frac_lm
= usub64_borrow(0, a
->frac_lm
, &c
);
1083 a
->frac_hm
= usub64_borrow(0, a
->frac_hm
, &c
);
1084 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
1087 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A)
1089 static int frac64_normalize(FloatParts64
*a
)
1092 int shift
= clz64(a
->frac
);
1099 static int frac128_normalize(FloatParts128
*a
)
1102 int shl
= clz64(a
->frac_hi
);
1103 a
->frac_hi
= shl_double(a
->frac_hi
, a
->frac_lo
, shl
);
1106 } else if (a
->frac_lo
) {
1107 int shl
= clz64(a
->frac_lo
);
1108 a
->frac_hi
= a
->frac_lo
<< shl
;
1115 static int frac256_normalize(FloatParts256
*a
)
1117 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_hm
;
1118 uint64_t a2
= a
->frac_lm
, a3
= a
->frac_lo
;
1130 a0
= a1
, a1
= a2
, a2
= a3
, a3
= 0;
1133 a0
= a2
, a1
= a3
, a2
= 0, a3
= 0;
1136 a0
= a3
, a1
= 0, a2
= 0, a3
= 0;
1139 a0
= 0, a1
= 0, a2
= 0, a3
= 0;
1149 a0
= shl_double(a0
, a1
, shl
);
1150 a1
= shl_double(a1
, a2
, shl
);
1151 a2
= shl_double(a2
, a3
, shl
);
1162 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A)
1164 static void frac64_shl(FloatParts64
*a
, int c
)
1169 static void frac128_shl(FloatParts128
*a
, int c
)
1171 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_lo
;
1179 a0
= shl_double(a0
, a1
, c
);
1187 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
1189 static void frac64_shr(FloatParts64
*a
, int c
)
1194 static void frac128_shr(FloatParts128
*a
, int c
)
1196 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_lo
;
1204 a1
= shr_double(a0
, a1
, c
);
1212 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
1214 static void frac64_shrjam(FloatParts64
*a
, int c
)
1216 uint64_t a0
= a
->frac
;
1218 if (likely(c
!= 0)) {
1219 if (likely(c
< 64)) {
1220 a0
= (a0
>> c
) | (shr_double(a0
, 0, c
) != 0);
1228 static void frac128_shrjam(FloatParts128
*a
, int c
)
1230 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_lo
;
1231 uint64_t sticky
= 0;
1233 if (unlikely(c
== 0)) {
1235 } else if (likely(c
< 64)) {
1237 } else if (likely(c
< 128)) {
1251 sticky
|= shr_double(a1
, 0, c
);
1252 a1
= shr_double(a0
, a1
, c
);
1256 a
->frac_lo
= a1
| (sticky
!= 0);
1260 static void frac256_shrjam(FloatParts256
*a
, int c
)
1262 uint64_t a0
= a
->frac_hi
, a1
= a
->frac_hm
;
1263 uint64_t a2
= a
->frac_lm
, a3
= a
->frac_lo
;
1264 uint64_t sticky
= 0;
1266 if (unlikely(c
== 0)) {
1268 } else if (likely(c
< 64)) {
1270 } else if (likely(c
< 256)) {
1271 if (unlikely(c
& 128)) {
1273 a3
= a1
, a2
= a0
, a1
= 0, a0
= 0;
1275 if (unlikely(c
& 64)) {
1277 a3
= a2
, a2
= a1
, a1
= a0
, a0
= 0;
1284 sticky
= a0
| a1
| a2
| a3
;
1285 a0
= a1
= a2
= a3
= 0;
1289 sticky
|= shr_double(a3
, 0, c
);
1290 a3
= shr_double(a2
, a3
, c
);
1291 a2
= shr_double(a1
, a2
, c
);
1292 a1
= shr_double(a0
, a1
, c
);
1296 a
->frac_lo
= a3
| (sticky
!= 0);
1302 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1304 static bool frac64_sub(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
1306 return usub64_overflow(a
->frac
, b
->frac
, &r
->frac
);
1309 static bool frac128_sub(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
1312 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
1313 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
1317 static bool frac256_sub(FloatParts256
*r
, FloatParts256
*a
, FloatParts256
*b
)
1320 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
1321 r
->frac_lm
= usub64_borrow(a
->frac_lm
, b
->frac_lm
, &c
);
1322 r
->frac_hm
= usub64_borrow(a
->frac_hm
, b
->frac_hm
, &c
);
1323 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
1327 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1329 static void frac64_truncjam(FloatParts64
*r
, FloatParts128
*a
)
1331 r
->frac
= a
->frac_hi
| (a
->frac_lo
!= 0);
1334 static void frac128_truncjam(FloatParts128
*r
, FloatParts256
*a
)
1336 r
->frac_hi
= a
->frac_hi
;
1337 r
->frac_lo
= a
->frac_hm
| ((a
->frac_lm
| a
->frac_lo
) != 0);
1340 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A)
1342 static void frac64_widen(FloatParts128
*r
, FloatParts64
*a
)
1344 r
->frac_hi
= a
->frac
;
1348 static void frac128_widen(FloatParts256
*r
, FloatParts128
*a
)
1350 r
->frac_hi
= a
->frac_hi
;
1351 r
->frac_hm
= a
->frac_lo
;
1356 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B)
1358 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
1359 #define FloatPartsN glue(FloatParts,N)
1360 #define FloatPartsW glue(FloatParts,W)
1365 #include "softfloat-parts-addsub.c.inc"
1366 #include "softfloat-parts.c.inc"
1373 #include "softfloat-parts-addsub.c.inc"
1374 #include "softfloat-parts.c.inc"
1380 #include "softfloat-parts-addsub.c.inc"
1389 * Pack/unpack routines with a specific FloatFmt.
1392 static void float16a_unpack_canonical(FloatParts64
*p
, float16 f
,
1393 float_status
*s
, const FloatFmt
*params
)
1395 float16_unpack_raw(p
, f
);
1396 parts_canonicalize(p
, s
, params
);
1399 static void float16_unpack_canonical(FloatParts64
*p
, float16 f
,
1402 float16a_unpack_canonical(p
, f
, s
, &float16_params
);
1405 static void bfloat16_unpack_canonical(FloatParts64
*p
, bfloat16 f
,
1408 bfloat16_unpack_raw(p
, f
);
1409 parts_canonicalize(p
, s
, &bfloat16_params
);
1412 static float16
float16a_round_pack_canonical(FloatParts64
*p
,
1414 const FloatFmt
*params
)
1416 parts_uncanon(p
, s
, params
);
1417 return float16_pack_raw(p
);
1420 static float16
float16_round_pack_canonical(FloatParts64
*p
,
1423 return float16a_round_pack_canonical(p
, s
, &float16_params
);
1426 static bfloat16
bfloat16_round_pack_canonical(FloatParts64
*p
,
1429 parts_uncanon(p
, s
, &bfloat16_params
);
1430 return bfloat16_pack_raw(p
);
1433 static void float32_unpack_canonical(FloatParts64
*p
, float32 f
,
1436 float32_unpack_raw(p
, f
);
1437 parts_canonicalize(p
, s
, &float32_params
);
1440 static float32
float32_round_pack_canonical(FloatParts64
*p
,
1443 parts_uncanon(p
, s
, &float32_params
);
1444 return float32_pack_raw(p
);
1447 static void float64_unpack_canonical(FloatParts64
*p
, float64 f
,
1450 float64_unpack_raw(p
, f
);
1451 parts_canonicalize(p
, s
, &float64_params
);
1454 static float64
float64_round_pack_canonical(FloatParts64
*p
,
1457 parts_uncanon(p
, s
, &float64_params
);
1458 return float64_pack_raw(p
);
1461 static void float128_unpack_canonical(FloatParts128
*p
, float128 f
,
1464 float128_unpack_raw(p
, f
);
1465 parts_canonicalize(p
, s
, &float128_params
);
1468 static float128
float128_round_pack_canonical(FloatParts128
*p
,
1471 parts_uncanon(p
, s
, &float128_params
);
1472 return float128_pack_raw(p
);
1476 * Addition and subtraction
1479 static float16 QEMU_FLATTEN
1480 float16_addsub(float16 a
, float16 b
, float_status
*status
, bool subtract
)
1482 FloatParts64 pa
, pb
, *pr
;
1484 float16_unpack_canonical(&pa
, a
, status
);
1485 float16_unpack_canonical(&pb
, b
, status
);
1486 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1488 return float16_round_pack_canonical(pr
, status
);
1491 float16
float16_add(float16 a
, float16 b
, float_status
*status
)
1493 return float16_addsub(a
, b
, status
, false);
1496 float16
float16_sub(float16 a
, float16 b
, float_status
*status
)
1498 return float16_addsub(a
, b
, status
, true);
1501 static float32 QEMU_SOFTFLOAT_ATTR
1502 soft_f32_addsub(float32 a
, float32 b
, float_status
*status
, bool subtract
)
1504 FloatParts64 pa
, pb
, *pr
;
1506 float32_unpack_canonical(&pa
, a
, status
);
1507 float32_unpack_canonical(&pb
, b
, status
);
1508 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1510 return float32_round_pack_canonical(pr
, status
);
1513 static float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1515 return soft_f32_addsub(a
, b
, status
, false);
1518 static float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1520 return soft_f32_addsub(a
, b
, status
, true);
1523 static float64 QEMU_SOFTFLOAT_ATTR
1524 soft_f64_addsub(float64 a
, float64 b
, float_status
*status
, bool subtract
)
1526 FloatParts64 pa
, pb
, *pr
;
1528 float64_unpack_canonical(&pa
, a
, status
);
1529 float64_unpack_canonical(&pb
, b
, status
);
1530 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1532 return float64_round_pack_canonical(pr
, status
);
1535 static float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1537 return soft_f64_addsub(a
, b
, status
, false);
1540 static float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1542 return soft_f64_addsub(a
, b
, status
, true);
1545 static float hard_f32_add(float a
, float b
)
1550 static float hard_f32_sub(float a
, float b
)
1555 static double hard_f64_add(double a
, double b
)
1560 static double hard_f64_sub(double a
, double b
)
1565 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1567 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1568 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1570 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1573 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1575 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1576 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1578 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1582 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1583 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1585 return float32_gen2(a
, b
, s
, hard
, soft
,
1586 f32_is_zon2
, f32_addsubmul_post
);
1589 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1590 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1592 return float64_gen2(a
, b
, s
, hard
, soft
,
1593 f64_is_zon2
, f64_addsubmul_post
);
1596 float32 QEMU_FLATTEN
1597 float32_add(float32 a
, float32 b
, float_status
*s
)
1599 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1602 float32 QEMU_FLATTEN
1603 float32_sub(float32 a
, float32 b
, float_status
*s
)
1605 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1608 float64 QEMU_FLATTEN
1609 float64_add(float64 a
, float64 b
, float_status
*s
)
1611 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1614 float64 QEMU_FLATTEN
1615 float64_sub(float64 a
, float64 b
, float_status
*s
)
1617 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1620 static bfloat16 QEMU_FLATTEN
1621 bfloat16_addsub(bfloat16 a
, bfloat16 b
, float_status
*status
, bool subtract
)
1623 FloatParts64 pa
, pb
, *pr
;
1625 bfloat16_unpack_canonical(&pa
, a
, status
);
1626 bfloat16_unpack_canonical(&pb
, b
, status
);
1627 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1629 return bfloat16_round_pack_canonical(pr
, status
);
1632 bfloat16
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1634 return bfloat16_addsub(a
, b
, status
, false);
1637 bfloat16
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1639 return bfloat16_addsub(a
, b
, status
, true);
1642 static float128 QEMU_FLATTEN
1643 float128_addsub(float128 a
, float128 b
, float_status
*status
, bool subtract
)
1645 FloatParts128 pa
, pb
, *pr
;
1647 float128_unpack_canonical(&pa
, a
, status
);
1648 float128_unpack_canonical(&pb
, b
, status
);
1649 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1651 return float128_round_pack_canonical(pr
, status
);
1654 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
1656 return float128_addsub(a
, b
, status
, false);
1659 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
1661 return float128_addsub(a
, b
, status
, true);
1668 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1670 FloatParts64 pa
, pb
, *pr
;
1672 float16_unpack_canonical(&pa
, a
, status
);
1673 float16_unpack_canonical(&pb
, b
, status
);
1674 pr
= parts_mul(&pa
, &pb
, status
);
1676 return float16_round_pack_canonical(pr
, status
);
1679 static float32 QEMU_SOFTFLOAT_ATTR
1680 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1682 FloatParts64 pa
, pb
, *pr
;
1684 float32_unpack_canonical(&pa
, a
, status
);
1685 float32_unpack_canonical(&pb
, b
, status
);
1686 pr
= parts_mul(&pa
, &pb
, status
);
1688 return float32_round_pack_canonical(pr
, status
);
1691 static float64 QEMU_SOFTFLOAT_ATTR
1692 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1694 FloatParts64 pa
, pb
, *pr
;
1696 float64_unpack_canonical(&pa
, a
, status
);
1697 float64_unpack_canonical(&pb
, b
, status
);
1698 pr
= parts_mul(&pa
, &pb
, status
);
1700 return float64_round_pack_canonical(pr
, status
);
1703 static float hard_f32_mul(float a
, float b
)
1708 static double hard_f64_mul(double a
, double b
)
1713 float32 QEMU_FLATTEN
1714 float32_mul(float32 a
, float32 b
, float_status
*s
)
1716 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1717 f32_is_zon2
, f32_addsubmul_post
);
1720 float64 QEMU_FLATTEN
1721 float64_mul(float64 a
, float64 b
, float_status
*s
)
1723 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1724 f64_is_zon2
, f64_addsubmul_post
);
1727 bfloat16 QEMU_FLATTEN
1728 bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1730 FloatParts64 pa
, pb
, *pr
;
1732 bfloat16_unpack_canonical(&pa
, a
, status
);
1733 bfloat16_unpack_canonical(&pb
, b
, status
);
1734 pr
= parts_mul(&pa
, &pb
, status
);
1736 return bfloat16_round_pack_canonical(pr
, status
);
1739 float128 QEMU_FLATTEN
1740 float128_mul(float128 a
, float128 b
, float_status
*status
)
1742 FloatParts128 pa
, pb
, *pr
;
1744 float128_unpack_canonical(&pa
, a
, status
);
1745 float128_unpack_canonical(&pb
, b
, status
);
1746 pr
= parts_mul(&pa
, &pb
, status
);
1748 return float128_round_pack_canonical(pr
, status
);
1752 * Fused multiply-add
1755 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1756 int flags
, float_status
*status
)
1758 FloatParts64 pa
, pb
, pc
, *pr
;
1760 float16_unpack_canonical(&pa
, a
, status
);
1761 float16_unpack_canonical(&pb
, b
, status
);
1762 float16_unpack_canonical(&pc
, c
, status
);
1763 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1765 return float16_round_pack_canonical(pr
, status
);
1768 static float32 QEMU_SOFTFLOAT_ATTR
1769 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1770 float_status
*status
)
1772 FloatParts64 pa
, pb
, pc
, *pr
;
1774 float32_unpack_canonical(&pa
, a
, status
);
1775 float32_unpack_canonical(&pb
, b
, status
);
1776 float32_unpack_canonical(&pc
, c
, status
);
1777 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1779 return float32_round_pack_canonical(pr
, status
);
1782 static float64 QEMU_SOFTFLOAT_ATTR
1783 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1784 float_status
*status
)
1786 FloatParts64 pa
, pb
, pc
, *pr
;
1788 float64_unpack_canonical(&pa
, a
, status
);
1789 float64_unpack_canonical(&pb
, b
, status
);
1790 float64_unpack_canonical(&pc
, c
, status
);
1791 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1793 return float64_round_pack_canonical(pr
, status
);
1796 static bool force_soft_fma
;
1798 float32 QEMU_FLATTEN
1799 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1801 union_float32 ua
, ub
, uc
, ur
;
1807 if (unlikely(!can_use_fpu(s
))) {
1810 if (unlikely(flags
& float_muladd_halve_result
)) {
1814 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1815 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1819 if (unlikely(force_soft_fma
)) {
1824 * When (a || b) == 0, there's no need to check for under/over flow,
1825 * since we know the addend is (normal || 0) and the product is 0.
1827 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1831 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1832 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1833 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1835 if (flags
& float_muladd_negate_c
) {
1840 union_float32 ua_orig
= ua
;
1841 union_float32 uc_orig
= uc
;
1843 if (flags
& float_muladd_negate_product
) {
1846 if (flags
& float_muladd_negate_c
) {
1850 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1852 if (unlikely(f32_is_inf(ur
))) {
1853 float_raise(float_flag_overflow
, s
);
1854 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1860 if (flags
& float_muladd_negate_result
) {
1861 return float32_chs(ur
.s
);
1866 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1869 float64 QEMU_FLATTEN
1870 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1872 union_float64 ua
, ub
, uc
, ur
;
1878 if (unlikely(!can_use_fpu(s
))) {
1881 if (unlikely(flags
& float_muladd_halve_result
)) {
1885 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1886 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1890 if (unlikely(force_soft_fma
)) {
1895 * When (a || b) == 0, there's no need to check for under/over flow,
1896 * since we know the addend is (normal || 0) and the product is 0.
1898 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1902 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1903 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1904 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1906 if (flags
& float_muladd_negate_c
) {
1911 union_float64 ua_orig
= ua
;
1912 union_float64 uc_orig
= uc
;
1914 if (flags
& float_muladd_negate_product
) {
1917 if (flags
& float_muladd_negate_c
) {
1921 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1923 if (unlikely(f64_is_inf(ur
))) {
1924 float_raise(float_flag_overflow
, s
);
1925 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1931 if (flags
& float_muladd_negate_result
) {
1932 return float64_chs(ur
.s
);
1937 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1940 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1941 int flags
, float_status
*status
)
1943 FloatParts64 pa
, pb
, pc
, *pr
;
1945 bfloat16_unpack_canonical(&pa
, a
, status
);
1946 bfloat16_unpack_canonical(&pb
, b
, status
);
1947 bfloat16_unpack_canonical(&pc
, c
, status
);
1948 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1950 return bfloat16_round_pack_canonical(pr
, status
);
1953 float128 QEMU_FLATTEN
float128_muladd(float128 a
, float128 b
, float128 c
,
1954 int flags
, float_status
*status
)
1956 FloatParts128 pa
, pb
, pc
, *pr
;
1958 float128_unpack_canonical(&pa
, a
, status
);
1959 float128_unpack_canonical(&pb
, b
, status
);
1960 float128_unpack_canonical(&pc
, c
, status
);
1961 pr
= parts_muladd(&pa
, &pb
, &pc
, flags
, status
);
1963 return float128_round_pack_canonical(pr
, status
);
1970 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1972 FloatParts64 pa
, pb
, *pr
;
1974 float16_unpack_canonical(&pa
, a
, status
);
1975 float16_unpack_canonical(&pb
, b
, status
);
1976 pr
= parts_div(&pa
, &pb
, status
);
1978 return float16_round_pack_canonical(pr
, status
);
1981 static float32 QEMU_SOFTFLOAT_ATTR
1982 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1984 FloatParts64 pa
, pb
, *pr
;
1986 float32_unpack_canonical(&pa
, a
, status
);
1987 float32_unpack_canonical(&pb
, b
, status
);
1988 pr
= parts_div(&pa
, &pb
, status
);
1990 return float32_round_pack_canonical(pr
, status
);
1993 static float64 QEMU_SOFTFLOAT_ATTR
1994 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1996 FloatParts64 pa
, pb
, *pr
;
1998 float64_unpack_canonical(&pa
, a
, status
);
1999 float64_unpack_canonical(&pb
, b
, status
);
2000 pr
= parts_div(&pa
, &pb
, status
);
2002 return float64_round_pack_canonical(pr
, status
);
2005 static float hard_f32_div(float a
, float b
)
2010 static double hard_f64_div(double a
, double b
)
2015 static bool f32_div_pre(union_float32 a
, union_float32 b
)
2017 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
2018 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
2019 fpclassify(b
.h
) == FP_NORMAL
;
2021 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
2024 static bool f64_div_pre(union_float64 a
, union_float64 b
)
2026 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
2027 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
2028 fpclassify(b
.h
) == FP_NORMAL
;
2030 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
2033 static bool f32_div_post(union_float32 a
, union_float32 b
)
2035 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
2036 return fpclassify(a
.h
) != FP_ZERO
;
2038 return !float32_is_zero(a
.s
);
2041 static bool f64_div_post(union_float64 a
, union_float64 b
)
2043 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
2044 return fpclassify(a
.h
) != FP_ZERO
;
2046 return !float64_is_zero(a
.s
);
2049 float32 QEMU_FLATTEN
2050 float32_div(float32 a
, float32 b
, float_status
*s
)
2052 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
2053 f32_div_pre
, f32_div_post
);
2056 float64 QEMU_FLATTEN
2057 float64_div(float64 a
, float64 b
, float_status
*s
)
2059 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
2060 f64_div_pre
, f64_div_post
);
2063 bfloat16 QEMU_FLATTEN
2064 bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
2066 FloatParts64 pa
, pb
, *pr
;
2068 bfloat16_unpack_canonical(&pa
, a
, status
);
2069 bfloat16_unpack_canonical(&pb
, b
, status
);
2070 pr
= parts_div(&pa
, &pb
, status
);
2072 return bfloat16_round_pack_canonical(pr
, status
);
2075 float128 QEMU_FLATTEN
2076 float128_div(float128 a
, float128 b
, float_status
*status
)
2078 FloatParts128 pa
, pb
, *pr
;
2080 float128_unpack_canonical(&pa
, a
, status
);
2081 float128_unpack_canonical(&pb
, b
, status
);
2082 pr
= parts_div(&pa
, &pb
, status
);
2084 return float128_round_pack_canonical(pr
, status
);
2088 * Float to Float conversions
2090 * Returns the result of converting one float format to another. The
2091 * conversion is performed according to the IEC/IEEE Standard for
2092 * Binary Floating-Point Arithmetic.
2094 * Usually this only needs to take care of raising invalid exceptions
2095 * and handling the conversion on NaNs.
2098 static void parts_float_to_ahp(FloatParts64
*a
, float_status
*s
)
2101 case float_class_qnan
:
2102 case float_class_snan
:
2104 * There is no NaN in the destination format. Raise Invalid
2105 * and return a zero with the sign of the input NaN.
2107 float_raise(float_flag_invalid
, s
);
2108 a
->cls
= float_class_zero
;
2111 case float_class_inf
:
2113 * There is no Inf in the destination format. Raise Invalid
2114 * and return the maximum normal with the correct sign.
2116 float_raise(float_flag_invalid
, s
);
2117 a
->cls
= float_class_normal
;
2118 a
->exp
= float16_params_ahp
.exp_max
;
2119 a
->frac
= MAKE_64BIT_MASK(float16_params_ahp
.frac_shift
,
2120 float16_params_ahp
.frac_size
+ 1);
2123 case float_class_normal
:
2124 case float_class_zero
:
2128 g_assert_not_reached();
2132 static void parts64_float_to_float(FloatParts64
*a
, float_status
*s
)
2134 if (is_nan(a
->cls
)) {
2135 parts_return_nan(a
, s
);
2139 static void parts128_float_to_float(FloatParts128
*a
, float_status
*s
)
2141 if (is_nan(a
->cls
)) {
2142 parts_return_nan(a
, s
);
2146 #define parts_float_to_float(P, S) \
2147 PARTS_GENERIC_64_128(float_to_float, P)(P, S)
2149 static void parts_float_to_float_narrow(FloatParts64
*a
, FloatParts128
*b
,
2156 if (a
->cls
== float_class_normal
) {
2157 frac_truncjam(a
, b
);
2158 } else if (is_nan(a
->cls
)) {
2159 /* Discard the low bits of the NaN. */
2160 a
->frac
= b
->frac_hi
;
2161 parts_return_nan(a
, s
);
2165 static void parts_float_to_float_widen(FloatParts128
*a
, FloatParts64
*b
,
2173 if (is_nan(a
->cls
)) {
2174 parts_return_nan(a
, s
);
2178 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
2180 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2183 float16a_unpack_canonical(&p
, a
, s
, fmt16
);
2184 parts_float_to_float(&p
, s
);
2185 return float32_round_pack_canonical(&p
, s
);
2188 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
2190 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2193 float16a_unpack_canonical(&p
, a
, s
, fmt16
);
2194 parts_float_to_float(&p
, s
);
2195 return float64_round_pack_canonical(&p
, s
);
2198 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
2201 const FloatFmt
*fmt
;
2203 float32_unpack_canonical(&p
, a
, s
);
2205 parts_float_to_float(&p
, s
);
2206 fmt
= &float16_params
;
2208 parts_float_to_ahp(&p
, s
);
2209 fmt
= &float16_params_ahp
;
2211 return float16a_round_pack_canonical(&p
, s
, fmt
);
2214 static float64 QEMU_SOFTFLOAT_ATTR
2215 soft_float32_to_float64(float32 a
, float_status
*s
)
2219 float32_unpack_canonical(&p
, a
, s
);
2220 parts_float_to_float(&p
, s
);
2221 return float64_round_pack_canonical(&p
, s
);
2224 float64
float32_to_float64(float32 a
, float_status
*s
)
2226 if (likely(float32_is_normal(a
))) {
2227 /* Widening conversion can never produce inexact results. */
2233 } else if (float32_is_zero(a
)) {
2234 return float64_set_sign(float64_zero
, float32_is_neg(a
));
2236 return soft_float32_to_float64(a
, s
);
2240 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2243 const FloatFmt
*fmt
;
2245 float64_unpack_canonical(&p
, a
, s
);
2247 parts_float_to_float(&p
, s
);
2248 fmt
= &float16_params
;
2250 parts_float_to_ahp(&p
, s
);
2251 fmt
= &float16_params_ahp
;
2253 return float16a_round_pack_canonical(&p
, s
, fmt
);
2256 float32
float64_to_float32(float64 a
, float_status
*s
)
2260 float64_unpack_canonical(&p
, a
, s
);
2261 parts_float_to_float(&p
, s
);
2262 return float32_round_pack_canonical(&p
, s
);
2265 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2269 bfloat16_unpack_canonical(&p
, a
, s
);
2270 parts_float_to_float(&p
, s
);
2271 return float32_round_pack_canonical(&p
, s
);
2274 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2278 bfloat16_unpack_canonical(&p
, a
, s
);
2279 parts_float_to_float(&p
, s
);
2280 return float64_round_pack_canonical(&p
, s
);
2283 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2287 float32_unpack_canonical(&p
, a
, s
);
2288 parts_float_to_float(&p
, s
);
2289 return bfloat16_round_pack_canonical(&p
, s
);
2292 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2296 float64_unpack_canonical(&p
, a
, s
);
2297 parts_float_to_float(&p
, s
);
2298 return bfloat16_round_pack_canonical(&p
, s
);
2301 float32
float128_to_float32(float128 a
, float_status
*s
)
2306 float128_unpack_canonical(&p128
, a
, s
);
2307 parts_float_to_float_narrow(&p64
, &p128
, s
);
2308 return float32_round_pack_canonical(&p64
, s
);
2311 float64
float128_to_float64(float128 a
, float_status
*s
)
2316 float128_unpack_canonical(&p128
, a
, s
);
2317 parts_float_to_float_narrow(&p64
, &p128
, s
);
2318 return float64_round_pack_canonical(&p64
, s
);
2321 float128
float32_to_float128(float32 a
, float_status
*s
)
2326 float32_unpack_canonical(&p64
, a
, s
);
2327 parts_float_to_float_widen(&p128
, &p64
, s
);
2328 return float128_round_pack_canonical(&p128
, s
);
2331 float128
float64_to_float128(float64 a
, float_status
*s
)
2336 float64_unpack_canonical(&p64
, a
, s
);
2337 parts_float_to_float_widen(&p128
, &p64
, s
);
2338 return float128_round_pack_canonical(&p128
, s
);
2342 * Round to integral value
2345 float16
float16_round_to_int(float16 a
, float_status
*s
)
2349 float16_unpack_canonical(&p
, a
, s
);
2350 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float16_params
);
2351 return float16_round_pack_canonical(&p
, s
);
2354 float32
float32_round_to_int(float32 a
, float_status
*s
)
2358 float32_unpack_canonical(&p
, a
, s
);
2359 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float32_params
);
2360 return float32_round_pack_canonical(&p
, s
);
2363 float64
float64_round_to_int(float64 a
, float_status
*s
)
2367 float64_unpack_canonical(&p
, a
, s
);
2368 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float64_params
);
2369 return float64_round_pack_canonical(&p
, s
);
2372 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2376 bfloat16_unpack_canonical(&p
, a
, s
);
2377 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &bfloat16_params
);
2378 return bfloat16_round_pack_canonical(&p
, s
);
2381 float128
float128_round_to_int(float128 a
, float_status
*s
)
2385 float128_unpack_canonical(&p
, a
, s
);
2386 parts_round_to_int(&p
, s
->float_rounding_mode
, 0, s
, &float128_params
);
2387 return float128_round_pack_canonical(&p
, s
);
2391 * Floating-point to signed integer conversions
2394 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2399 float16_unpack_canonical(&p
, a
, s
);
2400 return parts_float_to_sint(&p
, rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2403 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2408 float16_unpack_canonical(&p
, a
, s
);
2409 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2412 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2417 float16_unpack_canonical(&p
, a
, s
);
2418 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2421 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2426 float16_unpack_canonical(&p
, a
, s
);
2427 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2430 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2435 float32_unpack_canonical(&p
, a
, s
);
2436 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2439 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2444 float32_unpack_canonical(&p
, a
, s
);
2445 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2448 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2453 float32_unpack_canonical(&p
, a
, s
);
2454 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2457 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2462 float64_unpack_canonical(&p
, a
, s
);
2463 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2466 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2471 float64_unpack_canonical(&p
, a
, s
);
2472 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2475 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2480 float64_unpack_canonical(&p
, a
, s
);
2481 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2484 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2489 bfloat16_unpack_canonical(&p
, a
, s
);
2490 return parts_float_to_sint(&p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2493 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2498 bfloat16_unpack_canonical(&p
, a
, s
);
2499 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2502 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2507 bfloat16_unpack_canonical(&p
, a
, s
);
2508 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2511 static int32_t float128_to_int32_scalbn(float128 a
, FloatRoundMode rmode
,
2512 int scale
, float_status
*s
)
2516 float128_unpack_canonical(&p
, a
, s
);
2517 return parts_float_to_sint(&p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2520 static int64_t float128_to_int64_scalbn(float128 a
, FloatRoundMode rmode
,
2521 int scale
, float_status
*s
)
2525 float128_unpack_canonical(&p
, a
, s
);
2526 return parts_float_to_sint(&p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2529 int8_t float16_to_int8(float16 a
, float_status
*s
)
2531 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2534 int16_t float16_to_int16(float16 a
, float_status
*s
)
2536 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2539 int32_t float16_to_int32(float16 a
, float_status
*s
)
2541 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2544 int64_t float16_to_int64(float16 a
, float_status
*s
)
2546 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2549 int16_t float32_to_int16(float32 a
, float_status
*s
)
2551 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2554 int32_t float32_to_int32(float32 a
, float_status
*s
)
2556 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2559 int64_t float32_to_int64(float32 a
, float_status
*s
)
2561 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2564 int16_t float64_to_int16(float64 a
, float_status
*s
)
2566 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2569 int32_t float64_to_int32(float64 a
, float_status
*s
)
2571 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2574 int64_t float64_to_int64(float64 a
, float_status
*s
)
2576 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2579 int32_t float128_to_int32(float128 a
, float_status
*s
)
2581 return float128_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2584 int64_t float128_to_int64(float128 a
, float_status
*s
)
2586 return float128_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2589 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2591 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2594 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2596 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2599 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2601 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2604 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2606 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2609 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2611 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2614 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2616 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2619 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2621 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2624 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2626 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2629 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2631 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2634 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*s
)
2636 return float128_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2639 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*s
)
2641 return float128_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2644 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2646 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2649 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2651 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2654 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2656 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2659 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2661 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2664 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2666 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2669 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2671 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2675 * Floating-point to unsigned integer conversions
2678 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2683 float16_unpack_canonical(&p
, a
, s
);
2684 return parts_float_to_uint(&p
, rmode
, scale
, UINT8_MAX
, s
);
2687 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2692 float16_unpack_canonical(&p
, a
, s
);
2693 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2696 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2701 float16_unpack_canonical(&p
, a
, s
);
2702 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2705 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2710 float16_unpack_canonical(&p
, a
, s
);
2711 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2714 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2719 float32_unpack_canonical(&p
, a
, s
);
2720 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2723 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2728 float32_unpack_canonical(&p
, a
, s
);
2729 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2732 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2737 float32_unpack_canonical(&p
, a
, s
);
2738 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2741 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2746 float64_unpack_canonical(&p
, a
, s
);
2747 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2750 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2755 float64_unpack_canonical(&p
, a
, s
);
2756 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2759 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2764 float64_unpack_canonical(&p
, a
, s
);
2765 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2768 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2769 int scale
, float_status
*s
)
2773 bfloat16_unpack_canonical(&p
, a
, s
);
2774 return parts_float_to_uint(&p
, rmode
, scale
, UINT16_MAX
, s
);
2777 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2778 int scale
, float_status
*s
)
2782 bfloat16_unpack_canonical(&p
, a
, s
);
2783 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2786 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2787 int scale
, float_status
*s
)
2791 bfloat16_unpack_canonical(&p
, a
, s
);
2792 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2795 static uint32_t float128_to_uint32_scalbn(float128 a
, FloatRoundMode rmode
,
2796 int scale
, float_status
*s
)
2800 float128_unpack_canonical(&p
, a
, s
);
2801 return parts_float_to_uint(&p
, rmode
, scale
, UINT32_MAX
, s
);
2804 static uint64_t float128_to_uint64_scalbn(float128 a
, FloatRoundMode rmode
,
2805 int scale
, float_status
*s
)
2809 float128_unpack_canonical(&p
, a
, s
);
2810 return parts_float_to_uint(&p
, rmode
, scale
, UINT64_MAX
, s
);
2813 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2815 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2818 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2820 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2823 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2825 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2828 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2830 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2833 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2835 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2838 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2840 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2843 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2845 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2848 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2850 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2853 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2855 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2858 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2860 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2863 uint32_t float128_to_uint32(float128 a
, float_status
*s
)
2865 return float128_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2868 uint64_t float128_to_uint64(float128 a
, float_status
*s
)
2870 return float128_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2873 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2875 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2878 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2880 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2883 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2885 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2888 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2890 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2893 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2895 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2898 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2900 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2903 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2905 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2908 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2910 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2913 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2915 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2918 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*s
)
2920 return float128_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2923 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*s
)
2925 return float128_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2928 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2930 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2933 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2935 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2938 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2940 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2943 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2945 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2948 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2950 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2953 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2955 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2959 * Signed integer to floating-point conversions
2962 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2966 parts_sint_to_float(&p
, a
, scale
, status
);
2967 return float16_round_pack_canonical(&p
, status
);
2970 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2972 return int64_to_float16_scalbn(a
, scale
, status
);
2975 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2977 return int64_to_float16_scalbn(a
, scale
, status
);
2980 float16
int64_to_float16(int64_t a
, float_status
*status
)
2982 return int64_to_float16_scalbn(a
, 0, status
);
2985 float16
int32_to_float16(int32_t a
, float_status
*status
)
2987 return int64_to_float16_scalbn(a
, 0, status
);
2990 float16
int16_to_float16(int16_t a
, float_status
*status
)
2992 return int64_to_float16_scalbn(a
, 0, status
);
2995 float16
int8_to_float16(int8_t a
, float_status
*status
)
2997 return int64_to_float16_scalbn(a
, 0, status
);
3000 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
3004 parts64_sint_to_float(&p
, a
, scale
, status
);
3005 return float32_round_pack_canonical(&p
, status
);
3008 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
3010 return int64_to_float32_scalbn(a
, scale
, status
);
3013 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
3015 return int64_to_float32_scalbn(a
, scale
, status
);
3018 float32
int64_to_float32(int64_t a
, float_status
*status
)
3020 return int64_to_float32_scalbn(a
, 0, status
);
3023 float32
int32_to_float32(int32_t a
, float_status
*status
)
3025 return int64_to_float32_scalbn(a
, 0, status
);
3028 float32
int16_to_float32(int16_t a
, float_status
*status
)
3030 return int64_to_float32_scalbn(a
, 0, status
);
3033 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
3037 parts_sint_to_float(&p
, a
, scale
, status
);
3038 return float64_round_pack_canonical(&p
, status
);
3041 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
3043 return int64_to_float64_scalbn(a
, scale
, status
);
3046 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
3048 return int64_to_float64_scalbn(a
, scale
, status
);
3051 float64
int64_to_float64(int64_t a
, float_status
*status
)
3053 return int64_to_float64_scalbn(a
, 0, status
);
3056 float64
int32_to_float64(int32_t a
, float_status
*status
)
3058 return int64_to_float64_scalbn(a
, 0, status
);
3061 float64
int16_to_float64(int16_t a
, float_status
*status
)
3063 return int64_to_float64_scalbn(a
, 0, status
);
3066 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
3070 parts_sint_to_float(&p
, a
, scale
, status
);
3071 return bfloat16_round_pack_canonical(&p
, status
);
3074 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
3076 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3079 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
3081 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3084 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
3086 return int64_to_bfloat16_scalbn(a
, 0, status
);
3089 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
3091 return int64_to_bfloat16_scalbn(a
, 0, status
);
3094 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
3096 return int64_to_bfloat16_scalbn(a
, 0, status
);
3099 float128
int64_to_float128(int64_t a
, float_status
*status
)
3103 parts_sint_to_float(&p
, a
, 0, status
);
3104 return float128_round_pack_canonical(&p
, status
);
3107 float128
int32_to_float128(int32_t a
, float_status
*status
)
3109 return int64_to_float128(a
, status
);
3113 * Unsigned Integer to floating-point conversions
3116 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3120 parts_uint_to_float(&p
, a
, scale
, status
);
3121 return float16_round_pack_canonical(&p
, status
);
3124 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3126 return uint64_to_float16_scalbn(a
, scale
, status
);
3129 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3131 return uint64_to_float16_scalbn(a
, scale
, status
);
3134 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
3136 return uint64_to_float16_scalbn(a
, 0, status
);
3139 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
3141 return uint64_to_float16_scalbn(a
, 0, status
);
3144 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
3146 return uint64_to_float16_scalbn(a
, 0, status
);
3149 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
3151 return uint64_to_float16_scalbn(a
, 0, status
);
3154 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
3158 parts_uint_to_float(&p
, a
, scale
, status
);
3159 return float32_round_pack_canonical(&p
, status
);
3162 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
3164 return uint64_to_float32_scalbn(a
, scale
, status
);
3167 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
3169 return uint64_to_float32_scalbn(a
, scale
, status
);
3172 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
3174 return uint64_to_float32_scalbn(a
, 0, status
);
3177 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3179 return uint64_to_float32_scalbn(a
, 0, status
);
3182 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3184 return uint64_to_float32_scalbn(a
, 0, status
);
3187 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3191 parts_uint_to_float(&p
, a
, scale
, status
);
3192 return float64_round_pack_canonical(&p
, status
);
3195 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3197 return uint64_to_float64_scalbn(a
, scale
, status
);
3200 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3202 return uint64_to_float64_scalbn(a
, scale
, status
);
3205 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3207 return uint64_to_float64_scalbn(a
, 0, status
);
3210 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3212 return uint64_to_float64_scalbn(a
, 0, status
);
3215 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3217 return uint64_to_float64_scalbn(a
, 0, status
);
3220 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3224 parts_uint_to_float(&p
, a
, scale
, status
);
3225 return bfloat16_round_pack_canonical(&p
, status
);
3228 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3230 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3233 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3235 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3238 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3240 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3243 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3245 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3248 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3250 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3253 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3257 parts_uint_to_float(&p
, a
, 0, status
);
3258 return float128_round_pack_canonical(&p
, status
);
3262 /* min() and max() functions. These can't be implemented as
3263 * 'compare and pick one input' because that would mishandle
3264 * NaNs and +0 vs -0.
3266 * minnum() and maxnum() functions. These are similar to the min()
3267 * and max() functions but if one of the arguments is a QNaN and
3268 * the other is numerical then the numerical argument is returned.
3269 * SNaNs will get quietened before being returned.
3270 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3271 * and maxNum() operations. min() and max() are the typical min/max
3272 * semantics provided by many CPUs which predate that specification.
3274 * minnummag() and maxnummag() functions correspond to minNumMag()
3275 * and minNumMag() from the IEEE-754 2008.
3277 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3278 bool ieee
, bool ismag
, float_status
*s
)
3280 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3282 /* Takes two floating-point values `a' and `b', one of
3283 * which is a NaN, and returns the appropriate NaN
3284 * result. If either `a' or `b' is a signaling NaN,
3285 * the invalid exception is raised.
3287 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3288 return *parts_pick_nan(&a
, &b
, s
);
3289 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3291 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3295 return *parts_pick_nan(&a
, &b
, s
);
3300 case float_class_normal
:
3303 case float_class_inf
:
3306 case float_class_zero
:
3310 g_assert_not_reached();
3314 case float_class_normal
:
3317 case float_class_inf
:
3320 case float_class_zero
:
3324 g_assert_not_reached();
3328 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3329 bool a_less
= a_exp
< b_exp
;
3330 if (a_exp
== b_exp
) {
3331 a_less
= a
.frac
< b
.frac
;
3333 return a_less
^ ismin
? b
: a
;
3336 if (a
.sign
== b
.sign
) {
3337 bool a_less
= a_exp
< b_exp
;
3338 if (a_exp
== b_exp
) {
3339 a_less
= a
.frac
< b
.frac
;
3341 return a
.sign
^ a_less
^ ismin
? b
: a
;
3343 return a
.sign
^ ismin
? b
: a
;
3348 #define MINMAX(sz, name, ismin, isiee, ismag) \
3349 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3352 FloatParts64 pa, pb, pr; \
3353 float ## sz ## _unpack_canonical(&pa, a, s); \
3354 float ## sz ## _unpack_canonical(&pb, b, s); \
3355 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3356 return float ## sz ## _round_pack_canonical(&pr, s); \
3359 MINMAX(16, min
, true, false, false)
3360 MINMAX(16, minnum
, true, true, false)
3361 MINMAX(16, minnummag
, true, true, true)
3362 MINMAX(16, max
, false, false, false)
3363 MINMAX(16, maxnum
, false, true, false)
3364 MINMAX(16, maxnummag
, false, true, true)
3366 MINMAX(32, min
, true, false, false)
3367 MINMAX(32, minnum
, true, true, false)
3368 MINMAX(32, minnummag
, true, true, true)
3369 MINMAX(32, max
, false, false, false)
3370 MINMAX(32, maxnum
, false, true, false)
3371 MINMAX(32, maxnummag
, false, true, true)
3373 MINMAX(64, min
, true, false, false)
3374 MINMAX(64, minnum
, true, true, false)
3375 MINMAX(64, minnummag
, true, true, true)
3376 MINMAX(64, max
, false, false, false)
3377 MINMAX(64, maxnum
, false, true, false)
3378 MINMAX(64, maxnummag
, false, true, true)
3382 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3383 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3385 FloatParts64 pa, pb, pr; \
3386 bfloat16_unpack_canonical(&pa, a, s); \
3387 bfloat16_unpack_canonical(&pb, b, s); \
3388 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3389 return bfloat16_round_pack_canonical(&pr, s); \
3392 BF16_MINMAX(min
, true, false, false)
3393 BF16_MINMAX(minnum
, true, true, false)
3394 BF16_MINMAX(minnummag
, true, true, true)
3395 BF16_MINMAX(max
, false, false, false)
3396 BF16_MINMAX(maxnum
, false, true, false)
3397 BF16_MINMAX(maxnummag
, false, true, true)
3401 /* Floating point compare */
3402 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3405 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3407 a
.cls
== float_class_snan
||
3408 b
.cls
== float_class_snan
) {
3409 float_raise(float_flag_invalid
, s
);
3411 return float_relation_unordered
;
3414 if (a
.cls
== float_class_zero
) {
3415 if (b
.cls
== float_class_zero
) {
3416 return float_relation_equal
;
3418 return b
.sign
? float_relation_greater
: float_relation_less
;
3419 } else if (b
.cls
== float_class_zero
) {
3420 return a
.sign
? float_relation_less
: float_relation_greater
;
3423 /* The only really important thing about infinity is its sign. If
3424 * both are infinities the sign marks the smallest of the two.
3426 if (a
.cls
== float_class_inf
) {
3427 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3428 return float_relation_equal
;
3430 return a
.sign
? float_relation_less
: float_relation_greater
;
3431 } else if (b
.cls
== float_class_inf
) {
3432 return b
.sign
? float_relation_greater
: float_relation_less
;
3435 if (a
.sign
!= b
.sign
) {
3436 return a
.sign
? float_relation_less
: float_relation_greater
;
3439 if (a
.exp
== b
.exp
) {
3440 if (a
.frac
== b
.frac
) {
3441 return float_relation_equal
;
3444 return a
.frac
> b
.frac
?
3445 float_relation_less
: float_relation_greater
;
3447 return a
.frac
> b
.frac
?
3448 float_relation_greater
: float_relation_less
;
3452 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3454 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3459 #define COMPARE(name, attr, sz) \
3461 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3463 FloatParts64 pa, pb; \
3464 float ## sz ## _unpack_canonical(&pa, a, s); \
3465 float ## sz ## _unpack_canonical(&pb, b, s); \
3466 return compare_floats(pa, pb, is_quiet, s); \
3469 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3470 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3471 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3475 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3477 return soft_f16_compare(a
, b
, false, s
);
3480 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3482 return soft_f16_compare(a
, b
, true, s
);
3485 static FloatRelation QEMU_FLATTEN
3486 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3488 union_float32 ua
, ub
;
3493 if (QEMU_NO_HARDFLOAT
) {
3497 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3498 if (isgreaterequal(ua
.h
, ub
.h
)) {
3499 if (isgreater(ua
.h
, ub
.h
)) {
3500 return float_relation_greater
;
3502 return float_relation_equal
;
3504 if (likely(isless(ua
.h
, ub
.h
))) {
3505 return float_relation_less
;
3507 /* The only condition remaining is unordered.
3508 * Fall through to set flags.
3511 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3514 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3516 return f32_compare(a
, b
, false, s
);
3519 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3521 return f32_compare(a
, b
, true, s
);
3524 static FloatRelation QEMU_FLATTEN
3525 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3527 union_float64 ua
, ub
;
3532 if (QEMU_NO_HARDFLOAT
) {
3536 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3537 if (isgreaterequal(ua
.h
, ub
.h
)) {
3538 if (isgreater(ua
.h
, ub
.h
)) {
3539 return float_relation_greater
;
3541 return float_relation_equal
;
3543 if (likely(isless(ua
.h
, ub
.h
))) {
3544 return float_relation_less
;
3546 /* The only condition remaining is unordered.
3547 * Fall through to set flags.
3550 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3553 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3555 return f64_compare(a
, b
, false, s
);
3558 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3560 return f64_compare(a
, b
, true, s
);
3563 static FloatRelation QEMU_FLATTEN
3564 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3566 FloatParts64 pa
, pb
;
3568 bfloat16_unpack_canonical(&pa
, a
, s
);
3569 bfloat16_unpack_canonical(&pb
, b
, s
);
3570 return compare_floats(pa
, pb
, is_quiet
, s
);
3573 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3575 return soft_bf16_compare(a
, b
, false, s
);
3578 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3580 return soft_bf16_compare(a
, b
, true, s
);
3583 /* Multiply A by 2 raised to the power N. */
3584 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3586 if (unlikely(is_nan(a
.cls
))) {
3587 parts_return_nan(&a
, s
);
3589 if (a
.cls
== float_class_normal
) {
3590 /* The largest float type (even though not supported by FloatParts64)
3591 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3592 * still allows rounding to infinity, without allowing overflow
3593 * within the int32_t that backs FloatParts64.exp.
3595 n
= MIN(MAX(n
, -0x10000), 0x10000);
3601 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3603 FloatParts64 pa
, pr
;
3605 float16_unpack_canonical(&pa
, a
, status
);
3606 pr
= scalbn_decomposed(pa
, n
, status
);
3607 return float16_round_pack_canonical(&pr
, status
);
3610 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3612 FloatParts64 pa
, pr
;
3614 float32_unpack_canonical(&pa
, a
, status
);
3615 pr
= scalbn_decomposed(pa
, n
, status
);
3616 return float32_round_pack_canonical(&pr
, status
);
3619 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3621 FloatParts64 pa
, pr
;
3623 float64_unpack_canonical(&pa
, a
, status
);
3624 pr
= scalbn_decomposed(pa
, n
, status
);
3625 return float64_round_pack_canonical(&pr
, status
);
3628 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3630 FloatParts64 pa
, pr
;
3632 bfloat16_unpack_canonical(&pa
, a
, status
);
3633 pr
= scalbn_decomposed(pa
, n
, status
);
3634 return bfloat16_round_pack_canonical(&pr
, status
);
3640 * The old softfloat code did an approximation step before zeroing in
3641 * on the final result. However for simpleness we just compute the
3642 * square root by iterating down from the implicit bit to enough extra
3643 * bits to ensure we get a correctly rounded result.
3645 * This does mean however the calculation is slower than before,
3646 * especially for 64 bit floats.
3649 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3651 uint64_t a_frac
, r_frac
, s_frac
;
3654 if (is_nan(a
.cls
)) {
3655 parts_return_nan(&a
, s
);
3658 if (a
.cls
== float_class_zero
) {
3659 return a
; /* sqrt(+-0) = +-0 */
3662 float_raise(float_flag_invalid
, s
);
3663 parts_default_nan(&a
, s
);
3666 if (a
.cls
== float_class_inf
) {
3667 return a
; /* sqrt(+inf) = +inf */
3670 assert(a
.cls
== float_class_normal
);
3672 /* We need two overflow bits at the top. Adding room for that is a
3673 * right shift. If the exponent is odd, we can discard the low bit
3674 * by multiplying the fraction by 2; that's a left shift. Combine
3675 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3677 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3680 /* Bit-by-bit computation of sqrt. */
3684 /* Iterate from implicit bit down to the 3 extra bits to compute a
3685 * properly rounded result. Remember we've inserted two more bits
3686 * at the top, so these positions are two less.
3688 bit
= DECOMPOSED_BINARY_POINT
- 2;
3689 last_bit
= MAX(p
->frac_shift
- 4, 0);
3691 uint64_t q
= 1ULL << bit
;
3692 uint64_t t_frac
= s_frac
+ q
;
3693 if (t_frac
<= a_frac
) {
3694 s_frac
= t_frac
+ q
;
3699 } while (--bit
>= last_bit
);
3701 /* Undo the right shift done above. If there is any remaining
3702 * fraction, the result is inexact. Set the sticky bit.
3704 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3709 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3711 FloatParts64 pa
, pr
;
3713 float16_unpack_canonical(&pa
, a
, status
);
3714 pr
= sqrt_float(pa
, status
, &float16_params
);
3715 return float16_round_pack_canonical(&pr
, status
);
3718 static float32 QEMU_SOFTFLOAT_ATTR
3719 soft_f32_sqrt(float32 a
, float_status
*status
)
3721 FloatParts64 pa
, pr
;
3723 float32_unpack_canonical(&pa
, a
, status
);
3724 pr
= sqrt_float(pa
, status
, &float32_params
);
3725 return float32_round_pack_canonical(&pr
, status
);
3728 static float64 QEMU_SOFTFLOAT_ATTR
3729 soft_f64_sqrt(float64 a
, float_status
*status
)
3731 FloatParts64 pa
, pr
;
3733 float64_unpack_canonical(&pa
, a
, status
);
3734 pr
= sqrt_float(pa
, status
, &float64_params
);
3735 return float64_round_pack_canonical(&pr
, status
);
3738 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3740 union_float32 ua
, ur
;
3743 if (unlikely(!can_use_fpu(s
))) {
3747 float32_input_flush1(&ua
.s
, s
);
3748 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3749 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3750 fpclassify(ua
.h
) == FP_ZERO
) ||
3754 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3755 float32_is_neg(ua
.s
))) {
3762 return soft_f32_sqrt(ua
.s
, s
);
3765 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3767 union_float64 ua
, ur
;
3770 if (unlikely(!can_use_fpu(s
))) {
3774 float64_input_flush1(&ua
.s
, s
);
3775 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3776 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3777 fpclassify(ua
.h
) == FP_ZERO
) ||
3781 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3782 float64_is_neg(ua
.s
))) {
3789 return soft_f64_sqrt(ua
.s
, s
);
3792 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3794 FloatParts64 pa
, pr
;
3796 bfloat16_unpack_canonical(&pa
, a
, status
);
3797 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3798 return bfloat16_round_pack_canonical(&pr
, status
);
3801 /*----------------------------------------------------------------------------
3802 | The pattern for a default generated NaN.
3803 *----------------------------------------------------------------------------*/
3805 float16
float16_default_nan(float_status
*status
)
3809 parts_default_nan(&p
, status
);
3810 p
.frac
>>= float16_params
.frac_shift
;
3811 return float16_pack_raw(&p
);
3814 float32
float32_default_nan(float_status
*status
)
3818 parts_default_nan(&p
, status
);
3819 p
.frac
>>= float32_params
.frac_shift
;
3820 return float32_pack_raw(&p
);
3823 float64
float64_default_nan(float_status
*status
)
3827 parts_default_nan(&p
, status
);
3828 p
.frac
>>= float64_params
.frac_shift
;
3829 return float64_pack_raw(&p
);
3832 float128
float128_default_nan(float_status
*status
)
3836 parts_default_nan(&p
, status
);
3837 frac_shr(&p
, float128_params
.frac_shift
);
3838 return float128_pack_raw(&p
);
3841 bfloat16
bfloat16_default_nan(float_status
*status
)
3845 parts_default_nan(&p
, status
);
3846 p
.frac
>>= bfloat16_params
.frac_shift
;
3847 return bfloat16_pack_raw(&p
);
3850 /*----------------------------------------------------------------------------
3851 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3852 *----------------------------------------------------------------------------*/
3854 float16
float16_silence_nan(float16 a
, float_status
*status
)
3858 float16_unpack_raw(&p
, a
);
3859 p
.frac
<<= float16_params
.frac_shift
;
3860 parts_silence_nan(&p
, status
);
3861 p
.frac
>>= float16_params
.frac_shift
;
3862 return float16_pack_raw(&p
);
3865 float32
float32_silence_nan(float32 a
, float_status
*status
)
3869 float32_unpack_raw(&p
, a
);
3870 p
.frac
<<= float32_params
.frac_shift
;
3871 parts_silence_nan(&p
, status
);
3872 p
.frac
>>= float32_params
.frac_shift
;
3873 return float32_pack_raw(&p
);
3876 float64
float64_silence_nan(float64 a
, float_status
*status
)
3880 float64_unpack_raw(&p
, a
);
3881 p
.frac
<<= float64_params
.frac_shift
;
3882 parts_silence_nan(&p
, status
);
3883 p
.frac
>>= float64_params
.frac_shift
;
3884 return float64_pack_raw(&p
);
3887 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3891 bfloat16_unpack_raw(&p
, a
);
3892 p
.frac
<<= bfloat16_params
.frac_shift
;
3893 parts_silence_nan(&p
, status
);
3894 p
.frac
>>= bfloat16_params
.frac_shift
;
3895 return bfloat16_pack_raw(&p
);
3898 float128
float128_silence_nan(float128 a
, float_status
*status
)
3902 float128_unpack_raw(&p
, a
);
3903 frac_shl(&p
, float128_params
.frac_shift
);
3904 parts_silence_nan(&p
, status
);
3905 frac_shr(&p
, float128_params
.frac_shift
);
3906 return float128_pack_raw(&p
);
3909 /*----------------------------------------------------------------------------
3910 | If `a' is denormal and we are in flush-to-zero mode then set the
3911 | input-denormal exception and return zero. Otherwise just return the value.
3912 *----------------------------------------------------------------------------*/
3914 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3916 if (p
.exp
== 0 && p
.frac
!= 0) {
3917 float_raise(float_flag_input_denormal
, status
);
3924 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3926 if (status
->flush_inputs_to_zero
) {
3929 float16_unpack_raw(&p
, a
);
3930 if (parts_squash_denormal(p
, status
)) {
3931 return float16_set_sign(float16_zero
, p
.sign
);
3937 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3939 if (status
->flush_inputs_to_zero
) {
3942 float32_unpack_raw(&p
, a
);
3943 if (parts_squash_denormal(p
, status
)) {
3944 return float32_set_sign(float32_zero
, p
.sign
);
3950 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3952 if (status
->flush_inputs_to_zero
) {
3955 float64_unpack_raw(&p
, a
);
3956 if (parts_squash_denormal(p
, status
)) {
3957 return float64_set_sign(float64_zero
, p
.sign
);
3963 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3965 if (status
->flush_inputs_to_zero
) {
3968 bfloat16_unpack_raw(&p
, a
);
3969 if (parts_squash_denormal(p
, status
)) {
3970 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3976 /*----------------------------------------------------------------------------
3977 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3978 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3979 | input. If `zSign' is 1, the input is negated before being converted to an
3980 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3981 | is simply rounded to an integer, with the inexact exception raised if the
3982 | input cannot be represented exactly as an integer. However, if the fixed-
3983 | point input is too large, the invalid exception is raised and the largest
3984 | positive or negative integer is returned.
3985 *----------------------------------------------------------------------------*/
3987 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3988 float_status
*status
)
3990 int8_t roundingMode
;
3991 bool roundNearestEven
;
3992 int8_t roundIncrement
, roundBits
;
3995 roundingMode
= status
->float_rounding_mode
;
3996 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3997 switch (roundingMode
) {
3998 case float_round_nearest_even
:
3999 case float_round_ties_away
:
4000 roundIncrement
= 0x40;
4002 case float_round_to_zero
:
4005 case float_round_up
:
4006 roundIncrement
= zSign
? 0 : 0x7f;
4008 case float_round_down
:
4009 roundIncrement
= zSign
? 0x7f : 0;
4011 case float_round_to_odd
:
4012 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
4017 roundBits
= absZ
& 0x7F;
4018 absZ
= ( absZ
+ roundIncrement
)>>7;
4019 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4023 if ( zSign
) z
= - z
;
4024 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
4025 float_raise(float_flag_invalid
, status
);
4026 return zSign
? INT32_MIN
: INT32_MAX
;
4029 float_raise(float_flag_inexact
, status
);
4035 /*----------------------------------------------------------------------------
4036 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4037 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4038 | and returns the properly rounded 64-bit integer corresponding to the input.
4039 | If `zSign' is 1, the input is negated before being converted to an integer.
4040 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4041 | the inexact exception raised if the input cannot be represented exactly as
4042 | an integer. However, if the fixed-point input is too large, the invalid
4043 | exception is raised and the largest positive or negative integer is
4045 *----------------------------------------------------------------------------*/
4047 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
4048 float_status
*status
)
4050 int8_t roundingMode
;
4051 bool roundNearestEven
, increment
;
4054 roundingMode
= status
->float_rounding_mode
;
4055 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4056 switch (roundingMode
) {
4057 case float_round_nearest_even
:
4058 case float_round_ties_away
:
4059 increment
= ((int64_t) absZ1
< 0);
4061 case float_round_to_zero
:
4064 case float_round_up
:
4065 increment
= !zSign
&& absZ1
;
4067 case float_round_down
:
4068 increment
= zSign
&& absZ1
;
4070 case float_round_to_odd
:
4071 increment
= !(absZ0
& 1) && absZ1
;
4078 if ( absZ0
== 0 ) goto overflow
;
4079 if (!(absZ1
<< 1) && roundNearestEven
) {
4084 if ( zSign
) z
= - z
;
4085 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
4087 float_raise(float_flag_invalid
, status
);
4088 return zSign
? INT64_MIN
: INT64_MAX
;
4091 float_raise(float_flag_inexact
, status
);
4097 /*----------------------------------------------------------------------------
4098 | Normalizes the subnormal single-precision floating-point value represented
4099 | by the denormalized significand `aSig'. The normalized exponent and
4100 | significand are stored at the locations pointed to by `zExpPtr' and
4101 | `zSigPtr', respectively.
4102 *----------------------------------------------------------------------------*/
4105 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
4109 shiftCount
= clz32(aSig
) - 8;
4110 *zSigPtr
= aSig
<<shiftCount
;
4111 *zExpPtr
= 1 - shiftCount
;
4115 /*----------------------------------------------------------------------------
4116 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4117 | and significand `zSig', and returns the proper single-precision floating-
4118 | point value corresponding to the abstract input. Ordinarily, the abstract
4119 | value is simply rounded and packed into the single-precision format, with
4120 | the inexact exception raised if the abstract input cannot be represented
4121 | exactly. However, if the abstract value is too large, the overflow and
4122 | inexact exceptions are raised and an infinity or maximal finite value is
4123 | returned. If the abstract value is too small, the input value is rounded to
4124 | a subnormal number, and the underflow and inexact exceptions are raised if
4125 | the abstract input cannot be represented exactly as a subnormal single-
4126 | precision floating-point number.
4127 | The input significand `zSig' has its binary point between bits 30
4128 | and 29, which is 7 bits to the left of the usual location. This shifted
4129 | significand must be normalized or smaller. If `zSig' is not normalized,
4130 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4131 | and it must not require rounding. In the usual case that `zSig' is
4132 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4133 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4134 | Binary Floating-Point Arithmetic.
4135 *----------------------------------------------------------------------------*/
4137 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4138 float_status
*status
)
4140 int8_t roundingMode
;
4141 bool roundNearestEven
;
4142 int8_t roundIncrement
, roundBits
;
4145 roundingMode
= status
->float_rounding_mode
;
4146 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4147 switch (roundingMode
) {
4148 case float_round_nearest_even
:
4149 case float_round_ties_away
:
4150 roundIncrement
= 0x40;
4152 case float_round_to_zero
:
4155 case float_round_up
:
4156 roundIncrement
= zSign
? 0 : 0x7f;
4158 case float_round_down
:
4159 roundIncrement
= zSign
? 0x7f : 0;
4161 case float_round_to_odd
:
4162 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4168 roundBits
= zSig
& 0x7F;
4169 if ( 0xFD <= (uint16_t) zExp
) {
4170 if ( ( 0xFD < zExp
)
4171 || ( ( zExp
== 0xFD )
4172 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4174 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4175 roundIncrement
!= 0;
4176 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4177 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4180 if (status
->flush_to_zero
) {
4181 float_raise(float_flag_output_denormal
, status
);
4182 return packFloat32(zSign
, 0, 0);
4184 isTiny
= status
->tininess_before_rounding
4186 || (zSig
+ roundIncrement
< 0x80000000);
4187 shift32RightJamming( zSig
, - zExp
, &zSig
);
4189 roundBits
= zSig
& 0x7F;
4190 if (isTiny
&& roundBits
) {
4191 float_raise(float_flag_underflow
, status
);
4193 if (roundingMode
== float_round_to_odd
) {
4195 * For round-to-odd case, the roundIncrement depends on
4196 * zSig which just changed.
4198 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4203 float_raise(float_flag_inexact
, status
);
4205 zSig
= ( zSig
+ roundIncrement
)>>7;
4206 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4209 if ( zSig
== 0 ) zExp
= 0;
4210 return packFloat32( zSign
, zExp
, zSig
);
4214 /*----------------------------------------------------------------------------
4215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4216 | and significand `zSig', and returns the proper single-precision floating-
4217 | point value corresponding to the abstract input. This routine is just like
4218 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4219 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4220 | floating-point exponent.
4221 *----------------------------------------------------------------------------*/
4224 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4225 float_status
*status
)
4229 shiftCount
= clz32(zSig
) - 1;
4230 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4235 /*----------------------------------------------------------------------------
4236 | Normalizes the subnormal double-precision floating-point value represented
4237 | by the denormalized significand `aSig'. The normalized exponent and
4238 | significand are stored at the locations pointed to by `zExpPtr' and
4239 | `zSigPtr', respectively.
4240 *----------------------------------------------------------------------------*/
4243 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4247 shiftCount
= clz64(aSig
) - 11;
4248 *zSigPtr
= aSig
<<shiftCount
;
4249 *zExpPtr
= 1 - shiftCount
;
4253 /*----------------------------------------------------------------------------
4254 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4255 | double-precision floating-point value, returning the result. After being
4256 | shifted into the proper positions, the three fields are simply added
4257 | together to form the result. This means that any integer portion of `zSig'
4258 | will be added into the exponent. Since a properly normalized significand
4259 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4260 | than the desired result exponent whenever `zSig' is a complete, normalized
4262 *----------------------------------------------------------------------------*/
4264 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4267 return make_float64(
4268 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4272 /*----------------------------------------------------------------------------
4273 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4274 | and significand `zSig', and returns the proper double-precision floating-
4275 | point value corresponding to the abstract input. Ordinarily, the abstract
4276 | value is simply rounded and packed into the double-precision format, with
4277 | the inexact exception raised if the abstract input cannot be represented
4278 | exactly. However, if the abstract value is too large, the overflow and
4279 | inexact exceptions are raised and an infinity or maximal finite value is
4280 | returned. If the abstract value is too small, the input value is rounded to
4281 | a subnormal number, and the underflow and inexact exceptions are raised if
4282 | the abstract input cannot be represented exactly as a subnormal double-
4283 | precision floating-point number.
4284 | The input significand `zSig' has its binary point between bits 62
4285 | and 61, which is 10 bits to the left of the usual location. This shifted
4286 | significand must be normalized or smaller. If `zSig' is not normalized,
4287 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4288 | and it must not require rounding. In the usual case that `zSig' is
4289 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4290 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4291 | Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4294 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4295 float_status
*status
)
4297 int8_t roundingMode
;
4298 bool roundNearestEven
;
4299 int roundIncrement
, roundBits
;
4302 roundingMode
= status
->float_rounding_mode
;
4303 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4304 switch (roundingMode
) {
4305 case float_round_nearest_even
:
4306 case float_round_ties_away
:
4307 roundIncrement
= 0x200;
4309 case float_round_to_zero
:
4312 case float_round_up
:
4313 roundIncrement
= zSign
? 0 : 0x3ff;
4315 case float_round_down
:
4316 roundIncrement
= zSign
? 0x3ff : 0;
4318 case float_round_to_odd
:
4319 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4324 roundBits
= zSig
& 0x3FF;
4325 if ( 0x7FD <= (uint16_t) zExp
) {
4326 if ( ( 0x7FD < zExp
)
4327 || ( ( zExp
== 0x7FD )
4328 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4330 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4331 roundIncrement
!= 0;
4332 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4333 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4336 if (status
->flush_to_zero
) {
4337 float_raise(float_flag_output_denormal
, status
);
4338 return packFloat64(zSign
, 0, 0);
4340 isTiny
= status
->tininess_before_rounding
4342 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4343 shift64RightJamming( zSig
, - zExp
, &zSig
);
4345 roundBits
= zSig
& 0x3FF;
4346 if (isTiny
&& roundBits
) {
4347 float_raise(float_flag_underflow
, status
);
4349 if (roundingMode
== float_round_to_odd
) {
4351 * For round-to-odd case, the roundIncrement depends on
4352 * zSig which just changed.
4354 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4359 float_raise(float_flag_inexact
, status
);
4361 zSig
= ( zSig
+ roundIncrement
)>>10;
4362 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4365 if ( zSig
== 0 ) zExp
= 0;
4366 return packFloat64( zSign
, zExp
, zSig
);
4370 /*----------------------------------------------------------------------------
4371 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4372 | and significand `zSig', and returns the proper double-precision floating-
4373 | point value corresponding to the abstract input. This routine is just like
4374 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4375 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4376 | floating-point exponent.
4377 *----------------------------------------------------------------------------*/
4380 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4381 float_status
*status
)
4385 shiftCount
= clz64(zSig
) - 1;
4386 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4391 /*----------------------------------------------------------------------------
4392 | Normalizes the subnormal extended double-precision floating-point value
4393 | represented by the denormalized significand `aSig'. The normalized exponent
4394 | and significand are stored at the locations pointed to by `zExpPtr' and
4395 | `zSigPtr', respectively.
4396 *----------------------------------------------------------------------------*/
4398 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4403 shiftCount
= clz64(aSig
);
4404 *zSigPtr
= aSig
<<shiftCount
;
4405 *zExpPtr
= 1 - shiftCount
;
4408 /*----------------------------------------------------------------------------
4409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4410 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4411 | and returns the proper extended double-precision floating-point value
4412 | corresponding to the abstract input. Ordinarily, the abstract value is
4413 | rounded and packed into the extended double-precision format, with the
4414 | inexact exception raised if the abstract input cannot be represented
4415 | exactly. However, if the abstract value is too large, the overflow and
4416 | inexact exceptions are raised and an infinity or maximal finite value is
4417 | returned. If the abstract value is too small, the input value is rounded to
4418 | a subnormal number, and the underflow and inexact exceptions are raised if
4419 | the abstract input cannot be represented exactly as a subnormal extended
4420 | double-precision floating-point number.
4421 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4422 | number of bits as single or double precision, respectively. Otherwise, the
4423 | result is rounded to the full precision of the extended double-precision
4425 | The input significand must be normalized or smaller. If the input
4426 | significand is not normalized, `zExp' must be 0; in that case, the result
4427 | returned is a subnormal number, and it must not require rounding. The
4428 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4432 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4433 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4434 float_status
*status
)
4436 int8_t roundingMode
;
4437 bool roundNearestEven
, increment
, isTiny
;
4438 int64_t roundIncrement
, roundMask
, roundBits
;
4440 roundingMode
= status
->float_rounding_mode
;
4441 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4442 if ( roundingPrecision
== 80 ) goto precision80
;
4443 if ( roundingPrecision
== 64 ) {
4444 roundIncrement
= UINT64_C(0x0000000000000400);
4445 roundMask
= UINT64_C(0x00000000000007FF);
4447 else if ( roundingPrecision
== 32 ) {
4448 roundIncrement
= UINT64_C(0x0000008000000000);
4449 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4454 zSig0
|= ( zSig1
!= 0 );
4455 switch (roundingMode
) {
4456 case float_round_nearest_even
:
4457 case float_round_ties_away
:
4459 case float_round_to_zero
:
4462 case float_round_up
:
4463 roundIncrement
= zSign
? 0 : roundMask
;
4465 case float_round_down
:
4466 roundIncrement
= zSign
? roundMask
: 0;
4471 roundBits
= zSig0
& roundMask
;
4472 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4473 if ( ( 0x7FFE < zExp
)
4474 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4479 if (status
->flush_to_zero
) {
4480 float_raise(float_flag_output_denormal
, status
);
4481 return packFloatx80(zSign
, 0, 0);
4483 isTiny
= status
->tininess_before_rounding
4485 || (zSig0
<= zSig0
+ roundIncrement
);
4486 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4488 roundBits
= zSig0
& roundMask
;
4489 if (isTiny
&& roundBits
) {
4490 float_raise(float_flag_underflow
, status
);
4493 float_raise(float_flag_inexact
, status
);
4495 zSig0
+= roundIncrement
;
4496 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4497 roundIncrement
= roundMask
+ 1;
4498 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4499 roundMask
|= roundIncrement
;
4501 zSig0
&= ~ roundMask
;
4502 return packFloatx80( zSign
, zExp
, zSig0
);
4506 float_raise(float_flag_inexact
, status
);
4508 zSig0
+= roundIncrement
;
4509 if ( zSig0
< roundIncrement
) {
4511 zSig0
= UINT64_C(0x8000000000000000);
4513 roundIncrement
= roundMask
+ 1;
4514 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4515 roundMask
|= roundIncrement
;
4517 zSig0
&= ~ roundMask
;
4518 if ( zSig0
== 0 ) zExp
= 0;
4519 return packFloatx80( zSign
, zExp
, zSig0
);
4521 switch (roundingMode
) {
4522 case float_round_nearest_even
:
4523 case float_round_ties_away
:
4524 increment
= ((int64_t)zSig1
< 0);
4526 case float_round_to_zero
:
4529 case float_round_up
:
4530 increment
= !zSign
&& zSig1
;
4532 case float_round_down
:
4533 increment
= zSign
&& zSig1
;
4538 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4539 if ( ( 0x7FFE < zExp
)
4540 || ( ( zExp
== 0x7FFE )
4541 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4547 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4548 if ( ( roundingMode
== float_round_to_zero
)
4549 || ( zSign
&& ( roundingMode
== float_round_up
) )
4550 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4552 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4554 return packFloatx80(zSign
,
4555 floatx80_infinity_high
,
4556 floatx80_infinity_low
);
4559 isTiny
= status
->tininess_before_rounding
4562 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4563 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4565 if (isTiny
&& zSig1
) {
4566 float_raise(float_flag_underflow
, status
);
4569 float_raise(float_flag_inexact
, status
);
4571 switch (roundingMode
) {
4572 case float_round_nearest_even
:
4573 case float_round_ties_away
:
4574 increment
= ((int64_t)zSig1
< 0);
4576 case float_round_to_zero
:
4579 case float_round_up
:
4580 increment
= !zSign
&& zSig1
;
4582 case float_round_down
:
4583 increment
= zSign
&& zSig1
;
4590 if (!(zSig1
<< 1) && roundNearestEven
) {
4593 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4595 return packFloatx80( zSign
, zExp
, zSig0
);
4599 float_raise(float_flag_inexact
, status
);
4605 zSig0
= UINT64_C(0x8000000000000000);
4608 if (!(zSig1
<< 1) && roundNearestEven
) {
4614 if ( zSig0
== 0 ) zExp
= 0;
4616 return packFloatx80( zSign
, zExp
, zSig0
);
4620 /*----------------------------------------------------------------------------
4621 | Takes an abstract floating-point value having sign `zSign', exponent
4622 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4623 | and returns the proper extended double-precision floating-point value
4624 | corresponding to the abstract input. This routine is just like
4625 | `roundAndPackFloatx80' except that the input significand does not have to be
4627 *----------------------------------------------------------------------------*/
4629 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4630 bool zSign
, int32_t zExp
,
4631 uint64_t zSig0
, uint64_t zSig1
,
4632 float_status
*status
)
4641 shiftCount
= clz64(zSig0
);
4642 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4644 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4645 zSig0
, zSig1
, status
);
4649 /*----------------------------------------------------------------------------
4650 | Returns the least-significant 64 fraction bits of the quadruple-precision
4651 | floating-point value `a'.
4652 *----------------------------------------------------------------------------*/
4654 static inline uint64_t extractFloat128Frac1( float128 a
)
4661 /*----------------------------------------------------------------------------
4662 | Returns the most-significant 48 fraction bits of the quadruple-precision
4663 | floating-point value `a'.
4664 *----------------------------------------------------------------------------*/
4666 static inline uint64_t extractFloat128Frac0( float128 a
)
4669 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4673 /*----------------------------------------------------------------------------
4674 | Returns the exponent bits of the quadruple-precision floating-point value
4676 *----------------------------------------------------------------------------*/
4678 static inline int32_t extractFloat128Exp( float128 a
)
4681 return ( a
.high
>>48 ) & 0x7FFF;
4685 /*----------------------------------------------------------------------------
4686 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4687 *----------------------------------------------------------------------------*/
4689 static inline bool extractFloat128Sign(float128 a
)
4691 return a
.high
>> 63;
4694 /*----------------------------------------------------------------------------
4695 | Normalizes the subnormal quadruple-precision floating-point value
4696 | represented by the denormalized significand formed by the concatenation of
4697 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4698 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4699 | significand are stored at the location pointed to by `zSig0Ptr', and the
4700 | least significant 64 bits of the normalized significand are stored at the
4701 | location pointed to by `zSig1Ptr'.
4702 *----------------------------------------------------------------------------*/
4705 normalizeFloat128Subnormal(
4716 shiftCount
= clz64(aSig1
) - 15;
4717 if ( shiftCount
< 0 ) {
4718 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4719 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4722 *zSig0Ptr
= aSig1
<<shiftCount
;
4725 *zExpPtr
= - shiftCount
- 63;
4728 shiftCount
= clz64(aSig0
) - 15;
4729 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4730 *zExpPtr
= 1 - shiftCount
;
4735 /*----------------------------------------------------------------------------
4736 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4737 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4738 | floating-point value, returning the result. After being shifted into the
4739 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4740 | added together to form the most significant 32 bits of the result. This
4741 | means that any integer portion of `zSig0' will be added into the exponent.
4742 | Since a properly normalized significand will have an integer portion equal
4743 | to 1, the `zExp' input should be 1 less than the desired result exponent
4744 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4746 *----------------------------------------------------------------------------*/
4748 static inline float128
4749 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4754 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4758 /*----------------------------------------------------------------------------
4759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4760 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4761 | and `zSig2', and returns the proper quadruple-precision floating-point value
4762 | corresponding to the abstract input. Ordinarily, the abstract value is
4763 | simply rounded and packed into the quadruple-precision format, with the
4764 | inexact exception raised if the abstract input cannot be represented
4765 | exactly. However, if the abstract value is too large, the overflow and
4766 | inexact exceptions are raised and an infinity or maximal finite value is
4767 | returned. If the abstract value is too small, the input value is rounded to
4768 | a subnormal number, and the underflow and inexact exceptions are raised if
4769 | the abstract input cannot be represented exactly as a subnormal quadruple-
4770 | precision floating-point number.
4771 | The input significand must be normalized or smaller. If the input
4772 | significand is not normalized, `zExp' must be 0; in that case, the result
4773 | returned is a subnormal number, and it must not require rounding. In the
4774 | usual case that the input significand is normalized, `zExp' must be 1 less
4775 | than the ``true'' floating-point exponent. The handling of underflow and
4776 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4777 *----------------------------------------------------------------------------*/
4779 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4780 uint64_t zSig0
, uint64_t zSig1
,
4781 uint64_t zSig2
, float_status
*status
)
4783 int8_t roundingMode
;
4784 bool roundNearestEven
, increment
, isTiny
;
4786 roundingMode
= status
->float_rounding_mode
;
4787 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4788 switch (roundingMode
) {
4789 case float_round_nearest_even
:
4790 case float_round_ties_away
:
4791 increment
= ((int64_t)zSig2
< 0);
4793 case float_round_to_zero
:
4796 case float_round_up
:
4797 increment
= !zSign
&& zSig2
;
4799 case float_round_down
:
4800 increment
= zSign
&& zSig2
;
4802 case float_round_to_odd
:
4803 increment
= !(zSig1
& 0x1) && zSig2
;
4808 if ( 0x7FFD <= (uint32_t) zExp
) {
4809 if ( ( 0x7FFD < zExp
)
4810 || ( ( zExp
== 0x7FFD )
4812 UINT64_C(0x0001FFFFFFFFFFFF),
4813 UINT64_C(0xFFFFFFFFFFFFFFFF),
4820 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4821 if ( ( roundingMode
== float_round_to_zero
)
4822 || ( zSign
&& ( roundingMode
== float_round_up
) )
4823 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4824 || (roundingMode
== float_round_to_odd
)
4830 UINT64_C(0x0000FFFFFFFFFFFF),
4831 UINT64_C(0xFFFFFFFFFFFFFFFF)
4834 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4837 if (status
->flush_to_zero
) {
4838 float_raise(float_flag_output_denormal
, status
);
4839 return packFloat128(zSign
, 0, 0, 0);
4841 isTiny
= status
->tininess_before_rounding
4844 || lt128(zSig0
, zSig1
,
4845 UINT64_C(0x0001FFFFFFFFFFFF),
4846 UINT64_C(0xFFFFFFFFFFFFFFFF));
4847 shift128ExtraRightJamming(
4848 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4850 if (isTiny
&& zSig2
) {
4851 float_raise(float_flag_underflow
, status
);
4853 switch (roundingMode
) {
4854 case float_round_nearest_even
:
4855 case float_round_ties_away
:
4856 increment
= ((int64_t)zSig2
< 0);
4858 case float_round_to_zero
:
4861 case float_round_up
:
4862 increment
= !zSign
&& zSig2
;
4864 case float_round_down
:
4865 increment
= zSign
&& zSig2
;
4867 case float_round_to_odd
:
4868 increment
= !(zSig1
& 0x1) && zSig2
;
4876 float_raise(float_flag_inexact
, status
);
4879 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4880 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4885 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4887 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4891 /*----------------------------------------------------------------------------
4892 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4893 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4894 | returns the proper quadruple-precision floating-point value corresponding
4895 | to the abstract input. This routine is just like `roundAndPackFloat128'
4896 | except that the input significand has fewer bits and does not have to be
4897 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4899 *----------------------------------------------------------------------------*/
4901 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4902 uint64_t zSig0
, uint64_t zSig1
,
4903 float_status
*status
)
4913 shiftCount
= clz64(zSig0
) - 15;
4914 if ( 0 <= shiftCount
) {
4916 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4919 shift128ExtraRightJamming(
4920 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4923 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4928 /*----------------------------------------------------------------------------
4929 | Returns the result of converting the 32-bit two's complement integer `a'
4930 | to the extended double-precision floating-point format. The conversion
4931 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4933 *----------------------------------------------------------------------------*/
4935 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4942 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4944 absA
= zSign
? - a
: a
;
4945 shiftCount
= clz32(absA
) + 32;
4947 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4951 /*----------------------------------------------------------------------------
4952 | Returns the result of converting the 64-bit two's complement integer `a'
4953 | to the extended double-precision floating-point format. The conversion
4954 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4956 *----------------------------------------------------------------------------*/
4958 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4964 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4966 absA
= zSign
? - a
: a
;
4967 shiftCount
= clz64(absA
);
4968 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4972 /*----------------------------------------------------------------------------
4973 | Returns the result of converting the single-precision floating-point value
4974 | `a' to the extended double-precision floating-point format. The conversion
4975 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4977 *----------------------------------------------------------------------------*/
4979 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4985 a
= float32_squash_input_denormal(a
, status
);
4986 aSig
= extractFloat32Frac( a
);
4987 aExp
= extractFloat32Exp( a
);
4988 aSign
= extractFloat32Sign( a
);
4989 if ( aExp
== 0xFF ) {
4991 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4993 return floatx80_silence_nan(res
, status
);
4995 return packFloatx80(aSign
,
4996 floatx80_infinity_high
,
4997 floatx80_infinity_low
);
5000 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5001 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5004 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
5008 /*----------------------------------------------------------------------------
5009 | Returns the remainder of the single-precision floating-point value `a'
5010 | with respect to the corresponding value `b'. The operation is performed
5011 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5012 *----------------------------------------------------------------------------*/
5014 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
5017 int aExp
, bExp
, expDiff
;
5018 uint32_t aSig
, bSig
;
5020 uint64_t aSig64
, bSig64
, q64
;
5021 uint32_t alternateASig
;
5023 a
= float32_squash_input_denormal(a
, status
);
5024 b
= float32_squash_input_denormal(b
, status
);
5026 aSig
= extractFloat32Frac( a
);
5027 aExp
= extractFloat32Exp( a
);
5028 aSign
= extractFloat32Sign( a
);
5029 bSig
= extractFloat32Frac( b
);
5030 bExp
= extractFloat32Exp( b
);
5031 if ( aExp
== 0xFF ) {
5032 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
5033 return propagateFloat32NaN(a
, b
, status
);
5035 float_raise(float_flag_invalid
, status
);
5036 return float32_default_nan(status
);
5038 if ( bExp
== 0xFF ) {
5040 return propagateFloat32NaN(a
, b
, status
);
5046 float_raise(float_flag_invalid
, status
);
5047 return float32_default_nan(status
);
5049 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
5052 if ( aSig
== 0 ) return a
;
5053 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5055 expDiff
= aExp
- bExp
;
5058 if ( expDiff
< 32 ) {
5061 if ( expDiff
< 0 ) {
5062 if ( expDiff
< -1 ) return a
;
5065 q
= ( bSig
<= aSig
);
5066 if ( q
) aSig
-= bSig
;
5067 if ( 0 < expDiff
) {
5068 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5071 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5079 if ( bSig
<= aSig
) aSig
-= bSig
;
5080 aSig64
= ( (uint64_t) aSig
)<<40;
5081 bSig64
= ( (uint64_t) bSig
)<<40;
5083 while ( 0 < expDiff
) {
5084 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5085 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5086 aSig64
= - ( ( bSig
* q64
)<<38 );
5090 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5091 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5092 q
= q64
>>( 64 - expDiff
);
5094 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5097 alternateASig
= aSig
;
5100 } while ( 0 <= (int32_t) aSig
);
5101 sigMean
= aSig
+ alternateASig
;
5102 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5103 aSig
= alternateASig
;
5105 zSign
= ( (int32_t) aSig
< 0 );
5106 if ( zSign
) aSig
= - aSig
;
5107 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5112 /*----------------------------------------------------------------------------
5113 | Returns the binary exponential of the single-precision floating-point value
5114 | `a'. The operation is performed according to the IEC/IEEE Standard for
5115 | Binary Floating-Point Arithmetic.
5117 | Uses the following identities:
5119 | 1. -------------------------------------------------------------------------
5123 | 2. -------------------------------------------------------------------------
5126 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5128 *----------------------------------------------------------------------------*/
5130 static const float64 float32_exp2_coefficients
[15] =
5132 const_float64( 0x3ff0000000000000ll
), /* 1 */
5133 const_float64( 0x3fe0000000000000ll
), /* 2 */
5134 const_float64( 0x3fc5555555555555ll
), /* 3 */
5135 const_float64( 0x3fa5555555555555ll
), /* 4 */
5136 const_float64( 0x3f81111111111111ll
), /* 5 */
5137 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5138 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5139 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5140 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5141 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5142 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5143 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5144 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5145 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5146 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5149 float32
float32_exp2(float32 a
, float_status
*status
)
5156 a
= float32_squash_input_denormal(a
, status
);
5158 aSig
= extractFloat32Frac( a
);
5159 aExp
= extractFloat32Exp( a
);
5160 aSign
= extractFloat32Sign( a
);
5162 if ( aExp
== 0xFF) {
5164 return propagateFloat32NaN(a
, float32_zero
, status
);
5166 return (aSign
) ? float32_zero
: a
;
5169 if (aSig
== 0) return float32_one
;
5172 float_raise(float_flag_inexact
, status
);
5174 /* ******************************* */
5175 /* using float64 for approximation */
5176 /* ******************************* */
5177 x
= float32_to_float64(a
, status
);
5178 x
= float64_mul(x
, float64_ln2
, status
);
5182 for (i
= 0 ; i
< 15 ; i
++) {
5185 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5186 r
= float64_add(r
, f
, status
);
5188 xn
= float64_mul(xn
, x
, status
);
5191 return float64_to_float32(r
, status
);
5194 /*----------------------------------------------------------------------------
5195 | Returns the binary log of the single-precision floating-point value `a'.
5196 | The operation is performed according to the IEC/IEEE Standard for Binary
5197 | Floating-Point Arithmetic.
5198 *----------------------------------------------------------------------------*/
5199 float32
float32_log2(float32 a
, float_status
*status
)
5203 uint32_t aSig
, zSig
, i
;
5205 a
= float32_squash_input_denormal(a
, status
);
5206 aSig
= extractFloat32Frac( a
);
5207 aExp
= extractFloat32Exp( a
);
5208 aSign
= extractFloat32Sign( a
);
5211 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5212 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5215 float_raise(float_flag_invalid
, status
);
5216 return float32_default_nan(status
);
5218 if ( aExp
== 0xFF ) {
5220 return propagateFloat32NaN(a
, float32_zero
, status
);
5230 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5231 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5232 if ( aSig
& 0x01000000 ) {
5241 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5244 /*----------------------------------------------------------------------------
5245 | Returns the result of converting the double-precision floating-point value
5246 | `a' to the extended double-precision floating-point format. The conversion
5247 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5249 *----------------------------------------------------------------------------*/
5251 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5257 a
= float64_squash_input_denormal(a
, status
);
5258 aSig
= extractFloat64Frac( a
);
5259 aExp
= extractFloat64Exp( a
);
5260 aSign
= extractFloat64Sign( a
);
5261 if ( aExp
== 0x7FF ) {
5263 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5265 return floatx80_silence_nan(res
, status
);
5267 return packFloatx80(aSign
,
5268 floatx80_infinity_high
,
5269 floatx80_infinity_low
);
5272 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5273 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5277 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5281 /*----------------------------------------------------------------------------
5282 | Returns the remainder of the double-precision floating-point value `a'
5283 | with respect to the corresponding value `b'. The operation is performed
5284 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5285 *----------------------------------------------------------------------------*/
5287 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5290 int aExp
, bExp
, expDiff
;
5291 uint64_t aSig
, bSig
;
5292 uint64_t q
, alternateASig
;
5295 a
= float64_squash_input_denormal(a
, status
);
5296 b
= float64_squash_input_denormal(b
, status
);
5297 aSig
= extractFloat64Frac( a
);
5298 aExp
= extractFloat64Exp( a
);
5299 aSign
= extractFloat64Sign( a
);
5300 bSig
= extractFloat64Frac( b
);
5301 bExp
= extractFloat64Exp( b
);
5302 if ( aExp
== 0x7FF ) {
5303 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5304 return propagateFloat64NaN(a
, b
, status
);
5306 float_raise(float_flag_invalid
, status
);
5307 return float64_default_nan(status
);
5309 if ( bExp
== 0x7FF ) {
5311 return propagateFloat64NaN(a
, b
, status
);
5317 float_raise(float_flag_invalid
, status
);
5318 return float64_default_nan(status
);
5320 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5323 if ( aSig
== 0 ) return a
;
5324 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5326 expDiff
= aExp
- bExp
;
5327 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5328 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5329 if ( expDiff
< 0 ) {
5330 if ( expDiff
< -1 ) return a
;
5333 q
= ( bSig
<= aSig
);
5334 if ( q
) aSig
-= bSig
;
5336 while ( 0 < expDiff
) {
5337 q
= estimateDiv128To64( aSig
, 0, bSig
);
5338 q
= ( 2 < q
) ? q
- 2 : 0;
5339 aSig
= - ( ( bSig
>>2 ) * q
);
5343 if ( 0 < expDiff
) {
5344 q
= estimateDiv128To64( aSig
, 0, bSig
);
5345 q
= ( 2 < q
) ? q
- 2 : 0;
5348 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5355 alternateASig
= aSig
;
5358 } while ( 0 <= (int64_t) aSig
);
5359 sigMean
= aSig
+ alternateASig
;
5360 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5361 aSig
= alternateASig
;
5363 zSign
= ( (int64_t) aSig
< 0 );
5364 if ( zSign
) aSig
= - aSig
;
5365 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5369 /*----------------------------------------------------------------------------
5370 | Returns the binary log of the double-precision floating-point value `a'.
5371 | The operation is performed according to the IEC/IEEE Standard for Binary
5372 | Floating-Point Arithmetic.
5373 *----------------------------------------------------------------------------*/
5374 float64
float64_log2(float64 a
, float_status
*status
)
5378 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5379 a
= float64_squash_input_denormal(a
, status
);
5381 aSig
= extractFloat64Frac( a
);
5382 aExp
= extractFloat64Exp( a
);
5383 aSign
= extractFloat64Sign( a
);
5386 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5387 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5390 float_raise(float_flag_invalid
, status
);
5391 return float64_default_nan(status
);
5393 if ( aExp
== 0x7FF ) {
5395 return propagateFloat64NaN(a
, float64_zero
, status
);
5401 aSig
|= UINT64_C(0x0010000000000000);
5403 zSig
= (uint64_t)aExp
<< 52;
5404 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5405 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5406 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5407 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5415 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5418 /*----------------------------------------------------------------------------
5419 | Returns the result of converting the extended double-precision floating-
5420 | point value `a' to the 32-bit two's complement integer format. The
5421 | conversion is performed according to the IEC/IEEE Standard for Binary
5422 | Floating-Point Arithmetic---which means in particular that the conversion
5423 | is rounded according to the current rounding mode. If `a' is a NaN, the
5424 | largest positive integer is returned. Otherwise, if the conversion
5425 | overflows, the largest integer with the same sign as `a' is returned.
5426 *----------------------------------------------------------------------------*/
5428 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5431 int32_t aExp
, shiftCount
;
5434 if (floatx80_invalid_encoding(a
)) {
5435 float_raise(float_flag_invalid
, status
);
5438 aSig
= extractFloatx80Frac( a
);
5439 aExp
= extractFloatx80Exp( a
);
5440 aSign
= extractFloatx80Sign( a
);
5441 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5442 shiftCount
= 0x4037 - aExp
;
5443 if ( shiftCount
<= 0 ) shiftCount
= 1;
5444 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5445 return roundAndPackInt32(aSign
, aSig
, status
);
5449 /*----------------------------------------------------------------------------
5450 | Returns the result of converting the extended double-precision floating-
5451 | point value `a' to the 32-bit two's complement integer format. The
5452 | conversion is performed according to the IEC/IEEE Standard for Binary
5453 | Floating-Point Arithmetic, except that the conversion is always rounded
5454 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5455 | Otherwise, if the conversion overflows, the largest integer with the same
5456 | sign as `a' is returned.
5457 *----------------------------------------------------------------------------*/
5459 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5462 int32_t aExp
, shiftCount
;
5463 uint64_t aSig
, savedASig
;
5466 if (floatx80_invalid_encoding(a
)) {
5467 float_raise(float_flag_invalid
, status
);
5470 aSig
= extractFloatx80Frac( a
);
5471 aExp
= extractFloatx80Exp( a
);
5472 aSign
= extractFloatx80Sign( a
);
5473 if ( 0x401E < aExp
) {
5474 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5477 else if ( aExp
< 0x3FFF ) {
5479 float_raise(float_flag_inexact
, status
);
5483 shiftCount
= 0x403E - aExp
;
5485 aSig
>>= shiftCount
;
5487 if ( aSign
) z
= - z
;
5488 if ( ( z
< 0 ) ^ aSign
) {
5490 float_raise(float_flag_invalid
, status
);
5491 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5493 if ( ( aSig
<<shiftCount
) != savedASig
) {
5494 float_raise(float_flag_inexact
, status
);
5500 /*----------------------------------------------------------------------------
5501 | Returns the result of converting the extended double-precision floating-
5502 | point value `a' to the 64-bit two's complement integer format. The
5503 | conversion is performed according to the IEC/IEEE Standard for Binary
5504 | Floating-Point Arithmetic---which means in particular that the conversion
5505 | is rounded according to the current rounding mode. If `a' is a NaN,
5506 | the largest positive integer is returned. Otherwise, if the conversion
5507 | overflows, the largest integer with the same sign as `a' is returned.
5508 *----------------------------------------------------------------------------*/
5510 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5513 int32_t aExp
, shiftCount
;
5514 uint64_t aSig
, aSigExtra
;
5516 if (floatx80_invalid_encoding(a
)) {
5517 float_raise(float_flag_invalid
, status
);
5520 aSig
= extractFloatx80Frac( a
);
5521 aExp
= extractFloatx80Exp( a
);
5522 aSign
= extractFloatx80Sign( a
);
5523 shiftCount
= 0x403E - aExp
;
5524 if ( shiftCount
<= 0 ) {
5526 float_raise(float_flag_invalid
, status
);
5527 if (!aSign
|| floatx80_is_any_nan(a
)) {
5535 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5537 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5541 /*----------------------------------------------------------------------------
5542 | Returns the result of converting the extended double-precision floating-
5543 | point value `a' to the 64-bit two's complement integer format. The
5544 | conversion is performed according to the IEC/IEEE Standard for Binary
5545 | Floating-Point Arithmetic, except that the conversion is always rounded
5546 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5547 | Otherwise, if the conversion overflows, the largest integer with the same
5548 | sign as `a' is returned.
5549 *----------------------------------------------------------------------------*/
5551 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5554 int32_t aExp
, shiftCount
;
5558 if (floatx80_invalid_encoding(a
)) {
5559 float_raise(float_flag_invalid
, status
);
5562 aSig
= extractFloatx80Frac( a
);
5563 aExp
= extractFloatx80Exp( a
);
5564 aSign
= extractFloatx80Sign( a
);
5565 shiftCount
= aExp
- 0x403E;
5566 if ( 0 <= shiftCount
) {
5567 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5568 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5569 float_raise(float_flag_invalid
, status
);
5570 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5576 else if ( aExp
< 0x3FFF ) {
5578 float_raise(float_flag_inexact
, status
);
5582 z
= aSig
>>( - shiftCount
);
5583 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5584 float_raise(float_flag_inexact
, status
);
5586 if ( aSign
) z
= - z
;
5591 /*----------------------------------------------------------------------------
5592 | Returns the result of converting the extended double-precision floating-
5593 | point value `a' to the single-precision floating-point format. The
5594 | conversion is performed according to the IEC/IEEE Standard for Binary
5595 | Floating-Point Arithmetic.
5596 *----------------------------------------------------------------------------*/
5598 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5604 if (floatx80_invalid_encoding(a
)) {
5605 float_raise(float_flag_invalid
, status
);
5606 return float32_default_nan(status
);
5608 aSig
= extractFloatx80Frac( a
);
5609 aExp
= extractFloatx80Exp( a
);
5610 aSign
= extractFloatx80Sign( a
);
5611 if ( aExp
== 0x7FFF ) {
5612 if ( (uint64_t) ( aSig
<<1 ) ) {
5613 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5615 return float32_silence_nan(res
, status
);
5617 return packFloat32( aSign
, 0xFF, 0 );
5619 shift64RightJamming( aSig
, 33, &aSig
);
5620 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5621 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5625 /*----------------------------------------------------------------------------
5626 | Returns the result of converting the extended double-precision floating-
5627 | point value `a' to the double-precision floating-point format. The
5628 | conversion is performed according to the IEC/IEEE Standard for Binary
5629 | Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5632 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5636 uint64_t aSig
, zSig
;
5638 if (floatx80_invalid_encoding(a
)) {
5639 float_raise(float_flag_invalid
, status
);
5640 return float64_default_nan(status
);
5642 aSig
= extractFloatx80Frac( a
);
5643 aExp
= extractFloatx80Exp( a
);
5644 aSign
= extractFloatx80Sign( a
);
5645 if ( aExp
== 0x7FFF ) {
5646 if ( (uint64_t) ( aSig
<<1 ) ) {
5647 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5649 return float64_silence_nan(res
, status
);
5651 return packFloat64( aSign
, 0x7FF, 0 );
5653 shift64RightJamming( aSig
, 1, &zSig
);
5654 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5655 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5659 /*----------------------------------------------------------------------------
5660 | Returns the result of converting the extended double-precision floating-
5661 | point value `a' to the quadruple-precision floating-point format. The
5662 | conversion is performed according to the IEC/IEEE Standard for Binary
5663 | Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5666 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5670 uint64_t aSig
, zSig0
, zSig1
;
5672 if (floatx80_invalid_encoding(a
)) {
5673 float_raise(float_flag_invalid
, status
);
5674 return float128_default_nan(status
);
5676 aSig
= extractFloatx80Frac( a
);
5677 aExp
= extractFloatx80Exp( a
);
5678 aSign
= extractFloatx80Sign( a
);
5679 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5680 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5682 return float128_silence_nan(res
, status
);
5684 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5685 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5689 /*----------------------------------------------------------------------------
5690 | Rounds the extended double-precision floating-point value `a'
5691 | to the precision provided by floatx80_rounding_precision and returns the
5692 | result as an extended double-precision floating-point value.
5693 | The operation is performed according to the IEC/IEEE Standard for Binary
5694 | Floating-Point Arithmetic.
5695 *----------------------------------------------------------------------------*/
5697 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5699 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5700 extractFloatx80Sign(a
),
5701 extractFloatx80Exp(a
),
5702 extractFloatx80Frac(a
), 0, status
);
5705 /*----------------------------------------------------------------------------
5706 | Rounds the extended double-precision floating-point value `a' to an integer,
5707 | and returns the result as an extended quadruple-precision floating-point
5708 | value. The operation is performed according to the IEC/IEEE Standard for
5709 | Binary Floating-Point Arithmetic.
5710 *----------------------------------------------------------------------------*/
5712 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5716 uint64_t lastBitMask
, roundBitsMask
;
5719 if (floatx80_invalid_encoding(a
)) {
5720 float_raise(float_flag_invalid
, status
);
5721 return floatx80_default_nan(status
);
5723 aExp
= extractFloatx80Exp( a
);
5724 if ( 0x403E <= aExp
) {
5725 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5726 return propagateFloatx80NaN(a
, a
, status
);
5730 if ( aExp
< 0x3FFF ) {
5732 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5735 float_raise(float_flag_inexact
, status
);
5736 aSign
= extractFloatx80Sign( a
);
5737 switch (status
->float_rounding_mode
) {
5738 case float_round_nearest_even
:
5739 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5742 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5745 case float_round_ties_away
:
5746 if (aExp
== 0x3FFE) {
5747 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5750 case float_round_down
:
5753 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5754 : packFloatx80( 0, 0, 0 );
5755 case float_round_up
:
5757 aSign
? packFloatx80( 1, 0, 0 )
5758 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5760 case float_round_to_zero
:
5763 g_assert_not_reached();
5765 return packFloatx80( aSign
, 0, 0 );
5768 lastBitMask
<<= 0x403E - aExp
;
5769 roundBitsMask
= lastBitMask
- 1;
5771 switch (status
->float_rounding_mode
) {
5772 case float_round_nearest_even
:
5773 z
.low
+= lastBitMask
>>1;
5774 if ((z
.low
& roundBitsMask
) == 0) {
5775 z
.low
&= ~lastBitMask
;
5778 case float_round_ties_away
:
5779 z
.low
+= lastBitMask
>> 1;
5781 case float_round_to_zero
:
5783 case float_round_up
:
5784 if (!extractFloatx80Sign(z
)) {
5785 z
.low
+= roundBitsMask
;
5788 case float_round_down
:
5789 if (extractFloatx80Sign(z
)) {
5790 z
.low
+= roundBitsMask
;
5796 z
.low
&= ~ roundBitsMask
;
5799 z
.low
= UINT64_C(0x8000000000000000);
5801 if (z
.low
!= a
.low
) {
5802 float_raise(float_flag_inexact
, status
);
5808 /*----------------------------------------------------------------------------
5809 | Returns the result of adding the absolute values of the extended double-
5810 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5811 | negated before being returned. `zSign' is ignored if the result is a NaN.
5812 | The addition is performed according to the IEC/IEEE Standard for Binary
5813 | Floating-Point Arithmetic.
5814 *----------------------------------------------------------------------------*/
5816 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5817 float_status
*status
)
5819 int32_t aExp
, bExp
, zExp
;
5820 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5823 aSig
= extractFloatx80Frac( a
);
5824 aExp
= extractFloatx80Exp( a
);
5825 bSig
= extractFloatx80Frac( b
);
5826 bExp
= extractFloatx80Exp( b
);
5827 expDiff
= aExp
- bExp
;
5828 if ( 0 < expDiff
) {
5829 if ( aExp
== 0x7FFF ) {
5830 if ((uint64_t)(aSig
<< 1)) {
5831 return propagateFloatx80NaN(a
, b
, status
);
5835 if ( bExp
== 0 ) --expDiff
;
5836 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5839 else if ( expDiff
< 0 ) {
5840 if ( bExp
== 0x7FFF ) {
5841 if ((uint64_t)(bSig
<< 1)) {
5842 return propagateFloatx80NaN(a
, b
, status
);
5844 return packFloatx80(zSign
,
5845 floatx80_infinity_high
,
5846 floatx80_infinity_low
);
5848 if ( aExp
== 0 ) ++expDiff
;
5849 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5853 if ( aExp
== 0x7FFF ) {
5854 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5855 return propagateFloatx80NaN(a
, b
, status
);
5860 zSig0
= aSig
+ bSig
;
5862 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5863 /* At least one of the values is a pseudo-denormal,
5864 * and there is a carry out of the result. */
5869 return packFloatx80(zSign
, 0, 0);
5871 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5877 zSig0
= aSig
+ bSig
;
5878 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5880 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5881 zSig0
|= UINT64_C(0x8000000000000000);
5884 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5885 zSign
, zExp
, zSig0
, zSig1
, status
);
5888 /*----------------------------------------------------------------------------
5889 | Returns the result of subtracting the absolute values of the extended
5890 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5891 | difference is negated before being returned. `zSign' is ignored if the
5892 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5893 | Standard for Binary Floating-Point Arithmetic.
5894 *----------------------------------------------------------------------------*/
5896 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5897 float_status
*status
)
5899 int32_t aExp
, bExp
, zExp
;
5900 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5903 aSig
= extractFloatx80Frac( a
);
5904 aExp
= extractFloatx80Exp( a
);
5905 bSig
= extractFloatx80Frac( b
);
5906 bExp
= extractFloatx80Exp( b
);
5907 expDiff
= aExp
- bExp
;
5908 if ( 0 < expDiff
) goto aExpBigger
;
5909 if ( expDiff
< 0 ) goto bExpBigger
;
5910 if ( aExp
== 0x7FFF ) {
5911 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5912 return propagateFloatx80NaN(a
, b
, status
);
5914 float_raise(float_flag_invalid
, status
);
5915 return floatx80_default_nan(status
);
5922 if ( bSig
< aSig
) goto aBigger
;
5923 if ( aSig
< bSig
) goto bBigger
;
5924 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5926 if ( bExp
== 0x7FFF ) {
5927 if ((uint64_t)(bSig
<< 1)) {
5928 return propagateFloatx80NaN(a
, b
, status
);
5930 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5931 floatx80_infinity_low
);
5933 if ( aExp
== 0 ) ++expDiff
;
5934 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5936 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5939 goto normalizeRoundAndPack
;
5941 if ( aExp
== 0x7FFF ) {
5942 if ((uint64_t)(aSig
<< 1)) {
5943 return propagateFloatx80NaN(a
, b
, status
);
5947 if ( bExp
== 0 ) --expDiff
;
5948 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5950 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5952 normalizeRoundAndPack
:
5953 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5954 zSign
, zExp
, zSig0
, zSig1
, status
);
5957 /*----------------------------------------------------------------------------
5958 | Returns the result of adding the extended double-precision floating-point
5959 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5960 | Standard for Binary Floating-Point Arithmetic.
5961 *----------------------------------------------------------------------------*/
5963 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5967 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5968 float_raise(float_flag_invalid
, status
);
5969 return floatx80_default_nan(status
);
5971 aSign
= extractFloatx80Sign( a
);
5972 bSign
= extractFloatx80Sign( b
);
5973 if ( aSign
== bSign
) {
5974 return addFloatx80Sigs(a
, b
, aSign
, status
);
5977 return subFloatx80Sigs(a
, b
, aSign
, status
);
5982 /*----------------------------------------------------------------------------
5983 | Returns the result of subtracting the extended double-precision floating-
5984 | point values `a' and `b'. The operation is performed according to the
5985 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5986 *----------------------------------------------------------------------------*/
5988 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5992 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5993 float_raise(float_flag_invalid
, status
);
5994 return floatx80_default_nan(status
);
5996 aSign
= extractFloatx80Sign( a
);
5997 bSign
= extractFloatx80Sign( b
);
5998 if ( aSign
== bSign
) {
5999 return subFloatx80Sigs(a
, b
, aSign
, status
);
6002 return addFloatx80Sigs(a
, b
, aSign
, status
);
6007 /*----------------------------------------------------------------------------
6008 | Returns the result of multiplying the extended double-precision floating-
6009 | point values `a' and `b'. The operation is performed according to the
6010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6011 *----------------------------------------------------------------------------*/
6013 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6015 bool aSign
, bSign
, zSign
;
6016 int32_t aExp
, bExp
, zExp
;
6017 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6019 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6020 float_raise(float_flag_invalid
, status
);
6021 return floatx80_default_nan(status
);
6023 aSig
= extractFloatx80Frac( a
);
6024 aExp
= extractFloatx80Exp( a
);
6025 aSign
= extractFloatx80Sign( a
);
6026 bSig
= extractFloatx80Frac( b
);
6027 bExp
= extractFloatx80Exp( b
);
6028 bSign
= extractFloatx80Sign( b
);
6029 zSign
= aSign
^ bSign
;
6030 if ( aExp
== 0x7FFF ) {
6031 if ( (uint64_t) ( aSig
<<1 )
6032 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6033 return propagateFloatx80NaN(a
, b
, status
);
6035 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6036 return packFloatx80(zSign
, floatx80_infinity_high
,
6037 floatx80_infinity_low
);
6039 if ( bExp
== 0x7FFF ) {
6040 if ((uint64_t)(bSig
<< 1)) {
6041 return propagateFloatx80NaN(a
, b
, status
);
6043 if ( ( aExp
| aSig
) == 0 ) {
6045 float_raise(float_flag_invalid
, status
);
6046 return floatx80_default_nan(status
);
6048 return packFloatx80(zSign
, floatx80_infinity_high
,
6049 floatx80_infinity_low
);
6052 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6053 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6056 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6057 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6059 zExp
= aExp
+ bExp
- 0x3FFE;
6060 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6061 if ( 0 < (int64_t) zSig0
) {
6062 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6065 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6066 zSign
, zExp
, zSig0
, zSig1
, status
);
6069 /*----------------------------------------------------------------------------
6070 | Returns the result of dividing the extended double-precision floating-point
6071 | value `a' by the corresponding value `b'. The operation is performed
6072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6073 *----------------------------------------------------------------------------*/
6075 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6077 bool aSign
, bSign
, zSign
;
6078 int32_t aExp
, bExp
, zExp
;
6079 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6080 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6082 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6083 float_raise(float_flag_invalid
, status
);
6084 return floatx80_default_nan(status
);
6086 aSig
= extractFloatx80Frac( a
);
6087 aExp
= extractFloatx80Exp( a
);
6088 aSign
= extractFloatx80Sign( a
);
6089 bSig
= extractFloatx80Frac( b
);
6090 bExp
= extractFloatx80Exp( b
);
6091 bSign
= extractFloatx80Sign( b
);
6092 zSign
= aSign
^ bSign
;
6093 if ( aExp
== 0x7FFF ) {
6094 if ((uint64_t)(aSig
<< 1)) {
6095 return propagateFloatx80NaN(a
, b
, status
);
6097 if ( bExp
== 0x7FFF ) {
6098 if ((uint64_t)(bSig
<< 1)) {
6099 return propagateFloatx80NaN(a
, b
, status
);
6103 return packFloatx80(zSign
, floatx80_infinity_high
,
6104 floatx80_infinity_low
);
6106 if ( bExp
== 0x7FFF ) {
6107 if ((uint64_t)(bSig
<< 1)) {
6108 return propagateFloatx80NaN(a
, b
, status
);
6110 return packFloatx80( zSign
, 0, 0 );
6114 if ( ( aExp
| aSig
) == 0 ) {
6116 float_raise(float_flag_invalid
, status
);
6117 return floatx80_default_nan(status
);
6119 float_raise(float_flag_divbyzero
, status
);
6120 return packFloatx80(zSign
, floatx80_infinity_high
,
6121 floatx80_infinity_low
);
6123 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6126 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6127 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6129 zExp
= aExp
- bExp
+ 0x3FFE;
6131 if ( bSig
<= aSig
) {
6132 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6135 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6136 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6137 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6138 while ( (int64_t) rem0
< 0 ) {
6140 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6142 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6143 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6144 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6145 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6146 while ( (int64_t) rem1
< 0 ) {
6148 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6150 zSig1
|= ( ( rem1
| rem2
) != 0 );
6152 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6153 zSign
, zExp
, zSig0
, zSig1
, status
);
6156 /*----------------------------------------------------------------------------
6157 | Returns the remainder of the extended double-precision floating-point value
6158 | `a' with respect to the corresponding value `b'. The operation is performed
6159 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6160 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6161 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6162 | the absolute value of the integer quotient.
6163 *----------------------------------------------------------------------------*/
6165 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6166 float_status
*status
)
6169 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6170 uint64_t aSig0
, aSig1
, bSig
;
6171 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6174 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6175 float_raise(float_flag_invalid
, status
);
6176 return floatx80_default_nan(status
);
6178 aSig0
= extractFloatx80Frac( a
);
6179 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6180 aSign
= extractFloatx80Sign( a
);
6181 bSig
= extractFloatx80Frac( b
);
6182 bExp
= extractFloatx80Exp( b
);
6183 if ( aExp
== 0x7FFF ) {
6184 if ( (uint64_t) ( aSig0
<<1 )
6185 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6186 return propagateFloatx80NaN(a
, b
, status
);
6190 if ( bExp
== 0x7FFF ) {
6191 if ((uint64_t)(bSig
<< 1)) {
6192 return propagateFloatx80NaN(a
, b
, status
);
6194 if (aExp
== 0 && aSig0
>> 63) {
6196 * Pseudo-denormal argument must be returned in normalized
6199 return packFloatx80(aSign
, 1, aSig0
);
6206 float_raise(float_flag_invalid
, status
);
6207 return floatx80_default_nan(status
);
6209 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6212 if ( aSig0
== 0 ) return a
;
6213 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6216 expDiff
= aExp
- bExp
;
6218 if ( expDiff
< 0 ) {
6219 if ( mod
|| expDiff
< -1 ) {
6220 if (aExp
== 1 && aExpOrig
== 0) {
6222 * Pseudo-denormal argument must be returned in
6225 return packFloatx80(aSign
, aExp
, aSig0
);
6229 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6232 *quotient
= q
= ( bSig
<= aSig0
);
6233 if ( q
) aSig0
-= bSig
;
6235 while ( 0 < expDiff
) {
6236 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6237 q
= ( 2 < q
) ? q
- 2 : 0;
6238 mul64To128( bSig
, q
, &term0
, &term1
);
6239 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6240 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6246 if ( 0 < expDiff
) {
6247 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6248 q
= ( 2 < q
) ? q
- 2 : 0;
6250 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6251 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6252 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6253 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6255 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6258 *quotient
<<= expDiff
;
6269 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6270 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6271 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6274 aSig0
= alternateASig0
;
6275 aSig1
= alternateASig1
;
6281 normalizeRoundAndPackFloatx80(
6282 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6286 /*----------------------------------------------------------------------------
6287 | Returns the remainder of the extended double-precision floating-point value
6288 | `a' with respect to the corresponding value `b'. The operation is performed
6289 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6290 *----------------------------------------------------------------------------*/
6292 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6295 return floatx80_modrem(a
, b
, false, "ient
, status
);
6298 /*----------------------------------------------------------------------------
6299 | Returns the remainder of the extended double-precision floating-point value
6300 | `a' with respect to the corresponding value `b', with the quotient truncated
6302 *----------------------------------------------------------------------------*/
6304 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6307 return floatx80_modrem(a
, b
, true, "ient
, status
);
6310 /*----------------------------------------------------------------------------
6311 | Returns the square root of the extended double-precision floating-point
6312 | value `a'. The operation is performed according to the IEC/IEEE Standard
6313 | for Binary Floating-Point Arithmetic.
6314 *----------------------------------------------------------------------------*/
6316 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6320 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6321 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6323 if (floatx80_invalid_encoding(a
)) {
6324 float_raise(float_flag_invalid
, status
);
6325 return floatx80_default_nan(status
);
6327 aSig0
= extractFloatx80Frac( a
);
6328 aExp
= extractFloatx80Exp( a
);
6329 aSign
= extractFloatx80Sign( a
);
6330 if ( aExp
== 0x7FFF ) {
6331 if ((uint64_t)(aSig0
<< 1)) {
6332 return propagateFloatx80NaN(a
, a
, status
);
6334 if ( ! aSign
) return a
;
6338 if ( ( aExp
| aSig0
) == 0 ) return a
;
6340 float_raise(float_flag_invalid
, status
);
6341 return floatx80_default_nan(status
);
6344 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6345 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6347 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6348 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6349 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6350 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6351 doubleZSig0
= zSig0
<<1;
6352 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6353 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6354 while ( (int64_t) rem0
< 0 ) {
6357 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6359 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6360 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6361 if ( zSig1
== 0 ) zSig1
= 1;
6362 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6363 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6364 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6365 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6366 while ( (int64_t) rem1
< 0 ) {
6368 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6370 term2
|= doubleZSig0
;
6371 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6373 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6375 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6376 zSig0
|= doubleZSig0
;
6377 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6378 0, zExp
, zSig0
, zSig1
, status
);
6381 /*----------------------------------------------------------------------------
6382 | Returns the result of converting the quadruple-precision floating-point
6383 | value `a' to the extended double-precision floating-point format. The
6384 | conversion is performed according to the IEC/IEEE Standard for Binary
6385 | Floating-Point Arithmetic.
6386 *----------------------------------------------------------------------------*/
6388 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6392 uint64_t aSig0
, aSig1
;
6394 aSig1
= extractFloat128Frac1( a
);
6395 aSig0
= extractFloat128Frac0( a
);
6396 aExp
= extractFloat128Exp( a
);
6397 aSign
= extractFloat128Sign( a
);
6398 if ( aExp
== 0x7FFF ) {
6399 if ( aSig0
| aSig1
) {
6400 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6402 return floatx80_silence_nan(res
, status
);
6404 return packFloatx80(aSign
, floatx80_infinity_high
,
6405 floatx80_infinity_low
);
6408 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6409 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6412 aSig0
|= UINT64_C(0x0001000000000000);
6414 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6415 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6419 /*----------------------------------------------------------------------------
6420 | Returns the remainder of the quadruple-precision floating-point value `a'
6421 | with respect to the corresponding value `b'. The operation is performed
6422 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6423 *----------------------------------------------------------------------------*/
6425 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6428 int32_t aExp
, bExp
, expDiff
;
6429 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6430 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6433 aSig1
= extractFloat128Frac1( a
);
6434 aSig0
= extractFloat128Frac0( a
);
6435 aExp
= extractFloat128Exp( a
);
6436 aSign
= extractFloat128Sign( a
);
6437 bSig1
= extractFloat128Frac1( b
);
6438 bSig0
= extractFloat128Frac0( b
);
6439 bExp
= extractFloat128Exp( b
);
6440 if ( aExp
== 0x7FFF ) {
6441 if ( ( aSig0
| aSig1
)
6442 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6443 return propagateFloat128NaN(a
, b
, status
);
6447 if ( bExp
== 0x7FFF ) {
6448 if (bSig0
| bSig1
) {
6449 return propagateFloat128NaN(a
, b
, status
);
6454 if ( ( bSig0
| bSig1
) == 0 ) {
6456 float_raise(float_flag_invalid
, status
);
6457 return float128_default_nan(status
);
6459 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6462 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6463 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6465 expDiff
= aExp
- bExp
;
6466 if ( expDiff
< -1 ) return a
;
6468 aSig0
| UINT64_C(0x0001000000000000),
6470 15 - ( expDiff
< 0 ),
6475 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
6476 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6477 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6479 while ( 0 < expDiff
) {
6480 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6481 q
= ( 4 < q
) ? q
- 4 : 0;
6482 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6483 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6484 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6485 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6488 if ( -64 < expDiff
) {
6489 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6490 q
= ( 4 < q
) ? q
- 4 : 0;
6492 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6494 if ( expDiff
< 0 ) {
6495 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6498 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6500 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6501 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6504 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6505 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6508 alternateASig0
= aSig0
;
6509 alternateASig1
= aSig1
;
6511 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6512 } while ( 0 <= (int64_t) aSig0
);
6514 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6515 if ( ( sigMean0
< 0 )
6516 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6517 aSig0
= alternateASig0
;
6518 aSig1
= alternateASig1
;
6520 zSign
= ( (int64_t) aSig0
< 0 );
6521 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6522 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6526 /*----------------------------------------------------------------------------
6527 | Returns the square root of the quadruple-precision floating-point value `a'.
6528 | The operation is performed according to the IEC/IEEE Standard for Binary
6529 | Floating-Point Arithmetic.
6530 *----------------------------------------------------------------------------*/
6532 float128
float128_sqrt(float128 a
, float_status
*status
)
6536 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6537 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6539 aSig1
= extractFloat128Frac1( a
);
6540 aSig0
= extractFloat128Frac0( a
);
6541 aExp
= extractFloat128Exp( a
);
6542 aSign
= extractFloat128Sign( a
);
6543 if ( aExp
== 0x7FFF ) {
6544 if (aSig0
| aSig1
) {
6545 return propagateFloat128NaN(a
, a
, status
);
6547 if ( ! aSign
) return a
;
6551 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6553 float_raise(float_flag_invalid
, status
);
6554 return float128_default_nan(status
);
6557 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6558 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6560 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6561 aSig0
|= UINT64_C(0x0001000000000000);
6562 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6563 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6564 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6565 doubleZSig0
= zSig0
<<1;
6566 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6567 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6568 while ( (int64_t) rem0
< 0 ) {
6571 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6573 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6574 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6575 if ( zSig1
== 0 ) zSig1
= 1;
6576 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6577 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6578 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6579 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6580 while ( (int64_t) rem1
< 0 ) {
6582 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6584 term2
|= doubleZSig0
;
6585 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6587 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6589 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6590 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6594 static inline FloatRelation
6595 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
6596 float_status
*status
)
6600 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6601 float_raise(float_flag_invalid
, status
);
6602 return float_relation_unordered
;
6604 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6605 ( extractFloatx80Frac( a
)<<1 ) ) ||
6606 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6607 ( extractFloatx80Frac( b
)<<1 ) )) {
6609 floatx80_is_signaling_nan(a
, status
) ||
6610 floatx80_is_signaling_nan(b
, status
)) {
6611 float_raise(float_flag_invalid
, status
);
6613 return float_relation_unordered
;
6615 aSign
= extractFloatx80Sign( a
);
6616 bSign
= extractFloatx80Sign( b
);
6617 if ( aSign
!= bSign
) {
6619 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6620 ( ( a
.low
| b
.low
) == 0 ) ) {
6622 return float_relation_equal
;
6624 return 1 - (2 * aSign
);
6627 /* Normalize pseudo-denormals before comparison. */
6628 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
6631 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
6634 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6635 return float_relation_equal
;
6637 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6642 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6644 return floatx80_compare_internal(a
, b
, 0, status
);
6647 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
6648 float_status
*status
)
6650 return floatx80_compare_internal(a
, b
, 1, status
);
6653 static inline FloatRelation
6654 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
6655 float_status
*status
)
6659 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6660 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6661 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6662 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6664 float128_is_signaling_nan(a
, status
) ||
6665 float128_is_signaling_nan(b
, status
)) {
6666 float_raise(float_flag_invalid
, status
);
6668 return float_relation_unordered
;
6670 aSign
= extractFloat128Sign( a
);
6671 bSign
= extractFloat128Sign( b
);
6672 if ( aSign
!= bSign
) {
6673 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6675 return float_relation_equal
;
6677 return 1 - (2 * aSign
);
6680 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6681 return float_relation_equal
;
6683 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6688 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
6690 return float128_compare_internal(a
, b
, 0, status
);
6693 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
6694 float_status
*status
)
6696 return float128_compare_internal(a
, b
, 1, status
);
6699 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
6705 if (floatx80_invalid_encoding(a
)) {
6706 float_raise(float_flag_invalid
, status
);
6707 return floatx80_default_nan(status
);
6709 aSig
= extractFloatx80Frac( a
);
6710 aExp
= extractFloatx80Exp( a
);
6711 aSign
= extractFloatx80Sign( a
);
6713 if ( aExp
== 0x7FFF ) {
6715 return propagateFloatx80NaN(a
, a
, status
);
6729 } else if (n
< -0x10000) {
6734 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
6735 aSign
, aExp
, aSig
, 0, status
);
6738 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
6742 uint64_t aSig0
, aSig1
;
6744 aSig1
= extractFloat128Frac1( a
);
6745 aSig0
= extractFloat128Frac0( a
);
6746 aExp
= extractFloat128Exp( a
);
6747 aSign
= extractFloat128Sign( a
);
6748 if ( aExp
== 0x7FFF ) {
6749 if ( aSig0
| aSig1
) {
6750 return propagateFloat128NaN(a
, a
, status
);
6755 aSig0
|= UINT64_C(0x0001000000000000);
6756 } else if (aSig0
== 0 && aSig1
== 0) {
6764 } else if (n
< -0x10000) {
6769 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
6774 static void __attribute__((constructor
)) softfloat_init(void)
6776 union_float64 ua
, ub
, uc
, ur
;
6778 if (QEMU_NO_HARDFLOAT
) {
6782 * Test that the host's FMA is not obviously broken. For example,
6783 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
6784 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
6786 ua
.s
= 0x0020000000000001ULL
;
6787 ub
.s
= 0x3ca0000000000000ULL
;
6788 uc
.s
= 0x0020000000000000ULL
;
6789 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
6790 if (ur
.s
!= 0x0020000000000001ULL
) {
6791 force_soft_fma
= true;