4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
516 /* Routines that know the structure may reference the singular name. */
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT 63
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
540 /* Structure holding all of the relevant parameters for a format.
541 * exp_size: the size of the exponent field
542 * exp_bias: the offset applied to the exponent field
543 * exp_max: the maximum normalised exponent
544 * frac_size: the size of the fraction field
545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546 * The following are computed based the size of fraction
547 * frac_lsb: least significant bit of fraction
548 * frac_lsbm1: the bit below the least significant bit (for rounding)
549 * round_mask/roundeven_mask: masks used for rounding
550 * The following optional modifiers are available:
551 * arm_althp: handle ARM Alternative Half Precision
562 uint64_t roundeven_mask
;
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F) \
569 .exp_bias = ((1 << E) - 1) >> 1, \
570 .exp_max = (1 << E) - 1, \
572 .frac_shift = (-F - 1) & 63, \
573 .frac_lsb = 1ull << ((-F - 1) & 63), \
574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
578 static const FloatFmt float16_params
= {
582 static const FloatFmt float16_params_ahp
= {
587 static const FloatFmt bfloat16_params
= {
591 static const FloatFmt float32_params
= {
595 static const FloatFmt float64_params
= {
599 static const FloatFmt float128_params
= {
600 FLOAT_PARAMS(15, 112)
603 /* Unpack a float to parts, but do not canonicalize. */
604 static void unpack_raw64(FloatParts64
*r
, const FloatFmt
*fmt
, uint64_t raw
)
606 const int f_size
= fmt
->frac_size
;
607 const int e_size
= fmt
->exp_size
;
609 *r
= (FloatParts64
) {
610 .cls
= float_class_unclassified
,
611 .sign
= extract64(raw
, f_size
+ e_size
, 1),
612 .exp
= extract64(raw
, f_size
, e_size
),
613 .frac
= extract64(raw
, 0, f_size
)
617 static inline void float16_unpack_raw(FloatParts64
*p
, float16 f
)
619 unpack_raw64(p
, &float16_params
, f
);
622 static inline void bfloat16_unpack_raw(FloatParts64
*p
, bfloat16 f
)
624 unpack_raw64(p
, &bfloat16_params
, f
);
627 static inline void float32_unpack_raw(FloatParts64
*p
, float32 f
)
629 unpack_raw64(p
, &float32_params
, f
);
632 static inline void float64_unpack_raw(FloatParts64
*p
, float64 f
)
634 unpack_raw64(p
, &float64_params
, f
);
637 static void float128_unpack_raw(FloatParts128
*p
, float128 f
)
639 const int f_size
= float128_params
.frac_size
- 64;
640 const int e_size
= float128_params
.exp_size
;
642 *p
= (FloatParts128
) {
643 .cls
= float_class_unclassified
,
644 .sign
= extract64(f
.high
, f_size
+ e_size
, 1),
645 .exp
= extract64(f
.high
, f_size
, e_size
),
646 .frac_hi
= extract64(f
.high
, 0, f_size
),
651 /* Pack a float from parts, but do not canonicalize. */
652 static uint64_t pack_raw64(const FloatParts64
*p
, const FloatFmt
*fmt
)
654 const int f_size
= fmt
->frac_size
;
655 const int e_size
= fmt
->exp_size
;
658 ret
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
659 ret
= deposit64(ret
, f_size
, e_size
, p
->exp
);
660 ret
= deposit64(ret
, 0, f_size
, p
->frac
);
664 static inline float16
float16_pack_raw(const FloatParts64
*p
)
666 return make_float16(pack_raw64(p
, &float16_params
));
669 static inline bfloat16
bfloat16_pack_raw(const FloatParts64
*p
)
671 return pack_raw64(p
, &bfloat16_params
);
674 static inline float32
float32_pack_raw(const FloatParts64
*p
)
676 return make_float32(pack_raw64(p
, &float32_params
));
679 static inline float64
float64_pack_raw(const FloatParts64
*p
)
681 return make_float64(pack_raw64(p
, &float64_params
));
684 static float128
float128_pack_raw(const FloatParts128
*p
)
686 const int f_size
= float128_params
.frac_size
- 64;
687 const int e_size
= float128_params
.exp_size
;
690 hi
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
691 hi
= deposit64(hi
, f_size
, e_size
, p
->exp
);
692 hi
= deposit64(hi
, 0, f_size
, p
->frac_hi
);
693 return make_float128(hi
, p
->frac_lo
);
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine: (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output. These details are target-
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
706 #define PARTS_GENERIC_64_128(NAME, P) \
707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
712 static void parts64_return_nan(FloatParts64
*a
, float_status
*s
);
713 static void parts128_return_nan(FloatParts128
*a
, float_status
*s
);
715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
717 static FloatParts64
*parts64_pick_nan(FloatParts64
*a
, FloatParts64
*b
,
719 static FloatParts128
*parts128_pick_nan(FloatParts128
*a
, FloatParts128
*b
,
722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
724 static FloatParts64
*parts64_pick_nan_muladd(FloatParts64
*a
, FloatParts64
*b
,
725 FloatParts64
*c
, float_status
*s
,
726 int ab_mask
, int abc_mask
);
727 static FloatParts128
*parts128_pick_nan_muladd(FloatParts128
*a
,
731 int ab_mask
, int abc_mask
);
733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
736 static void parts64_canonicalize(FloatParts64
*p
, float_status
*status
,
737 const FloatFmt
*fmt
);
738 static void parts128_canonicalize(FloatParts128
*p
, float_status
*status
,
739 const FloatFmt
*fmt
);
741 #define parts_canonicalize(A, S, F) \
742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
744 static void parts64_uncanon(FloatParts64
*p
, float_status
*status
,
745 const FloatFmt
*fmt
);
746 static void parts128_uncanon(FloatParts128
*p
, float_status
*status
,
747 const FloatFmt
*fmt
);
749 #define parts_uncanon(A, S, F) \
750 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
752 static void parts64_add_normal(FloatParts64
*a
, FloatParts64
*b
);
753 static void parts128_add_normal(FloatParts128
*a
, FloatParts128
*b
);
755 #define parts_add_normal(A, B) \
756 PARTS_GENERIC_64_128(add_normal, A)(A, B)
758 static bool parts64_sub_normal(FloatParts64
*a
, FloatParts64
*b
);
759 static bool parts128_sub_normal(FloatParts128
*a
, FloatParts128
*b
);
761 #define parts_sub_normal(A, B) \
762 PARTS_GENERIC_64_128(sub_normal, A)(A, B)
764 static FloatParts64
*parts64_addsub(FloatParts64
*a
, FloatParts64
*b
,
765 float_status
*s
, bool subtract
);
766 static FloatParts128
*parts128_addsub(FloatParts128
*a
, FloatParts128
*b
,
767 float_status
*s
, bool subtract
);
769 #define parts_addsub(A, B, S, Z) \
770 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
773 * Helper functions for softfloat-parts.c.inc, per-size operations.
776 #define FRAC_GENERIC_64_128(NAME, P) \
777 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
779 static bool frac64_add(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
781 return uadd64_overflow(a
->frac
, b
->frac
, &r
->frac
);
784 static bool frac128_add(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
787 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
788 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
792 #define frac_add(R, A, B) FRAC_GENERIC_64_128(add, R)(R, A, B)
794 static bool frac64_addi(FloatParts64
*r
, FloatParts64
*a
, uint64_t c
)
796 return uadd64_overflow(a
->frac
, c
, &r
->frac
);
799 static bool frac128_addi(FloatParts128
*r
, FloatParts128
*a
, uint64_t c
)
801 c
= uadd64_overflow(a
->frac_lo
, c
, &r
->frac_lo
);
802 return uadd64_overflow(a
->frac_hi
, c
, &r
->frac_hi
);
805 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
807 static void frac64_allones(FloatParts64
*a
)
812 static void frac128_allones(FloatParts128
*a
)
814 a
->frac_hi
= a
->frac_lo
= -1;
817 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
819 static int frac64_cmp(FloatParts64
*a
, FloatParts64
*b
)
821 return a
->frac
== b
->frac
? 0 : a
->frac
< b
->frac
? -1 : 1;
824 static int frac128_cmp(FloatParts128
*a
, FloatParts128
*b
)
826 uint64_t ta
= a
->frac_hi
, tb
= b
->frac_hi
;
828 ta
= a
->frac_lo
, tb
= b
->frac_lo
;
833 return ta
< tb
? -1 : 1;
836 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
838 static void frac64_clear(FloatParts64
*a
)
843 static void frac128_clear(FloatParts128
*a
)
845 a
->frac_hi
= a
->frac_lo
= 0;
848 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
850 static bool frac64_eqz(FloatParts64
*a
)
855 static bool frac128_eqz(FloatParts128
*a
)
857 return (a
->frac_hi
| a
->frac_lo
) == 0;
860 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
862 static void frac64_neg(FloatParts64
*a
)
867 static void frac128_neg(FloatParts128
*a
)
870 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
871 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
874 #define frac_neg(A) FRAC_GENERIC_64_128(neg, A)(A)
876 static int frac64_normalize(FloatParts64
*a
)
879 int shift
= clz64(a
->frac
);
886 static int frac128_normalize(FloatParts128
*a
)
889 int shl
= clz64(a
->frac_hi
);
892 a
->frac_hi
= (a
->frac_hi
<< shl
) | (a
->frac_lo
>> shr
);
893 a
->frac_lo
= (a
->frac_lo
<< shl
);
896 } else if (a
->frac_lo
) {
897 int shl
= clz64(a
->frac_lo
);
898 a
->frac_hi
= (a
->frac_lo
<< shl
);
905 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A)
907 static void frac64_shl(FloatParts64
*a
, int c
)
912 static void frac128_shl(FloatParts128
*a
, int c
)
914 shift128Left(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
917 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
919 static void frac64_shr(FloatParts64
*a
, int c
)
924 static void frac128_shr(FloatParts128
*a
, int c
)
926 shift128Right(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
929 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
931 static void frac64_shrjam(FloatParts64
*a
, int c
)
933 shift64RightJamming(a
->frac
, c
, &a
->frac
);
936 static void frac128_shrjam(FloatParts128
*a
, int c
)
938 shift128RightJamming(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
941 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C)
943 static bool frac64_sub(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
945 return usub64_overflow(a
->frac
, b
->frac
, &r
->frac
);
948 static bool frac128_sub(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
951 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
952 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
956 #define frac_sub(R, A, B) FRAC_GENERIC_64_128(sub, R)(R, A, B)
958 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
959 #define FloatPartsN glue(FloatParts,N)
963 #include "softfloat-parts-addsub.c.inc"
964 #include "softfloat-parts.c.inc"
969 #include "softfloat-parts-addsub.c.inc"
970 #include "softfloat-parts.c.inc"
977 * Pack/unpack routines with a specific FloatFmt.
980 static void float16a_unpack_canonical(FloatParts64
*p
, float16 f
,
981 float_status
*s
, const FloatFmt
*params
)
983 float16_unpack_raw(p
, f
);
984 parts_canonicalize(p
, s
, params
);
987 static void float16_unpack_canonical(FloatParts64
*p
, float16 f
,
990 float16a_unpack_canonical(p
, f
, s
, &float16_params
);
993 static void bfloat16_unpack_canonical(FloatParts64
*p
, bfloat16 f
,
996 bfloat16_unpack_raw(p
, f
);
997 parts_canonicalize(p
, s
, &bfloat16_params
);
1000 static float16
float16a_round_pack_canonical(FloatParts64
*p
,
1002 const FloatFmt
*params
)
1004 parts_uncanon(p
, s
, params
);
1005 return float16_pack_raw(p
);
1008 static float16
float16_round_pack_canonical(FloatParts64
*p
,
1011 return float16a_round_pack_canonical(p
, s
, &float16_params
);
1014 static bfloat16
bfloat16_round_pack_canonical(FloatParts64
*p
,
1017 parts_uncanon(p
, s
, &bfloat16_params
);
1018 return bfloat16_pack_raw(p
);
1021 static void float32_unpack_canonical(FloatParts64
*p
, float32 f
,
1024 float32_unpack_raw(p
, f
);
1025 parts_canonicalize(p
, s
, &float32_params
);
1028 static float32
float32_round_pack_canonical(FloatParts64
*p
,
1031 parts_uncanon(p
, s
, &float32_params
);
1032 return float32_pack_raw(p
);
1035 static void float64_unpack_canonical(FloatParts64
*p
, float64 f
,
1038 float64_unpack_raw(p
, f
);
1039 parts_canonicalize(p
, s
, &float64_params
);
1042 static float64
float64_round_pack_canonical(FloatParts64
*p
,
1045 parts_uncanon(p
, s
, &float64_params
);
1046 return float64_pack_raw(p
);
1050 * Addition and subtraction
1053 static float16 QEMU_FLATTEN
1054 float16_addsub(float16 a
, float16 b
, float_status
*status
, bool subtract
)
1056 FloatParts64 pa
, pb
, *pr
;
1058 float16_unpack_canonical(&pa
, a
, status
);
1059 float16_unpack_canonical(&pb
, b
, status
);
1060 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1062 return float16_round_pack_canonical(pr
, status
);
1065 float16
float16_add(float16 a
, float16 b
, float_status
*status
)
1067 return float16_addsub(a
, b
, status
, false);
1070 float16
float16_sub(float16 a
, float16 b
, float_status
*status
)
1072 return float16_addsub(a
, b
, status
, true);
1075 static float32 QEMU_SOFTFLOAT_ATTR
1076 soft_f32_addsub(float32 a
, float32 b
, float_status
*status
, bool subtract
)
1078 FloatParts64 pa
, pb
, *pr
;
1080 float32_unpack_canonical(&pa
, a
, status
);
1081 float32_unpack_canonical(&pb
, b
, status
);
1082 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1084 return float32_round_pack_canonical(pr
, status
);
1087 static float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1089 return soft_f32_addsub(a
, b
, status
, false);
1092 static float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1094 return soft_f32_addsub(a
, b
, status
, true);
1097 static float64 QEMU_SOFTFLOAT_ATTR
1098 soft_f64_addsub(float64 a
, float64 b
, float_status
*status
, bool subtract
)
1100 FloatParts64 pa
, pb
, *pr
;
1102 float64_unpack_canonical(&pa
, a
, status
);
1103 float64_unpack_canonical(&pb
, b
, status
);
1104 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1106 return float64_round_pack_canonical(pr
, status
);
1109 static float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1111 return soft_f64_addsub(a
, b
, status
, false);
1114 static float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1116 return soft_f64_addsub(a
, b
, status
, true);
1119 static float hard_f32_add(float a
, float b
)
1124 static float hard_f32_sub(float a
, float b
)
1129 static double hard_f64_add(double a
, double b
)
1134 static double hard_f64_sub(double a
, double b
)
1139 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1141 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1142 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1144 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1147 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1149 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1150 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1152 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1156 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1157 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1159 return float32_gen2(a
, b
, s
, hard
, soft
,
1160 f32_is_zon2
, f32_addsubmul_post
);
1163 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1164 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1166 return float64_gen2(a
, b
, s
, hard
, soft
,
1167 f64_is_zon2
, f64_addsubmul_post
);
1170 float32 QEMU_FLATTEN
1171 float32_add(float32 a
, float32 b
, float_status
*s
)
1173 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1176 float32 QEMU_FLATTEN
1177 float32_sub(float32 a
, float32 b
, float_status
*s
)
1179 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1182 float64 QEMU_FLATTEN
1183 float64_add(float64 a
, float64 b
, float_status
*s
)
1185 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1188 float64 QEMU_FLATTEN
1189 float64_sub(float64 a
, float64 b
, float_status
*s
)
1191 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1194 static bfloat16 QEMU_FLATTEN
1195 bfloat16_addsub(bfloat16 a
, bfloat16 b
, float_status
*status
, bool subtract
)
1197 FloatParts64 pa
, pb
, *pr
;
1199 bfloat16_unpack_canonical(&pa
, a
, status
);
1200 bfloat16_unpack_canonical(&pb
, b
, status
);
1201 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1203 return bfloat16_round_pack_canonical(pr
, status
);
1206 bfloat16
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1208 return bfloat16_addsub(a
, b
, status
, false);
1211 bfloat16
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1213 return bfloat16_addsub(a
, b
, status
, true);
1217 * Returns the result of multiplying the floating-point values `a' and
1218 * `b'. The operation is performed according to the IEC/IEEE Standard
1219 * for Binary Floating-Point Arithmetic.
1222 static FloatParts64
mul_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1224 bool sign
= a
.sign
^ b
.sign
;
1226 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1228 int exp
= a
.exp
+ b
.exp
;
1230 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1231 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1244 /* handle all the NaN cases */
1245 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1246 return *parts_pick_nan(&a
, &b
, s
);
1248 /* Inf * Zero == NaN */
1249 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1250 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1251 float_raise(float_flag_invalid
, s
);
1252 parts_default_nan(&a
, s
);
1255 /* Multiply by 0 or Inf */
1256 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1260 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1264 g_assert_not_reached();
1267 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1269 FloatParts64 pa
, pb
, pr
;
1271 float16_unpack_canonical(&pa
, a
, status
);
1272 float16_unpack_canonical(&pb
, b
, status
);
1273 pr
= mul_floats(pa
, pb
, status
);
1275 return float16_round_pack_canonical(&pr
, status
);
1278 static float32 QEMU_SOFTFLOAT_ATTR
1279 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1281 FloatParts64 pa
, pb
, pr
;
1283 float32_unpack_canonical(&pa
, a
, status
);
1284 float32_unpack_canonical(&pb
, b
, status
);
1285 pr
= mul_floats(pa
, pb
, status
);
1287 return float32_round_pack_canonical(&pr
, status
);
1290 static float64 QEMU_SOFTFLOAT_ATTR
1291 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1293 FloatParts64 pa
, pb
, pr
;
1295 float64_unpack_canonical(&pa
, a
, status
);
1296 float64_unpack_canonical(&pb
, b
, status
);
1297 pr
= mul_floats(pa
, pb
, status
);
1299 return float64_round_pack_canonical(&pr
, status
);
1302 static float hard_f32_mul(float a
, float b
)
1307 static double hard_f64_mul(double a
, double b
)
1312 float32 QEMU_FLATTEN
1313 float32_mul(float32 a
, float32 b
, float_status
*s
)
1315 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1316 f32_is_zon2
, f32_addsubmul_post
);
1319 float64 QEMU_FLATTEN
1320 float64_mul(float64 a
, float64 b
, float_status
*s
)
1322 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1323 f64_is_zon2
, f64_addsubmul_post
);
1327 * Returns the result of multiplying the bfloat16
1328 * values `a' and `b'.
1331 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1333 FloatParts64 pa
, pb
, pr
;
1335 bfloat16_unpack_canonical(&pa
, a
, status
);
1336 bfloat16_unpack_canonical(&pb
, b
, status
);
1337 pr
= mul_floats(pa
, pb
, status
);
1339 return bfloat16_round_pack_canonical(&pr
, status
);
1343 * Returns the result of multiplying the floating-point values `a' and
1344 * `b' then adding 'c', with no intermediate rounding step after the
1345 * multiplication. The operation is performed according to the
1346 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1347 * The flags argument allows the caller to select negation of the
1348 * addend, the intermediate product, or the final result. (The
1349 * difference between this and having the caller do a separate
1350 * negation is that negating externally will flip the sign bit on
1354 static FloatParts64
muladd_floats(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
1355 int flags
, float_status
*s
)
1357 bool inf_zero
, p_sign
;
1358 bool sign_flip
= flags
& float_muladd_negate_result
;
1362 int ab_mask
, abc_mask
;
1364 ab_mask
= float_cmask(a
.cls
) | float_cmask(b
.cls
);
1365 abc_mask
= float_cmask(c
.cls
) | ab_mask
;
1366 inf_zero
= ab_mask
== float_cmask_infzero
;
1368 /* It is implementation-defined whether the cases of (0,inf,qnan)
1369 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1370 * they return if they do), so we have to hand this information
1371 * off to the target-specific pick-a-NaN routine.
1373 if (unlikely(abc_mask
& float_cmask_anynan
)) {
1374 return *parts_pick_nan_muladd(&a
, &b
, &c
, s
, ab_mask
, abc_mask
);
1378 float_raise(float_flag_invalid
, s
);
1379 parts_default_nan(&a
, s
);
1383 if (flags
& float_muladd_negate_c
) {
1387 p_sign
= a
.sign
^ b
.sign
;
1389 if (flags
& float_muladd_negate_product
) {
1393 if (ab_mask
& float_cmask_inf
) {
1394 p_class
= float_class_inf
;
1395 } else if (ab_mask
& float_cmask_zero
) {
1396 p_class
= float_class_zero
;
1398 p_class
= float_class_normal
;
1401 if (c
.cls
== float_class_inf
) {
1402 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1403 float_raise(float_flag_invalid
, s
);
1404 parts_default_nan(&c
, s
);
1406 c
.sign
^= sign_flip
;
1411 if (p_class
== float_class_inf
) {
1412 a
.cls
= float_class_inf
;
1413 a
.sign
= p_sign
^ sign_flip
;
1417 if (p_class
== float_class_zero
) {
1418 if (c
.cls
== float_class_zero
) {
1419 if (p_sign
!= c
.sign
) {
1420 p_sign
= s
->float_rounding_mode
== float_round_down
;
1423 } else if (flags
& float_muladd_halve_result
) {
1426 c
.sign
^= sign_flip
;
1430 /* a & b should be normals now... */
1431 assert(a
.cls
== float_class_normal
&&
1432 b
.cls
== float_class_normal
);
1434 p_exp
= a
.exp
+ b
.exp
;
1436 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1438 /* Renormalize to the msb. */
1439 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1442 shortShift128Left(hi
, lo
, 1, &hi
, &lo
);
1446 if (c
.cls
!= float_class_zero
) {
1447 int exp_diff
= p_exp
- c
.exp
;
1448 if (p_sign
== c
.sign
) {
1450 if (exp_diff
<= 0) {
1451 shift64RightJamming(hi
, -exp_diff
, &hi
);
1453 if (uadd64_overflow(hi
, c
.frac
, &hi
)) {
1454 shift64RightJamming(hi
, 1, &hi
);
1455 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1459 uint64_t c_hi
, c_lo
, over
;
1460 shift128RightJamming(c
.frac
, 0, exp_diff
, &c_hi
, &c_lo
);
1461 add192(0, hi
, lo
, 0, c_hi
, c_lo
, &over
, &hi
, &lo
);
1463 shift64RightJamming(hi
, 1, &hi
);
1464 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1470 uint64_t c_hi
= c
.frac
, c_lo
= 0;
1472 if (exp_diff
<= 0) {
1473 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1476 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1477 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1479 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1484 shift128RightJamming(c_hi
, c_lo
,
1487 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1490 if (hi
== 0 && lo
== 0) {
1491 a
.cls
= float_class_zero
;
1492 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1493 a
.sign
^= sign_flip
;
1500 shift
= clz64(lo
) + 64;
1502 /* Normalizing to a binary point of 124 is the
1503 correct adjust for the exponent. However since we're
1504 shifting, we might as well put the binary point back
1505 at 63 where we really want it. Therefore shift as
1506 if we're leaving 1 bit at the top of the word, but
1507 adjust the exponent as if we're leaving 3 bits. */
1508 shift128Left(hi
, lo
, shift
, &hi
, &lo
);
1515 if (flags
& float_muladd_halve_result
) {
1519 /* finally prepare our result */
1520 a
.cls
= float_class_normal
;
1521 a
.sign
= p_sign
^ sign_flip
;
1528 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1529 int flags
, float_status
*status
)
1531 FloatParts64 pa
, pb
, pc
, pr
;
1533 float16_unpack_canonical(&pa
, a
, status
);
1534 float16_unpack_canonical(&pb
, b
, status
);
1535 float16_unpack_canonical(&pc
, c
, status
);
1536 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1538 return float16_round_pack_canonical(&pr
, status
);
1541 static float32 QEMU_SOFTFLOAT_ATTR
1542 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1543 float_status
*status
)
1545 FloatParts64 pa
, pb
, pc
, pr
;
1547 float32_unpack_canonical(&pa
, a
, status
);
1548 float32_unpack_canonical(&pb
, b
, status
);
1549 float32_unpack_canonical(&pc
, c
, status
);
1550 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1552 return float32_round_pack_canonical(&pr
, status
);
1555 static float64 QEMU_SOFTFLOAT_ATTR
1556 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1557 float_status
*status
)
1559 FloatParts64 pa
, pb
, pc
, pr
;
1561 float64_unpack_canonical(&pa
, a
, status
);
1562 float64_unpack_canonical(&pb
, b
, status
);
1563 float64_unpack_canonical(&pc
, c
, status
);
1564 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1566 return float64_round_pack_canonical(&pr
, status
);
1569 static bool force_soft_fma
;
1571 float32 QEMU_FLATTEN
1572 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1574 union_float32 ua
, ub
, uc
, ur
;
1580 if (unlikely(!can_use_fpu(s
))) {
1583 if (unlikely(flags
& float_muladd_halve_result
)) {
1587 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1588 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1592 if (unlikely(force_soft_fma
)) {
1597 * When (a || b) == 0, there's no need to check for under/over flow,
1598 * since we know the addend is (normal || 0) and the product is 0.
1600 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1604 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1605 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1606 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1608 if (flags
& float_muladd_negate_c
) {
1613 union_float32 ua_orig
= ua
;
1614 union_float32 uc_orig
= uc
;
1616 if (flags
& float_muladd_negate_product
) {
1619 if (flags
& float_muladd_negate_c
) {
1623 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1625 if (unlikely(f32_is_inf(ur
))) {
1626 float_raise(float_flag_overflow
, s
);
1627 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1633 if (flags
& float_muladd_negate_result
) {
1634 return float32_chs(ur
.s
);
1639 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1642 float64 QEMU_FLATTEN
1643 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1645 union_float64 ua
, ub
, uc
, ur
;
1651 if (unlikely(!can_use_fpu(s
))) {
1654 if (unlikely(flags
& float_muladd_halve_result
)) {
1658 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1659 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1663 if (unlikely(force_soft_fma
)) {
1668 * When (a || b) == 0, there's no need to check for under/over flow,
1669 * since we know the addend is (normal || 0) and the product is 0.
1671 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1675 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1676 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1677 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1679 if (flags
& float_muladd_negate_c
) {
1684 union_float64 ua_orig
= ua
;
1685 union_float64 uc_orig
= uc
;
1687 if (flags
& float_muladd_negate_product
) {
1690 if (flags
& float_muladd_negate_c
) {
1694 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1696 if (unlikely(f64_is_inf(ur
))) {
1697 float_raise(float_flag_overflow
, s
);
1698 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1704 if (flags
& float_muladd_negate_result
) {
1705 return float64_chs(ur
.s
);
1710 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1714 * Returns the result of multiplying the bfloat16 values `a'
1715 * and `b' then adding 'c', with no intermediate rounding step after the
1719 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1720 int flags
, float_status
*status
)
1722 FloatParts64 pa
, pb
, pc
, pr
;
1724 bfloat16_unpack_canonical(&pa
, a
, status
);
1725 bfloat16_unpack_canonical(&pb
, b
, status
);
1726 bfloat16_unpack_canonical(&pc
, c
, status
);
1727 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1729 return bfloat16_round_pack_canonical(&pr
, status
);
1733 * Returns the result of dividing the floating-point value `a' by the
1734 * corresponding value `b'. The operation is performed according to
1735 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1738 static FloatParts64
div_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1740 bool sign
= a
.sign
^ b
.sign
;
1742 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1743 uint64_t n0
, n1
, q
, r
;
1744 int exp
= a
.exp
- b
.exp
;
1747 * We want a 2*N / N-bit division to produce exactly an N-bit
1748 * result, so that we do not lose any precision and so that we
1749 * do not have to renormalize afterward. If A.frac < B.frac,
1750 * then division would produce an (N-1)-bit result; shift A left
1751 * by one to produce the an N-bit result, and decrement the
1752 * exponent to match.
1754 * The udiv_qrnnd algorithm that we're using requires normalization,
1755 * i.e. the msb of the denominator must be set, which is already true.
1757 if (a
.frac
< b
.frac
) {
1759 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1761 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1763 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1765 /* Set lsb if there is a remainder, to set inexact. */
1766 a
.frac
= q
| (r
!= 0);
1771 /* handle all the NaN cases */
1772 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1773 return *parts_pick_nan(&a
, &b
, s
);
1775 /* 0/0 or Inf/Inf */
1778 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1779 float_raise(float_flag_invalid
, s
);
1780 parts_default_nan(&a
, s
);
1783 /* Inf / x or 0 / x */
1784 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1789 if (b
.cls
== float_class_zero
) {
1790 float_raise(float_flag_divbyzero
, s
);
1791 a
.cls
= float_class_inf
;
1796 if (b
.cls
== float_class_inf
) {
1797 a
.cls
= float_class_zero
;
1801 g_assert_not_reached();
1804 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1806 FloatParts64 pa
, pb
, pr
;
1808 float16_unpack_canonical(&pa
, a
, status
);
1809 float16_unpack_canonical(&pb
, b
, status
);
1810 pr
= div_floats(pa
, pb
, status
);
1812 return float16_round_pack_canonical(&pr
, status
);
1815 static float32 QEMU_SOFTFLOAT_ATTR
1816 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1818 FloatParts64 pa
, pb
, pr
;
1820 float32_unpack_canonical(&pa
, a
, status
);
1821 float32_unpack_canonical(&pb
, b
, status
);
1822 pr
= div_floats(pa
, pb
, status
);
1824 return float32_round_pack_canonical(&pr
, status
);
1827 static float64 QEMU_SOFTFLOAT_ATTR
1828 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1830 FloatParts64 pa
, pb
, pr
;
1832 float64_unpack_canonical(&pa
, a
, status
);
1833 float64_unpack_canonical(&pb
, b
, status
);
1834 pr
= div_floats(pa
, pb
, status
);
1836 return float64_round_pack_canonical(&pr
, status
);
1839 static float hard_f32_div(float a
, float b
)
1844 static double hard_f64_div(double a
, double b
)
1849 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1851 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1852 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1853 fpclassify(b
.h
) == FP_NORMAL
;
1855 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1858 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1860 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1861 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1862 fpclassify(b
.h
) == FP_NORMAL
;
1864 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1867 static bool f32_div_post(union_float32 a
, union_float32 b
)
1869 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1870 return fpclassify(a
.h
) != FP_ZERO
;
1872 return !float32_is_zero(a
.s
);
1875 static bool f64_div_post(union_float64 a
, union_float64 b
)
1877 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1878 return fpclassify(a
.h
) != FP_ZERO
;
1880 return !float64_is_zero(a
.s
);
1883 float32 QEMU_FLATTEN
1884 float32_div(float32 a
, float32 b
, float_status
*s
)
1886 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1887 f32_div_pre
, f32_div_post
);
1890 float64 QEMU_FLATTEN
1891 float64_div(float64 a
, float64 b
, float_status
*s
)
1893 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1894 f64_div_pre
, f64_div_post
);
1898 * Returns the result of dividing the bfloat16
1899 * value `a' by the corresponding value `b'.
1902 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1904 FloatParts64 pa
, pb
, pr
;
1906 bfloat16_unpack_canonical(&pa
, a
, status
);
1907 bfloat16_unpack_canonical(&pb
, b
, status
);
1908 pr
= div_floats(pa
, pb
, status
);
1910 return bfloat16_round_pack_canonical(&pr
, status
);
1914 * Float to Float conversions
1916 * Returns the result of converting one float format to another. The
1917 * conversion is performed according to the IEC/IEEE Standard for
1918 * Binary Floating-Point Arithmetic.
1920 * The float_to_float helper only needs to take care of raising
1921 * invalid exceptions and handling the conversion on NaNs.
1924 static FloatParts64
float_to_float(FloatParts64 a
, const FloatFmt
*dstf
,
1927 if (dstf
->arm_althp
) {
1929 case float_class_qnan
:
1930 case float_class_snan
:
1931 /* There is no NaN in the destination format. Raise Invalid
1932 * and return a zero with the sign of the input NaN.
1934 float_raise(float_flag_invalid
, s
);
1935 a
.cls
= float_class_zero
;
1940 case float_class_inf
:
1941 /* There is no Inf in the destination format. Raise Invalid
1942 * and return the maximum normal with the correct sign.
1944 float_raise(float_flag_invalid
, s
);
1945 a
.cls
= float_class_normal
;
1946 a
.exp
= dstf
->exp_max
;
1947 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1953 } else if (is_nan(a
.cls
)) {
1954 parts_return_nan(&a
, s
);
1959 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1961 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1962 FloatParts64 pa
, pr
;
1964 float16a_unpack_canonical(&pa
, a
, s
, fmt16
);
1965 pr
= float_to_float(pa
, &float32_params
, s
);
1966 return float32_round_pack_canonical(&pr
, s
);
1969 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1971 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1972 FloatParts64 pa
, pr
;
1974 float16a_unpack_canonical(&pa
, a
, s
, fmt16
);
1975 pr
= float_to_float(pa
, &float64_params
, s
);
1976 return float64_round_pack_canonical(&pr
, s
);
1979 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1981 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1982 FloatParts64 pa
, pr
;
1984 float32_unpack_canonical(&pa
, a
, s
);
1985 pr
= float_to_float(pa
, fmt16
, s
);
1986 return float16a_round_pack_canonical(&pr
, s
, fmt16
);
1989 static float64 QEMU_SOFTFLOAT_ATTR
1990 soft_float32_to_float64(float32 a
, float_status
*s
)
1992 FloatParts64 pa
, pr
;
1994 float32_unpack_canonical(&pa
, a
, s
);
1995 pr
= float_to_float(pa
, &float64_params
, s
);
1996 return float64_round_pack_canonical(&pr
, s
);
1999 float64
float32_to_float64(float32 a
, float_status
*s
)
2001 if (likely(float32_is_normal(a
))) {
2002 /* Widening conversion can never produce inexact results. */
2008 } else if (float32_is_zero(a
)) {
2009 return float64_set_sign(float64_zero
, float32_is_neg(a
));
2011 return soft_float32_to_float64(a
, s
);
2015 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2017 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2018 FloatParts64 pa
, pr
;
2020 float64_unpack_canonical(&pa
, a
, s
);
2021 pr
= float_to_float(pa
, fmt16
, s
);
2022 return float16a_round_pack_canonical(&pr
, s
, fmt16
);
2025 float32
float64_to_float32(float64 a
, float_status
*s
)
2027 FloatParts64 pa
, pr
;
2029 float64_unpack_canonical(&pa
, a
, s
);
2030 pr
= float_to_float(pa
, &float32_params
, s
);
2031 return float32_round_pack_canonical(&pr
, s
);
2034 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2036 FloatParts64 pa
, pr
;
2038 bfloat16_unpack_canonical(&pa
, a
, s
);
2039 pr
= float_to_float(pa
, &float32_params
, s
);
2040 return float32_round_pack_canonical(&pr
, s
);
2043 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2045 FloatParts64 pa
, pr
;
2047 bfloat16_unpack_canonical(&pa
, a
, s
);
2048 pr
= float_to_float(pa
, &float64_params
, s
);
2049 return float64_round_pack_canonical(&pr
, s
);
2052 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2054 FloatParts64 pa
, pr
;
2056 float32_unpack_canonical(&pa
, a
, s
);
2057 pr
= float_to_float(pa
, &bfloat16_params
, s
);
2058 return bfloat16_round_pack_canonical(&pr
, s
);
2061 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2063 FloatParts64 pa
, pr
;
2065 float64_unpack_canonical(&pa
, a
, s
);
2066 pr
= float_to_float(pa
, &bfloat16_params
, s
);
2067 return bfloat16_round_pack_canonical(&pr
, s
);
2071 * Rounds the floating-point value `a' to an integer, and returns the
2072 * result as a floating-point value. The operation is performed
2073 * according to the IEC/IEEE Standard for Binary Floating-Point
2077 static FloatParts64
round_to_int(FloatParts64 a
, FloatRoundMode rmode
,
2078 int scale
, float_status
*s
)
2081 case float_class_qnan
:
2082 case float_class_snan
:
2083 parts_return_nan(&a
, s
);
2086 case float_class_zero
:
2087 case float_class_inf
:
2088 /* already "integral" */
2091 case float_class_normal
:
2092 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2095 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2096 /* already integral */
2101 /* all fractional */
2102 float_raise(float_flag_inexact
, s
);
2104 case float_round_nearest_even
:
2105 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2107 case float_round_ties_away
:
2108 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2110 case float_round_to_zero
:
2113 case float_round_up
:
2116 case float_round_down
:
2119 case float_round_to_odd
:
2123 g_assert_not_reached();
2127 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2130 a
.cls
= float_class_zero
;
2133 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2134 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2135 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2136 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2140 case float_round_nearest_even
:
2141 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2143 case float_round_ties_away
:
2146 case float_round_to_zero
:
2149 case float_round_up
:
2150 inc
= a
.sign
? 0 : rnd_mask
;
2152 case float_round_down
:
2153 inc
= a
.sign
? rnd_mask
: 0;
2155 case float_round_to_odd
:
2156 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2159 g_assert_not_reached();
2162 if (a
.frac
& rnd_mask
) {
2163 float_raise(float_flag_inexact
, s
);
2164 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2166 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2169 a
.frac
&= ~rnd_mask
;
2174 g_assert_not_reached();
2179 float16
float16_round_to_int(float16 a
, float_status
*s
)
2181 FloatParts64 pa
, pr
;
2183 float16_unpack_canonical(&pa
, a
, s
);
2184 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2185 return float16_round_pack_canonical(&pr
, s
);
2188 float32
float32_round_to_int(float32 a
, float_status
*s
)
2190 FloatParts64 pa
, pr
;
2192 float32_unpack_canonical(&pa
, a
, s
);
2193 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2194 return float32_round_pack_canonical(&pr
, s
);
2197 float64
float64_round_to_int(float64 a
, float_status
*s
)
2199 FloatParts64 pa
, pr
;
2201 float64_unpack_canonical(&pa
, a
, s
);
2202 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2203 return float64_round_pack_canonical(&pr
, s
);
2207 * Rounds the bfloat16 value `a' to an integer, and returns the
2208 * result as a bfloat16 value.
2211 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2213 FloatParts64 pa
, pr
;
2215 bfloat16_unpack_canonical(&pa
, a
, s
);
2216 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2217 return bfloat16_round_pack_canonical(&pr
, s
);
2221 * Returns the result of converting the floating-point value `a' to
2222 * the two's complement integer format. The conversion is performed
2223 * according to the IEC/IEEE Standard for Binary Floating-Point
2224 * Arithmetic---which means in particular that the conversion is
2225 * rounded according to the current rounding mode. If `a' is a NaN,
2226 * the largest positive integer is returned. Otherwise, if the
2227 * conversion overflows, the largest integer with the same sign as `a'
2231 static int64_t round_to_int_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2232 int scale
, int64_t min
, int64_t max
,
2236 int orig_flags
= get_float_exception_flags(s
);
2237 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2240 case float_class_snan
:
2241 case float_class_qnan
:
2242 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2244 case float_class_inf
:
2245 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2246 return p
.sign
? min
: max
;
2247 case float_class_zero
:
2249 case float_class_normal
:
2250 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2251 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2256 if (r
<= -(uint64_t) min
) {
2259 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2266 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2271 g_assert_not_reached();
2275 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2280 float16_unpack_canonical(&p
, a
, s
);
2281 return round_to_int_and_pack(p
, rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2284 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2289 float16_unpack_canonical(&p
, a
, s
);
2290 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2293 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2298 float16_unpack_canonical(&p
, a
, s
);
2299 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2302 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2307 float16_unpack_canonical(&p
, a
, s
);
2308 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2311 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2316 float32_unpack_canonical(&p
, a
, s
);
2317 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2320 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2325 float32_unpack_canonical(&p
, a
, s
);
2326 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2329 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2334 float32_unpack_canonical(&p
, a
, s
);
2335 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2338 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2343 float64_unpack_canonical(&p
, a
, s
);
2344 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2347 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2352 float64_unpack_canonical(&p
, a
, s
);
2353 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2356 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2361 float64_unpack_canonical(&p
, a
, s
);
2362 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2365 int8_t float16_to_int8(float16 a
, float_status
*s
)
2367 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2370 int16_t float16_to_int16(float16 a
, float_status
*s
)
2372 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2375 int32_t float16_to_int32(float16 a
, float_status
*s
)
2377 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2380 int64_t float16_to_int64(float16 a
, float_status
*s
)
2382 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2385 int16_t float32_to_int16(float32 a
, float_status
*s
)
2387 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2390 int32_t float32_to_int32(float32 a
, float_status
*s
)
2392 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2395 int64_t float32_to_int64(float32 a
, float_status
*s
)
2397 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2400 int16_t float64_to_int16(float64 a
, float_status
*s
)
2402 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2405 int32_t float64_to_int32(float64 a
, float_status
*s
)
2407 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2410 int64_t float64_to_int64(float64 a
, float_status
*s
)
2412 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2415 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2417 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2420 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2422 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2425 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2427 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2430 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2432 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2435 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2437 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2440 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2442 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2445 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2447 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2450 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2452 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2455 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2457 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2461 * Returns the result of converting the floating-point value `a' to
2462 * the two's complement integer format.
2465 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2470 bfloat16_unpack_canonical(&p
, a
, s
);
2471 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2474 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2479 bfloat16_unpack_canonical(&p
, a
, s
);
2480 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2483 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2488 bfloat16_unpack_canonical(&p
, a
, s
);
2489 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2492 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2494 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2497 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2499 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2502 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2504 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2507 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2509 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2512 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2514 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2517 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2519 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2523 * Returns the result of converting the floating-point value `a' to
2524 * the unsigned integer format. The conversion is performed according
2525 * to the IEC/IEEE Standard for Binary Floating-Point
2526 * Arithmetic---which means in particular that the conversion is
2527 * rounded according to the current rounding mode. If `a' is a NaN,
2528 * the largest unsigned integer is returned. Otherwise, if the
2529 * conversion overflows, the largest unsigned integer is returned. If
2530 * the 'a' is negative, the result is rounded and zero is returned;
2531 * values that do not round to zero will raise the inexact exception
2535 static uint64_t round_to_uint_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2536 int scale
, uint64_t max
,
2539 int orig_flags
= get_float_exception_flags(s
);
2540 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2544 case float_class_snan
:
2545 case float_class_qnan
:
2546 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2548 case float_class_inf
:
2549 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2550 return p
.sign
? 0 : max
;
2551 case float_class_zero
:
2553 case float_class_normal
:
2555 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2559 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2560 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2562 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2566 /* For uint64 this will never trip, but if p.exp is too large
2567 * to shift a decomposed fraction we shall have exited via the
2571 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2576 g_assert_not_reached();
2580 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2585 float16_unpack_canonical(&p
, a
, s
);
2586 return round_to_uint_and_pack(p
, rmode
, scale
, UINT8_MAX
, s
);
2589 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2594 float16_unpack_canonical(&p
, a
, s
);
2595 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2598 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2603 float16_unpack_canonical(&p
, a
, s
);
2604 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2607 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2612 float16_unpack_canonical(&p
, a
, s
);
2613 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2616 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2621 float32_unpack_canonical(&p
, a
, s
);
2622 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2625 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2630 float32_unpack_canonical(&p
, a
, s
);
2631 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2634 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2639 float32_unpack_canonical(&p
, a
, s
);
2640 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2643 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2648 float64_unpack_canonical(&p
, a
, s
);
2649 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2652 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2657 float64_unpack_canonical(&p
, a
, s
);
2658 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2661 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2666 float64_unpack_canonical(&p
, a
, s
);
2667 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2670 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2672 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2675 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2677 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2680 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2682 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2685 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2687 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2690 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2692 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2695 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2697 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2700 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2702 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2705 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2707 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2710 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2712 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2715 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2717 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2720 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2722 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2725 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2727 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2730 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2732 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2735 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2737 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2740 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2742 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2745 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2747 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2750 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2752 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2755 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2757 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2760 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2762 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2766 * Returns the result of converting the bfloat16 value `a' to
2767 * the unsigned integer format.
2770 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2771 int scale
, float_status
*s
)
2775 bfloat16_unpack_canonical(&p
, a
, s
);
2776 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2779 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2780 int scale
, float_status
*s
)
2784 bfloat16_unpack_canonical(&p
, a
, s
);
2785 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2788 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2789 int scale
, float_status
*s
)
2793 bfloat16_unpack_canonical(&p
, a
, s
);
2794 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2797 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2799 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2802 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2804 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2807 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2809 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2812 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2814 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2817 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2819 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2822 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2824 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2828 * Integer to float conversions
2830 * Returns the result of converting the two's complement integer `a'
2831 * to the floating-point format. The conversion is performed according
2832 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2835 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2837 FloatParts64 r
= { .sign
= false };
2840 r
.cls
= float_class_zero
;
2845 r
.cls
= float_class_normal
;
2851 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2853 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2854 r
.frac
= f
<< shift
;
2860 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2862 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2863 return float16_round_pack_canonical(&pa
, status
);
2866 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2868 return int64_to_float16_scalbn(a
, scale
, status
);
2871 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2873 return int64_to_float16_scalbn(a
, scale
, status
);
2876 float16
int64_to_float16(int64_t a
, float_status
*status
)
2878 return int64_to_float16_scalbn(a
, 0, status
);
2881 float16
int32_to_float16(int32_t a
, float_status
*status
)
2883 return int64_to_float16_scalbn(a
, 0, status
);
2886 float16
int16_to_float16(int16_t a
, float_status
*status
)
2888 return int64_to_float16_scalbn(a
, 0, status
);
2891 float16
int8_to_float16(int8_t a
, float_status
*status
)
2893 return int64_to_float16_scalbn(a
, 0, status
);
2896 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2898 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2899 return float32_round_pack_canonical(&pa
, status
);
2902 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2904 return int64_to_float32_scalbn(a
, scale
, status
);
2907 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2909 return int64_to_float32_scalbn(a
, scale
, status
);
2912 float32
int64_to_float32(int64_t a
, float_status
*status
)
2914 return int64_to_float32_scalbn(a
, 0, status
);
2917 float32
int32_to_float32(int32_t a
, float_status
*status
)
2919 return int64_to_float32_scalbn(a
, 0, status
);
2922 float32
int16_to_float32(int16_t a
, float_status
*status
)
2924 return int64_to_float32_scalbn(a
, 0, status
);
2927 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2929 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2930 return float64_round_pack_canonical(&pa
, status
);
2933 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2935 return int64_to_float64_scalbn(a
, scale
, status
);
2938 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2940 return int64_to_float64_scalbn(a
, scale
, status
);
2943 float64
int64_to_float64(int64_t a
, float_status
*status
)
2945 return int64_to_float64_scalbn(a
, 0, status
);
2948 float64
int32_to_float64(int32_t a
, float_status
*status
)
2950 return int64_to_float64_scalbn(a
, 0, status
);
2953 float64
int16_to_float64(int16_t a
, float_status
*status
)
2955 return int64_to_float64_scalbn(a
, 0, status
);
2959 * Returns the result of converting the two's complement integer `a'
2960 * to the bfloat16 format.
2963 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
2965 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2966 return bfloat16_round_pack_canonical(&pa
, status
);
2969 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
2971 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2974 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
2976 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2979 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
2981 return int64_to_bfloat16_scalbn(a
, 0, status
);
2984 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
2986 return int64_to_bfloat16_scalbn(a
, 0, status
);
2989 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
2991 return int64_to_bfloat16_scalbn(a
, 0, status
);
2995 * Unsigned Integer to float conversions
2997 * Returns the result of converting the unsigned integer `a' to the
2998 * floating-point format. The conversion is performed according to the
2999 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3002 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
3004 FloatParts64 r
= { .sign
= false };
3008 r
.cls
= float_class_zero
;
3010 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
3012 r
.cls
= float_class_normal
;
3013 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
3014 r
.frac
= a
<< shift
;
3020 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3022 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3023 return float16_round_pack_canonical(&pa
, status
);
3026 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3028 return uint64_to_float16_scalbn(a
, scale
, status
);
3031 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3033 return uint64_to_float16_scalbn(a
, scale
, status
);
3036 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
3038 return uint64_to_float16_scalbn(a
, 0, status
);
3041 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
3043 return uint64_to_float16_scalbn(a
, 0, status
);
3046 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
3048 return uint64_to_float16_scalbn(a
, 0, status
);
3051 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
3053 return uint64_to_float16_scalbn(a
, 0, status
);
3056 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
3058 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3059 return float32_round_pack_canonical(&pa
, status
);
3062 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
3064 return uint64_to_float32_scalbn(a
, scale
, status
);
3067 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
3069 return uint64_to_float32_scalbn(a
, scale
, status
);
3072 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
3074 return uint64_to_float32_scalbn(a
, 0, status
);
3077 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3079 return uint64_to_float32_scalbn(a
, 0, status
);
3082 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3084 return uint64_to_float32_scalbn(a
, 0, status
);
3087 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3089 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3090 return float64_round_pack_canonical(&pa
, status
);
3093 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3095 return uint64_to_float64_scalbn(a
, scale
, status
);
3098 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3100 return uint64_to_float64_scalbn(a
, scale
, status
);
3103 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3105 return uint64_to_float64_scalbn(a
, 0, status
);
3108 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3110 return uint64_to_float64_scalbn(a
, 0, status
);
3113 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3115 return uint64_to_float64_scalbn(a
, 0, status
);
3119 * Returns the result of converting the unsigned integer `a' to the
3123 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3125 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3126 return bfloat16_round_pack_canonical(&pa
, status
);
3129 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3131 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3134 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3136 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3139 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3141 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3144 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3146 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3149 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3151 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3155 /* min() and max() functions. These can't be implemented as
3156 * 'compare and pick one input' because that would mishandle
3157 * NaNs and +0 vs -0.
3159 * minnum() and maxnum() functions. These are similar to the min()
3160 * and max() functions but if one of the arguments is a QNaN and
3161 * the other is numerical then the numerical argument is returned.
3162 * SNaNs will get quietened before being returned.
3163 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3164 * and maxNum() operations. min() and max() are the typical min/max
3165 * semantics provided by many CPUs which predate that specification.
3167 * minnummag() and maxnummag() functions correspond to minNumMag()
3168 * and minNumMag() from the IEEE-754 2008.
3170 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3171 bool ieee
, bool ismag
, float_status
*s
)
3173 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3175 /* Takes two floating-point values `a' and `b', one of
3176 * which is a NaN, and returns the appropriate NaN
3177 * result. If either `a' or `b' is a signaling NaN,
3178 * the invalid exception is raised.
3180 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3181 return *parts_pick_nan(&a
, &b
, s
);
3182 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3184 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3188 return *parts_pick_nan(&a
, &b
, s
);
3193 case float_class_normal
:
3196 case float_class_inf
:
3199 case float_class_zero
:
3203 g_assert_not_reached();
3207 case float_class_normal
:
3210 case float_class_inf
:
3213 case float_class_zero
:
3217 g_assert_not_reached();
3221 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3222 bool a_less
= a_exp
< b_exp
;
3223 if (a_exp
== b_exp
) {
3224 a_less
= a
.frac
< b
.frac
;
3226 return a_less
^ ismin
? b
: a
;
3229 if (a
.sign
== b
.sign
) {
3230 bool a_less
= a_exp
< b_exp
;
3231 if (a_exp
== b_exp
) {
3232 a_less
= a
.frac
< b
.frac
;
3234 return a
.sign
^ a_less
^ ismin
? b
: a
;
3236 return a
.sign
^ ismin
? b
: a
;
3241 #define MINMAX(sz, name, ismin, isiee, ismag) \
3242 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3245 FloatParts64 pa, pb, pr; \
3246 float ## sz ## _unpack_canonical(&pa, a, s); \
3247 float ## sz ## _unpack_canonical(&pb, b, s); \
3248 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3249 return float ## sz ## _round_pack_canonical(&pr, s); \
3252 MINMAX(16, min
, true, false, false)
3253 MINMAX(16, minnum
, true, true, false)
3254 MINMAX(16, minnummag
, true, true, true)
3255 MINMAX(16, max
, false, false, false)
3256 MINMAX(16, maxnum
, false, true, false)
3257 MINMAX(16, maxnummag
, false, true, true)
3259 MINMAX(32, min
, true, false, false)
3260 MINMAX(32, minnum
, true, true, false)
3261 MINMAX(32, minnummag
, true, true, true)
3262 MINMAX(32, max
, false, false, false)
3263 MINMAX(32, maxnum
, false, true, false)
3264 MINMAX(32, maxnummag
, false, true, true)
3266 MINMAX(64, min
, true, false, false)
3267 MINMAX(64, minnum
, true, true, false)
3268 MINMAX(64, minnummag
, true, true, true)
3269 MINMAX(64, max
, false, false, false)
3270 MINMAX(64, maxnum
, false, true, false)
3271 MINMAX(64, maxnummag
, false, true, true)
3275 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3276 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3278 FloatParts64 pa, pb, pr; \
3279 bfloat16_unpack_canonical(&pa, a, s); \
3280 bfloat16_unpack_canonical(&pb, b, s); \
3281 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3282 return bfloat16_round_pack_canonical(&pr, s); \
3285 BF16_MINMAX(min
, true, false, false)
3286 BF16_MINMAX(minnum
, true, true, false)
3287 BF16_MINMAX(minnummag
, true, true, true)
3288 BF16_MINMAX(max
, false, false, false)
3289 BF16_MINMAX(maxnum
, false, true, false)
3290 BF16_MINMAX(maxnummag
, false, true, true)
3294 /* Floating point compare */
3295 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3298 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3300 a
.cls
== float_class_snan
||
3301 b
.cls
== float_class_snan
) {
3302 float_raise(float_flag_invalid
, s
);
3304 return float_relation_unordered
;
3307 if (a
.cls
== float_class_zero
) {
3308 if (b
.cls
== float_class_zero
) {
3309 return float_relation_equal
;
3311 return b
.sign
? float_relation_greater
: float_relation_less
;
3312 } else if (b
.cls
== float_class_zero
) {
3313 return a
.sign
? float_relation_less
: float_relation_greater
;
3316 /* The only really important thing about infinity is its sign. If
3317 * both are infinities the sign marks the smallest of the two.
3319 if (a
.cls
== float_class_inf
) {
3320 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3321 return float_relation_equal
;
3323 return a
.sign
? float_relation_less
: float_relation_greater
;
3324 } else if (b
.cls
== float_class_inf
) {
3325 return b
.sign
? float_relation_greater
: float_relation_less
;
3328 if (a
.sign
!= b
.sign
) {
3329 return a
.sign
? float_relation_less
: float_relation_greater
;
3332 if (a
.exp
== b
.exp
) {
3333 if (a
.frac
== b
.frac
) {
3334 return float_relation_equal
;
3337 return a
.frac
> b
.frac
?
3338 float_relation_less
: float_relation_greater
;
3340 return a
.frac
> b
.frac
?
3341 float_relation_greater
: float_relation_less
;
3345 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3347 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3352 #define COMPARE(name, attr, sz) \
3354 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3356 FloatParts64 pa, pb; \
3357 float ## sz ## _unpack_canonical(&pa, a, s); \
3358 float ## sz ## _unpack_canonical(&pb, b, s); \
3359 return compare_floats(pa, pb, is_quiet, s); \
3362 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3363 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3364 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3368 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3370 return soft_f16_compare(a
, b
, false, s
);
3373 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3375 return soft_f16_compare(a
, b
, true, s
);
3378 static FloatRelation QEMU_FLATTEN
3379 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3381 union_float32 ua
, ub
;
3386 if (QEMU_NO_HARDFLOAT
) {
3390 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3391 if (isgreaterequal(ua
.h
, ub
.h
)) {
3392 if (isgreater(ua
.h
, ub
.h
)) {
3393 return float_relation_greater
;
3395 return float_relation_equal
;
3397 if (likely(isless(ua
.h
, ub
.h
))) {
3398 return float_relation_less
;
3400 /* The only condition remaining is unordered.
3401 * Fall through to set flags.
3404 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3407 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3409 return f32_compare(a
, b
, false, s
);
3412 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3414 return f32_compare(a
, b
, true, s
);
3417 static FloatRelation QEMU_FLATTEN
3418 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3420 union_float64 ua
, ub
;
3425 if (QEMU_NO_HARDFLOAT
) {
3429 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3430 if (isgreaterequal(ua
.h
, ub
.h
)) {
3431 if (isgreater(ua
.h
, ub
.h
)) {
3432 return float_relation_greater
;
3434 return float_relation_equal
;
3436 if (likely(isless(ua
.h
, ub
.h
))) {
3437 return float_relation_less
;
3439 /* The only condition remaining is unordered.
3440 * Fall through to set flags.
3443 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3446 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3448 return f64_compare(a
, b
, false, s
);
3451 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3453 return f64_compare(a
, b
, true, s
);
3456 static FloatRelation QEMU_FLATTEN
3457 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3459 FloatParts64 pa
, pb
;
3461 bfloat16_unpack_canonical(&pa
, a
, s
);
3462 bfloat16_unpack_canonical(&pb
, b
, s
);
3463 return compare_floats(pa
, pb
, is_quiet
, s
);
3466 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3468 return soft_bf16_compare(a
, b
, false, s
);
3471 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3473 return soft_bf16_compare(a
, b
, true, s
);
3476 /* Multiply A by 2 raised to the power N. */
3477 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3479 if (unlikely(is_nan(a
.cls
))) {
3480 parts_return_nan(&a
, s
);
3482 if (a
.cls
== float_class_normal
) {
3483 /* The largest float type (even though not supported by FloatParts64)
3484 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3485 * still allows rounding to infinity, without allowing overflow
3486 * within the int32_t that backs FloatParts64.exp.
3488 n
= MIN(MAX(n
, -0x10000), 0x10000);
3494 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3496 FloatParts64 pa
, pr
;
3498 float16_unpack_canonical(&pa
, a
, status
);
3499 pr
= scalbn_decomposed(pa
, n
, status
);
3500 return float16_round_pack_canonical(&pr
, status
);
3503 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3505 FloatParts64 pa
, pr
;
3507 float32_unpack_canonical(&pa
, a
, status
);
3508 pr
= scalbn_decomposed(pa
, n
, status
);
3509 return float32_round_pack_canonical(&pr
, status
);
3512 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3514 FloatParts64 pa
, pr
;
3516 float64_unpack_canonical(&pa
, a
, status
);
3517 pr
= scalbn_decomposed(pa
, n
, status
);
3518 return float64_round_pack_canonical(&pr
, status
);
3521 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3523 FloatParts64 pa
, pr
;
3525 bfloat16_unpack_canonical(&pa
, a
, status
);
3526 pr
= scalbn_decomposed(pa
, n
, status
);
3527 return bfloat16_round_pack_canonical(&pr
, status
);
3533 * The old softfloat code did an approximation step before zeroing in
3534 * on the final result. However for simpleness we just compute the
3535 * square root by iterating down from the implicit bit to enough extra
3536 * bits to ensure we get a correctly rounded result.
3538 * This does mean however the calculation is slower than before,
3539 * especially for 64 bit floats.
3542 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3544 uint64_t a_frac
, r_frac
, s_frac
;
3547 if (is_nan(a
.cls
)) {
3548 parts_return_nan(&a
, s
);
3551 if (a
.cls
== float_class_zero
) {
3552 return a
; /* sqrt(+-0) = +-0 */
3555 float_raise(float_flag_invalid
, s
);
3556 parts_default_nan(&a
, s
);
3559 if (a
.cls
== float_class_inf
) {
3560 return a
; /* sqrt(+inf) = +inf */
3563 assert(a
.cls
== float_class_normal
);
3565 /* We need two overflow bits at the top. Adding room for that is a
3566 * right shift. If the exponent is odd, we can discard the low bit
3567 * by multiplying the fraction by 2; that's a left shift. Combine
3568 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3570 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3573 /* Bit-by-bit computation of sqrt. */
3577 /* Iterate from implicit bit down to the 3 extra bits to compute a
3578 * properly rounded result. Remember we've inserted two more bits
3579 * at the top, so these positions are two less.
3581 bit
= DECOMPOSED_BINARY_POINT
- 2;
3582 last_bit
= MAX(p
->frac_shift
- 4, 0);
3584 uint64_t q
= 1ULL << bit
;
3585 uint64_t t_frac
= s_frac
+ q
;
3586 if (t_frac
<= a_frac
) {
3587 s_frac
= t_frac
+ q
;
3592 } while (--bit
>= last_bit
);
3594 /* Undo the right shift done above. If there is any remaining
3595 * fraction, the result is inexact. Set the sticky bit.
3597 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3602 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3604 FloatParts64 pa
, pr
;
3606 float16_unpack_canonical(&pa
, a
, status
);
3607 pr
= sqrt_float(pa
, status
, &float16_params
);
3608 return float16_round_pack_canonical(&pr
, status
);
3611 static float32 QEMU_SOFTFLOAT_ATTR
3612 soft_f32_sqrt(float32 a
, float_status
*status
)
3614 FloatParts64 pa
, pr
;
3616 float32_unpack_canonical(&pa
, a
, status
);
3617 pr
= sqrt_float(pa
, status
, &float32_params
);
3618 return float32_round_pack_canonical(&pr
, status
);
3621 static float64 QEMU_SOFTFLOAT_ATTR
3622 soft_f64_sqrt(float64 a
, float_status
*status
)
3624 FloatParts64 pa
, pr
;
3626 float64_unpack_canonical(&pa
, a
, status
);
3627 pr
= sqrt_float(pa
, status
, &float64_params
);
3628 return float64_round_pack_canonical(&pr
, status
);
3631 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3633 union_float32 ua
, ur
;
3636 if (unlikely(!can_use_fpu(s
))) {
3640 float32_input_flush1(&ua
.s
, s
);
3641 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3642 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3643 fpclassify(ua
.h
) == FP_ZERO
) ||
3647 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3648 float32_is_neg(ua
.s
))) {
3655 return soft_f32_sqrt(ua
.s
, s
);
3658 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3660 union_float64 ua
, ur
;
3663 if (unlikely(!can_use_fpu(s
))) {
3667 float64_input_flush1(&ua
.s
, s
);
3668 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3669 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3670 fpclassify(ua
.h
) == FP_ZERO
) ||
3674 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3675 float64_is_neg(ua
.s
))) {
3682 return soft_f64_sqrt(ua
.s
, s
);
3685 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3687 FloatParts64 pa
, pr
;
3689 bfloat16_unpack_canonical(&pa
, a
, status
);
3690 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3691 return bfloat16_round_pack_canonical(&pr
, status
);
3694 /*----------------------------------------------------------------------------
3695 | The pattern for a default generated NaN.
3696 *----------------------------------------------------------------------------*/
3698 float16
float16_default_nan(float_status
*status
)
3702 parts_default_nan(&p
, status
);
3703 p
.frac
>>= float16_params
.frac_shift
;
3704 return float16_pack_raw(&p
);
3707 float32
float32_default_nan(float_status
*status
)
3711 parts_default_nan(&p
, status
);
3712 p
.frac
>>= float32_params
.frac_shift
;
3713 return float32_pack_raw(&p
);
3716 float64
float64_default_nan(float_status
*status
)
3720 parts_default_nan(&p
, status
);
3721 p
.frac
>>= float64_params
.frac_shift
;
3722 return float64_pack_raw(&p
);
3725 float128
float128_default_nan(float_status
*status
)
3729 parts_default_nan(&p
, status
);
3730 frac_shr(&p
, float128_params
.frac_shift
);
3731 return float128_pack_raw(&p
);
3734 bfloat16
bfloat16_default_nan(float_status
*status
)
3738 parts_default_nan(&p
, status
);
3739 p
.frac
>>= bfloat16_params
.frac_shift
;
3740 return bfloat16_pack_raw(&p
);
3743 /*----------------------------------------------------------------------------
3744 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3745 *----------------------------------------------------------------------------*/
3747 float16
float16_silence_nan(float16 a
, float_status
*status
)
3751 float16_unpack_raw(&p
, a
);
3752 p
.frac
<<= float16_params
.frac_shift
;
3753 parts_silence_nan(&p
, status
);
3754 p
.frac
>>= float16_params
.frac_shift
;
3755 return float16_pack_raw(&p
);
3758 float32
float32_silence_nan(float32 a
, float_status
*status
)
3762 float32_unpack_raw(&p
, a
);
3763 p
.frac
<<= float32_params
.frac_shift
;
3764 parts_silence_nan(&p
, status
);
3765 p
.frac
>>= float32_params
.frac_shift
;
3766 return float32_pack_raw(&p
);
3769 float64
float64_silence_nan(float64 a
, float_status
*status
)
3773 float64_unpack_raw(&p
, a
);
3774 p
.frac
<<= float64_params
.frac_shift
;
3775 parts_silence_nan(&p
, status
);
3776 p
.frac
>>= float64_params
.frac_shift
;
3777 return float64_pack_raw(&p
);
3780 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3784 bfloat16_unpack_raw(&p
, a
);
3785 p
.frac
<<= bfloat16_params
.frac_shift
;
3786 parts_silence_nan(&p
, status
);
3787 p
.frac
>>= bfloat16_params
.frac_shift
;
3788 return bfloat16_pack_raw(&p
);
3791 float128
float128_silence_nan(float128 a
, float_status
*status
)
3795 float128_unpack_raw(&p
, a
);
3796 frac_shl(&p
, float128_params
.frac_shift
);
3797 parts_silence_nan(&p
, status
);
3798 frac_shr(&p
, float128_params
.frac_shift
);
3799 return float128_pack_raw(&p
);
3802 /*----------------------------------------------------------------------------
3803 | If `a' is denormal and we are in flush-to-zero mode then set the
3804 | input-denormal exception and return zero. Otherwise just return the value.
3805 *----------------------------------------------------------------------------*/
3807 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3809 if (p
.exp
== 0 && p
.frac
!= 0) {
3810 float_raise(float_flag_input_denormal
, status
);
3817 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3819 if (status
->flush_inputs_to_zero
) {
3822 float16_unpack_raw(&p
, a
);
3823 if (parts_squash_denormal(p
, status
)) {
3824 return float16_set_sign(float16_zero
, p
.sign
);
3830 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3832 if (status
->flush_inputs_to_zero
) {
3835 float32_unpack_raw(&p
, a
);
3836 if (parts_squash_denormal(p
, status
)) {
3837 return float32_set_sign(float32_zero
, p
.sign
);
3843 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3845 if (status
->flush_inputs_to_zero
) {
3848 float64_unpack_raw(&p
, a
);
3849 if (parts_squash_denormal(p
, status
)) {
3850 return float64_set_sign(float64_zero
, p
.sign
);
3856 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3858 if (status
->flush_inputs_to_zero
) {
3861 bfloat16_unpack_raw(&p
, a
);
3862 if (parts_squash_denormal(p
, status
)) {
3863 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3869 /*----------------------------------------------------------------------------
3870 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3871 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3872 | input. If `zSign' is 1, the input is negated before being converted to an
3873 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3874 | is simply rounded to an integer, with the inexact exception raised if the
3875 | input cannot be represented exactly as an integer. However, if the fixed-
3876 | point input is too large, the invalid exception is raised and the largest
3877 | positive or negative integer is returned.
3878 *----------------------------------------------------------------------------*/
3880 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3881 float_status
*status
)
3883 int8_t roundingMode
;
3884 bool roundNearestEven
;
3885 int8_t roundIncrement
, roundBits
;
3888 roundingMode
= status
->float_rounding_mode
;
3889 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3890 switch (roundingMode
) {
3891 case float_round_nearest_even
:
3892 case float_round_ties_away
:
3893 roundIncrement
= 0x40;
3895 case float_round_to_zero
:
3898 case float_round_up
:
3899 roundIncrement
= zSign
? 0 : 0x7f;
3901 case float_round_down
:
3902 roundIncrement
= zSign
? 0x7f : 0;
3904 case float_round_to_odd
:
3905 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3910 roundBits
= absZ
& 0x7F;
3911 absZ
= ( absZ
+ roundIncrement
)>>7;
3912 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3916 if ( zSign
) z
= - z
;
3917 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3918 float_raise(float_flag_invalid
, status
);
3919 return zSign
? INT32_MIN
: INT32_MAX
;
3922 float_raise(float_flag_inexact
, status
);
3928 /*----------------------------------------------------------------------------
3929 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3930 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3931 | and returns the properly rounded 64-bit integer corresponding to the input.
3932 | If `zSign' is 1, the input is negated before being converted to an integer.
3933 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3934 | the inexact exception raised if the input cannot be represented exactly as
3935 | an integer. However, if the fixed-point input is too large, the invalid
3936 | exception is raised and the largest positive or negative integer is
3938 *----------------------------------------------------------------------------*/
3940 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3941 float_status
*status
)
3943 int8_t roundingMode
;
3944 bool roundNearestEven
, increment
;
3947 roundingMode
= status
->float_rounding_mode
;
3948 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3949 switch (roundingMode
) {
3950 case float_round_nearest_even
:
3951 case float_round_ties_away
:
3952 increment
= ((int64_t) absZ1
< 0);
3954 case float_round_to_zero
:
3957 case float_round_up
:
3958 increment
= !zSign
&& absZ1
;
3960 case float_round_down
:
3961 increment
= zSign
&& absZ1
;
3963 case float_round_to_odd
:
3964 increment
= !(absZ0
& 1) && absZ1
;
3971 if ( absZ0
== 0 ) goto overflow
;
3972 if (!(absZ1
<< 1) && roundNearestEven
) {
3977 if ( zSign
) z
= - z
;
3978 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3980 float_raise(float_flag_invalid
, status
);
3981 return zSign
? INT64_MIN
: INT64_MAX
;
3984 float_raise(float_flag_inexact
, status
);
3990 /*----------------------------------------------------------------------------
3991 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3992 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3993 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3994 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3995 | with the inexact exception raised if the input cannot be represented exactly
3996 | as an integer. However, if the fixed-point input is too large, the invalid
3997 | exception is raised and the largest unsigned integer is returned.
3998 *----------------------------------------------------------------------------*/
4000 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
4001 uint64_t absZ1
, float_status
*status
)
4003 int8_t roundingMode
;
4004 bool roundNearestEven
, increment
;
4006 roundingMode
= status
->float_rounding_mode
;
4007 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
4008 switch (roundingMode
) {
4009 case float_round_nearest_even
:
4010 case float_round_ties_away
:
4011 increment
= ((int64_t)absZ1
< 0);
4013 case float_round_to_zero
:
4016 case float_round_up
:
4017 increment
= !zSign
&& absZ1
;
4019 case float_round_down
:
4020 increment
= zSign
&& absZ1
;
4022 case float_round_to_odd
:
4023 increment
= !(absZ0
& 1) && absZ1
;
4031 float_raise(float_flag_invalid
, status
);
4034 if (!(absZ1
<< 1) && roundNearestEven
) {
4039 if (zSign
&& absZ0
) {
4040 float_raise(float_flag_invalid
, status
);
4045 float_raise(float_flag_inexact
, status
);
4050 /*----------------------------------------------------------------------------
4051 | Normalizes the subnormal single-precision floating-point value represented
4052 | by the denormalized significand `aSig'. The normalized exponent and
4053 | significand are stored at the locations pointed to by `zExpPtr' and
4054 | `zSigPtr', respectively.
4055 *----------------------------------------------------------------------------*/
4058 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
4062 shiftCount
= clz32(aSig
) - 8;
4063 *zSigPtr
= aSig
<<shiftCount
;
4064 *zExpPtr
= 1 - shiftCount
;
4068 /*----------------------------------------------------------------------------
4069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4070 | and significand `zSig', and returns the proper single-precision floating-
4071 | point value corresponding to the abstract input. Ordinarily, the abstract
4072 | value is simply rounded and packed into the single-precision format, with
4073 | the inexact exception raised if the abstract input cannot be represented
4074 | exactly. However, if the abstract value is too large, the overflow and
4075 | inexact exceptions are raised and an infinity or maximal finite value is
4076 | returned. If the abstract value is too small, the input value is rounded to
4077 | a subnormal number, and the underflow and inexact exceptions are raised if
4078 | the abstract input cannot be represented exactly as a subnormal single-
4079 | precision floating-point number.
4080 | The input significand `zSig' has its binary point between bits 30
4081 | and 29, which is 7 bits to the left of the usual location. This shifted
4082 | significand must be normalized or smaller. If `zSig' is not normalized,
4083 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4084 | and it must not require rounding. In the usual case that `zSig' is
4085 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4086 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4087 | Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4090 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4091 float_status
*status
)
4093 int8_t roundingMode
;
4094 bool roundNearestEven
;
4095 int8_t roundIncrement
, roundBits
;
4098 roundingMode
= status
->float_rounding_mode
;
4099 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4100 switch (roundingMode
) {
4101 case float_round_nearest_even
:
4102 case float_round_ties_away
:
4103 roundIncrement
= 0x40;
4105 case float_round_to_zero
:
4108 case float_round_up
:
4109 roundIncrement
= zSign
? 0 : 0x7f;
4111 case float_round_down
:
4112 roundIncrement
= zSign
? 0x7f : 0;
4114 case float_round_to_odd
:
4115 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4121 roundBits
= zSig
& 0x7F;
4122 if ( 0xFD <= (uint16_t) zExp
) {
4123 if ( ( 0xFD < zExp
)
4124 || ( ( zExp
== 0xFD )
4125 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4127 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4128 roundIncrement
!= 0;
4129 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4130 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4133 if (status
->flush_to_zero
) {
4134 float_raise(float_flag_output_denormal
, status
);
4135 return packFloat32(zSign
, 0, 0);
4137 isTiny
= status
->tininess_before_rounding
4139 || (zSig
+ roundIncrement
< 0x80000000);
4140 shift32RightJamming( zSig
, - zExp
, &zSig
);
4142 roundBits
= zSig
& 0x7F;
4143 if (isTiny
&& roundBits
) {
4144 float_raise(float_flag_underflow
, status
);
4146 if (roundingMode
== float_round_to_odd
) {
4148 * For round-to-odd case, the roundIncrement depends on
4149 * zSig which just changed.
4151 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4156 float_raise(float_flag_inexact
, status
);
4158 zSig
= ( zSig
+ roundIncrement
)>>7;
4159 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4162 if ( zSig
== 0 ) zExp
= 0;
4163 return packFloat32( zSign
, zExp
, zSig
);
4167 /*----------------------------------------------------------------------------
4168 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4169 | and significand `zSig', and returns the proper single-precision floating-
4170 | point value corresponding to the abstract input. This routine is just like
4171 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4172 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4173 | floating-point exponent.
4174 *----------------------------------------------------------------------------*/
4177 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4178 float_status
*status
)
4182 shiftCount
= clz32(zSig
) - 1;
4183 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4188 /*----------------------------------------------------------------------------
4189 | Normalizes the subnormal double-precision floating-point value represented
4190 | by the denormalized significand `aSig'. The normalized exponent and
4191 | significand are stored at the locations pointed to by `zExpPtr' and
4192 | `zSigPtr', respectively.
4193 *----------------------------------------------------------------------------*/
4196 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4200 shiftCount
= clz64(aSig
) - 11;
4201 *zSigPtr
= aSig
<<shiftCount
;
4202 *zExpPtr
= 1 - shiftCount
;
4206 /*----------------------------------------------------------------------------
4207 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4208 | double-precision floating-point value, returning the result. After being
4209 | shifted into the proper positions, the three fields are simply added
4210 | together to form the result. This means that any integer portion of `zSig'
4211 | will be added into the exponent. Since a properly normalized significand
4212 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4213 | than the desired result exponent whenever `zSig' is a complete, normalized
4215 *----------------------------------------------------------------------------*/
4217 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4220 return make_float64(
4221 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4225 /*----------------------------------------------------------------------------
4226 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4227 | and significand `zSig', and returns the proper double-precision floating-
4228 | point value corresponding to the abstract input. Ordinarily, the abstract
4229 | value is simply rounded and packed into the double-precision format, with
4230 | the inexact exception raised if the abstract input cannot be represented
4231 | exactly. However, if the abstract value is too large, the overflow and
4232 | inexact exceptions are raised and an infinity or maximal finite value is
4233 | returned. If the abstract value is too small, the input value is rounded to
4234 | a subnormal number, and the underflow and inexact exceptions are raised if
4235 | the abstract input cannot be represented exactly as a subnormal double-
4236 | precision floating-point number.
4237 | The input significand `zSig' has its binary point between bits 62
4238 | and 61, which is 10 bits to the left of the usual location. This shifted
4239 | significand must be normalized or smaller. If `zSig' is not normalized,
4240 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4241 | and it must not require rounding. In the usual case that `zSig' is
4242 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4243 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4244 | Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4247 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4248 float_status
*status
)
4250 int8_t roundingMode
;
4251 bool roundNearestEven
;
4252 int roundIncrement
, roundBits
;
4255 roundingMode
= status
->float_rounding_mode
;
4256 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4257 switch (roundingMode
) {
4258 case float_round_nearest_even
:
4259 case float_round_ties_away
:
4260 roundIncrement
= 0x200;
4262 case float_round_to_zero
:
4265 case float_round_up
:
4266 roundIncrement
= zSign
? 0 : 0x3ff;
4268 case float_round_down
:
4269 roundIncrement
= zSign
? 0x3ff : 0;
4271 case float_round_to_odd
:
4272 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4277 roundBits
= zSig
& 0x3FF;
4278 if ( 0x7FD <= (uint16_t) zExp
) {
4279 if ( ( 0x7FD < zExp
)
4280 || ( ( zExp
== 0x7FD )
4281 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4283 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4284 roundIncrement
!= 0;
4285 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4286 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4289 if (status
->flush_to_zero
) {
4290 float_raise(float_flag_output_denormal
, status
);
4291 return packFloat64(zSign
, 0, 0);
4293 isTiny
= status
->tininess_before_rounding
4295 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4296 shift64RightJamming( zSig
, - zExp
, &zSig
);
4298 roundBits
= zSig
& 0x3FF;
4299 if (isTiny
&& roundBits
) {
4300 float_raise(float_flag_underflow
, status
);
4302 if (roundingMode
== float_round_to_odd
) {
4304 * For round-to-odd case, the roundIncrement depends on
4305 * zSig which just changed.
4307 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4312 float_raise(float_flag_inexact
, status
);
4314 zSig
= ( zSig
+ roundIncrement
)>>10;
4315 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4318 if ( zSig
== 0 ) zExp
= 0;
4319 return packFloat64( zSign
, zExp
, zSig
);
4323 /*----------------------------------------------------------------------------
4324 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4325 | and significand `zSig', and returns the proper double-precision floating-
4326 | point value corresponding to the abstract input. This routine is just like
4327 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4328 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4329 | floating-point exponent.
4330 *----------------------------------------------------------------------------*/
4333 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4334 float_status
*status
)
4338 shiftCount
= clz64(zSig
) - 1;
4339 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4344 /*----------------------------------------------------------------------------
4345 | Normalizes the subnormal extended double-precision floating-point value
4346 | represented by the denormalized significand `aSig'. The normalized exponent
4347 | and significand are stored at the locations pointed to by `zExpPtr' and
4348 | `zSigPtr', respectively.
4349 *----------------------------------------------------------------------------*/
4351 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4356 shiftCount
= clz64(aSig
);
4357 *zSigPtr
= aSig
<<shiftCount
;
4358 *zExpPtr
= 1 - shiftCount
;
4361 /*----------------------------------------------------------------------------
4362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4363 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4364 | and returns the proper extended double-precision floating-point value
4365 | corresponding to the abstract input. Ordinarily, the abstract value is
4366 | rounded and packed into the extended double-precision format, with the
4367 | inexact exception raised if the abstract input cannot be represented
4368 | exactly. However, if the abstract value is too large, the overflow and
4369 | inexact exceptions are raised and an infinity or maximal finite value is
4370 | returned. If the abstract value is too small, the input value is rounded to
4371 | a subnormal number, and the underflow and inexact exceptions are raised if
4372 | the abstract input cannot be represented exactly as a subnormal extended
4373 | double-precision floating-point number.
4374 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4375 | number of bits as single or double precision, respectively. Otherwise, the
4376 | result is rounded to the full precision of the extended double-precision
4378 | The input significand must be normalized or smaller. If the input
4379 | significand is not normalized, `zExp' must be 0; in that case, the result
4380 | returned is a subnormal number, and it must not require rounding. The
4381 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4382 | Floating-Point Arithmetic.
4383 *----------------------------------------------------------------------------*/
4385 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4386 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4387 float_status
*status
)
4389 int8_t roundingMode
;
4390 bool roundNearestEven
, increment
, isTiny
;
4391 int64_t roundIncrement
, roundMask
, roundBits
;
4393 roundingMode
= status
->float_rounding_mode
;
4394 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4395 if ( roundingPrecision
== 80 ) goto precision80
;
4396 if ( roundingPrecision
== 64 ) {
4397 roundIncrement
= UINT64_C(0x0000000000000400);
4398 roundMask
= UINT64_C(0x00000000000007FF);
4400 else if ( roundingPrecision
== 32 ) {
4401 roundIncrement
= UINT64_C(0x0000008000000000);
4402 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4407 zSig0
|= ( zSig1
!= 0 );
4408 switch (roundingMode
) {
4409 case float_round_nearest_even
:
4410 case float_round_ties_away
:
4412 case float_round_to_zero
:
4415 case float_round_up
:
4416 roundIncrement
= zSign
? 0 : roundMask
;
4418 case float_round_down
:
4419 roundIncrement
= zSign
? roundMask
: 0;
4424 roundBits
= zSig0
& roundMask
;
4425 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4426 if ( ( 0x7FFE < zExp
)
4427 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4432 if (status
->flush_to_zero
) {
4433 float_raise(float_flag_output_denormal
, status
);
4434 return packFloatx80(zSign
, 0, 0);
4436 isTiny
= status
->tininess_before_rounding
4438 || (zSig0
<= zSig0
+ roundIncrement
);
4439 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4441 roundBits
= zSig0
& roundMask
;
4442 if (isTiny
&& roundBits
) {
4443 float_raise(float_flag_underflow
, status
);
4446 float_raise(float_flag_inexact
, status
);
4448 zSig0
+= roundIncrement
;
4449 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4450 roundIncrement
= roundMask
+ 1;
4451 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4452 roundMask
|= roundIncrement
;
4454 zSig0
&= ~ roundMask
;
4455 return packFloatx80( zSign
, zExp
, zSig0
);
4459 float_raise(float_flag_inexact
, status
);
4461 zSig0
+= roundIncrement
;
4462 if ( zSig0
< roundIncrement
) {
4464 zSig0
= UINT64_C(0x8000000000000000);
4466 roundIncrement
= roundMask
+ 1;
4467 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4468 roundMask
|= roundIncrement
;
4470 zSig0
&= ~ roundMask
;
4471 if ( zSig0
== 0 ) zExp
= 0;
4472 return packFloatx80( zSign
, zExp
, zSig0
);
4474 switch (roundingMode
) {
4475 case float_round_nearest_even
:
4476 case float_round_ties_away
:
4477 increment
= ((int64_t)zSig1
< 0);
4479 case float_round_to_zero
:
4482 case float_round_up
:
4483 increment
= !zSign
&& zSig1
;
4485 case float_round_down
:
4486 increment
= zSign
&& zSig1
;
4491 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4492 if ( ( 0x7FFE < zExp
)
4493 || ( ( zExp
== 0x7FFE )
4494 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4500 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4501 if ( ( roundingMode
== float_round_to_zero
)
4502 || ( zSign
&& ( roundingMode
== float_round_up
) )
4503 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4505 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4507 return packFloatx80(zSign
,
4508 floatx80_infinity_high
,
4509 floatx80_infinity_low
);
4512 isTiny
= status
->tininess_before_rounding
4515 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4516 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4518 if (isTiny
&& zSig1
) {
4519 float_raise(float_flag_underflow
, status
);
4522 float_raise(float_flag_inexact
, status
);
4524 switch (roundingMode
) {
4525 case float_round_nearest_even
:
4526 case float_round_ties_away
:
4527 increment
= ((int64_t)zSig1
< 0);
4529 case float_round_to_zero
:
4532 case float_round_up
:
4533 increment
= !zSign
&& zSig1
;
4535 case float_round_down
:
4536 increment
= zSign
&& zSig1
;
4543 if (!(zSig1
<< 1) && roundNearestEven
) {
4546 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4548 return packFloatx80( zSign
, zExp
, zSig0
);
4552 float_raise(float_flag_inexact
, status
);
4558 zSig0
= UINT64_C(0x8000000000000000);
4561 if (!(zSig1
<< 1) && roundNearestEven
) {
4567 if ( zSig0
== 0 ) zExp
= 0;
4569 return packFloatx80( zSign
, zExp
, zSig0
);
4573 /*----------------------------------------------------------------------------
4574 | Takes an abstract floating-point value having sign `zSign', exponent
4575 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4576 | and returns the proper extended double-precision floating-point value
4577 | corresponding to the abstract input. This routine is just like
4578 | `roundAndPackFloatx80' except that the input significand does not have to be
4580 *----------------------------------------------------------------------------*/
4582 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4583 bool zSign
, int32_t zExp
,
4584 uint64_t zSig0
, uint64_t zSig1
,
4585 float_status
*status
)
4594 shiftCount
= clz64(zSig0
);
4595 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4597 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4598 zSig0
, zSig1
, status
);
4602 /*----------------------------------------------------------------------------
4603 | Returns the least-significant 64 fraction bits of the quadruple-precision
4604 | floating-point value `a'.
4605 *----------------------------------------------------------------------------*/
4607 static inline uint64_t extractFloat128Frac1( float128 a
)
4614 /*----------------------------------------------------------------------------
4615 | Returns the most-significant 48 fraction bits of the quadruple-precision
4616 | floating-point value `a'.
4617 *----------------------------------------------------------------------------*/
4619 static inline uint64_t extractFloat128Frac0( float128 a
)
4622 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4626 /*----------------------------------------------------------------------------
4627 | Returns the exponent bits of the quadruple-precision floating-point value
4629 *----------------------------------------------------------------------------*/
4631 static inline int32_t extractFloat128Exp( float128 a
)
4634 return ( a
.high
>>48 ) & 0x7FFF;
4638 /*----------------------------------------------------------------------------
4639 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4640 *----------------------------------------------------------------------------*/
4642 static inline bool extractFloat128Sign(float128 a
)
4644 return a
.high
>> 63;
4647 /*----------------------------------------------------------------------------
4648 | Normalizes the subnormal quadruple-precision floating-point value
4649 | represented by the denormalized significand formed by the concatenation of
4650 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4651 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4652 | significand are stored at the location pointed to by `zSig0Ptr', and the
4653 | least significant 64 bits of the normalized significand are stored at the
4654 | location pointed to by `zSig1Ptr'.
4655 *----------------------------------------------------------------------------*/
4658 normalizeFloat128Subnormal(
4669 shiftCount
= clz64(aSig1
) - 15;
4670 if ( shiftCount
< 0 ) {
4671 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4672 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4675 *zSig0Ptr
= aSig1
<<shiftCount
;
4678 *zExpPtr
= - shiftCount
- 63;
4681 shiftCount
= clz64(aSig0
) - 15;
4682 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4683 *zExpPtr
= 1 - shiftCount
;
4688 /*----------------------------------------------------------------------------
4689 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4690 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4691 | floating-point value, returning the result. After being shifted into the
4692 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4693 | added together to form the most significant 32 bits of the result. This
4694 | means that any integer portion of `zSig0' will be added into the exponent.
4695 | Since a properly normalized significand will have an integer portion equal
4696 | to 1, the `zExp' input should be 1 less than the desired result exponent
4697 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4699 *----------------------------------------------------------------------------*/
4701 static inline float128
4702 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4707 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4711 /*----------------------------------------------------------------------------
4712 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4713 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4714 | and `zSig2', and returns the proper quadruple-precision floating-point value
4715 | corresponding to the abstract input. Ordinarily, the abstract value is
4716 | simply rounded and packed into the quadruple-precision format, with the
4717 | inexact exception raised if the abstract input cannot be represented
4718 | exactly. However, if the abstract value is too large, the overflow and
4719 | inexact exceptions are raised and an infinity or maximal finite value is
4720 | returned. If the abstract value is too small, the input value is rounded to
4721 | a subnormal number, and the underflow and inexact exceptions are raised if
4722 | the abstract input cannot be represented exactly as a subnormal quadruple-
4723 | precision floating-point number.
4724 | The input significand must be normalized or smaller. If the input
4725 | significand is not normalized, `zExp' must be 0; in that case, the result
4726 | returned is a subnormal number, and it must not require rounding. In the
4727 | usual case that the input significand is normalized, `zExp' must be 1 less
4728 | than the ``true'' floating-point exponent. The handling of underflow and
4729 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4730 *----------------------------------------------------------------------------*/
4732 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4733 uint64_t zSig0
, uint64_t zSig1
,
4734 uint64_t zSig2
, float_status
*status
)
4736 int8_t roundingMode
;
4737 bool roundNearestEven
, increment
, isTiny
;
4739 roundingMode
= status
->float_rounding_mode
;
4740 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4741 switch (roundingMode
) {
4742 case float_round_nearest_even
:
4743 case float_round_ties_away
:
4744 increment
= ((int64_t)zSig2
< 0);
4746 case float_round_to_zero
:
4749 case float_round_up
:
4750 increment
= !zSign
&& zSig2
;
4752 case float_round_down
:
4753 increment
= zSign
&& zSig2
;
4755 case float_round_to_odd
:
4756 increment
= !(zSig1
& 0x1) && zSig2
;
4761 if ( 0x7FFD <= (uint32_t) zExp
) {
4762 if ( ( 0x7FFD < zExp
)
4763 || ( ( zExp
== 0x7FFD )
4765 UINT64_C(0x0001FFFFFFFFFFFF),
4766 UINT64_C(0xFFFFFFFFFFFFFFFF),
4773 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4774 if ( ( roundingMode
== float_round_to_zero
)
4775 || ( zSign
&& ( roundingMode
== float_round_up
) )
4776 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4777 || (roundingMode
== float_round_to_odd
)
4783 UINT64_C(0x0000FFFFFFFFFFFF),
4784 UINT64_C(0xFFFFFFFFFFFFFFFF)
4787 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4790 if (status
->flush_to_zero
) {
4791 float_raise(float_flag_output_denormal
, status
);
4792 return packFloat128(zSign
, 0, 0, 0);
4794 isTiny
= status
->tininess_before_rounding
4797 || lt128(zSig0
, zSig1
,
4798 UINT64_C(0x0001FFFFFFFFFFFF),
4799 UINT64_C(0xFFFFFFFFFFFFFFFF));
4800 shift128ExtraRightJamming(
4801 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4803 if (isTiny
&& zSig2
) {
4804 float_raise(float_flag_underflow
, status
);
4806 switch (roundingMode
) {
4807 case float_round_nearest_even
:
4808 case float_round_ties_away
:
4809 increment
= ((int64_t)zSig2
< 0);
4811 case float_round_to_zero
:
4814 case float_round_up
:
4815 increment
= !zSign
&& zSig2
;
4817 case float_round_down
:
4818 increment
= zSign
&& zSig2
;
4820 case float_round_to_odd
:
4821 increment
= !(zSig1
& 0x1) && zSig2
;
4829 float_raise(float_flag_inexact
, status
);
4832 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4833 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4838 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4840 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4844 /*----------------------------------------------------------------------------
4845 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4846 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4847 | returns the proper quadruple-precision floating-point value corresponding
4848 | to the abstract input. This routine is just like `roundAndPackFloat128'
4849 | except that the input significand has fewer bits and does not have to be
4850 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4852 *----------------------------------------------------------------------------*/
4854 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4855 uint64_t zSig0
, uint64_t zSig1
,
4856 float_status
*status
)
4866 shiftCount
= clz64(zSig0
) - 15;
4867 if ( 0 <= shiftCount
) {
4869 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4872 shift128ExtraRightJamming(
4873 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4876 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the 32-bit two's complement integer `a'
4883 | to the extended double-precision floating-point format. The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4886 *----------------------------------------------------------------------------*/
4888 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4895 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4897 absA
= zSign
? - a
: a
;
4898 shiftCount
= clz32(absA
) + 32;
4900 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4904 /*----------------------------------------------------------------------------
4905 | Returns the result of converting the 32-bit two's complement integer `a' to
4906 | the quadruple-precision floating-point format. The conversion is performed
4907 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4908 *----------------------------------------------------------------------------*/
4910 float128
int32_to_float128(int32_t a
, float_status
*status
)
4917 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4919 absA
= zSign
? - a
: a
;
4920 shiftCount
= clz32(absA
) + 17;
4922 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4926 /*----------------------------------------------------------------------------
4927 | Returns the result of converting the 64-bit two's complement integer `a'
4928 | to the extended double-precision floating-point format. The conversion
4929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4931 *----------------------------------------------------------------------------*/
4933 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4939 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4941 absA
= zSign
? - a
: a
;
4942 shiftCount
= clz64(absA
);
4943 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4947 /*----------------------------------------------------------------------------
4948 | Returns the result of converting the 64-bit two's complement integer `a' to
4949 | the quadruple-precision floating-point format. The conversion is performed
4950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4951 *----------------------------------------------------------------------------*/
4953 float128
int64_to_float128(int64_t a
, float_status
*status
)
4959 uint64_t zSig0
, zSig1
;
4961 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4963 absA
= zSign
? - a
: a
;
4964 shiftCount
= clz64(absA
) + 49;
4965 zExp
= 0x406E - shiftCount
;
4966 if ( 64 <= shiftCount
) {
4975 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4976 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4980 /*----------------------------------------------------------------------------
4981 | Returns the result of converting the 64-bit unsigned integer `a'
4982 | to the quadruple-precision floating-point format. The conversion is performed
4983 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4984 *----------------------------------------------------------------------------*/
4986 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4989 return float128_zero
;
4991 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4994 /*----------------------------------------------------------------------------
4995 | Returns the result of converting the single-precision floating-point value
4996 | `a' to the extended double-precision floating-point format. The conversion
4997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4999 *----------------------------------------------------------------------------*/
5001 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
5007 a
= float32_squash_input_denormal(a
, status
);
5008 aSig
= extractFloat32Frac( a
);
5009 aExp
= extractFloat32Exp( a
);
5010 aSign
= extractFloat32Sign( a
);
5011 if ( aExp
== 0xFF ) {
5013 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
5015 return floatx80_silence_nan(res
, status
);
5017 return packFloatx80(aSign
,
5018 floatx80_infinity_high
,
5019 floatx80_infinity_low
);
5022 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5023 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5026 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the single-precision floating-point value
5032 | `a' to the double-precision floating-point format. The conversion is
5033 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5035 *----------------------------------------------------------------------------*/
5037 float128
float32_to_float128(float32 a
, float_status
*status
)
5043 a
= float32_squash_input_denormal(a
, status
);
5044 aSig
= extractFloat32Frac( a
);
5045 aExp
= extractFloat32Exp( a
);
5046 aSign
= extractFloat32Sign( a
);
5047 if ( aExp
== 0xFF ) {
5049 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
5051 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5054 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5055 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5058 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
5062 /*----------------------------------------------------------------------------
5063 | Returns the remainder of the single-precision floating-point value `a'
5064 | with respect to the corresponding value `b'. The operation is performed
5065 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5066 *----------------------------------------------------------------------------*/
5068 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
5071 int aExp
, bExp
, expDiff
;
5072 uint32_t aSig
, bSig
;
5074 uint64_t aSig64
, bSig64
, q64
;
5075 uint32_t alternateASig
;
5077 a
= float32_squash_input_denormal(a
, status
);
5078 b
= float32_squash_input_denormal(b
, status
);
5080 aSig
= extractFloat32Frac( a
);
5081 aExp
= extractFloat32Exp( a
);
5082 aSign
= extractFloat32Sign( a
);
5083 bSig
= extractFloat32Frac( b
);
5084 bExp
= extractFloat32Exp( b
);
5085 if ( aExp
== 0xFF ) {
5086 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
5087 return propagateFloat32NaN(a
, b
, status
);
5089 float_raise(float_flag_invalid
, status
);
5090 return float32_default_nan(status
);
5092 if ( bExp
== 0xFF ) {
5094 return propagateFloat32NaN(a
, b
, status
);
5100 float_raise(float_flag_invalid
, status
);
5101 return float32_default_nan(status
);
5103 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
5106 if ( aSig
== 0 ) return a
;
5107 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5109 expDiff
= aExp
- bExp
;
5112 if ( expDiff
< 32 ) {
5115 if ( expDiff
< 0 ) {
5116 if ( expDiff
< -1 ) return a
;
5119 q
= ( bSig
<= aSig
);
5120 if ( q
) aSig
-= bSig
;
5121 if ( 0 < expDiff
) {
5122 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5125 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5133 if ( bSig
<= aSig
) aSig
-= bSig
;
5134 aSig64
= ( (uint64_t) aSig
)<<40;
5135 bSig64
= ( (uint64_t) bSig
)<<40;
5137 while ( 0 < expDiff
) {
5138 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5139 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5140 aSig64
= - ( ( bSig
* q64
)<<38 );
5144 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5145 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5146 q
= q64
>>( 64 - expDiff
);
5148 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5151 alternateASig
= aSig
;
5154 } while ( 0 <= (int32_t) aSig
);
5155 sigMean
= aSig
+ alternateASig
;
5156 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5157 aSig
= alternateASig
;
5159 zSign
= ( (int32_t) aSig
< 0 );
5160 if ( zSign
) aSig
= - aSig
;
5161 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5166 /*----------------------------------------------------------------------------
5167 | Returns the binary exponential of the single-precision floating-point value
5168 | `a'. The operation is performed according to the IEC/IEEE Standard for
5169 | Binary Floating-Point Arithmetic.
5171 | Uses the following identities:
5173 | 1. -------------------------------------------------------------------------
5177 | 2. -------------------------------------------------------------------------
5180 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5182 *----------------------------------------------------------------------------*/
5184 static const float64 float32_exp2_coefficients
[15] =
5186 const_float64( 0x3ff0000000000000ll
), /* 1 */
5187 const_float64( 0x3fe0000000000000ll
), /* 2 */
5188 const_float64( 0x3fc5555555555555ll
), /* 3 */
5189 const_float64( 0x3fa5555555555555ll
), /* 4 */
5190 const_float64( 0x3f81111111111111ll
), /* 5 */
5191 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5192 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5193 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5194 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5195 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5196 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5197 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5198 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5199 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5200 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5203 float32
float32_exp2(float32 a
, float_status
*status
)
5210 a
= float32_squash_input_denormal(a
, status
);
5212 aSig
= extractFloat32Frac( a
);
5213 aExp
= extractFloat32Exp( a
);
5214 aSign
= extractFloat32Sign( a
);
5216 if ( aExp
== 0xFF) {
5218 return propagateFloat32NaN(a
, float32_zero
, status
);
5220 return (aSign
) ? float32_zero
: a
;
5223 if (aSig
== 0) return float32_one
;
5226 float_raise(float_flag_inexact
, status
);
5228 /* ******************************* */
5229 /* using float64 for approximation */
5230 /* ******************************* */
5231 x
= float32_to_float64(a
, status
);
5232 x
= float64_mul(x
, float64_ln2
, status
);
5236 for (i
= 0 ; i
< 15 ; i
++) {
5239 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5240 r
= float64_add(r
, f
, status
);
5242 xn
= float64_mul(xn
, x
, status
);
5245 return float64_to_float32(r
, status
);
5248 /*----------------------------------------------------------------------------
5249 | Returns the binary log of the single-precision floating-point value `a'.
5250 | The operation is performed according to the IEC/IEEE Standard for Binary
5251 | Floating-Point Arithmetic.
5252 *----------------------------------------------------------------------------*/
5253 float32
float32_log2(float32 a
, float_status
*status
)
5257 uint32_t aSig
, zSig
, i
;
5259 a
= float32_squash_input_denormal(a
, status
);
5260 aSig
= extractFloat32Frac( a
);
5261 aExp
= extractFloat32Exp( a
);
5262 aSign
= extractFloat32Sign( a
);
5265 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5266 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5269 float_raise(float_flag_invalid
, status
);
5270 return float32_default_nan(status
);
5272 if ( aExp
== 0xFF ) {
5274 return propagateFloat32NaN(a
, float32_zero
, status
);
5284 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5285 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5286 if ( aSig
& 0x01000000 ) {
5295 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of converting the double-precision floating-point value
5300 | `a' to the extended double-precision floating-point format. The conversion
5301 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5303 *----------------------------------------------------------------------------*/
5305 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5311 a
= float64_squash_input_denormal(a
, status
);
5312 aSig
= extractFloat64Frac( a
);
5313 aExp
= extractFloat64Exp( a
);
5314 aSign
= extractFloat64Sign( a
);
5315 if ( aExp
== 0x7FF ) {
5317 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5319 return floatx80_silence_nan(res
, status
);
5321 return packFloatx80(aSign
,
5322 floatx80_infinity_high
,
5323 floatx80_infinity_low
);
5326 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5327 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5331 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5335 /*----------------------------------------------------------------------------
5336 | Returns the result of converting the double-precision floating-point value
5337 | `a' to the quadruple-precision floating-point format. The conversion is
5338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5340 *----------------------------------------------------------------------------*/
5342 float128
float64_to_float128(float64 a
, float_status
*status
)
5346 uint64_t aSig
, zSig0
, zSig1
;
5348 a
= float64_squash_input_denormal(a
, status
);
5349 aSig
= extractFloat64Frac( a
);
5350 aExp
= extractFloat64Exp( a
);
5351 aSign
= extractFloat64Sign( a
);
5352 if ( aExp
== 0x7FF ) {
5354 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5356 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5359 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5360 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5363 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5364 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5369 /*----------------------------------------------------------------------------
5370 | Returns the remainder of the double-precision floating-point value `a'
5371 | with respect to the corresponding value `b'. The operation is performed
5372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5373 *----------------------------------------------------------------------------*/
5375 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5378 int aExp
, bExp
, expDiff
;
5379 uint64_t aSig
, bSig
;
5380 uint64_t q
, alternateASig
;
5383 a
= float64_squash_input_denormal(a
, status
);
5384 b
= float64_squash_input_denormal(b
, status
);
5385 aSig
= extractFloat64Frac( a
);
5386 aExp
= extractFloat64Exp( a
);
5387 aSign
= extractFloat64Sign( a
);
5388 bSig
= extractFloat64Frac( b
);
5389 bExp
= extractFloat64Exp( b
);
5390 if ( aExp
== 0x7FF ) {
5391 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5392 return propagateFloat64NaN(a
, b
, status
);
5394 float_raise(float_flag_invalid
, status
);
5395 return float64_default_nan(status
);
5397 if ( bExp
== 0x7FF ) {
5399 return propagateFloat64NaN(a
, b
, status
);
5405 float_raise(float_flag_invalid
, status
);
5406 return float64_default_nan(status
);
5408 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5411 if ( aSig
== 0 ) return a
;
5412 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5414 expDiff
= aExp
- bExp
;
5415 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5416 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5417 if ( expDiff
< 0 ) {
5418 if ( expDiff
< -1 ) return a
;
5421 q
= ( bSig
<= aSig
);
5422 if ( q
) aSig
-= bSig
;
5424 while ( 0 < expDiff
) {
5425 q
= estimateDiv128To64( aSig
, 0, bSig
);
5426 q
= ( 2 < q
) ? q
- 2 : 0;
5427 aSig
= - ( ( bSig
>>2 ) * q
);
5431 if ( 0 < expDiff
) {
5432 q
= estimateDiv128To64( aSig
, 0, bSig
);
5433 q
= ( 2 < q
) ? q
- 2 : 0;
5436 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5443 alternateASig
= aSig
;
5446 } while ( 0 <= (int64_t) aSig
);
5447 sigMean
= aSig
+ alternateASig
;
5448 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5449 aSig
= alternateASig
;
5451 zSign
= ( (int64_t) aSig
< 0 );
5452 if ( zSign
) aSig
= - aSig
;
5453 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5457 /*----------------------------------------------------------------------------
5458 | Returns the binary log of the double-precision floating-point value `a'.
5459 | The operation is performed according to the IEC/IEEE Standard for Binary
5460 | Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462 float64
float64_log2(float64 a
, float_status
*status
)
5466 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5467 a
= float64_squash_input_denormal(a
, status
);
5469 aSig
= extractFloat64Frac( a
);
5470 aExp
= extractFloat64Exp( a
);
5471 aSign
= extractFloat64Sign( a
);
5474 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5475 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5478 float_raise(float_flag_invalid
, status
);
5479 return float64_default_nan(status
);
5481 if ( aExp
== 0x7FF ) {
5483 return propagateFloat64NaN(a
, float64_zero
, status
);
5489 aSig
|= UINT64_C(0x0010000000000000);
5491 zSig
= (uint64_t)aExp
<< 52;
5492 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5493 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5494 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5495 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5503 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5506 /*----------------------------------------------------------------------------
5507 | Returns the result of converting the extended double-precision floating-
5508 | point value `a' to the 32-bit two's complement integer format. The
5509 | conversion is performed according to the IEC/IEEE Standard for Binary
5510 | Floating-Point Arithmetic---which means in particular that the conversion
5511 | is rounded according to the current rounding mode. If `a' is a NaN, the
5512 | largest positive integer is returned. Otherwise, if the conversion
5513 | overflows, the largest integer with the same sign as `a' is returned.
5514 *----------------------------------------------------------------------------*/
5516 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5519 int32_t aExp
, shiftCount
;
5522 if (floatx80_invalid_encoding(a
)) {
5523 float_raise(float_flag_invalid
, status
);
5526 aSig
= extractFloatx80Frac( a
);
5527 aExp
= extractFloatx80Exp( a
);
5528 aSign
= extractFloatx80Sign( a
);
5529 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5530 shiftCount
= 0x4037 - aExp
;
5531 if ( shiftCount
<= 0 ) shiftCount
= 1;
5532 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5533 return roundAndPackInt32(aSign
, aSig
, status
);
5537 /*----------------------------------------------------------------------------
5538 | Returns the result of converting the extended double-precision floating-
5539 | point value `a' to the 32-bit two's complement integer format. The
5540 | conversion is performed according to the IEC/IEEE Standard for Binary
5541 | Floating-Point Arithmetic, except that the conversion is always rounded
5542 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5543 | Otherwise, if the conversion overflows, the largest integer with the same
5544 | sign as `a' is returned.
5545 *----------------------------------------------------------------------------*/
5547 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5550 int32_t aExp
, shiftCount
;
5551 uint64_t aSig
, savedASig
;
5554 if (floatx80_invalid_encoding(a
)) {
5555 float_raise(float_flag_invalid
, status
);
5558 aSig
= extractFloatx80Frac( a
);
5559 aExp
= extractFloatx80Exp( a
);
5560 aSign
= extractFloatx80Sign( a
);
5561 if ( 0x401E < aExp
) {
5562 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5565 else if ( aExp
< 0x3FFF ) {
5567 float_raise(float_flag_inexact
, status
);
5571 shiftCount
= 0x403E - aExp
;
5573 aSig
>>= shiftCount
;
5575 if ( aSign
) z
= - z
;
5576 if ( ( z
< 0 ) ^ aSign
) {
5578 float_raise(float_flag_invalid
, status
);
5579 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5581 if ( ( aSig
<<shiftCount
) != savedASig
) {
5582 float_raise(float_flag_inexact
, status
);
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the 64-bit two's complement integer format. The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic---which means in particular that the conversion
5593 | is rounded according to the current rounding mode. If `a' is a NaN,
5594 | the largest positive integer is returned. Otherwise, if the conversion
5595 | overflows, the largest integer with the same sign as `a' is returned.
5596 *----------------------------------------------------------------------------*/
5598 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5601 int32_t aExp
, shiftCount
;
5602 uint64_t aSig
, aSigExtra
;
5604 if (floatx80_invalid_encoding(a
)) {
5605 float_raise(float_flag_invalid
, status
);
5608 aSig
= extractFloatx80Frac( a
);
5609 aExp
= extractFloatx80Exp( a
);
5610 aSign
= extractFloatx80Sign( a
);
5611 shiftCount
= 0x403E - aExp
;
5612 if ( shiftCount
<= 0 ) {
5614 float_raise(float_flag_invalid
, status
);
5615 if (!aSign
|| floatx80_is_any_nan(a
)) {
5623 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5625 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5629 /*----------------------------------------------------------------------------
5630 | Returns the result of converting the extended double-precision floating-
5631 | point value `a' to the 64-bit two's complement integer format. The
5632 | conversion is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic, except that the conversion is always rounded
5634 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5635 | Otherwise, if the conversion overflows, the largest integer with the same
5636 | sign as `a' is returned.
5637 *----------------------------------------------------------------------------*/
5639 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5642 int32_t aExp
, shiftCount
;
5646 if (floatx80_invalid_encoding(a
)) {
5647 float_raise(float_flag_invalid
, status
);
5650 aSig
= extractFloatx80Frac( a
);
5651 aExp
= extractFloatx80Exp( a
);
5652 aSign
= extractFloatx80Sign( a
);
5653 shiftCount
= aExp
- 0x403E;
5654 if ( 0 <= shiftCount
) {
5655 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5656 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5657 float_raise(float_flag_invalid
, status
);
5658 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5664 else if ( aExp
< 0x3FFF ) {
5666 float_raise(float_flag_inexact
, status
);
5670 z
= aSig
>>( - shiftCount
);
5671 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5672 float_raise(float_flag_inexact
, status
);
5674 if ( aSign
) z
= - z
;
5679 /*----------------------------------------------------------------------------
5680 | Returns the result of converting the extended double-precision floating-
5681 | point value `a' to the single-precision floating-point format. The
5682 | conversion is performed according to the IEC/IEEE Standard for Binary
5683 | Floating-Point Arithmetic.
5684 *----------------------------------------------------------------------------*/
5686 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5692 if (floatx80_invalid_encoding(a
)) {
5693 float_raise(float_flag_invalid
, status
);
5694 return float32_default_nan(status
);
5696 aSig
= extractFloatx80Frac( a
);
5697 aExp
= extractFloatx80Exp( a
);
5698 aSign
= extractFloatx80Sign( a
);
5699 if ( aExp
== 0x7FFF ) {
5700 if ( (uint64_t) ( aSig
<<1 ) ) {
5701 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5703 return float32_silence_nan(res
, status
);
5705 return packFloat32( aSign
, 0xFF, 0 );
5707 shift64RightJamming( aSig
, 33, &aSig
);
5708 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5709 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5713 /*----------------------------------------------------------------------------
5714 | Returns the result of converting the extended double-precision floating-
5715 | point value `a' to the double-precision floating-point format. The
5716 | conversion is performed according to the IEC/IEEE Standard for Binary
5717 | Floating-Point Arithmetic.
5718 *----------------------------------------------------------------------------*/
5720 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5724 uint64_t aSig
, zSig
;
5726 if (floatx80_invalid_encoding(a
)) {
5727 float_raise(float_flag_invalid
, status
);
5728 return float64_default_nan(status
);
5730 aSig
= extractFloatx80Frac( a
);
5731 aExp
= extractFloatx80Exp( a
);
5732 aSign
= extractFloatx80Sign( a
);
5733 if ( aExp
== 0x7FFF ) {
5734 if ( (uint64_t) ( aSig
<<1 ) ) {
5735 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5737 return float64_silence_nan(res
, status
);
5739 return packFloat64( aSign
, 0x7FF, 0 );
5741 shift64RightJamming( aSig
, 1, &zSig
);
5742 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5743 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5747 /*----------------------------------------------------------------------------
5748 | Returns the result of converting the extended double-precision floating-
5749 | point value `a' to the quadruple-precision floating-point format. The
5750 | conversion is performed according to the IEC/IEEE Standard for Binary
5751 | Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5754 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5758 uint64_t aSig
, zSig0
, zSig1
;
5760 if (floatx80_invalid_encoding(a
)) {
5761 float_raise(float_flag_invalid
, status
);
5762 return float128_default_nan(status
);
5764 aSig
= extractFloatx80Frac( a
);
5765 aExp
= extractFloatx80Exp( a
);
5766 aSign
= extractFloatx80Sign( a
);
5767 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5768 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5770 return float128_silence_nan(res
, status
);
5772 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5773 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5777 /*----------------------------------------------------------------------------
5778 | Rounds the extended double-precision floating-point value `a'
5779 | to the precision provided by floatx80_rounding_precision and returns the
5780 | result as an extended double-precision floating-point value.
5781 | The operation is performed according to the IEC/IEEE Standard for Binary
5782 | Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5785 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5787 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5788 extractFloatx80Sign(a
),
5789 extractFloatx80Exp(a
),
5790 extractFloatx80Frac(a
), 0, status
);
5793 /*----------------------------------------------------------------------------
5794 | Rounds the extended double-precision floating-point value `a' to an integer,
5795 | and returns the result as an extended quadruple-precision floating-point
5796 | value. The operation is performed according to the IEC/IEEE Standard for
5797 | Binary Floating-Point Arithmetic.
5798 *----------------------------------------------------------------------------*/
5800 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5804 uint64_t lastBitMask
, roundBitsMask
;
5807 if (floatx80_invalid_encoding(a
)) {
5808 float_raise(float_flag_invalid
, status
);
5809 return floatx80_default_nan(status
);
5811 aExp
= extractFloatx80Exp( a
);
5812 if ( 0x403E <= aExp
) {
5813 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5814 return propagateFloatx80NaN(a
, a
, status
);
5818 if ( aExp
< 0x3FFF ) {
5820 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5823 float_raise(float_flag_inexact
, status
);
5824 aSign
= extractFloatx80Sign( a
);
5825 switch (status
->float_rounding_mode
) {
5826 case float_round_nearest_even
:
5827 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5830 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5833 case float_round_ties_away
:
5834 if (aExp
== 0x3FFE) {
5835 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5838 case float_round_down
:
5841 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5842 : packFloatx80( 0, 0, 0 );
5843 case float_round_up
:
5845 aSign
? packFloatx80( 1, 0, 0 )
5846 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5848 case float_round_to_zero
:
5851 g_assert_not_reached();
5853 return packFloatx80( aSign
, 0, 0 );
5856 lastBitMask
<<= 0x403E - aExp
;
5857 roundBitsMask
= lastBitMask
- 1;
5859 switch (status
->float_rounding_mode
) {
5860 case float_round_nearest_even
:
5861 z
.low
+= lastBitMask
>>1;
5862 if ((z
.low
& roundBitsMask
) == 0) {
5863 z
.low
&= ~lastBitMask
;
5866 case float_round_ties_away
:
5867 z
.low
+= lastBitMask
>> 1;
5869 case float_round_to_zero
:
5871 case float_round_up
:
5872 if (!extractFloatx80Sign(z
)) {
5873 z
.low
+= roundBitsMask
;
5876 case float_round_down
:
5877 if (extractFloatx80Sign(z
)) {
5878 z
.low
+= roundBitsMask
;
5884 z
.low
&= ~ roundBitsMask
;
5887 z
.low
= UINT64_C(0x8000000000000000);
5889 if (z
.low
!= a
.low
) {
5890 float_raise(float_flag_inexact
, status
);
5896 /*----------------------------------------------------------------------------
5897 | Returns the result of adding the absolute values of the extended double-
5898 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5899 | negated before being returned. `zSign' is ignored if the result is a NaN.
5900 | The addition is performed according to the IEC/IEEE Standard for Binary
5901 | Floating-Point Arithmetic.
5902 *----------------------------------------------------------------------------*/
5904 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5905 float_status
*status
)
5907 int32_t aExp
, bExp
, zExp
;
5908 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5911 aSig
= extractFloatx80Frac( a
);
5912 aExp
= extractFloatx80Exp( a
);
5913 bSig
= extractFloatx80Frac( b
);
5914 bExp
= extractFloatx80Exp( b
);
5915 expDiff
= aExp
- bExp
;
5916 if ( 0 < expDiff
) {
5917 if ( aExp
== 0x7FFF ) {
5918 if ((uint64_t)(aSig
<< 1)) {
5919 return propagateFloatx80NaN(a
, b
, status
);
5923 if ( bExp
== 0 ) --expDiff
;
5924 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5927 else if ( expDiff
< 0 ) {
5928 if ( bExp
== 0x7FFF ) {
5929 if ((uint64_t)(bSig
<< 1)) {
5930 return propagateFloatx80NaN(a
, b
, status
);
5932 return packFloatx80(zSign
,
5933 floatx80_infinity_high
,
5934 floatx80_infinity_low
);
5936 if ( aExp
== 0 ) ++expDiff
;
5937 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5941 if ( aExp
== 0x7FFF ) {
5942 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5943 return propagateFloatx80NaN(a
, b
, status
);
5948 zSig0
= aSig
+ bSig
;
5950 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5951 /* At least one of the values is a pseudo-denormal,
5952 * and there is a carry out of the result. */
5957 return packFloatx80(zSign
, 0, 0);
5959 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5965 zSig0
= aSig
+ bSig
;
5966 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5968 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5969 zSig0
|= UINT64_C(0x8000000000000000);
5972 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5973 zSign
, zExp
, zSig0
, zSig1
, status
);
5976 /*----------------------------------------------------------------------------
5977 | Returns the result of subtracting the absolute values of the extended
5978 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5979 | difference is negated before being returned. `zSign' is ignored if the
5980 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5981 | Standard for Binary Floating-Point Arithmetic.
5982 *----------------------------------------------------------------------------*/
5984 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5985 float_status
*status
)
5987 int32_t aExp
, bExp
, zExp
;
5988 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5991 aSig
= extractFloatx80Frac( a
);
5992 aExp
= extractFloatx80Exp( a
);
5993 bSig
= extractFloatx80Frac( b
);
5994 bExp
= extractFloatx80Exp( b
);
5995 expDiff
= aExp
- bExp
;
5996 if ( 0 < expDiff
) goto aExpBigger
;
5997 if ( expDiff
< 0 ) goto bExpBigger
;
5998 if ( aExp
== 0x7FFF ) {
5999 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
6000 return propagateFloatx80NaN(a
, b
, status
);
6002 float_raise(float_flag_invalid
, status
);
6003 return floatx80_default_nan(status
);
6010 if ( bSig
< aSig
) goto aBigger
;
6011 if ( aSig
< bSig
) goto bBigger
;
6012 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
6014 if ( bExp
== 0x7FFF ) {
6015 if ((uint64_t)(bSig
<< 1)) {
6016 return propagateFloatx80NaN(a
, b
, status
);
6018 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
6019 floatx80_infinity_low
);
6021 if ( aExp
== 0 ) ++expDiff
;
6022 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
6024 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
6027 goto normalizeRoundAndPack
;
6029 if ( aExp
== 0x7FFF ) {
6030 if ((uint64_t)(aSig
<< 1)) {
6031 return propagateFloatx80NaN(a
, b
, status
);
6035 if ( bExp
== 0 ) --expDiff
;
6036 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
6038 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
6040 normalizeRoundAndPack
:
6041 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
6042 zSign
, zExp
, zSig0
, zSig1
, status
);
6045 /*----------------------------------------------------------------------------
6046 | Returns the result of adding the extended double-precision floating-point
6047 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6048 | Standard for Binary Floating-Point Arithmetic.
6049 *----------------------------------------------------------------------------*/
6051 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
6055 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6056 float_raise(float_flag_invalid
, status
);
6057 return floatx80_default_nan(status
);
6059 aSign
= extractFloatx80Sign( a
);
6060 bSign
= extractFloatx80Sign( b
);
6061 if ( aSign
== bSign
) {
6062 return addFloatx80Sigs(a
, b
, aSign
, status
);
6065 return subFloatx80Sigs(a
, b
, aSign
, status
);
6070 /*----------------------------------------------------------------------------
6071 | Returns the result of subtracting the extended double-precision floating-
6072 | point values `a' and `b'. The operation is performed according to the
6073 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6074 *----------------------------------------------------------------------------*/
6076 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
6080 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6081 float_raise(float_flag_invalid
, status
);
6082 return floatx80_default_nan(status
);
6084 aSign
= extractFloatx80Sign( a
);
6085 bSign
= extractFloatx80Sign( b
);
6086 if ( aSign
== bSign
) {
6087 return subFloatx80Sigs(a
, b
, aSign
, status
);
6090 return addFloatx80Sigs(a
, b
, aSign
, status
);
6095 /*----------------------------------------------------------------------------
6096 | Returns the result of multiplying the extended double-precision floating-
6097 | point values `a' and `b'. The operation is performed according to the
6098 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6099 *----------------------------------------------------------------------------*/
6101 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6103 bool aSign
, bSign
, zSign
;
6104 int32_t aExp
, bExp
, zExp
;
6105 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6107 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6108 float_raise(float_flag_invalid
, status
);
6109 return floatx80_default_nan(status
);
6111 aSig
= extractFloatx80Frac( a
);
6112 aExp
= extractFloatx80Exp( a
);
6113 aSign
= extractFloatx80Sign( a
);
6114 bSig
= extractFloatx80Frac( b
);
6115 bExp
= extractFloatx80Exp( b
);
6116 bSign
= extractFloatx80Sign( b
);
6117 zSign
= aSign
^ bSign
;
6118 if ( aExp
== 0x7FFF ) {
6119 if ( (uint64_t) ( aSig
<<1 )
6120 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6121 return propagateFloatx80NaN(a
, b
, status
);
6123 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6124 return packFloatx80(zSign
, floatx80_infinity_high
,
6125 floatx80_infinity_low
);
6127 if ( bExp
== 0x7FFF ) {
6128 if ((uint64_t)(bSig
<< 1)) {
6129 return propagateFloatx80NaN(a
, b
, status
);
6131 if ( ( aExp
| aSig
) == 0 ) {
6133 float_raise(float_flag_invalid
, status
);
6134 return floatx80_default_nan(status
);
6136 return packFloatx80(zSign
, floatx80_infinity_high
,
6137 floatx80_infinity_low
);
6140 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6141 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6144 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6145 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6147 zExp
= aExp
+ bExp
- 0x3FFE;
6148 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6149 if ( 0 < (int64_t) zSig0
) {
6150 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6153 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6154 zSign
, zExp
, zSig0
, zSig1
, status
);
6157 /*----------------------------------------------------------------------------
6158 | Returns the result of dividing the extended double-precision floating-point
6159 | value `a' by the corresponding value `b'. The operation is performed
6160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6161 *----------------------------------------------------------------------------*/
6163 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6165 bool aSign
, bSign
, zSign
;
6166 int32_t aExp
, bExp
, zExp
;
6167 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6168 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6170 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6171 float_raise(float_flag_invalid
, status
);
6172 return floatx80_default_nan(status
);
6174 aSig
= extractFloatx80Frac( a
);
6175 aExp
= extractFloatx80Exp( a
);
6176 aSign
= extractFloatx80Sign( a
);
6177 bSig
= extractFloatx80Frac( b
);
6178 bExp
= extractFloatx80Exp( b
);
6179 bSign
= extractFloatx80Sign( b
);
6180 zSign
= aSign
^ bSign
;
6181 if ( aExp
== 0x7FFF ) {
6182 if ((uint64_t)(aSig
<< 1)) {
6183 return propagateFloatx80NaN(a
, b
, status
);
6185 if ( bExp
== 0x7FFF ) {
6186 if ((uint64_t)(bSig
<< 1)) {
6187 return propagateFloatx80NaN(a
, b
, status
);
6191 return packFloatx80(zSign
, floatx80_infinity_high
,
6192 floatx80_infinity_low
);
6194 if ( bExp
== 0x7FFF ) {
6195 if ((uint64_t)(bSig
<< 1)) {
6196 return propagateFloatx80NaN(a
, b
, status
);
6198 return packFloatx80( zSign
, 0, 0 );
6202 if ( ( aExp
| aSig
) == 0 ) {
6204 float_raise(float_flag_invalid
, status
);
6205 return floatx80_default_nan(status
);
6207 float_raise(float_flag_divbyzero
, status
);
6208 return packFloatx80(zSign
, floatx80_infinity_high
,
6209 floatx80_infinity_low
);
6211 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6214 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6215 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6217 zExp
= aExp
- bExp
+ 0x3FFE;
6219 if ( bSig
<= aSig
) {
6220 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6223 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6224 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6225 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6226 while ( (int64_t) rem0
< 0 ) {
6228 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6230 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6231 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6232 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6233 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6234 while ( (int64_t) rem1
< 0 ) {
6236 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6238 zSig1
|= ( ( rem1
| rem2
) != 0 );
6240 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6241 zSign
, zExp
, zSig0
, zSig1
, status
);
6244 /*----------------------------------------------------------------------------
6245 | Returns the remainder of the extended double-precision floating-point value
6246 | `a' with respect to the corresponding value `b'. The operation is performed
6247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6248 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6249 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6250 | the absolute value of the integer quotient.
6251 *----------------------------------------------------------------------------*/
6253 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6254 float_status
*status
)
6257 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6258 uint64_t aSig0
, aSig1
, bSig
;
6259 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6262 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6263 float_raise(float_flag_invalid
, status
);
6264 return floatx80_default_nan(status
);
6266 aSig0
= extractFloatx80Frac( a
);
6267 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6268 aSign
= extractFloatx80Sign( a
);
6269 bSig
= extractFloatx80Frac( b
);
6270 bExp
= extractFloatx80Exp( b
);
6271 if ( aExp
== 0x7FFF ) {
6272 if ( (uint64_t) ( aSig0
<<1 )
6273 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6274 return propagateFloatx80NaN(a
, b
, status
);
6278 if ( bExp
== 0x7FFF ) {
6279 if ((uint64_t)(bSig
<< 1)) {
6280 return propagateFloatx80NaN(a
, b
, status
);
6282 if (aExp
== 0 && aSig0
>> 63) {
6284 * Pseudo-denormal argument must be returned in normalized
6287 return packFloatx80(aSign
, 1, aSig0
);
6294 float_raise(float_flag_invalid
, status
);
6295 return floatx80_default_nan(status
);
6297 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6300 if ( aSig0
== 0 ) return a
;
6301 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6304 expDiff
= aExp
- bExp
;
6306 if ( expDiff
< 0 ) {
6307 if ( mod
|| expDiff
< -1 ) {
6308 if (aExp
== 1 && aExpOrig
== 0) {
6310 * Pseudo-denormal argument must be returned in
6313 return packFloatx80(aSign
, aExp
, aSig0
);
6317 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6320 *quotient
= q
= ( bSig
<= aSig0
);
6321 if ( q
) aSig0
-= bSig
;
6323 while ( 0 < expDiff
) {
6324 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6325 q
= ( 2 < q
) ? q
- 2 : 0;
6326 mul64To128( bSig
, q
, &term0
, &term1
);
6327 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6328 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6334 if ( 0 < expDiff
) {
6335 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6336 q
= ( 2 < q
) ? q
- 2 : 0;
6338 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6339 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6340 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6341 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6343 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6346 *quotient
<<= expDiff
;
6357 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6358 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6359 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6362 aSig0
= alternateASig0
;
6363 aSig1
= alternateASig1
;
6369 normalizeRoundAndPackFloatx80(
6370 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6374 /*----------------------------------------------------------------------------
6375 | Returns the remainder of the extended double-precision floating-point value
6376 | `a' with respect to the corresponding value `b'. The operation is performed
6377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6378 *----------------------------------------------------------------------------*/
6380 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6383 return floatx80_modrem(a
, b
, false, "ient
, status
);
6386 /*----------------------------------------------------------------------------
6387 | Returns the remainder of the extended double-precision floating-point value
6388 | `a' with respect to the corresponding value `b', with the quotient truncated
6390 *----------------------------------------------------------------------------*/
6392 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6395 return floatx80_modrem(a
, b
, true, "ient
, status
);
6398 /*----------------------------------------------------------------------------
6399 | Returns the square root of the extended double-precision floating-point
6400 | value `a'. The operation is performed according to the IEC/IEEE Standard
6401 | for Binary Floating-Point Arithmetic.
6402 *----------------------------------------------------------------------------*/
6404 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6408 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6409 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6411 if (floatx80_invalid_encoding(a
)) {
6412 float_raise(float_flag_invalid
, status
);
6413 return floatx80_default_nan(status
);
6415 aSig0
= extractFloatx80Frac( a
);
6416 aExp
= extractFloatx80Exp( a
);
6417 aSign
= extractFloatx80Sign( a
);
6418 if ( aExp
== 0x7FFF ) {
6419 if ((uint64_t)(aSig0
<< 1)) {
6420 return propagateFloatx80NaN(a
, a
, status
);
6422 if ( ! aSign
) return a
;
6426 if ( ( aExp
| aSig0
) == 0 ) return a
;
6428 float_raise(float_flag_invalid
, status
);
6429 return floatx80_default_nan(status
);
6432 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6433 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6435 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6436 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6437 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6438 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6439 doubleZSig0
= zSig0
<<1;
6440 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6441 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6442 while ( (int64_t) rem0
< 0 ) {
6445 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6447 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6448 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6449 if ( zSig1
== 0 ) zSig1
= 1;
6450 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6451 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6452 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6453 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6454 while ( (int64_t) rem1
< 0 ) {
6456 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6458 term2
|= doubleZSig0
;
6459 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6461 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6463 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6464 zSig0
|= doubleZSig0
;
6465 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6466 0, zExp
, zSig0
, zSig1
, status
);
6469 /*----------------------------------------------------------------------------
6470 | Returns the result of converting the quadruple-precision floating-point
6471 | value `a' to the 32-bit two's complement integer format. The conversion
6472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6473 | Arithmetic---which means in particular that the conversion is rounded
6474 | according to the current rounding mode. If `a' is a NaN, the largest
6475 | positive integer is returned. Otherwise, if the conversion overflows, the
6476 | largest integer with the same sign as `a' is returned.
6477 *----------------------------------------------------------------------------*/
6479 int32_t float128_to_int32(float128 a
, float_status
*status
)
6482 int32_t aExp
, shiftCount
;
6483 uint64_t aSig0
, aSig1
;
6485 aSig1
= extractFloat128Frac1( a
);
6486 aSig0
= extractFloat128Frac0( a
);
6487 aExp
= extractFloat128Exp( a
);
6488 aSign
= extractFloat128Sign( a
);
6489 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6490 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6491 aSig0
|= ( aSig1
!= 0 );
6492 shiftCount
= 0x4028 - aExp
;
6493 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6494 return roundAndPackInt32(aSign
, aSig0
, status
);
6498 /*----------------------------------------------------------------------------
6499 | Returns the result of converting the quadruple-precision floating-point
6500 | value `a' to the 32-bit two's complement integer format. The conversion
6501 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6502 | Arithmetic, except that the conversion is always rounded toward zero. If
6503 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6504 | conversion overflows, the largest integer with the same sign as `a' is
6506 *----------------------------------------------------------------------------*/
6508 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6511 int32_t aExp
, shiftCount
;
6512 uint64_t aSig0
, aSig1
, savedASig
;
6515 aSig1
= extractFloat128Frac1( a
);
6516 aSig0
= extractFloat128Frac0( a
);
6517 aExp
= extractFloat128Exp( a
);
6518 aSign
= extractFloat128Sign( a
);
6519 aSig0
|= ( aSig1
!= 0 );
6520 if ( 0x401E < aExp
) {
6521 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6524 else if ( aExp
< 0x3FFF ) {
6525 if (aExp
|| aSig0
) {
6526 float_raise(float_flag_inexact
, status
);
6530 aSig0
|= UINT64_C(0x0001000000000000);
6531 shiftCount
= 0x402F - aExp
;
6533 aSig0
>>= shiftCount
;
6535 if ( aSign
) z
= - z
;
6536 if ( ( z
< 0 ) ^ aSign
) {
6538 float_raise(float_flag_invalid
, status
);
6539 return aSign
? INT32_MIN
: INT32_MAX
;
6541 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6542 float_raise(float_flag_inexact
, status
);
6548 /*----------------------------------------------------------------------------
6549 | Returns the result of converting the quadruple-precision floating-point
6550 | value `a' to the 64-bit two's complement integer format. The conversion
6551 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6552 | Arithmetic---which means in particular that the conversion is rounded
6553 | according to the current rounding mode. If `a' is a NaN, the largest
6554 | positive integer is returned. Otherwise, if the conversion overflows, the
6555 | largest integer with the same sign as `a' is returned.
6556 *----------------------------------------------------------------------------*/
6558 int64_t float128_to_int64(float128 a
, float_status
*status
)
6561 int32_t aExp
, shiftCount
;
6562 uint64_t aSig0
, aSig1
;
6564 aSig1
= extractFloat128Frac1( a
);
6565 aSig0
= extractFloat128Frac0( a
);
6566 aExp
= extractFloat128Exp( a
);
6567 aSign
= extractFloat128Sign( a
);
6568 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6569 shiftCount
= 0x402F - aExp
;
6570 if ( shiftCount
<= 0 ) {
6571 if ( 0x403E < aExp
) {
6572 float_raise(float_flag_invalid
, status
);
6574 || ( ( aExp
== 0x7FFF )
6575 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6582 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6585 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6587 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6591 /*----------------------------------------------------------------------------
6592 | Returns the result of converting the quadruple-precision floating-point
6593 | value `a' to the 64-bit two's complement integer format. The conversion
6594 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6595 | Arithmetic, except that the conversion is always rounded toward zero.
6596 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6597 | the conversion overflows, the largest integer with the same sign as `a' is
6599 *----------------------------------------------------------------------------*/
6601 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6604 int32_t aExp
, shiftCount
;
6605 uint64_t aSig0
, aSig1
;
6608 aSig1
= extractFloat128Frac1( a
);
6609 aSig0
= extractFloat128Frac0( a
);
6610 aExp
= extractFloat128Exp( a
);
6611 aSign
= extractFloat128Sign( a
);
6612 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6613 shiftCount
= aExp
- 0x402F;
6614 if ( 0 < shiftCount
) {
6615 if ( 0x403E <= aExp
) {
6616 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6617 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6618 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6620 float_raise(float_flag_inexact
, status
);
6624 float_raise(float_flag_invalid
, status
);
6625 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6631 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6632 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6633 float_raise(float_flag_inexact
, status
);
6637 if ( aExp
< 0x3FFF ) {
6638 if ( aExp
| aSig0
| aSig1
) {
6639 float_raise(float_flag_inexact
, status
);
6643 z
= aSig0
>>( - shiftCount
);
6645 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6646 float_raise(float_flag_inexact
, status
);
6649 if ( aSign
) z
= - z
;
6654 /*----------------------------------------------------------------------------
6655 | Returns the result of converting the quadruple-precision floating-point value
6656 | `a' to the 64-bit unsigned integer format. The conversion is
6657 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6658 | Arithmetic---which means in particular that the conversion is rounded
6659 | according to the current rounding mode. If `a' is a NaN, the largest
6660 | positive integer is returned. If the conversion overflows, the
6661 | largest unsigned integer is returned. If 'a' is negative, the value is
6662 | rounded and zero is returned; negative values that do not round to zero
6663 | will raise the inexact exception.
6664 *----------------------------------------------------------------------------*/
6666 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6671 uint64_t aSig0
, aSig1
;
6673 aSig0
= extractFloat128Frac0(a
);
6674 aSig1
= extractFloat128Frac1(a
);
6675 aExp
= extractFloat128Exp(a
);
6676 aSign
= extractFloat128Sign(a
);
6677 if (aSign
&& (aExp
> 0x3FFE)) {
6678 float_raise(float_flag_invalid
, status
);
6679 if (float128_is_any_nan(a
)) {
6686 aSig0
|= UINT64_C(0x0001000000000000);
6688 shiftCount
= 0x402F - aExp
;
6689 if (shiftCount
<= 0) {
6690 if (0x403E < aExp
) {
6691 float_raise(float_flag_invalid
, status
);
6694 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6696 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6698 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6701 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6704 signed char current_rounding_mode
= status
->float_rounding_mode
;
6706 set_float_rounding_mode(float_round_to_zero
, status
);
6707 v
= float128_to_uint64(a
, status
);
6708 set_float_rounding_mode(current_rounding_mode
, status
);
6713 /*----------------------------------------------------------------------------
6714 | Returns the result of converting the quadruple-precision floating-point
6715 | value `a' to the 32-bit unsigned integer format. The conversion
6716 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6717 | Arithmetic except that the conversion is always rounded toward zero.
6718 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6719 | if the conversion overflows, the largest unsigned integer is returned.
6720 | If 'a' is negative, the value is rounded and zero is returned; negative
6721 | values that do not round to zero will raise the inexact exception.
6722 *----------------------------------------------------------------------------*/
6724 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6728 int old_exc_flags
= get_float_exception_flags(status
);
6730 v
= float128_to_uint64_round_to_zero(a
, status
);
6731 if (v
> 0xffffffff) {
6736 set_float_exception_flags(old_exc_flags
, status
);
6737 float_raise(float_flag_invalid
, status
);
6741 /*----------------------------------------------------------------------------
6742 | Returns the result of converting the quadruple-precision floating-point value
6743 | `a' to the 32-bit unsigned integer format. The conversion is
6744 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6745 | Arithmetic---which means in particular that the conversion is rounded
6746 | according to the current rounding mode. If `a' is a NaN, the largest
6747 | positive integer is returned. If the conversion overflows, the
6748 | largest unsigned integer is returned. If 'a' is negative, the value is
6749 | rounded and zero is returned; negative values that do not round to zero
6750 | will raise the inexact exception.
6751 *----------------------------------------------------------------------------*/
6753 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6757 int old_exc_flags
= get_float_exception_flags(status
);
6759 v
= float128_to_uint64(a
, status
);
6760 if (v
> 0xffffffff) {
6765 set_float_exception_flags(old_exc_flags
, status
);
6766 float_raise(float_flag_invalid
, status
);
6770 /*----------------------------------------------------------------------------
6771 | Returns the result of converting the quadruple-precision floating-point
6772 | value `a' to the single-precision floating-point format. The conversion
6773 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6775 *----------------------------------------------------------------------------*/
6777 float32
float128_to_float32(float128 a
, float_status
*status
)
6781 uint64_t aSig0
, aSig1
;
6784 aSig1
= extractFloat128Frac1( a
);
6785 aSig0
= extractFloat128Frac0( a
);
6786 aExp
= extractFloat128Exp( a
);
6787 aSign
= extractFloat128Sign( a
);
6788 if ( aExp
== 0x7FFF ) {
6789 if ( aSig0
| aSig1
) {
6790 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6792 return packFloat32( aSign
, 0xFF, 0 );
6794 aSig0
|= ( aSig1
!= 0 );
6795 shift64RightJamming( aSig0
, 18, &aSig0
);
6797 if ( aExp
|| zSig
) {
6801 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6805 /*----------------------------------------------------------------------------
6806 | Returns the result of converting the quadruple-precision floating-point
6807 | value `a' to the double-precision floating-point format. The conversion
6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6810 *----------------------------------------------------------------------------*/
6812 float64
float128_to_float64(float128 a
, float_status
*status
)
6816 uint64_t aSig0
, aSig1
;
6818 aSig1
= extractFloat128Frac1( a
);
6819 aSig0
= extractFloat128Frac0( a
);
6820 aExp
= extractFloat128Exp( a
);
6821 aSign
= extractFloat128Sign( a
);
6822 if ( aExp
== 0x7FFF ) {
6823 if ( aSig0
| aSig1
) {
6824 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6826 return packFloat64( aSign
, 0x7FF, 0 );
6828 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6829 aSig0
|= ( aSig1
!= 0 );
6830 if ( aExp
|| aSig0
) {
6831 aSig0
|= UINT64_C(0x4000000000000000);
6834 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6838 /*----------------------------------------------------------------------------
6839 | Returns the result of converting the quadruple-precision floating-point
6840 | value `a' to the extended double-precision floating-point format. The
6841 | conversion is performed according to the IEC/IEEE Standard for Binary
6842 | Floating-Point Arithmetic.
6843 *----------------------------------------------------------------------------*/
6845 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6849 uint64_t aSig0
, aSig1
;
6851 aSig1
= extractFloat128Frac1( a
);
6852 aSig0
= extractFloat128Frac0( a
);
6853 aExp
= extractFloat128Exp( a
);
6854 aSign
= extractFloat128Sign( a
);
6855 if ( aExp
== 0x7FFF ) {
6856 if ( aSig0
| aSig1
) {
6857 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6859 return floatx80_silence_nan(res
, status
);
6861 return packFloatx80(aSign
, floatx80_infinity_high
,
6862 floatx80_infinity_low
);
6865 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6866 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6869 aSig0
|= UINT64_C(0x0001000000000000);
6871 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6872 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6876 /*----------------------------------------------------------------------------
6877 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6878 | returns the result as a quadruple-precision floating-point value. The
6879 | operation is performed according to the IEC/IEEE Standard for Binary
6880 | Floating-Point Arithmetic.
6881 *----------------------------------------------------------------------------*/
6883 float128
float128_round_to_int(float128 a
, float_status
*status
)
6887 uint64_t lastBitMask
, roundBitsMask
;
6890 aExp
= extractFloat128Exp( a
);
6891 if ( 0x402F <= aExp
) {
6892 if ( 0x406F <= aExp
) {
6893 if ( ( aExp
== 0x7FFF )
6894 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6896 return propagateFloat128NaN(a
, a
, status
);
6901 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6902 roundBitsMask
= lastBitMask
- 1;
6904 switch (status
->float_rounding_mode
) {
6905 case float_round_nearest_even
:
6906 if ( lastBitMask
) {
6907 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6908 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6911 if ( (int64_t) z
.low
< 0 ) {
6913 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6917 case float_round_ties_away
:
6919 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6921 if ((int64_t) z
.low
< 0) {
6926 case float_round_to_zero
:
6928 case float_round_up
:
6929 if (!extractFloat128Sign(z
)) {
6930 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6933 case float_round_down
:
6934 if (extractFloat128Sign(z
)) {
6935 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6938 case float_round_to_odd
:
6940 * Note that if lastBitMask == 0, the last bit is the lsb
6941 * of high, and roundBitsMask == -1.
6943 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6944 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6950 z
.low
&= ~ roundBitsMask
;
6953 if ( aExp
< 0x3FFF ) {
6954 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6955 float_raise(float_flag_inexact
, status
);
6956 aSign
= extractFloat128Sign( a
);
6957 switch (status
->float_rounding_mode
) {
6958 case float_round_nearest_even
:
6959 if ( ( aExp
== 0x3FFE )
6960 && ( extractFloat128Frac0( a
)
6961 | extractFloat128Frac1( a
) )
6963 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6966 case float_round_ties_away
:
6967 if (aExp
== 0x3FFE) {
6968 return packFloat128(aSign
, 0x3FFF, 0, 0);
6971 case float_round_down
:
6973 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6974 : packFloat128( 0, 0, 0, 0 );
6975 case float_round_up
:
6977 aSign
? packFloat128( 1, 0, 0, 0 )
6978 : packFloat128( 0, 0x3FFF, 0, 0 );
6980 case float_round_to_odd
:
6981 return packFloat128(aSign
, 0x3FFF, 0, 0);
6983 case float_round_to_zero
:
6986 return packFloat128( aSign
, 0, 0, 0 );
6989 lastBitMask
<<= 0x402F - aExp
;
6990 roundBitsMask
= lastBitMask
- 1;
6993 switch (status
->float_rounding_mode
) {
6994 case float_round_nearest_even
:
6995 z
.high
+= lastBitMask
>>1;
6996 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6997 z
.high
&= ~ lastBitMask
;
7000 case float_round_ties_away
:
7001 z
.high
+= lastBitMask
>>1;
7003 case float_round_to_zero
:
7005 case float_round_up
:
7006 if (!extractFloat128Sign(z
)) {
7007 z
.high
|= ( a
.low
!= 0 );
7008 z
.high
+= roundBitsMask
;
7011 case float_round_down
:
7012 if (extractFloat128Sign(z
)) {
7013 z
.high
|= (a
.low
!= 0);
7014 z
.high
+= roundBitsMask
;
7017 case float_round_to_odd
:
7018 if ((z
.high
& lastBitMask
) == 0) {
7019 z
.high
|= (a
.low
!= 0);
7020 z
.high
+= roundBitsMask
;
7026 z
.high
&= ~ roundBitsMask
;
7028 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7029 float_raise(float_flag_inexact
, status
);
7035 /*----------------------------------------------------------------------------
7036 | Returns the result of adding the absolute values of the quadruple-precision
7037 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7038 | before being returned. `zSign' is ignored if the result is a NaN.
7039 | The addition is performed according to the IEC/IEEE Standard for Binary
7040 | Floating-Point Arithmetic.
7041 *----------------------------------------------------------------------------*/
7043 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
7044 float_status
*status
)
7046 int32_t aExp
, bExp
, zExp
;
7047 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7050 aSig1
= extractFloat128Frac1( a
);
7051 aSig0
= extractFloat128Frac0( a
);
7052 aExp
= extractFloat128Exp( a
);
7053 bSig1
= extractFloat128Frac1( b
);
7054 bSig0
= extractFloat128Frac0( b
);
7055 bExp
= extractFloat128Exp( b
);
7056 expDiff
= aExp
- bExp
;
7057 if ( 0 < expDiff
) {
7058 if ( aExp
== 0x7FFF ) {
7059 if (aSig0
| aSig1
) {
7060 return propagateFloat128NaN(a
, b
, status
);
7068 bSig0
|= UINT64_C(0x0001000000000000);
7070 shift128ExtraRightJamming(
7071 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
7074 else if ( expDiff
< 0 ) {
7075 if ( bExp
== 0x7FFF ) {
7076 if (bSig0
| bSig1
) {
7077 return propagateFloat128NaN(a
, b
, status
);
7079 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7085 aSig0
|= UINT64_C(0x0001000000000000);
7087 shift128ExtraRightJamming(
7088 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
7092 if ( aExp
== 0x7FFF ) {
7093 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7094 return propagateFloat128NaN(a
, b
, status
);
7098 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7100 if (status
->flush_to_zero
) {
7101 if (zSig0
| zSig1
) {
7102 float_raise(float_flag_output_denormal
, status
);
7104 return packFloat128(zSign
, 0, 0, 0);
7106 return packFloat128( zSign
, 0, zSig0
, zSig1
);
7109 zSig0
|= UINT64_C(0x0002000000000000);
7113 aSig0
|= UINT64_C(0x0001000000000000);
7114 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7116 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
7119 shift128ExtraRightJamming(
7120 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7122 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7126 /*----------------------------------------------------------------------------
7127 | Returns the result of subtracting the absolute values of the quadruple-
7128 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7129 | difference is negated before being returned. `zSign' is ignored if the
7130 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7131 | Standard for Binary Floating-Point Arithmetic.
7132 *----------------------------------------------------------------------------*/
7134 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
7135 float_status
*status
)
7137 int32_t aExp
, bExp
, zExp
;
7138 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7141 aSig1
= extractFloat128Frac1( a
);
7142 aSig0
= extractFloat128Frac0( a
);
7143 aExp
= extractFloat128Exp( a
);
7144 bSig1
= extractFloat128Frac1( b
);
7145 bSig0
= extractFloat128Frac0( b
);
7146 bExp
= extractFloat128Exp( b
);
7147 expDiff
= aExp
- bExp
;
7148 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7149 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7150 if ( 0 < expDiff
) goto aExpBigger
;
7151 if ( expDiff
< 0 ) goto bExpBigger
;
7152 if ( aExp
== 0x7FFF ) {
7153 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7154 return propagateFloat128NaN(a
, b
, status
);
7156 float_raise(float_flag_invalid
, status
);
7157 return float128_default_nan(status
);
7163 if ( bSig0
< aSig0
) goto aBigger
;
7164 if ( aSig0
< bSig0
) goto bBigger
;
7165 if ( bSig1
< aSig1
) goto aBigger
;
7166 if ( aSig1
< bSig1
) goto bBigger
;
7167 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7170 if ( bExp
== 0x7FFF ) {
7171 if (bSig0
| bSig1
) {
7172 return propagateFloat128NaN(a
, b
, status
);
7174 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7180 aSig0
|= UINT64_C(0x4000000000000000);
7182 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7183 bSig0
|= UINT64_C(0x4000000000000000);
7185 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7188 goto normalizeRoundAndPack
;
7190 if ( aExp
== 0x7FFF ) {
7191 if (aSig0
| aSig1
) {
7192 return propagateFloat128NaN(a
, b
, status
);
7200 bSig0
|= UINT64_C(0x4000000000000000);
7202 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7203 aSig0
|= UINT64_C(0x4000000000000000);
7205 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7207 normalizeRoundAndPack
:
7209 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7214 /*----------------------------------------------------------------------------
7215 | Returns the result of adding the quadruple-precision floating-point values
7216 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7217 | for Binary Floating-Point Arithmetic.
7218 *----------------------------------------------------------------------------*/
7220 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7224 aSign
= extractFloat128Sign( a
);
7225 bSign
= extractFloat128Sign( b
);
7226 if ( aSign
== bSign
) {
7227 return addFloat128Sigs(a
, b
, aSign
, status
);
7230 return subFloat128Sigs(a
, b
, aSign
, status
);
7235 /*----------------------------------------------------------------------------
7236 | Returns the result of subtracting the quadruple-precision floating-point
7237 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7238 | Standard for Binary Floating-Point Arithmetic.
7239 *----------------------------------------------------------------------------*/
7241 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7245 aSign
= extractFloat128Sign( a
);
7246 bSign
= extractFloat128Sign( b
);
7247 if ( aSign
== bSign
) {
7248 return subFloat128Sigs(a
, b
, aSign
, status
);
7251 return addFloat128Sigs(a
, b
, aSign
, status
);
7256 /*----------------------------------------------------------------------------
7257 | Returns the result of multiplying the quadruple-precision floating-point
7258 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7259 | Standard for Binary Floating-Point Arithmetic.
7260 *----------------------------------------------------------------------------*/
7262 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7264 bool aSign
, bSign
, zSign
;
7265 int32_t aExp
, bExp
, zExp
;
7266 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7268 aSig1
= extractFloat128Frac1( a
);
7269 aSig0
= extractFloat128Frac0( a
);
7270 aExp
= extractFloat128Exp( a
);
7271 aSign
= extractFloat128Sign( a
);
7272 bSig1
= extractFloat128Frac1( b
);
7273 bSig0
= extractFloat128Frac0( b
);
7274 bExp
= extractFloat128Exp( b
);
7275 bSign
= extractFloat128Sign( b
);
7276 zSign
= aSign
^ bSign
;
7277 if ( aExp
== 0x7FFF ) {
7278 if ( ( aSig0
| aSig1
)
7279 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7280 return propagateFloat128NaN(a
, b
, status
);
7282 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7283 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7285 if ( bExp
== 0x7FFF ) {
7286 if (bSig0
| bSig1
) {
7287 return propagateFloat128NaN(a
, b
, status
);
7289 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7291 float_raise(float_flag_invalid
, status
);
7292 return float128_default_nan(status
);
7294 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7297 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7298 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7301 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7302 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7304 zExp
= aExp
+ bExp
- 0x4000;
7305 aSig0
|= UINT64_C(0x0001000000000000);
7306 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7307 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7308 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7309 zSig2
|= ( zSig3
!= 0 );
7310 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7311 shift128ExtraRightJamming(
7312 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7315 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7319 /*----------------------------------------------------------------------------
7320 | Returns the result of dividing the quadruple-precision floating-point value
7321 | `a' by the corresponding value `b'. The operation is performed according to
7322 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7323 *----------------------------------------------------------------------------*/
7325 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7327 bool aSign
, bSign
, zSign
;
7328 int32_t aExp
, bExp
, zExp
;
7329 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7330 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7332 aSig1
= extractFloat128Frac1( a
);
7333 aSig0
= extractFloat128Frac0( a
);
7334 aExp
= extractFloat128Exp( a
);
7335 aSign
= extractFloat128Sign( a
);
7336 bSig1
= extractFloat128Frac1( b
);
7337 bSig0
= extractFloat128Frac0( b
);
7338 bExp
= extractFloat128Exp( b
);
7339 bSign
= extractFloat128Sign( b
);
7340 zSign
= aSign
^ bSign
;
7341 if ( aExp
== 0x7FFF ) {
7342 if (aSig0
| aSig1
) {
7343 return propagateFloat128NaN(a
, b
, status
);
7345 if ( bExp
== 0x7FFF ) {
7346 if (bSig0
| bSig1
) {
7347 return propagateFloat128NaN(a
, b
, status
);
7351 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7353 if ( bExp
== 0x7FFF ) {
7354 if (bSig0
| bSig1
) {
7355 return propagateFloat128NaN(a
, b
, status
);
7357 return packFloat128( zSign
, 0, 0, 0 );
7360 if ( ( bSig0
| bSig1
) == 0 ) {
7361 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7363 float_raise(float_flag_invalid
, status
);
7364 return float128_default_nan(status
);
7366 float_raise(float_flag_divbyzero
, status
);
7367 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7369 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7372 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7373 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7375 zExp
= aExp
- bExp
+ 0x3FFD;
7377 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7379 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7380 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7381 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7384 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7385 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7386 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7387 while ( (int64_t) rem0
< 0 ) {
7389 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7391 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7392 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7393 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7394 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7395 while ( (int64_t) rem1
< 0 ) {
7397 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7399 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7401 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7402 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7406 /*----------------------------------------------------------------------------
7407 | Returns the remainder of the quadruple-precision floating-point value `a'
7408 | with respect to the corresponding value `b'. The operation is performed
7409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7410 *----------------------------------------------------------------------------*/
7412 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7415 int32_t aExp
, bExp
, expDiff
;
7416 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7417 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7420 aSig1
= extractFloat128Frac1( a
);
7421 aSig0
= extractFloat128Frac0( a
);
7422 aExp
= extractFloat128Exp( a
);
7423 aSign
= extractFloat128Sign( a
);
7424 bSig1
= extractFloat128Frac1( b
);
7425 bSig0
= extractFloat128Frac0( b
);
7426 bExp
= extractFloat128Exp( b
);
7427 if ( aExp
== 0x7FFF ) {
7428 if ( ( aSig0
| aSig1
)
7429 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7430 return propagateFloat128NaN(a
, b
, status
);
7434 if ( bExp
== 0x7FFF ) {
7435 if (bSig0
| bSig1
) {
7436 return propagateFloat128NaN(a
, b
, status
);
7441 if ( ( bSig0
| bSig1
) == 0 ) {
7443 float_raise(float_flag_invalid
, status
);
7444 return float128_default_nan(status
);
7446 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7449 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7450 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7452 expDiff
= aExp
- bExp
;
7453 if ( expDiff
< -1 ) return a
;
7455 aSig0
| UINT64_C(0x0001000000000000),
7457 15 - ( expDiff
< 0 ),
7462 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7463 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7464 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7466 while ( 0 < expDiff
) {
7467 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7468 q
= ( 4 < q
) ? q
- 4 : 0;
7469 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7470 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7471 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7472 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7475 if ( -64 < expDiff
) {
7476 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7477 q
= ( 4 < q
) ? q
- 4 : 0;
7479 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7481 if ( expDiff
< 0 ) {
7482 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7485 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7487 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7488 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7491 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7492 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7495 alternateASig0
= aSig0
;
7496 alternateASig1
= aSig1
;
7498 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7499 } while ( 0 <= (int64_t) aSig0
);
7501 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7502 if ( ( sigMean0
< 0 )
7503 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7504 aSig0
= alternateASig0
;
7505 aSig1
= alternateASig1
;
7507 zSign
= ( (int64_t) aSig0
< 0 );
7508 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7509 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7513 /*----------------------------------------------------------------------------
7514 | Returns the square root of the quadruple-precision floating-point value `a'.
7515 | The operation is performed according to the IEC/IEEE Standard for Binary
7516 | Floating-Point Arithmetic.
7517 *----------------------------------------------------------------------------*/
7519 float128
float128_sqrt(float128 a
, float_status
*status
)
7523 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7524 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7526 aSig1
= extractFloat128Frac1( a
);
7527 aSig0
= extractFloat128Frac0( a
);
7528 aExp
= extractFloat128Exp( a
);
7529 aSign
= extractFloat128Sign( a
);
7530 if ( aExp
== 0x7FFF ) {
7531 if (aSig0
| aSig1
) {
7532 return propagateFloat128NaN(a
, a
, status
);
7534 if ( ! aSign
) return a
;
7538 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7540 float_raise(float_flag_invalid
, status
);
7541 return float128_default_nan(status
);
7544 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7545 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7547 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7548 aSig0
|= UINT64_C(0x0001000000000000);
7549 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7550 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7551 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7552 doubleZSig0
= zSig0
<<1;
7553 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7554 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7555 while ( (int64_t) rem0
< 0 ) {
7558 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7560 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7561 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7562 if ( zSig1
== 0 ) zSig1
= 1;
7563 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7564 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7565 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7566 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7567 while ( (int64_t) rem1
< 0 ) {
7569 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7571 term2
|= doubleZSig0
;
7572 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7574 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7576 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7577 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7581 static inline FloatRelation
7582 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7583 float_status
*status
)
7587 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7588 float_raise(float_flag_invalid
, status
);
7589 return float_relation_unordered
;
7591 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7592 ( extractFloatx80Frac( a
)<<1 ) ) ||
7593 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7594 ( extractFloatx80Frac( b
)<<1 ) )) {
7596 floatx80_is_signaling_nan(a
, status
) ||
7597 floatx80_is_signaling_nan(b
, status
)) {
7598 float_raise(float_flag_invalid
, status
);
7600 return float_relation_unordered
;
7602 aSign
= extractFloatx80Sign( a
);
7603 bSign
= extractFloatx80Sign( b
);
7604 if ( aSign
!= bSign
) {
7606 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7607 ( ( a
.low
| b
.low
) == 0 ) ) {
7609 return float_relation_equal
;
7611 return 1 - (2 * aSign
);
7614 /* Normalize pseudo-denormals before comparison. */
7615 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7618 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7621 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7622 return float_relation_equal
;
7624 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7629 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7631 return floatx80_compare_internal(a
, b
, 0, status
);
7634 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7635 float_status
*status
)
7637 return floatx80_compare_internal(a
, b
, 1, status
);
7640 static inline FloatRelation
7641 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7642 float_status
*status
)
7646 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7647 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7648 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7649 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7651 float128_is_signaling_nan(a
, status
) ||
7652 float128_is_signaling_nan(b
, status
)) {
7653 float_raise(float_flag_invalid
, status
);
7655 return float_relation_unordered
;
7657 aSign
= extractFloat128Sign( a
);
7658 bSign
= extractFloat128Sign( b
);
7659 if ( aSign
!= bSign
) {
7660 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7662 return float_relation_equal
;
7664 return 1 - (2 * aSign
);
7667 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7668 return float_relation_equal
;
7670 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7675 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7677 return float128_compare_internal(a
, b
, 0, status
);
7680 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7681 float_status
*status
)
7683 return float128_compare_internal(a
, b
, 1, status
);
7686 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7692 if (floatx80_invalid_encoding(a
)) {
7693 float_raise(float_flag_invalid
, status
);
7694 return floatx80_default_nan(status
);
7696 aSig
= extractFloatx80Frac( a
);
7697 aExp
= extractFloatx80Exp( a
);
7698 aSign
= extractFloatx80Sign( a
);
7700 if ( aExp
== 0x7FFF ) {
7702 return propagateFloatx80NaN(a
, a
, status
);
7716 } else if (n
< -0x10000) {
7721 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7722 aSign
, aExp
, aSig
, 0, status
);
7725 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7729 uint64_t aSig0
, aSig1
;
7731 aSig1
= extractFloat128Frac1( a
);
7732 aSig0
= extractFloat128Frac0( a
);
7733 aExp
= extractFloat128Exp( a
);
7734 aSign
= extractFloat128Sign( a
);
7735 if ( aExp
== 0x7FFF ) {
7736 if ( aSig0
| aSig1
) {
7737 return propagateFloat128NaN(a
, a
, status
);
7742 aSig0
|= UINT64_C(0x0001000000000000);
7743 } else if (aSig0
== 0 && aSig1
== 0) {
7751 } else if (n
< -0x10000) {
7756 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7761 static void __attribute__((constructor
)) softfloat_init(void)
7763 union_float64 ua
, ub
, uc
, ur
;
7765 if (QEMU_NO_HARDFLOAT
) {
7769 * Test that the host's FMA is not obviously broken. For example,
7770 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7771 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7773 ua
.s
= 0x0020000000000001ULL
;
7774 ub
.s
= 0x3ca0000000000000ULL
;
7775 uc
.s
= 0x0020000000000000ULL
;
7776 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7777 if (ur
.s
!= 0x0020000000000001ULL
) {
7778 force_soft_fma
= true;