4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
516 /* Routines that know the structure may reference the singular name. */
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT 63
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
540 /* Structure holding all of the relevant parameters for a format.
541 * exp_size: the size of the exponent field
542 * exp_bias: the offset applied to the exponent field
543 * exp_max: the maximum normalised exponent
544 * frac_size: the size of the fraction field
545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546 * The following are computed based the size of fraction
547 * frac_lsb: least significant bit of fraction
548 * frac_lsbm1: the bit below the least significant bit (for rounding)
549 * round_mask/roundeven_mask: masks used for rounding
550 * The following optional modifiers are available:
551 * arm_althp: handle ARM Alternative Half Precision
562 uint64_t roundeven_mask
;
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F) \
569 .exp_bias = ((1 << E) - 1) >> 1, \
570 .exp_max = (1 << E) - 1, \
572 .frac_shift = (-F - 1) & 63, \
573 .frac_lsb = 1ull << ((-F - 1) & 63), \
574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
578 static const FloatFmt float16_params
= {
582 static const FloatFmt float16_params_ahp
= {
587 static const FloatFmt bfloat16_params
= {
591 static const FloatFmt float32_params
= {
595 static const FloatFmt float64_params
= {
599 static const FloatFmt float128_params
= {
600 FLOAT_PARAMS(15, 112)
603 /* Unpack a float to parts, but do not canonicalize. */
604 static void unpack_raw64(FloatParts64
*r
, const FloatFmt
*fmt
, uint64_t raw
)
606 const int f_size
= fmt
->frac_size
;
607 const int e_size
= fmt
->exp_size
;
609 *r
= (FloatParts64
) {
610 .cls
= float_class_unclassified
,
611 .sign
= extract64(raw
, f_size
+ e_size
, 1),
612 .exp
= extract64(raw
, f_size
, e_size
),
613 .frac
= extract64(raw
, 0, f_size
)
617 static inline void float16_unpack_raw(FloatParts64
*p
, float16 f
)
619 unpack_raw64(p
, &float16_params
, f
);
622 static inline void bfloat16_unpack_raw(FloatParts64
*p
, bfloat16 f
)
624 unpack_raw64(p
, &bfloat16_params
, f
);
627 static inline void float32_unpack_raw(FloatParts64
*p
, float32 f
)
629 unpack_raw64(p
, &float32_params
, f
);
632 static inline void float64_unpack_raw(FloatParts64
*p
, float64 f
)
634 unpack_raw64(p
, &float64_params
, f
);
637 static void float128_unpack_raw(FloatParts128
*p
, float128 f
)
639 const int f_size
= float128_params
.frac_size
- 64;
640 const int e_size
= float128_params
.exp_size
;
642 *p
= (FloatParts128
) {
643 .cls
= float_class_unclassified
,
644 .sign
= extract64(f
.high
, f_size
+ e_size
, 1),
645 .exp
= extract64(f
.high
, f_size
, e_size
),
646 .frac_hi
= extract64(f
.high
, 0, f_size
),
651 /* Pack a float from parts, but do not canonicalize. */
652 static uint64_t pack_raw64(const FloatParts64
*p
, const FloatFmt
*fmt
)
654 const int f_size
= fmt
->frac_size
;
655 const int e_size
= fmt
->exp_size
;
658 ret
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
659 ret
= deposit64(ret
, f_size
, e_size
, p
->exp
);
660 ret
= deposit64(ret
, 0, f_size
, p
->frac
);
664 static inline float16
float16_pack_raw(const FloatParts64
*p
)
666 return make_float16(pack_raw64(p
, &float16_params
));
669 static inline bfloat16
bfloat16_pack_raw(const FloatParts64
*p
)
671 return pack_raw64(p
, &bfloat16_params
);
674 static inline float32
float32_pack_raw(const FloatParts64
*p
)
676 return make_float32(pack_raw64(p
, &float32_params
));
679 static inline float64
float64_pack_raw(const FloatParts64
*p
)
681 return make_float64(pack_raw64(p
, &float64_params
));
684 static float128
float128_pack_raw(const FloatParts128
*p
)
686 const int f_size
= float128_params
.frac_size
- 64;
687 const int e_size
= float128_params
.exp_size
;
690 hi
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
691 hi
= deposit64(hi
, f_size
, e_size
, p
->exp
);
692 hi
= deposit64(hi
, 0, f_size
, p
->frac_hi
);
693 return make_float128(hi
, p
->frac_lo
);
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine: (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output. These details are target-
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
706 #define PARTS_GENERIC_64_128(NAME, P) \
707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
712 static void parts64_return_nan(FloatParts64
*a
, float_status
*s
);
713 static void parts128_return_nan(FloatParts128
*a
, float_status
*s
);
715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
717 static FloatParts64
*parts64_pick_nan(FloatParts64
*a
, FloatParts64
*b
,
719 static FloatParts128
*parts128_pick_nan(FloatParts128
*a
, FloatParts128
*b
,
722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
724 static FloatParts64
*parts64_pick_nan_muladd(FloatParts64
*a
, FloatParts64
*b
,
725 FloatParts64
*c
, float_status
*s
,
726 int ab_mask
, int abc_mask
);
727 static FloatParts128
*parts128_pick_nan_muladd(FloatParts128
*a
,
731 int ab_mask
, int abc_mask
);
733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
736 static void parts64_canonicalize(FloatParts64
*p
, float_status
*status
,
737 const FloatFmt
*fmt
);
738 static void parts128_canonicalize(FloatParts128
*p
, float_status
*status
,
739 const FloatFmt
*fmt
);
741 #define parts_canonicalize(A, S, F) \
742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
744 static void parts64_uncanon(FloatParts64
*p
, float_status
*status
,
745 const FloatFmt
*fmt
);
746 static void parts128_uncanon(FloatParts128
*p
, float_status
*status
,
747 const FloatFmt
*fmt
);
749 #define parts_uncanon(A, S, F) \
750 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
752 static void parts64_add_normal(FloatParts64
*a
, FloatParts64
*b
);
753 static void parts128_add_normal(FloatParts128
*a
, FloatParts128
*b
);
755 #define parts_add_normal(A, B) \
756 PARTS_GENERIC_64_128(add_normal, A)(A, B)
758 static bool parts64_sub_normal(FloatParts64
*a
, FloatParts64
*b
);
759 static bool parts128_sub_normal(FloatParts128
*a
, FloatParts128
*b
);
761 #define parts_sub_normal(A, B) \
762 PARTS_GENERIC_64_128(sub_normal, A)(A, B)
764 static FloatParts64
*parts64_addsub(FloatParts64
*a
, FloatParts64
*b
,
765 float_status
*s
, bool subtract
);
766 static FloatParts128
*parts128_addsub(FloatParts128
*a
, FloatParts128
*b
,
767 float_status
*s
, bool subtract
);
769 #define parts_addsub(A, B, S, Z) \
770 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
773 * Helper functions for softfloat-parts.c.inc, per-size operations.
776 #define FRAC_GENERIC_64_128(NAME, P) \
777 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
779 static bool frac64_add(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
781 return uadd64_overflow(a
->frac
, b
->frac
, &r
->frac
);
784 static bool frac128_add(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
787 r
->frac_lo
= uadd64_carry(a
->frac_lo
, b
->frac_lo
, &c
);
788 r
->frac_hi
= uadd64_carry(a
->frac_hi
, b
->frac_hi
, &c
);
792 #define frac_add(R, A, B) FRAC_GENERIC_64_128(add, R)(R, A, B)
794 static bool frac64_addi(FloatParts64
*r
, FloatParts64
*a
, uint64_t c
)
796 return uadd64_overflow(a
->frac
, c
, &r
->frac
);
799 static bool frac128_addi(FloatParts128
*r
, FloatParts128
*a
, uint64_t c
)
801 c
= uadd64_overflow(a
->frac_lo
, c
, &r
->frac_lo
);
802 return uadd64_overflow(a
->frac_hi
, c
, &r
->frac_hi
);
805 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
807 static void frac64_allones(FloatParts64
*a
)
812 static void frac128_allones(FloatParts128
*a
)
814 a
->frac_hi
= a
->frac_lo
= -1;
817 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
819 static int frac64_cmp(FloatParts64
*a
, FloatParts64
*b
)
821 return a
->frac
== b
->frac
? 0 : a
->frac
< b
->frac
? -1 : 1;
824 static int frac128_cmp(FloatParts128
*a
, FloatParts128
*b
)
826 uint64_t ta
= a
->frac_hi
, tb
= b
->frac_hi
;
828 ta
= a
->frac_lo
, tb
= b
->frac_lo
;
833 return ta
< tb
? -1 : 1;
836 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
838 static void frac64_clear(FloatParts64
*a
)
843 static void frac128_clear(FloatParts128
*a
)
845 a
->frac_hi
= a
->frac_lo
= 0;
848 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
850 static bool frac64_eqz(FloatParts64
*a
)
855 static bool frac128_eqz(FloatParts128
*a
)
857 return (a
->frac_hi
| a
->frac_lo
) == 0;
860 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
862 static void frac64_neg(FloatParts64
*a
)
867 static void frac128_neg(FloatParts128
*a
)
870 a
->frac_lo
= usub64_borrow(0, a
->frac_lo
, &c
);
871 a
->frac_hi
= usub64_borrow(0, a
->frac_hi
, &c
);
874 #define frac_neg(A) FRAC_GENERIC_64_128(neg, A)(A)
876 static int frac64_normalize(FloatParts64
*a
)
879 int shift
= clz64(a
->frac
);
886 static int frac128_normalize(FloatParts128
*a
)
889 int shl
= clz64(a
->frac_hi
);
892 a
->frac_hi
= (a
->frac_hi
<< shl
) | (a
->frac_lo
>> shr
);
893 a
->frac_lo
= (a
->frac_lo
<< shl
);
896 } else if (a
->frac_lo
) {
897 int shl
= clz64(a
->frac_lo
);
898 a
->frac_hi
= (a
->frac_lo
<< shl
);
905 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A)
907 static void frac64_shl(FloatParts64
*a
, int c
)
912 static void frac128_shl(FloatParts128
*a
, int c
)
914 shift128Left(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
917 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
919 static void frac64_shr(FloatParts64
*a
, int c
)
924 static void frac128_shr(FloatParts128
*a
, int c
)
926 shift128Right(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
929 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
931 static void frac64_shrjam(FloatParts64
*a
, int c
)
933 shift64RightJamming(a
->frac
, c
, &a
->frac
);
936 static void frac128_shrjam(FloatParts128
*a
, int c
)
938 shift128RightJamming(a
->frac_hi
, a
->frac_lo
, c
, &a
->frac_hi
, &a
->frac_lo
);
941 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C)
943 static bool frac64_sub(FloatParts64
*r
, FloatParts64
*a
, FloatParts64
*b
)
945 return usub64_overflow(a
->frac
, b
->frac
, &r
->frac
);
948 static bool frac128_sub(FloatParts128
*r
, FloatParts128
*a
, FloatParts128
*b
)
951 r
->frac_lo
= usub64_borrow(a
->frac_lo
, b
->frac_lo
, &c
);
952 r
->frac_hi
= usub64_borrow(a
->frac_hi
, b
->frac_hi
, &c
);
956 #define frac_sub(R, A, B) FRAC_GENERIC_64_128(sub, R)(R, A, B)
958 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
959 #define FloatPartsN glue(FloatParts,N)
963 #include "softfloat-parts-addsub.c.inc"
964 #include "softfloat-parts.c.inc"
969 #include "softfloat-parts-addsub.c.inc"
970 #include "softfloat-parts.c.inc"
977 * Pack/unpack routines with a specific FloatFmt.
980 static void float16a_unpack_canonical(FloatParts64
*p
, float16 f
,
981 float_status
*s
, const FloatFmt
*params
)
983 float16_unpack_raw(p
, f
);
984 parts_canonicalize(p
, s
, params
);
987 static void float16_unpack_canonical(FloatParts64
*p
, float16 f
,
990 float16a_unpack_canonical(p
, f
, s
, &float16_params
);
993 static void bfloat16_unpack_canonical(FloatParts64
*p
, bfloat16 f
,
996 bfloat16_unpack_raw(p
, f
);
997 parts_canonicalize(p
, s
, &bfloat16_params
);
1000 static float16
float16a_round_pack_canonical(FloatParts64
*p
,
1002 const FloatFmt
*params
)
1004 parts_uncanon(p
, s
, params
);
1005 return float16_pack_raw(p
);
1008 static float16
float16_round_pack_canonical(FloatParts64
*p
,
1011 return float16a_round_pack_canonical(p
, s
, &float16_params
);
1014 static bfloat16
bfloat16_round_pack_canonical(FloatParts64
*p
,
1017 parts_uncanon(p
, s
, &bfloat16_params
);
1018 return bfloat16_pack_raw(p
);
1021 static void float32_unpack_canonical(FloatParts64
*p
, float32 f
,
1024 float32_unpack_raw(p
, f
);
1025 parts_canonicalize(p
, s
, &float32_params
);
1028 static float32
float32_round_pack_canonical(FloatParts64
*p
,
1031 parts_uncanon(p
, s
, &float32_params
);
1032 return float32_pack_raw(p
);
1035 static void float64_unpack_canonical(FloatParts64
*p
, float64 f
,
1038 float64_unpack_raw(p
, f
);
1039 parts_canonicalize(p
, s
, &float64_params
);
1042 static float64
float64_round_pack_canonical(FloatParts64
*p
,
1045 parts_uncanon(p
, s
, &float64_params
);
1046 return float64_pack_raw(p
);
1049 static void float128_unpack_canonical(FloatParts128
*p
, float128 f
,
1052 float128_unpack_raw(p
, f
);
1053 parts_canonicalize(p
, s
, &float128_params
);
1056 static float128
float128_round_pack_canonical(FloatParts128
*p
,
1059 parts_uncanon(p
, s
, &float128_params
);
1060 return float128_pack_raw(p
);
1064 * Addition and subtraction
1067 static float16 QEMU_FLATTEN
1068 float16_addsub(float16 a
, float16 b
, float_status
*status
, bool subtract
)
1070 FloatParts64 pa
, pb
, *pr
;
1072 float16_unpack_canonical(&pa
, a
, status
);
1073 float16_unpack_canonical(&pb
, b
, status
);
1074 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1076 return float16_round_pack_canonical(pr
, status
);
1079 float16
float16_add(float16 a
, float16 b
, float_status
*status
)
1081 return float16_addsub(a
, b
, status
, false);
1084 float16
float16_sub(float16 a
, float16 b
, float_status
*status
)
1086 return float16_addsub(a
, b
, status
, true);
1089 static float32 QEMU_SOFTFLOAT_ATTR
1090 soft_f32_addsub(float32 a
, float32 b
, float_status
*status
, bool subtract
)
1092 FloatParts64 pa
, pb
, *pr
;
1094 float32_unpack_canonical(&pa
, a
, status
);
1095 float32_unpack_canonical(&pb
, b
, status
);
1096 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1098 return float32_round_pack_canonical(pr
, status
);
1101 static float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1103 return soft_f32_addsub(a
, b
, status
, false);
1106 static float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1108 return soft_f32_addsub(a
, b
, status
, true);
1111 static float64 QEMU_SOFTFLOAT_ATTR
1112 soft_f64_addsub(float64 a
, float64 b
, float_status
*status
, bool subtract
)
1114 FloatParts64 pa
, pb
, *pr
;
1116 float64_unpack_canonical(&pa
, a
, status
);
1117 float64_unpack_canonical(&pb
, b
, status
);
1118 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1120 return float64_round_pack_canonical(pr
, status
);
1123 static float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1125 return soft_f64_addsub(a
, b
, status
, false);
1128 static float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1130 return soft_f64_addsub(a
, b
, status
, true);
1133 static float hard_f32_add(float a
, float b
)
1138 static float hard_f32_sub(float a
, float b
)
1143 static double hard_f64_add(double a
, double b
)
1148 static double hard_f64_sub(double a
, double b
)
1153 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1155 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1156 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1158 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1161 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1163 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1164 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1166 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1170 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1171 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1173 return float32_gen2(a
, b
, s
, hard
, soft
,
1174 f32_is_zon2
, f32_addsubmul_post
);
1177 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1178 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1180 return float64_gen2(a
, b
, s
, hard
, soft
,
1181 f64_is_zon2
, f64_addsubmul_post
);
1184 float32 QEMU_FLATTEN
1185 float32_add(float32 a
, float32 b
, float_status
*s
)
1187 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1190 float32 QEMU_FLATTEN
1191 float32_sub(float32 a
, float32 b
, float_status
*s
)
1193 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1196 float64 QEMU_FLATTEN
1197 float64_add(float64 a
, float64 b
, float_status
*s
)
1199 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1202 float64 QEMU_FLATTEN
1203 float64_sub(float64 a
, float64 b
, float_status
*s
)
1205 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1208 static bfloat16 QEMU_FLATTEN
1209 bfloat16_addsub(bfloat16 a
, bfloat16 b
, float_status
*status
, bool subtract
)
1211 FloatParts64 pa
, pb
, *pr
;
1213 bfloat16_unpack_canonical(&pa
, a
, status
);
1214 bfloat16_unpack_canonical(&pb
, b
, status
);
1215 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1217 return bfloat16_round_pack_canonical(pr
, status
);
1220 bfloat16
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1222 return bfloat16_addsub(a
, b
, status
, false);
1225 bfloat16
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1227 return bfloat16_addsub(a
, b
, status
, true);
1230 static float128 QEMU_FLATTEN
1231 float128_addsub(float128 a
, float128 b
, float_status
*status
, bool subtract
)
1233 FloatParts128 pa
, pb
, *pr
;
1235 float128_unpack_canonical(&pa
, a
, status
);
1236 float128_unpack_canonical(&pb
, b
, status
);
1237 pr
= parts_addsub(&pa
, &pb
, status
, subtract
);
1239 return float128_round_pack_canonical(pr
, status
);
1242 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
1244 return float128_addsub(a
, b
, status
, false);
1247 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
1249 return float128_addsub(a
, b
, status
, true);
1253 * Returns the result of multiplying the floating-point values `a' and
1254 * `b'. The operation is performed according to the IEC/IEEE Standard
1255 * for Binary Floating-Point Arithmetic.
1258 static FloatParts64
mul_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1260 bool sign
= a
.sign
^ b
.sign
;
1262 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1264 int exp
= a
.exp
+ b
.exp
;
1266 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1267 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1280 /* handle all the NaN cases */
1281 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1282 return *parts_pick_nan(&a
, &b
, s
);
1284 /* Inf * Zero == NaN */
1285 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1286 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1287 float_raise(float_flag_invalid
, s
);
1288 parts_default_nan(&a
, s
);
1291 /* Multiply by 0 or Inf */
1292 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1296 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1300 g_assert_not_reached();
1303 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1305 FloatParts64 pa
, pb
, pr
;
1307 float16_unpack_canonical(&pa
, a
, status
);
1308 float16_unpack_canonical(&pb
, b
, status
);
1309 pr
= mul_floats(pa
, pb
, status
);
1311 return float16_round_pack_canonical(&pr
, status
);
1314 static float32 QEMU_SOFTFLOAT_ATTR
1315 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1317 FloatParts64 pa
, pb
, pr
;
1319 float32_unpack_canonical(&pa
, a
, status
);
1320 float32_unpack_canonical(&pb
, b
, status
);
1321 pr
= mul_floats(pa
, pb
, status
);
1323 return float32_round_pack_canonical(&pr
, status
);
1326 static float64 QEMU_SOFTFLOAT_ATTR
1327 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1329 FloatParts64 pa
, pb
, pr
;
1331 float64_unpack_canonical(&pa
, a
, status
);
1332 float64_unpack_canonical(&pb
, b
, status
);
1333 pr
= mul_floats(pa
, pb
, status
);
1335 return float64_round_pack_canonical(&pr
, status
);
1338 static float hard_f32_mul(float a
, float b
)
1343 static double hard_f64_mul(double a
, double b
)
1348 float32 QEMU_FLATTEN
1349 float32_mul(float32 a
, float32 b
, float_status
*s
)
1351 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1352 f32_is_zon2
, f32_addsubmul_post
);
1355 float64 QEMU_FLATTEN
1356 float64_mul(float64 a
, float64 b
, float_status
*s
)
1358 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1359 f64_is_zon2
, f64_addsubmul_post
);
1363 * Returns the result of multiplying the bfloat16
1364 * values `a' and `b'.
1367 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1369 FloatParts64 pa
, pb
, pr
;
1371 bfloat16_unpack_canonical(&pa
, a
, status
);
1372 bfloat16_unpack_canonical(&pb
, b
, status
);
1373 pr
= mul_floats(pa
, pb
, status
);
1375 return bfloat16_round_pack_canonical(&pr
, status
);
1379 * Returns the result of multiplying the floating-point values `a' and
1380 * `b' then adding 'c', with no intermediate rounding step after the
1381 * multiplication. The operation is performed according to the
1382 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1383 * The flags argument allows the caller to select negation of the
1384 * addend, the intermediate product, or the final result. (The
1385 * difference between this and having the caller do a separate
1386 * negation is that negating externally will flip the sign bit on
1390 static FloatParts64
muladd_floats(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
1391 int flags
, float_status
*s
)
1393 bool inf_zero
, p_sign
;
1394 bool sign_flip
= flags
& float_muladd_negate_result
;
1398 int ab_mask
, abc_mask
;
1400 ab_mask
= float_cmask(a
.cls
) | float_cmask(b
.cls
);
1401 abc_mask
= float_cmask(c
.cls
) | ab_mask
;
1402 inf_zero
= ab_mask
== float_cmask_infzero
;
1404 /* It is implementation-defined whether the cases of (0,inf,qnan)
1405 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1406 * they return if they do), so we have to hand this information
1407 * off to the target-specific pick-a-NaN routine.
1409 if (unlikely(abc_mask
& float_cmask_anynan
)) {
1410 return *parts_pick_nan_muladd(&a
, &b
, &c
, s
, ab_mask
, abc_mask
);
1414 float_raise(float_flag_invalid
, s
);
1415 parts_default_nan(&a
, s
);
1419 if (flags
& float_muladd_negate_c
) {
1423 p_sign
= a
.sign
^ b
.sign
;
1425 if (flags
& float_muladd_negate_product
) {
1429 if (ab_mask
& float_cmask_inf
) {
1430 p_class
= float_class_inf
;
1431 } else if (ab_mask
& float_cmask_zero
) {
1432 p_class
= float_class_zero
;
1434 p_class
= float_class_normal
;
1437 if (c
.cls
== float_class_inf
) {
1438 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1439 float_raise(float_flag_invalid
, s
);
1440 parts_default_nan(&c
, s
);
1442 c
.sign
^= sign_flip
;
1447 if (p_class
== float_class_inf
) {
1448 a
.cls
= float_class_inf
;
1449 a
.sign
= p_sign
^ sign_flip
;
1453 if (p_class
== float_class_zero
) {
1454 if (c
.cls
== float_class_zero
) {
1455 if (p_sign
!= c
.sign
) {
1456 p_sign
= s
->float_rounding_mode
== float_round_down
;
1459 } else if (flags
& float_muladd_halve_result
) {
1462 c
.sign
^= sign_flip
;
1466 /* a & b should be normals now... */
1467 assert(a
.cls
== float_class_normal
&&
1468 b
.cls
== float_class_normal
);
1470 p_exp
= a
.exp
+ b
.exp
;
1472 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1474 /* Renormalize to the msb. */
1475 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1478 shortShift128Left(hi
, lo
, 1, &hi
, &lo
);
1482 if (c
.cls
!= float_class_zero
) {
1483 int exp_diff
= p_exp
- c
.exp
;
1484 if (p_sign
== c
.sign
) {
1486 if (exp_diff
<= 0) {
1487 shift64RightJamming(hi
, -exp_diff
, &hi
);
1489 if (uadd64_overflow(hi
, c
.frac
, &hi
)) {
1490 shift64RightJamming(hi
, 1, &hi
);
1491 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1495 uint64_t c_hi
, c_lo
, over
;
1496 shift128RightJamming(c
.frac
, 0, exp_diff
, &c_hi
, &c_lo
);
1497 add192(0, hi
, lo
, 0, c_hi
, c_lo
, &over
, &hi
, &lo
);
1499 shift64RightJamming(hi
, 1, &hi
);
1500 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1506 uint64_t c_hi
= c
.frac
, c_lo
= 0;
1508 if (exp_diff
<= 0) {
1509 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1512 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1513 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1515 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1520 shift128RightJamming(c_hi
, c_lo
,
1523 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1526 if (hi
== 0 && lo
== 0) {
1527 a
.cls
= float_class_zero
;
1528 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1529 a
.sign
^= sign_flip
;
1536 shift
= clz64(lo
) + 64;
1538 /* Normalizing to a binary point of 124 is the
1539 correct adjust for the exponent. However since we're
1540 shifting, we might as well put the binary point back
1541 at 63 where we really want it. Therefore shift as
1542 if we're leaving 1 bit at the top of the word, but
1543 adjust the exponent as if we're leaving 3 bits. */
1544 shift128Left(hi
, lo
, shift
, &hi
, &lo
);
1551 if (flags
& float_muladd_halve_result
) {
1555 /* finally prepare our result */
1556 a
.cls
= float_class_normal
;
1557 a
.sign
= p_sign
^ sign_flip
;
1564 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1565 int flags
, float_status
*status
)
1567 FloatParts64 pa
, pb
, pc
, pr
;
1569 float16_unpack_canonical(&pa
, a
, status
);
1570 float16_unpack_canonical(&pb
, b
, status
);
1571 float16_unpack_canonical(&pc
, c
, status
);
1572 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1574 return float16_round_pack_canonical(&pr
, status
);
1577 static float32 QEMU_SOFTFLOAT_ATTR
1578 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1579 float_status
*status
)
1581 FloatParts64 pa
, pb
, pc
, pr
;
1583 float32_unpack_canonical(&pa
, a
, status
);
1584 float32_unpack_canonical(&pb
, b
, status
);
1585 float32_unpack_canonical(&pc
, c
, status
);
1586 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1588 return float32_round_pack_canonical(&pr
, status
);
1591 static float64 QEMU_SOFTFLOAT_ATTR
1592 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1593 float_status
*status
)
1595 FloatParts64 pa
, pb
, pc
, pr
;
1597 float64_unpack_canonical(&pa
, a
, status
);
1598 float64_unpack_canonical(&pb
, b
, status
);
1599 float64_unpack_canonical(&pc
, c
, status
);
1600 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1602 return float64_round_pack_canonical(&pr
, status
);
1605 static bool force_soft_fma
;
1607 float32 QEMU_FLATTEN
1608 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1610 union_float32 ua
, ub
, uc
, ur
;
1616 if (unlikely(!can_use_fpu(s
))) {
1619 if (unlikely(flags
& float_muladd_halve_result
)) {
1623 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1624 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1628 if (unlikely(force_soft_fma
)) {
1633 * When (a || b) == 0, there's no need to check for under/over flow,
1634 * since we know the addend is (normal || 0) and the product is 0.
1636 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1640 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1641 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1642 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1644 if (flags
& float_muladd_negate_c
) {
1649 union_float32 ua_orig
= ua
;
1650 union_float32 uc_orig
= uc
;
1652 if (flags
& float_muladd_negate_product
) {
1655 if (flags
& float_muladd_negate_c
) {
1659 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1661 if (unlikely(f32_is_inf(ur
))) {
1662 float_raise(float_flag_overflow
, s
);
1663 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1669 if (flags
& float_muladd_negate_result
) {
1670 return float32_chs(ur
.s
);
1675 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1678 float64 QEMU_FLATTEN
1679 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1681 union_float64 ua
, ub
, uc
, ur
;
1687 if (unlikely(!can_use_fpu(s
))) {
1690 if (unlikely(flags
& float_muladd_halve_result
)) {
1694 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1695 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1699 if (unlikely(force_soft_fma
)) {
1704 * When (a || b) == 0, there's no need to check for under/over flow,
1705 * since we know the addend is (normal || 0) and the product is 0.
1707 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1711 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1712 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1713 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1715 if (flags
& float_muladd_negate_c
) {
1720 union_float64 ua_orig
= ua
;
1721 union_float64 uc_orig
= uc
;
1723 if (flags
& float_muladd_negate_product
) {
1726 if (flags
& float_muladd_negate_c
) {
1730 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1732 if (unlikely(f64_is_inf(ur
))) {
1733 float_raise(float_flag_overflow
, s
);
1734 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1740 if (flags
& float_muladd_negate_result
) {
1741 return float64_chs(ur
.s
);
1746 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1750 * Returns the result of multiplying the bfloat16 values `a'
1751 * and `b' then adding 'c', with no intermediate rounding step after the
1755 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1756 int flags
, float_status
*status
)
1758 FloatParts64 pa
, pb
, pc
, pr
;
1760 bfloat16_unpack_canonical(&pa
, a
, status
);
1761 bfloat16_unpack_canonical(&pb
, b
, status
);
1762 bfloat16_unpack_canonical(&pc
, c
, status
);
1763 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1765 return bfloat16_round_pack_canonical(&pr
, status
);
1769 * Returns the result of dividing the floating-point value `a' by the
1770 * corresponding value `b'. The operation is performed according to
1771 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1774 static FloatParts64
div_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1776 bool sign
= a
.sign
^ b
.sign
;
1778 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1779 uint64_t n0
, n1
, q
, r
;
1780 int exp
= a
.exp
- b
.exp
;
1783 * We want a 2*N / N-bit division to produce exactly an N-bit
1784 * result, so that we do not lose any precision and so that we
1785 * do not have to renormalize afterward. If A.frac < B.frac,
1786 * then division would produce an (N-1)-bit result; shift A left
1787 * by one to produce the an N-bit result, and decrement the
1788 * exponent to match.
1790 * The udiv_qrnnd algorithm that we're using requires normalization,
1791 * i.e. the msb of the denominator must be set, which is already true.
1793 if (a
.frac
< b
.frac
) {
1795 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1797 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1799 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1801 /* Set lsb if there is a remainder, to set inexact. */
1802 a
.frac
= q
| (r
!= 0);
1807 /* handle all the NaN cases */
1808 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1809 return *parts_pick_nan(&a
, &b
, s
);
1811 /* 0/0 or Inf/Inf */
1814 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1815 float_raise(float_flag_invalid
, s
);
1816 parts_default_nan(&a
, s
);
1819 /* Inf / x or 0 / x */
1820 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1825 if (b
.cls
== float_class_zero
) {
1826 float_raise(float_flag_divbyzero
, s
);
1827 a
.cls
= float_class_inf
;
1832 if (b
.cls
== float_class_inf
) {
1833 a
.cls
= float_class_zero
;
1837 g_assert_not_reached();
1840 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1842 FloatParts64 pa
, pb
, pr
;
1844 float16_unpack_canonical(&pa
, a
, status
);
1845 float16_unpack_canonical(&pb
, b
, status
);
1846 pr
= div_floats(pa
, pb
, status
);
1848 return float16_round_pack_canonical(&pr
, status
);
1851 static float32 QEMU_SOFTFLOAT_ATTR
1852 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1854 FloatParts64 pa
, pb
, pr
;
1856 float32_unpack_canonical(&pa
, a
, status
);
1857 float32_unpack_canonical(&pb
, b
, status
);
1858 pr
= div_floats(pa
, pb
, status
);
1860 return float32_round_pack_canonical(&pr
, status
);
1863 static float64 QEMU_SOFTFLOAT_ATTR
1864 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1866 FloatParts64 pa
, pb
, pr
;
1868 float64_unpack_canonical(&pa
, a
, status
);
1869 float64_unpack_canonical(&pb
, b
, status
);
1870 pr
= div_floats(pa
, pb
, status
);
1872 return float64_round_pack_canonical(&pr
, status
);
1875 static float hard_f32_div(float a
, float b
)
1880 static double hard_f64_div(double a
, double b
)
1885 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1887 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1888 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1889 fpclassify(b
.h
) == FP_NORMAL
;
1891 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1894 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1896 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1897 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1898 fpclassify(b
.h
) == FP_NORMAL
;
1900 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1903 static bool f32_div_post(union_float32 a
, union_float32 b
)
1905 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1906 return fpclassify(a
.h
) != FP_ZERO
;
1908 return !float32_is_zero(a
.s
);
1911 static bool f64_div_post(union_float64 a
, union_float64 b
)
1913 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1914 return fpclassify(a
.h
) != FP_ZERO
;
1916 return !float64_is_zero(a
.s
);
1919 float32 QEMU_FLATTEN
1920 float32_div(float32 a
, float32 b
, float_status
*s
)
1922 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1923 f32_div_pre
, f32_div_post
);
1926 float64 QEMU_FLATTEN
1927 float64_div(float64 a
, float64 b
, float_status
*s
)
1929 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1930 f64_div_pre
, f64_div_post
);
1934 * Returns the result of dividing the bfloat16
1935 * value `a' by the corresponding value `b'.
1938 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1940 FloatParts64 pa
, pb
, pr
;
1942 bfloat16_unpack_canonical(&pa
, a
, status
);
1943 bfloat16_unpack_canonical(&pb
, b
, status
);
1944 pr
= div_floats(pa
, pb
, status
);
1946 return bfloat16_round_pack_canonical(&pr
, status
);
1950 * Float to Float conversions
1952 * Returns the result of converting one float format to another. The
1953 * conversion is performed according to the IEC/IEEE Standard for
1954 * Binary Floating-Point Arithmetic.
1956 * The float_to_float helper only needs to take care of raising
1957 * invalid exceptions and handling the conversion on NaNs.
1960 static FloatParts64
float_to_float(FloatParts64 a
, const FloatFmt
*dstf
,
1963 if (dstf
->arm_althp
) {
1965 case float_class_qnan
:
1966 case float_class_snan
:
1967 /* There is no NaN in the destination format. Raise Invalid
1968 * and return a zero with the sign of the input NaN.
1970 float_raise(float_flag_invalid
, s
);
1971 a
.cls
= float_class_zero
;
1976 case float_class_inf
:
1977 /* There is no Inf in the destination format. Raise Invalid
1978 * and return the maximum normal with the correct sign.
1980 float_raise(float_flag_invalid
, s
);
1981 a
.cls
= float_class_normal
;
1982 a
.exp
= dstf
->exp_max
;
1983 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1989 } else if (is_nan(a
.cls
)) {
1990 parts_return_nan(&a
, s
);
1995 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1997 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1998 FloatParts64 pa
, pr
;
2000 float16a_unpack_canonical(&pa
, a
, s
, fmt16
);
2001 pr
= float_to_float(pa
, &float32_params
, s
);
2002 return float32_round_pack_canonical(&pr
, s
);
2005 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
2007 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2008 FloatParts64 pa
, pr
;
2010 float16a_unpack_canonical(&pa
, a
, s
, fmt16
);
2011 pr
= float_to_float(pa
, &float64_params
, s
);
2012 return float64_round_pack_canonical(&pr
, s
);
2015 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
2017 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2018 FloatParts64 pa
, pr
;
2020 float32_unpack_canonical(&pa
, a
, s
);
2021 pr
= float_to_float(pa
, fmt16
, s
);
2022 return float16a_round_pack_canonical(&pr
, s
, fmt16
);
2025 static float64 QEMU_SOFTFLOAT_ATTR
2026 soft_float32_to_float64(float32 a
, float_status
*s
)
2028 FloatParts64 pa
, pr
;
2030 float32_unpack_canonical(&pa
, a
, s
);
2031 pr
= float_to_float(pa
, &float64_params
, s
);
2032 return float64_round_pack_canonical(&pr
, s
);
2035 float64
float32_to_float64(float32 a
, float_status
*s
)
2037 if (likely(float32_is_normal(a
))) {
2038 /* Widening conversion can never produce inexact results. */
2044 } else if (float32_is_zero(a
)) {
2045 return float64_set_sign(float64_zero
, float32_is_neg(a
));
2047 return soft_float32_to_float64(a
, s
);
2051 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2053 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2054 FloatParts64 pa
, pr
;
2056 float64_unpack_canonical(&pa
, a
, s
);
2057 pr
= float_to_float(pa
, fmt16
, s
);
2058 return float16a_round_pack_canonical(&pr
, s
, fmt16
);
2061 float32
float64_to_float32(float64 a
, float_status
*s
)
2063 FloatParts64 pa
, pr
;
2065 float64_unpack_canonical(&pa
, a
, s
);
2066 pr
= float_to_float(pa
, &float32_params
, s
);
2067 return float32_round_pack_canonical(&pr
, s
);
2070 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2072 FloatParts64 pa
, pr
;
2074 bfloat16_unpack_canonical(&pa
, a
, s
);
2075 pr
= float_to_float(pa
, &float32_params
, s
);
2076 return float32_round_pack_canonical(&pr
, s
);
2079 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2081 FloatParts64 pa
, pr
;
2083 bfloat16_unpack_canonical(&pa
, a
, s
);
2084 pr
= float_to_float(pa
, &float64_params
, s
);
2085 return float64_round_pack_canonical(&pr
, s
);
2088 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2090 FloatParts64 pa
, pr
;
2092 float32_unpack_canonical(&pa
, a
, s
);
2093 pr
= float_to_float(pa
, &bfloat16_params
, s
);
2094 return bfloat16_round_pack_canonical(&pr
, s
);
2097 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2099 FloatParts64 pa
, pr
;
2101 float64_unpack_canonical(&pa
, a
, s
);
2102 pr
= float_to_float(pa
, &bfloat16_params
, s
);
2103 return bfloat16_round_pack_canonical(&pr
, s
);
2107 * Rounds the floating-point value `a' to an integer, and returns the
2108 * result as a floating-point value. The operation is performed
2109 * according to the IEC/IEEE Standard for Binary Floating-Point
2113 static FloatParts64
round_to_int(FloatParts64 a
, FloatRoundMode rmode
,
2114 int scale
, float_status
*s
)
2117 case float_class_qnan
:
2118 case float_class_snan
:
2119 parts_return_nan(&a
, s
);
2122 case float_class_zero
:
2123 case float_class_inf
:
2124 /* already "integral" */
2127 case float_class_normal
:
2128 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2131 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2132 /* already integral */
2137 /* all fractional */
2138 float_raise(float_flag_inexact
, s
);
2140 case float_round_nearest_even
:
2141 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2143 case float_round_ties_away
:
2144 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2146 case float_round_to_zero
:
2149 case float_round_up
:
2152 case float_round_down
:
2155 case float_round_to_odd
:
2159 g_assert_not_reached();
2163 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2166 a
.cls
= float_class_zero
;
2169 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2170 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2171 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2172 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2176 case float_round_nearest_even
:
2177 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2179 case float_round_ties_away
:
2182 case float_round_to_zero
:
2185 case float_round_up
:
2186 inc
= a
.sign
? 0 : rnd_mask
;
2188 case float_round_down
:
2189 inc
= a
.sign
? rnd_mask
: 0;
2191 case float_round_to_odd
:
2192 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2195 g_assert_not_reached();
2198 if (a
.frac
& rnd_mask
) {
2199 float_raise(float_flag_inexact
, s
);
2200 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2202 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2205 a
.frac
&= ~rnd_mask
;
2210 g_assert_not_reached();
2215 float16
float16_round_to_int(float16 a
, float_status
*s
)
2217 FloatParts64 pa
, pr
;
2219 float16_unpack_canonical(&pa
, a
, s
);
2220 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2221 return float16_round_pack_canonical(&pr
, s
);
2224 float32
float32_round_to_int(float32 a
, float_status
*s
)
2226 FloatParts64 pa
, pr
;
2228 float32_unpack_canonical(&pa
, a
, s
);
2229 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2230 return float32_round_pack_canonical(&pr
, s
);
2233 float64
float64_round_to_int(float64 a
, float_status
*s
)
2235 FloatParts64 pa
, pr
;
2237 float64_unpack_canonical(&pa
, a
, s
);
2238 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2239 return float64_round_pack_canonical(&pr
, s
);
2243 * Rounds the bfloat16 value `a' to an integer, and returns the
2244 * result as a bfloat16 value.
2247 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2249 FloatParts64 pa
, pr
;
2251 bfloat16_unpack_canonical(&pa
, a
, s
);
2252 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2253 return bfloat16_round_pack_canonical(&pr
, s
);
2257 * Returns the result of converting the floating-point value `a' to
2258 * the two's complement integer format. The conversion is performed
2259 * according to the IEC/IEEE Standard for Binary Floating-Point
2260 * Arithmetic---which means in particular that the conversion is
2261 * rounded according to the current rounding mode. If `a' is a NaN,
2262 * the largest positive integer is returned. Otherwise, if the
2263 * conversion overflows, the largest integer with the same sign as `a'
2267 static int64_t round_to_int_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2268 int scale
, int64_t min
, int64_t max
,
2272 int orig_flags
= get_float_exception_flags(s
);
2273 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2276 case float_class_snan
:
2277 case float_class_qnan
:
2278 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2280 case float_class_inf
:
2281 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2282 return p
.sign
? min
: max
;
2283 case float_class_zero
:
2285 case float_class_normal
:
2286 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2287 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2292 if (r
<= -(uint64_t) min
) {
2295 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2302 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2307 g_assert_not_reached();
2311 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2316 float16_unpack_canonical(&p
, a
, s
);
2317 return round_to_int_and_pack(p
, rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2320 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2325 float16_unpack_canonical(&p
, a
, s
);
2326 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2329 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2334 float16_unpack_canonical(&p
, a
, s
);
2335 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2338 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2343 float16_unpack_canonical(&p
, a
, s
);
2344 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2347 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2352 float32_unpack_canonical(&p
, a
, s
);
2353 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2356 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2361 float32_unpack_canonical(&p
, a
, s
);
2362 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2365 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2370 float32_unpack_canonical(&p
, a
, s
);
2371 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2374 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2379 float64_unpack_canonical(&p
, a
, s
);
2380 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2383 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2388 float64_unpack_canonical(&p
, a
, s
);
2389 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2392 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2397 float64_unpack_canonical(&p
, a
, s
);
2398 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2401 int8_t float16_to_int8(float16 a
, float_status
*s
)
2403 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2406 int16_t float16_to_int16(float16 a
, float_status
*s
)
2408 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2411 int32_t float16_to_int32(float16 a
, float_status
*s
)
2413 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2416 int64_t float16_to_int64(float16 a
, float_status
*s
)
2418 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2421 int16_t float32_to_int16(float32 a
, float_status
*s
)
2423 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2426 int32_t float32_to_int32(float32 a
, float_status
*s
)
2428 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2431 int64_t float32_to_int64(float32 a
, float_status
*s
)
2433 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2436 int16_t float64_to_int16(float64 a
, float_status
*s
)
2438 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2441 int32_t float64_to_int32(float64 a
, float_status
*s
)
2443 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2446 int64_t float64_to_int64(float64 a
, float_status
*s
)
2448 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2451 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2453 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2456 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2458 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2461 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2463 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2466 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2468 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2471 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2473 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2476 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2478 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2481 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2483 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2486 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2488 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2491 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2493 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2497 * Returns the result of converting the floating-point value `a' to
2498 * the two's complement integer format.
2501 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2506 bfloat16_unpack_canonical(&p
, a
, s
);
2507 return round_to_int_and_pack(p
, rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2510 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2515 bfloat16_unpack_canonical(&p
, a
, s
);
2516 return round_to_int_and_pack(p
, rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2519 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2524 bfloat16_unpack_canonical(&p
, a
, s
);
2525 return round_to_int_and_pack(p
, rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2528 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2530 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2533 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2535 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2538 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2540 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2543 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2545 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2548 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2550 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2553 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2555 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2559 * Returns the result of converting the floating-point value `a' to
2560 * the unsigned integer format. The conversion is performed according
2561 * to the IEC/IEEE Standard for Binary Floating-Point
2562 * Arithmetic---which means in particular that the conversion is
2563 * rounded according to the current rounding mode. If `a' is a NaN,
2564 * the largest unsigned integer is returned. Otherwise, if the
2565 * conversion overflows, the largest unsigned integer is returned. If
2566 * the 'a' is negative, the result is rounded and zero is returned;
2567 * values that do not round to zero will raise the inexact exception
2571 static uint64_t round_to_uint_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2572 int scale
, uint64_t max
,
2575 int orig_flags
= get_float_exception_flags(s
);
2576 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2580 case float_class_snan
:
2581 case float_class_qnan
:
2582 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2584 case float_class_inf
:
2585 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2586 return p
.sign
? 0 : max
;
2587 case float_class_zero
:
2589 case float_class_normal
:
2591 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2595 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2596 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2598 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2602 /* For uint64 this will never trip, but if p.exp is too large
2603 * to shift a decomposed fraction we shall have exited via the
2607 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2612 g_assert_not_reached();
2616 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2621 float16_unpack_canonical(&p
, a
, s
);
2622 return round_to_uint_and_pack(p
, rmode
, scale
, UINT8_MAX
, s
);
2625 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2630 float16_unpack_canonical(&p
, a
, s
);
2631 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2634 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2639 float16_unpack_canonical(&p
, a
, s
);
2640 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2643 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2648 float16_unpack_canonical(&p
, a
, s
);
2649 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2652 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2657 float32_unpack_canonical(&p
, a
, s
);
2658 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2661 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2666 float32_unpack_canonical(&p
, a
, s
);
2667 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2670 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2675 float32_unpack_canonical(&p
, a
, s
);
2676 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2679 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2684 float64_unpack_canonical(&p
, a
, s
);
2685 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2688 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2693 float64_unpack_canonical(&p
, a
, s
);
2694 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2697 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2702 float64_unpack_canonical(&p
, a
, s
);
2703 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2706 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2708 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2711 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2713 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2716 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2718 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2721 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2723 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2726 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2728 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2731 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2733 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2736 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2738 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2741 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2743 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2746 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2748 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2751 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2753 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2756 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2758 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2761 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2763 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2766 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2768 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2771 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2773 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2776 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2778 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2781 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2783 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2786 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2788 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2791 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2793 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2796 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2798 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2802 * Returns the result of converting the bfloat16 value `a' to
2803 * the unsigned integer format.
2806 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2807 int scale
, float_status
*s
)
2811 bfloat16_unpack_canonical(&p
, a
, s
);
2812 return round_to_uint_and_pack(p
, rmode
, scale
, UINT16_MAX
, s
);
2815 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2816 int scale
, float_status
*s
)
2820 bfloat16_unpack_canonical(&p
, a
, s
);
2821 return round_to_uint_and_pack(p
, rmode
, scale
, UINT32_MAX
, s
);
2824 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2825 int scale
, float_status
*s
)
2829 bfloat16_unpack_canonical(&p
, a
, s
);
2830 return round_to_uint_and_pack(p
, rmode
, scale
, UINT64_MAX
, s
);
2833 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2835 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2838 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2840 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2843 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2845 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2848 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2850 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2853 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2855 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2858 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2860 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2864 * Integer to float conversions
2866 * Returns the result of converting the two's complement integer `a'
2867 * to the floating-point format. The conversion is performed according
2868 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2871 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2873 FloatParts64 r
= { .sign
= false };
2876 r
.cls
= float_class_zero
;
2881 r
.cls
= float_class_normal
;
2887 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2889 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2890 r
.frac
= f
<< shift
;
2896 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2898 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2899 return float16_round_pack_canonical(&pa
, status
);
2902 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2904 return int64_to_float16_scalbn(a
, scale
, status
);
2907 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2909 return int64_to_float16_scalbn(a
, scale
, status
);
2912 float16
int64_to_float16(int64_t a
, float_status
*status
)
2914 return int64_to_float16_scalbn(a
, 0, status
);
2917 float16
int32_to_float16(int32_t a
, float_status
*status
)
2919 return int64_to_float16_scalbn(a
, 0, status
);
2922 float16
int16_to_float16(int16_t a
, float_status
*status
)
2924 return int64_to_float16_scalbn(a
, 0, status
);
2927 float16
int8_to_float16(int8_t a
, float_status
*status
)
2929 return int64_to_float16_scalbn(a
, 0, status
);
2932 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2934 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2935 return float32_round_pack_canonical(&pa
, status
);
2938 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2940 return int64_to_float32_scalbn(a
, scale
, status
);
2943 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2945 return int64_to_float32_scalbn(a
, scale
, status
);
2948 float32
int64_to_float32(int64_t a
, float_status
*status
)
2950 return int64_to_float32_scalbn(a
, 0, status
);
2953 float32
int32_to_float32(int32_t a
, float_status
*status
)
2955 return int64_to_float32_scalbn(a
, 0, status
);
2958 float32
int16_to_float32(int16_t a
, float_status
*status
)
2960 return int64_to_float32_scalbn(a
, 0, status
);
2963 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2965 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2966 return float64_round_pack_canonical(&pa
, status
);
2969 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2971 return int64_to_float64_scalbn(a
, scale
, status
);
2974 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2976 return int64_to_float64_scalbn(a
, scale
, status
);
2979 float64
int64_to_float64(int64_t a
, float_status
*status
)
2981 return int64_to_float64_scalbn(a
, 0, status
);
2984 float64
int32_to_float64(int32_t a
, float_status
*status
)
2986 return int64_to_float64_scalbn(a
, 0, status
);
2989 float64
int16_to_float64(int16_t a
, float_status
*status
)
2991 return int64_to_float64_scalbn(a
, 0, status
);
2995 * Returns the result of converting the two's complement integer `a'
2996 * to the bfloat16 format.
2999 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
3001 FloatParts64 pa
= int_to_float(a
, scale
, status
);
3002 return bfloat16_round_pack_canonical(&pa
, status
);
3005 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
3007 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3010 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
3012 return int64_to_bfloat16_scalbn(a
, scale
, status
);
3015 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
3017 return int64_to_bfloat16_scalbn(a
, 0, status
);
3020 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
3022 return int64_to_bfloat16_scalbn(a
, 0, status
);
3025 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
3027 return int64_to_bfloat16_scalbn(a
, 0, status
);
3031 * Unsigned Integer to float conversions
3033 * Returns the result of converting the unsigned integer `a' to the
3034 * floating-point format. The conversion is performed according to the
3035 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3038 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
3040 FloatParts64 r
= { .sign
= false };
3044 r
.cls
= float_class_zero
;
3046 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
3048 r
.cls
= float_class_normal
;
3049 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
3050 r
.frac
= a
<< shift
;
3056 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3058 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3059 return float16_round_pack_canonical(&pa
, status
);
3062 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3064 return uint64_to_float16_scalbn(a
, scale
, status
);
3067 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3069 return uint64_to_float16_scalbn(a
, scale
, status
);
3072 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
3074 return uint64_to_float16_scalbn(a
, 0, status
);
3077 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
3079 return uint64_to_float16_scalbn(a
, 0, status
);
3082 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
3084 return uint64_to_float16_scalbn(a
, 0, status
);
3087 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
3089 return uint64_to_float16_scalbn(a
, 0, status
);
3092 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
3094 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3095 return float32_round_pack_canonical(&pa
, status
);
3098 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
3100 return uint64_to_float32_scalbn(a
, scale
, status
);
3103 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
3105 return uint64_to_float32_scalbn(a
, scale
, status
);
3108 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
3110 return uint64_to_float32_scalbn(a
, 0, status
);
3113 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3115 return uint64_to_float32_scalbn(a
, 0, status
);
3118 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3120 return uint64_to_float32_scalbn(a
, 0, status
);
3123 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3125 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3126 return float64_round_pack_canonical(&pa
, status
);
3129 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3131 return uint64_to_float64_scalbn(a
, scale
, status
);
3134 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3136 return uint64_to_float64_scalbn(a
, scale
, status
);
3139 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3141 return uint64_to_float64_scalbn(a
, 0, status
);
3144 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3146 return uint64_to_float64_scalbn(a
, 0, status
);
3149 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3151 return uint64_to_float64_scalbn(a
, 0, status
);
3155 * Returns the result of converting the unsigned integer `a' to the
3159 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3161 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3162 return bfloat16_round_pack_canonical(&pa
, status
);
3165 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3167 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3170 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3172 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3175 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3177 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3180 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3182 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3185 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3187 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3191 /* min() and max() functions. These can't be implemented as
3192 * 'compare and pick one input' because that would mishandle
3193 * NaNs and +0 vs -0.
3195 * minnum() and maxnum() functions. These are similar to the min()
3196 * and max() functions but if one of the arguments is a QNaN and
3197 * the other is numerical then the numerical argument is returned.
3198 * SNaNs will get quietened before being returned.
3199 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3200 * and maxNum() operations. min() and max() are the typical min/max
3201 * semantics provided by many CPUs which predate that specification.
3203 * minnummag() and maxnummag() functions correspond to minNumMag()
3204 * and minNumMag() from the IEEE-754 2008.
3206 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3207 bool ieee
, bool ismag
, float_status
*s
)
3209 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3211 /* Takes two floating-point values `a' and `b', one of
3212 * which is a NaN, and returns the appropriate NaN
3213 * result. If either `a' or `b' is a signaling NaN,
3214 * the invalid exception is raised.
3216 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3217 return *parts_pick_nan(&a
, &b
, s
);
3218 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3220 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3224 return *parts_pick_nan(&a
, &b
, s
);
3229 case float_class_normal
:
3232 case float_class_inf
:
3235 case float_class_zero
:
3239 g_assert_not_reached();
3243 case float_class_normal
:
3246 case float_class_inf
:
3249 case float_class_zero
:
3253 g_assert_not_reached();
3257 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3258 bool a_less
= a_exp
< b_exp
;
3259 if (a_exp
== b_exp
) {
3260 a_less
= a
.frac
< b
.frac
;
3262 return a_less
^ ismin
? b
: a
;
3265 if (a
.sign
== b
.sign
) {
3266 bool a_less
= a_exp
< b_exp
;
3267 if (a_exp
== b_exp
) {
3268 a_less
= a
.frac
< b
.frac
;
3270 return a
.sign
^ a_less
^ ismin
? b
: a
;
3272 return a
.sign
^ ismin
? b
: a
;
3277 #define MINMAX(sz, name, ismin, isiee, ismag) \
3278 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3281 FloatParts64 pa, pb, pr; \
3282 float ## sz ## _unpack_canonical(&pa, a, s); \
3283 float ## sz ## _unpack_canonical(&pb, b, s); \
3284 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3285 return float ## sz ## _round_pack_canonical(&pr, s); \
3288 MINMAX(16, min
, true, false, false)
3289 MINMAX(16, minnum
, true, true, false)
3290 MINMAX(16, minnummag
, true, true, true)
3291 MINMAX(16, max
, false, false, false)
3292 MINMAX(16, maxnum
, false, true, false)
3293 MINMAX(16, maxnummag
, false, true, true)
3295 MINMAX(32, min
, true, false, false)
3296 MINMAX(32, minnum
, true, true, false)
3297 MINMAX(32, minnummag
, true, true, true)
3298 MINMAX(32, max
, false, false, false)
3299 MINMAX(32, maxnum
, false, true, false)
3300 MINMAX(32, maxnummag
, false, true, true)
3302 MINMAX(64, min
, true, false, false)
3303 MINMAX(64, minnum
, true, true, false)
3304 MINMAX(64, minnummag
, true, true, true)
3305 MINMAX(64, max
, false, false, false)
3306 MINMAX(64, maxnum
, false, true, false)
3307 MINMAX(64, maxnummag
, false, true, true)
3311 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3312 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3314 FloatParts64 pa, pb, pr; \
3315 bfloat16_unpack_canonical(&pa, a, s); \
3316 bfloat16_unpack_canonical(&pb, b, s); \
3317 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3318 return bfloat16_round_pack_canonical(&pr, s); \
3321 BF16_MINMAX(min
, true, false, false)
3322 BF16_MINMAX(minnum
, true, true, false)
3323 BF16_MINMAX(minnummag
, true, true, true)
3324 BF16_MINMAX(max
, false, false, false)
3325 BF16_MINMAX(maxnum
, false, true, false)
3326 BF16_MINMAX(maxnummag
, false, true, true)
3330 /* Floating point compare */
3331 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3334 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3336 a
.cls
== float_class_snan
||
3337 b
.cls
== float_class_snan
) {
3338 float_raise(float_flag_invalid
, s
);
3340 return float_relation_unordered
;
3343 if (a
.cls
== float_class_zero
) {
3344 if (b
.cls
== float_class_zero
) {
3345 return float_relation_equal
;
3347 return b
.sign
? float_relation_greater
: float_relation_less
;
3348 } else if (b
.cls
== float_class_zero
) {
3349 return a
.sign
? float_relation_less
: float_relation_greater
;
3352 /* The only really important thing about infinity is its sign. If
3353 * both are infinities the sign marks the smallest of the two.
3355 if (a
.cls
== float_class_inf
) {
3356 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3357 return float_relation_equal
;
3359 return a
.sign
? float_relation_less
: float_relation_greater
;
3360 } else if (b
.cls
== float_class_inf
) {
3361 return b
.sign
? float_relation_greater
: float_relation_less
;
3364 if (a
.sign
!= b
.sign
) {
3365 return a
.sign
? float_relation_less
: float_relation_greater
;
3368 if (a
.exp
== b
.exp
) {
3369 if (a
.frac
== b
.frac
) {
3370 return float_relation_equal
;
3373 return a
.frac
> b
.frac
?
3374 float_relation_less
: float_relation_greater
;
3376 return a
.frac
> b
.frac
?
3377 float_relation_greater
: float_relation_less
;
3381 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3383 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3388 #define COMPARE(name, attr, sz) \
3390 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3392 FloatParts64 pa, pb; \
3393 float ## sz ## _unpack_canonical(&pa, a, s); \
3394 float ## sz ## _unpack_canonical(&pb, b, s); \
3395 return compare_floats(pa, pb, is_quiet, s); \
3398 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3399 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3400 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3404 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3406 return soft_f16_compare(a
, b
, false, s
);
3409 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3411 return soft_f16_compare(a
, b
, true, s
);
3414 static FloatRelation QEMU_FLATTEN
3415 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3417 union_float32 ua
, ub
;
3422 if (QEMU_NO_HARDFLOAT
) {
3426 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3427 if (isgreaterequal(ua
.h
, ub
.h
)) {
3428 if (isgreater(ua
.h
, ub
.h
)) {
3429 return float_relation_greater
;
3431 return float_relation_equal
;
3433 if (likely(isless(ua
.h
, ub
.h
))) {
3434 return float_relation_less
;
3436 /* The only condition remaining is unordered.
3437 * Fall through to set flags.
3440 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3443 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3445 return f32_compare(a
, b
, false, s
);
3448 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3450 return f32_compare(a
, b
, true, s
);
3453 static FloatRelation QEMU_FLATTEN
3454 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3456 union_float64 ua
, ub
;
3461 if (QEMU_NO_HARDFLOAT
) {
3465 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3466 if (isgreaterequal(ua
.h
, ub
.h
)) {
3467 if (isgreater(ua
.h
, ub
.h
)) {
3468 return float_relation_greater
;
3470 return float_relation_equal
;
3472 if (likely(isless(ua
.h
, ub
.h
))) {
3473 return float_relation_less
;
3475 /* The only condition remaining is unordered.
3476 * Fall through to set flags.
3479 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3482 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3484 return f64_compare(a
, b
, false, s
);
3487 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3489 return f64_compare(a
, b
, true, s
);
3492 static FloatRelation QEMU_FLATTEN
3493 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3495 FloatParts64 pa
, pb
;
3497 bfloat16_unpack_canonical(&pa
, a
, s
);
3498 bfloat16_unpack_canonical(&pb
, b
, s
);
3499 return compare_floats(pa
, pb
, is_quiet
, s
);
3502 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3504 return soft_bf16_compare(a
, b
, false, s
);
3507 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3509 return soft_bf16_compare(a
, b
, true, s
);
3512 /* Multiply A by 2 raised to the power N. */
3513 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3515 if (unlikely(is_nan(a
.cls
))) {
3516 parts_return_nan(&a
, s
);
3518 if (a
.cls
== float_class_normal
) {
3519 /* The largest float type (even though not supported by FloatParts64)
3520 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3521 * still allows rounding to infinity, without allowing overflow
3522 * within the int32_t that backs FloatParts64.exp.
3524 n
= MIN(MAX(n
, -0x10000), 0x10000);
3530 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3532 FloatParts64 pa
, pr
;
3534 float16_unpack_canonical(&pa
, a
, status
);
3535 pr
= scalbn_decomposed(pa
, n
, status
);
3536 return float16_round_pack_canonical(&pr
, status
);
3539 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3541 FloatParts64 pa
, pr
;
3543 float32_unpack_canonical(&pa
, a
, status
);
3544 pr
= scalbn_decomposed(pa
, n
, status
);
3545 return float32_round_pack_canonical(&pr
, status
);
3548 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3550 FloatParts64 pa
, pr
;
3552 float64_unpack_canonical(&pa
, a
, status
);
3553 pr
= scalbn_decomposed(pa
, n
, status
);
3554 return float64_round_pack_canonical(&pr
, status
);
3557 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3559 FloatParts64 pa
, pr
;
3561 bfloat16_unpack_canonical(&pa
, a
, status
);
3562 pr
= scalbn_decomposed(pa
, n
, status
);
3563 return bfloat16_round_pack_canonical(&pr
, status
);
3569 * The old softfloat code did an approximation step before zeroing in
3570 * on the final result. However for simpleness we just compute the
3571 * square root by iterating down from the implicit bit to enough extra
3572 * bits to ensure we get a correctly rounded result.
3574 * This does mean however the calculation is slower than before,
3575 * especially for 64 bit floats.
3578 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3580 uint64_t a_frac
, r_frac
, s_frac
;
3583 if (is_nan(a
.cls
)) {
3584 parts_return_nan(&a
, s
);
3587 if (a
.cls
== float_class_zero
) {
3588 return a
; /* sqrt(+-0) = +-0 */
3591 float_raise(float_flag_invalid
, s
);
3592 parts_default_nan(&a
, s
);
3595 if (a
.cls
== float_class_inf
) {
3596 return a
; /* sqrt(+inf) = +inf */
3599 assert(a
.cls
== float_class_normal
);
3601 /* We need two overflow bits at the top. Adding room for that is a
3602 * right shift. If the exponent is odd, we can discard the low bit
3603 * by multiplying the fraction by 2; that's a left shift. Combine
3604 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3606 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3609 /* Bit-by-bit computation of sqrt. */
3613 /* Iterate from implicit bit down to the 3 extra bits to compute a
3614 * properly rounded result. Remember we've inserted two more bits
3615 * at the top, so these positions are two less.
3617 bit
= DECOMPOSED_BINARY_POINT
- 2;
3618 last_bit
= MAX(p
->frac_shift
- 4, 0);
3620 uint64_t q
= 1ULL << bit
;
3621 uint64_t t_frac
= s_frac
+ q
;
3622 if (t_frac
<= a_frac
) {
3623 s_frac
= t_frac
+ q
;
3628 } while (--bit
>= last_bit
);
3630 /* Undo the right shift done above. If there is any remaining
3631 * fraction, the result is inexact. Set the sticky bit.
3633 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3638 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3640 FloatParts64 pa
, pr
;
3642 float16_unpack_canonical(&pa
, a
, status
);
3643 pr
= sqrt_float(pa
, status
, &float16_params
);
3644 return float16_round_pack_canonical(&pr
, status
);
3647 static float32 QEMU_SOFTFLOAT_ATTR
3648 soft_f32_sqrt(float32 a
, float_status
*status
)
3650 FloatParts64 pa
, pr
;
3652 float32_unpack_canonical(&pa
, a
, status
);
3653 pr
= sqrt_float(pa
, status
, &float32_params
);
3654 return float32_round_pack_canonical(&pr
, status
);
3657 static float64 QEMU_SOFTFLOAT_ATTR
3658 soft_f64_sqrt(float64 a
, float_status
*status
)
3660 FloatParts64 pa
, pr
;
3662 float64_unpack_canonical(&pa
, a
, status
);
3663 pr
= sqrt_float(pa
, status
, &float64_params
);
3664 return float64_round_pack_canonical(&pr
, status
);
3667 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3669 union_float32 ua
, ur
;
3672 if (unlikely(!can_use_fpu(s
))) {
3676 float32_input_flush1(&ua
.s
, s
);
3677 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3678 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3679 fpclassify(ua
.h
) == FP_ZERO
) ||
3683 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3684 float32_is_neg(ua
.s
))) {
3691 return soft_f32_sqrt(ua
.s
, s
);
3694 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3696 union_float64 ua
, ur
;
3699 if (unlikely(!can_use_fpu(s
))) {
3703 float64_input_flush1(&ua
.s
, s
);
3704 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3705 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3706 fpclassify(ua
.h
) == FP_ZERO
) ||
3710 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3711 float64_is_neg(ua
.s
))) {
3718 return soft_f64_sqrt(ua
.s
, s
);
3721 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3723 FloatParts64 pa
, pr
;
3725 bfloat16_unpack_canonical(&pa
, a
, status
);
3726 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3727 return bfloat16_round_pack_canonical(&pr
, status
);
3730 /*----------------------------------------------------------------------------
3731 | The pattern for a default generated NaN.
3732 *----------------------------------------------------------------------------*/
3734 float16
float16_default_nan(float_status
*status
)
3738 parts_default_nan(&p
, status
);
3739 p
.frac
>>= float16_params
.frac_shift
;
3740 return float16_pack_raw(&p
);
3743 float32
float32_default_nan(float_status
*status
)
3747 parts_default_nan(&p
, status
);
3748 p
.frac
>>= float32_params
.frac_shift
;
3749 return float32_pack_raw(&p
);
3752 float64
float64_default_nan(float_status
*status
)
3756 parts_default_nan(&p
, status
);
3757 p
.frac
>>= float64_params
.frac_shift
;
3758 return float64_pack_raw(&p
);
3761 float128
float128_default_nan(float_status
*status
)
3765 parts_default_nan(&p
, status
);
3766 frac_shr(&p
, float128_params
.frac_shift
);
3767 return float128_pack_raw(&p
);
3770 bfloat16
bfloat16_default_nan(float_status
*status
)
3774 parts_default_nan(&p
, status
);
3775 p
.frac
>>= bfloat16_params
.frac_shift
;
3776 return bfloat16_pack_raw(&p
);
3779 /*----------------------------------------------------------------------------
3780 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3781 *----------------------------------------------------------------------------*/
3783 float16
float16_silence_nan(float16 a
, float_status
*status
)
3787 float16_unpack_raw(&p
, a
);
3788 p
.frac
<<= float16_params
.frac_shift
;
3789 parts_silence_nan(&p
, status
);
3790 p
.frac
>>= float16_params
.frac_shift
;
3791 return float16_pack_raw(&p
);
3794 float32
float32_silence_nan(float32 a
, float_status
*status
)
3798 float32_unpack_raw(&p
, a
);
3799 p
.frac
<<= float32_params
.frac_shift
;
3800 parts_silence_nan(&p
, status
);
3801 p
.frac
>>= float32_params
.frac_shift
;
3802 return float32_pack_raw(&p
);
3805 float64
float64_silence_nan(float64 a
, float_status
*status
)
3809 float64_unpack_raw(&p
, a
);
3810 p
.frac
<<= float64_params
.frac_shift
;
3811 parts_silence_nan(&p
, status
);
3812 p
.frac
>>= float64_params
.frac_shift
;
3813 return float64_pack_raw(&p
);
3816 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3820 bfloat16_unpack_raw(&p
, a
);
3821 p
.frac
<<= bfloat16_params
.frac_shift
;
3822 parts_silence_nan(&p
, status
);
3823 p
.frac
>>= bfloat16_params
.frac_shift
;
3824 return bfloat16_pack_raw(&p
);
3827 float128
float128_silence_nan(float128 a
, float_status
*status
)
3831 float128_unpack_raw(&p
, a
);
3832 frac_shl(&p
, float128_params
.frac_shift
);
3833 parts_silence_nan(&p
, status
);
3834 frac_shr(&p
, float128_params
.frac_shift
);
3835 return float128_pack_raw(&p
);
3838 /*----------------------------------------------------------------------------
3839 | If `a' is denormal and we are in flush-to-zero mode then set the
3840 | input-denormal exception and return zero. Otherwise just return the value.
3841 *----------------------------------------------------------------------------*/
3843 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3845 if (p
.exp
== 0 && p
.frac
!= 0) {
3846 float_raise(float_flag_input_denormal
, status
);
3853 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3855 if (status
->flush_inputs_to_zero
) {
3858 float16_unpack_raw(&p
, a
);
3859 if (parts_squash_denormal(p
, status
)) {
3860 return float16_set_sign(float16_zero
, p
.sign
);
3866 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3868 if (status
->flush_inputs_to_zero
) {
3871 float32_unpack_raw(&p
, a
);
3872 if (parts_squash_denormal(p
, status
)) {
3873 return float32_set_sign(float32_zero
, p
.sign
);
3879 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3881 if (status
->flush_inputs_to_zero
) {
3884 float64_unpack_raw(&p
, a
);
3885 if (parts_squash_denormal(p
, status
)) {
3886 return float64_set_sign(float64_zero
, p
.sign
);
3892 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3894 if (status
->flush_inputs_to_zero
) {
3897 bfloat16_unpack_raw(&p
, a
);
3898 if (parts_squash_denormal(p
, status
)) {
3899 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3905 /*----------------------------------------------------------------------------
3906 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3907 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3908 | input. If `zSign' is 1, the input is negated before being converted to an
3909 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3910 | is simply rounded to an integer, with the inexact exception raised if the
3911 | input cannot be represented exactly as an integer. However, if the fixed-
3912 | point input is too large, the invalid exception is raised and the largest
3913 | positive or negative integer is returned.
3914 *----------------------------------------------------------------------------*/
3916 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3917 float_status
*status
)
3919 int8_t roundingMode
;
3920 bool roundNearestEven
;
3921 int8_t roundIncrement
, roundBits
;
3924 roundingMode
= status
->float_rounding_mode
;
3925 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3926 switch (roundingMode
) {
3927 case float_round_nearest_even
:
3928 case float_round_ties_away
:
3929 roundIncrement
= 0x40;
3931 case float_round_to_zero
:
3934 case float_round_up
:
3935 roundIncrement
= zSign
? 0 : 0x7f;
3937 case float_round_down
:
3938 roundIncrement
= zSign
? 0x7f : 0;
3940 case float_round_to_odd
:
3941 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3946 roundBits
= absZ
& 0x7F;
3947 absZ
= ( absZ
+ roundIncrement
)>>7;
3948 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3952 if ( zSign
) z
= - z
;
3953 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3954 float_raise(float_flag_invalid
, status
);
3955 return zSign
? INT32_MIN
: INT32_MAX
;
3958 float_raise(float_flag_inexact
, status
);
3964 /*----------------------------------------------------------------------------
3965 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3966 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3967 | and returns the properly rounded 64-bit integer corresponding to the input.
3968 | If `zSign' is 1, the input is negated before being converted to an integer.
3969 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3970 | the inexact exception raised if the input cannot be represented exactly as
3971 | an integer. However, if the fixed-point input is too large, the invalid
3972 | exception is raised and the largest positive or negative integer is
3974 *----------------------------------------------------------------------------*/
3976 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3977 float_status
*status
)
3979 int8_t roundingMode
;
3980 bool roundNearestEven
, increment
;
3983 roundingMode
= status
->float_rounding_mode
;
3984 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3985 switch (roundingMode
) {
3986 case float_round_nearest_even
:
3987 case float_round_ties_away
:
3988 increment
= ((int64_t) absZ1
< 0);
3990 case float_round_to_zero
:
3993 case float_round_up
:
3994 increment
= !zSign
&& absZ1
;
3996 case float_round_down
:
3997 increment
= zSign
&& absZ1
;
3999 case float_round_to_odd
:
4000 increment
= !(absZ0
& 1) && absZ1
;
4007 if ( absZ0
== 0 ) goto overflow
;
4008 if (!(absZ1
<< 1) && roundNearestEven
) {
4013 if ( zSign
) z
= - z
;
4014 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
4016 float_raise(float_flag_invalid
, status
);
4017 return zSign
? INT64_MIN
: INT64_MAX
;
4020 float_raise(float_flag_inexact
, status
);
4026 /*----------------------------------------------------------------------------
4027 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4028 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4029 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4030 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
4031 | with the inexact exception raised if the input cannot be represented exactly
4032 | as an integer. However, if the fixed-point input is too large, the invalid
4033 | exception is raised and the largest unsigned integer is returned.
4034 *----------------------------------------------------------------------------*/
4036 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
4037 uint64_t absZ1
, float_status
*status
)
4039 int8_t roundingMode
;
4040 bool roundNearestEven
, increment
;
4042 roundingMode
= status
->float_rounding_mode
;
4043 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
4044 switch (roundingMode
) {
4045 case float_round_nearest_even
:
4046 case float_round_ties_away
:
4047 increment
= ((int64_t)absZ1
< 0);
4049 case float_round_to_zero
:
4052 case float_round_up
:
4053 increment
= !zSign
&& absZ1
;
4055 case float_round_down
:
4056 increment
= zSign
&& absZ1
;
4058 case float_round_to_odd
:
4059 increment
= !(absZ0
& 1) && absZ1
;
4067 float_raise(float_flag_invalid
, status
);
4070 if (!(absZ1
<< 1) && roundNearestEven
) {
4075 if (zSign
&& absZ0
) {
4076 float_raise(float_flag_invalid
, status
);
4081 float_raise(float_flag_inexact
, status
);
4086 /*----------------------------------------------------------------------------
4087 | Normalizes the subnormal single-precision floating-point value represented
4088 | by the denormalized significand `aSig'. The normalized exponent and
4089 | significand are stored at the locations pointed to by `zExpPtr' and
4090 | `zSigPtr', respectively.
4091 *----------------------------------------------------------------------------*/
4094 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
4098 shiftCount
= clz32(aSig
) - 8;
4099 *zSigPtr
= aSig
<<shiftCount
;
4100 *zExpPtr
= 1 - shiftCount
;
4104 /*----------------------------------------------------------------------------
4105 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4106 | and significand `zSig', and returns the proper single-precision floating-
4107 | point value corresponding to the abstract input. Ordinarily, the abstract
4108 | value is simply rounded and packed into the single-precision format, with
4109 | the inexact exception raised if the abstract input cannot be represented
4110 | exactly. However, if the abstract value is too large, the overflow and
4111 | inexact exceptions are raised and an infinity or maximal finite value is
4112 | returned. If the abstract value is too small, the input value is rounded to
4113 | a subnormal number, and the underflow and inexact exceptions are raised if
4114 | the abstract input cannot be represented exactly as a subnormal single-
4115 | precision floating-point number.
4116 | The input significand `zSig' has its binary point between bits 30
4117 | and 29, which is 7 bits to the left of the usual location. This shifted
4118 | significand must be normalized or smaller. If `zSig' is not normalized,
4119 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4120 | and it must not require rounding. In the usual case that `zSig' is
4121 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4122 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4123 | Binary Floating-Point Arithmetic.
4124 *----------------------------------------------------------------------------*/
4126 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4127 float_status
*status
)
4129 int8_t roundingMode
;
4130 bool roundNearestEven
;
4131 int8_t roundIncrement
, roundBits
;
4134 roundingMode
= status
->float_rounding_mode
;
4135 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4136 switch (roundingMode
) {
4137 case float_round_nearest_even
:
4138 case float_round_ties_away
:
4139 roundIncrement
= 0x40;
4141 case float_round_to_zero
:
4144 case float_round_up
:
4145 roundIncrement
= zSign
? 0 : 0x7f;
4147 case float_round_down
:
4148 roundIncrement
= zSign
? 0x7f : 0;
4150 case float_round_to_odd
:
4151 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4157 roundBits
= zSig
& 0x7F;
4158 if ( 0xFD <= (uint16_t) zExp
) {
4159 if ( ( 0xFD < zExp
)
4160 || ( ( zExp
== 0xFD )
4161 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4163 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4164 roundIncrement
!= 0;
4165 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4166 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4169 if (status
->flush_to_zero
) {
4170 float_raise(float_flag_output_denormal
, status
);
4171 return packFloat32(zSign
, 0, 0);
4173 isTiny
= status
->tininess_before_rounding
4175 || (zSig
+ roundIncrement
< 0x80000000);
4176 shift32RightJamming( zSig
, - zExp
, &zSig
);
4178 roundBits
= zSig
& 0x7F;
4179 if (isTiny
&& roundBits
) {
4180 float_raise(float_flag_underflow
, status
);
4182 if (roundingMode
== float_round_to_odd
) {
4184 * For round-to-odd case, the roundIncrement depends on
4185 * zSig which just changed.
4187 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4192 float_raise(float_flag_inexact
, status
);
4194 zSig
= ( zSig
+ roundIncrement
)>>7;
4195 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4198 if ( zSig
== 0 ) zExp
= 0;
4199 return packFloat32( zSign
, zExp
, zSig
);
4203 /*----------------------------------------------------------------------------
4204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4205 | and significand `zSig', and returns the proper single-precision floating-
4206 | point value corresponding to the abstract input. This routine is just like
4207 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4208 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4209 | floating-point exponent.
4210 *----------------------------------------------------------------------------*/
4213 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4214 float_status
*status
)
4218 shiftCount
= clz32(zSig
) - 1;
4219 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4224 /*----------------------------------------------------------------------------
4225 | Normalizes the subnormal double-precision floating-point value represented
4226 | by the denormalized significand `aSig'. The normalized exponent and
4227 | significand are stored at the locations pointed to by `zExpPtr' and
4228 | `zSigPtr', respectively.
4229 *----------------------------------------------------------------------------*/
4232 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4236 shiftCount
= clz64(aSig
) - 11;
4237 *zSigPtr
= aSig
<<shiftCount
;
4238 *zExpPtr
= 1 - shiftCount
;
4242 /*----------------------------------------------------------------------------
4243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4244 | double-precision floating-point value, returning the result. After being
4245 | shifted into the proper positions, the three fields are simply added
4246 | together to form the result. This means that any integer portion of `zSig'
4247 | will be added into the exponent. Since a properly normalized significand
4248 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4249 | than the desired result exponent whenever `zSig' is a complete, normalized
4251 *----------------------------------------------------------------------------*/
4253 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4256 return make_float64(
4257 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4261 /*----------------------------------------------------------------------------
4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4263 | and significand `zSig', and returns the proper double-precision floating-
4264 | point value corresponding to the abstract input. Ordinarily, the abstract
4265 | value is simply rounded and packed into the double-precision format, with
4266 | the inexact exception raised if the abstract input cannot be represented
4267 | exactly. However, if the abstract value is too large, the overflow and
4268 | inexact exceptions are raised and an infinity or maximal finite value is
4269 | returned. If the abstract value is too small, the input value is rounded to
4270 | a subnormal number, and the underflow and inexact exceptions are raised if
4271 | the abstract input cannot be represented exactly as a subnormal double-
4272 | precision floating-point number.
4273 | The input significand `zSig' has its binary point between bits 62
4274 | and 61, which is 10 bits to the left of the usual location. This shifted
4275 | significand must be normalized or smaller. If `zSig' is not normalized,
4276 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4277 | and it must not require rounding. In the usual case that `zSig' is
4278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4279 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4280 | Binary Floating-Point Arithmetic.
4281 *----------------------------------------------------------------------------*/
4283 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4284 float_status
*status
)
4286 int8_t roundingMode
;
4287 bool roundNearestEven
;
4288 int roundIncrement
, roundBits
;
4291 roundingMode
= status
->float_rounding_mode
;
4292 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4293 switch (roundingMode
) {
4294 case float_round_nearest_even
:
4295 case float_round_ties_away
:
4296 roundIncrement
= 0x200;
4298 case float_round_to_zero
:
4301 case float_round_up
:
4302 roundIncrement
= zSign
? 0 : 0x3ff;
4304 case float_round_down
:
4305 roundIncrement
= zSign
? 0x3ff : 0;
4307 case float_round_to_odd
:
4308 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4313 roundBits
= zSig
& 0x3FF;
4314 if ( 0x7FD <= (uint16_t) zExp
) {
4315 if ( ( 0x7FD < zExp
)
4316 || ( ( zExp
== 0x7FD )
4317 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4319 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4320 roundIncrement
!= 0;
4321 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4322 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4325 if (status
->flush_to_zero
) {
4326 float_raise(float_flag_output_denormal
, status
);
4327 return packFloat64(zSign
, 0, 0);
4329 isTiny
= status
->tininess_before_rounding
4331 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4332 shift64RightJamming( zSig
, - zExp
, &zSig
);
4334 roundBits
= zSig
& 0x3FF;
4335 if (isTiny
&& roundBits
) {
4336 float_raise(float_flag_underflow
, status
);
4338 if (roundingMode
== float_round_to_odd
) {
4340 * For round-to-odd case, the roundIncrement depends on
4341 * zSig which just changed.
4343 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4348 float_raise(float_flag_inexact
, status
);
4350 zSig
= ( zSig
+ roundIncrement
)>>10;
4351 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4354 if ( zSig
== 0 ) zExp
= 0;
4355 return packFloat64( zSign
, zExp
, zSig
);
4359 /*----------------------------------------------------------------------------
4360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4361 | and significand `zSig', and returns the proper double-precision floating-
4362 | point value corresponding to the abstract input. This routine is just like
4363 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4364 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4365 | floating-point exponent.
4366 *----------------------------------------------------------------------------*/
4369 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4370 float_status
*status
)
4374 shiftCount
= clz64(zSig
) - 1;
4375 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4380 /*----------------------------------------------------------------------------
4381 | Normalizes the subnormal extended double-precision floating-point value
4382 | represented by the denormalized significand `aSig'. The normalized exponent
4383 | and significand are stored at the locations pointed to by `zExpPtr' and
4384 | `zSigPtr', respectively.
4385 *----------------------------------------------------------------------------*/
4387 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4392 shiftCount
= clz64(aSig
);
4393 *zSigPtr
= aSig
<<shiftCount
;
4394 *zExpPtr
= 1 - shiftCount
;
4397 /*----------------------------------------------------------------------------
4398 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4399 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4400 | and returns the proper extended double-precision floating-point value
4401 | corresponding to the abstract input. Ordinarily, the abstract value is
4402 | rounded and packed into the extended double-precision format, with the
4403 | inexact exception raised if the abstract input cannot be represented
4404 | exactly. However, if the abstract value is too large, the overflow and
4405 | inexact exceptions are raised and an infinity or maximal finite value is
4406 | returned. If the abstract value is too small, the input value is rounded to
4407 | a subnormal number, and the underflow and inexact exceptions are raised if
4408 | the abstract input cannot be represented exactly as a subnormal extended
4409 | double-precision floating-point number.
4410 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4411 | number of bits as single or double precision, respectively. Otherwise, the
4412 | result is rounded to the full precision of the extended double-precision
4414 | The input significand must be normalized or smaller. If the input
4415 | significand is not normalized, `zExp' must be 0; in that case, the result
4416 | returned is a subnormal number, and it must not require rounding. The
4417 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4418 | Floating-Point Arithmetic.
4419 *----------------------------------------------------------------------------*/
4421 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4422 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4423 float_status
*status
)
4425 int8_t roundingMode
;
4426 bool roundNearestEven
, increment
, isTiny
;
4427 int64_t roundIncrement
, roundMask
, roundBits
;
4429 roundingMode
= status
->float_rounding_mode
;
4430 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4431 if ( roundingPrecision
== 80 ) goto precision80
;
4432 if ( roundingPrecision
== 64 ) {
4433 roundIncrement
= UINT64_C(0x0000000000000400);
4434 roundMask
= UINT64_C(0x00000000000007FF);
4436 else if ( roundingPrecision
== 32 ) {
4437 roundIncrement
= UINT64_C(0x0000008000000000);
4438 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4443 zSig0
|= ( zSig1
!= 0 );
4444 switch (roundingMode
) {
4445 case float_round_nearest_even
:
4446 case float_round_ties_away
:
4448 case float_round_to_zero
:
4451 case float_round_up
:
4452 roundIncrement
= zSign
? 0 : roundMask
;
4454 case float_round_down
:
4455 roundIncrement
= zSign
? roundMask
: 0;
4460 roundBits
= zSig0
& roundMask
;
4461 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4462 if ( ( 0x7FFE < zExp
)
4463 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4468 if (status
->flush_to_zero
) {
4469 float_raise(float_flag_output_denormal
, status
);
4470 return packFloatx80(zSign
, 0, 0);
4472 isTiny
= status
->tininess_before_rounding
4474 || (zSig0
<= zSig0
+ roundIncrement
);
4475 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4477 roundBits
= zSig0
& roundMask
;
4478 if (isTiny
&& roundBits
) {
4479 float_raise(float_flag_underflow
, status
);
4482 float_raise(float_flag_inexact
, status
);
4484 zSig0
+= roundIncrement
;
4485 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4486 roundIncrement
= roundMask
+ 1;
4487 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4488 roundMask
|= roundIncrement
;
4490 zSig0
&= ~ roundMask
;
4491 return packFloatx80( zSign
, zExp
, zSig0
);
4495 float_raise(float_flag_inexact
, status
);
4497 zSig0
+= roundIncrement
;
4498 if ( zSig0
< roundIncrement
) {
4500 zSig0
= UINT64_C(0x8000000000000000);
4502 roundIncrement
= roundMask
+ 1;
4503 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4504 roundMask
|= roundIncrement
;
4506 zSig0
&= ~ roundMask
;
4507 if ( zSig0
== 0 ) zExp
= 0;
4508 return packFloatx80( zSign
, zExp
, zSig0
);
4510 switch (roundingMode
) {
4511 case float_round_nearest_even
:
4512 case float_round_ties_away
:
4513 increment
= ((int64_t)zSig1
< 0);
4515 case float_round_to_zero
:
4518 case float_round_up
:
4519 increment
= !zSign
&& zSig1
;
4521 case float_round_down
:
4522 increment
= zSign
&& zSig1
;
4527 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4528 if ( ( 0x7FFE < zExp
)
4529 || ( ( zExp
== 0x7FFE )
4530 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4536 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4537 if ( ( roundingMode
== float_round_to_zero
)
4538 || ( zSign
&& ( roundingMode
== float_round_up
) )
4539 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4541 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4543 return packFloatx80(zSign
,
4544 floatx80_infinity_high
,
4545 floatx80_infinity_low
);
4548 isTiny
= status
->tininess_before_rounding
4551 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4552 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4554 if (isTiny
&& zSig1
) {
4555 float_raise(float_flag_underflow
, status
);
4558 float_raise(float_flag_inexact
, status
);
4560 switch (roundingMode
) {
4561 case float_round_nearest_even
:
4562 case float_round_ties_away
:
4563 increment
= ((int64_t)zSig1
< 0);
4565 case float_round_to_zero
:
4568 case float_round_up
:
4569 increment
= !zSign
&& zSig1
;
4571 case float_round_down
:
4572 increment
= zSign
&& zSig1
;
4579 if (!(zSig1
<< 1) && roundNearestEven
) {
4582 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4584 return packFloatx80( zSign
, zExp
, zSig0
);
4588 float_raise(float_flag_inexact
, status
);
4594 zSig0
= UINT64_C(0x8000000000000000);
4597 if (!(zSig1
<< 1) && roundNearestEven
) {
4603 if ( zSig0
== 0 ) zExp
= 0;
4605 return packFloatx80( zSign
, zExp
, zSig0
);
4609 /*----------------------------------------------------------------------------
4610 | Takes an abstract floating-point value having sign `zSign', exponent
4611 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4612 | and returns the proper extended double-precision floating-point value
4613 | corresponding to the abstract input. This routine is just like
4614 | `roundAndPackFloatx80' except that the input significand does not have to be
4616 *----------------------------------------------------------------------------*/
4618 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4619 bool zSign
, int32_t zExp
,
4620 uint64_t zSig0
, uint64_t zSig1
,
4621 float_status
*status
)
4630 shiftCount
= clz64(zSig0
);
4631 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4633 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4634 zSig0
, zSig1
, status
);
4638 /*----------------------------------------------------------------------------
4639 | Returns the least-significant 64 fraction bits of the quadruple-precision
4640 | floating-point value `a'.
4641 *----------------------------------------------------------------------------*/
4643 static inline uint64_t extractFloat128Frac1( float128 a
)
4650 /*----------------------------------------------------------------------------
4651 | Returns the most-significant 48 fraction bits of the quadruple-precision
4652 | floating-point value `a'.
4653 *----------------------------------------------------------------------------*/
4655 static inline uint64_t extractFloat128Frac0( float128 a
)
4658 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4662 /*----------------------------------------------------------------------------
4663 | Returns the exponent bits of the quadruple-precision floating-point value
4665 *----------------------------------------------------------------------------*/
4667 static inline int32_t extractFloat128Exp( float128 a
)
4670 return ( a
.high
>>48 ) & 0x7FFF;
4674 /*----------------------------------------------------------------------------
4675 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4676 *----------------------------------------------------------------------------*/
4678 static inline bool extractFloat128Sign(float128 a
)
4680 return a
.high
>> 63;
4683 /*----------------------------------------------------------------------------
4684 | Normalizes the subnormal quadruple-precision floating-point value
4685 | represented by the denormalized significand formed by the concatenation of
4686 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4687 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4688 | significand are stored at the location pointed to by `zSig0Ptr', and the
4689 | least significant 64 bits of the normalized significand are stored at the
4690 | location pointed to by `zSig1Ptr'.
4691 *----------------------------------------------------------------------------*/
4694 normalizeFloat128Subnormal(
4705 shiftCount
= clz64(aSig1
) - 15;
4706 if ( shiftCount
< 0 ) {
4707 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4708 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4711 *zSig0Ptr
= aSig1
<<shiftCount
;
4714 *zExpPtr
= - shiftCount
- 63;
4717 shiftCount
= clz64(aSig0
) - 15;
4718 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4719 *zExpPtr
= 1 - shiftCount
;
4724 /*----------------------------------------------------------------------------
4725 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4726 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4727 | floating-point value, returning the result. After being shifted into the
4728 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4729 | added together to form the most significant 32 bits of the result. This
4730 | means that any integer portion of `zSig0' will be added into the exponent.
4731 | Since a properly normalized significand will have an integer portion equal
4732 | to 1, the `zExp' input should be 1 less than the desired result exponent
4733 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4735 *----------------------------------------------------------------------------*/
4737 static inline float128
4738 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4743 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4747 /*----------------------------------------------------------------------------
4748 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4749 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4750 | and `zSig2', and returns the proper quadruple-precision floating-point value
4751 | corresponding to the abstract input. Ordinarily, the abstract value is
4752 | simply rounded and packed into the quadruple-precision format, with the
4753 | inexact exception raised if the abstract input cannot be represented
4754 | exactly. However, if the abstract value is too large, the overflow and
4755 | inexact exceptions are raised and an infinity or maximal finite value is
4756 | returned. If the abstract value is too small, the input value is rounded to
4757 | a subnormal number, and the underflow and inexact exceptions are raised if
4758 | the abstract input cannot be represented exactly as a subnormal quadruple-
4759 | precision floating-point number.
4760 | The input significand must be normalized or smaller. If the input
4761 | significand is not normalized, `zExp' must be 0; in that case, the result
4762 | returned is a subnormal number, and it must not require rounding. In the
4763 | usual case that the input significand is normalized, `zExp' must be 1 less
4764 | than the ``true'' floating-point exponent. The handling of underflow and
4765 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4766 *----------------------------------------------------------------------------*/
4768 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4769 uint64_t zSig0
, uint64_t zSig1
,
4770 uint64_t zSig2
, float_status
*status
)
4772 int8_t roundingMode
;
4773 bool roundNearestEven
, increment
, isTiny
;
4775 roundingMode
= status
->float_rounding_mode
;
4776 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4777 switch (roundingMode
) {
4778 case float_round_nearest_even
:
4779 case float_round_ties_away
:
4780 increment
= ((int64_t)zSig2
< 0);
4782 case float_round_to_zero
:
4785 case float_round_up
:
4786 increment
= !zSign
&& zSig2
;
4788 case float_round_down
:
4789 increment
= zSign
&& zSig2
;
4791 case float_round_to_odd
:
4792 increment
= !(zSig1
& 0x1) && zSig2
;
4797 if ( 0x7FFD <= (uint32_t) zExp
) {
4798 if ( ( 0x7FFD < zExp
)
4799 || ( ( zExp
== 0x7FFD )
4801 UINT64_C(0x0001FFFFFFFFFFFF),
4802 UINT64_C(0xFFFFFFFFFFFFFFFF),
4809 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4810 if ( ( roundingMode
== float_round_to_zero
)
4811 || ( zSign
&& ( roundingMode
== float_round_up
) )
4812 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4813 || (roundingMode
== float_round_to_odd
)
4819 UINT64_C(0x0000FFFFFFFFFFFF),
4820 UINT64_C(0xFFFFFFFFFFFFFFFF)
4823 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4826 if (status
->flush_to_zero
) {
4827 float_raise(float_flag_output_denormal
, status
);
4828 return packFloat128(zSign
, 0, 0, 0);
4830 isTiny
= status
->tininess_before_rounding
4833 || lt128(zSig0
, zSig1
,
4834 UINT64_C(0x0001FFFFFFFFFFFF),
4835 UINT64_C(0xFFFFFFFFFFFFFFFF));
4836 shift128ExtraRightJamming(
4837 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4839 if (isTiny
&& zSig2
) {
4840 float_raise(float_flag_underflow
, status
);
4842 switch (roundingMode
) {
4843 case float_round_nearest_even
:
4844 case float_round_ties_away
:
4845 increment
= ((int64_t)zSig2
< 0);
4847 case float_round_to_zero
:
4850 case float_round_up
:
4851 increment
= !zSign
&& zSig2
;
4853 case float_round_down
:
4854 increment
= zSign
&& zSig2
;
4856 case float_round_to_odd
:
4857 increment
= !(zSig1
& 0x1) && zSig2
;
4865 float_raise(float_flag_inexact
, status
);
4868 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4869 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4874 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4876 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4880 /*----------------------------------------------------------------------------
4881 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4882 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4883 | returns the proper quadruple-precision floating-point value corresponding
4884 | to the abstract input. This routine is just like `roundAndPackFloat128'
4885 | except that the input significand has fewer bits and does not have to be
4886 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4888 *----------------------------------------------------------------------------*/
4890 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4891 uint64_t zSig0
, uint64_t zSig1
,
4892 float_status
*status
)
4902 shiftCount
= clz64(zSig0
) - 15;
4903 if ( 0 <= shiftCount
) {
4905 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4908 shift128ExtraRightJamming(
4909 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4912 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4917 /*----------------------------------------------------------------------------
4918 | Returns the result of converting the 32-bit two's complement integer `a'
4919 | to the extended double-precision floating-point format. The conversion
4920 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4922 *----------------------------------------------------------------------------*/
4924 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4931 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4933 absA
= zSign
? - a
: a
;
4934 shiftCount
= clz32(absA
) + 32;
4936 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4940 /*----------------------------------------------------------------------------
4941 | Returns the result of converting the 32-bit two's complement integer `a' to
4942 | the quadruple-precision floating-point format. The conversion is performed
4943 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4946 float128
int32_to_float128(int32_t a
, float_status
*status
)
4953 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4955 absA
= zSign
? - a
: a
;
4956 shiftCount
= clz32(absA
) + 17;
4958 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4962 /*----------------------------------------------------------------------------
4963 | Returns the result of converting the 64-bit two's complement integer `a'
4964 | to the extended double-precision floating-point format. The conversion
4965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4967 *----------------------------------------------------------------------------*/
4969 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4975 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4977 absA
= zSign
? - a
: a
;
4978 shiftCount
= clz64(absA
);
4979 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of converting the 64-bit two's complement integer `a' to
4985 | the quadruple-precision floating-point format. The conversion is performed
4986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4987 *----------------------------------------------------------------------------*/
4989 float128
int64_to_float128(int64_t a
, float_status
*status
)
4995 uint64_t zSig0
, zSig1
;
4997 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4999 absA
= zSign
? - a
: a
;
5000 shiftCount
= clz64(absA
) + 49;
5001 zExp
= 0x406E - shiftCount
;
5002 if ( 64 <= shiftCount
) {
5011 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
5012 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
5016 /*----------------------------------------------------------------------------
5017 | Returns the result of converting the 64-bit unsigned integer `a'
5018 | to the quadruple-precision floating-point format. The conversion is performed
5019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 *----------------------------------------------------------------------------*/
5022 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
5025 return float128_zero
;
5027 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the single-precision floating-point value
5032 | `a' to the extended double-precision floating-point format. The conversion
5033 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5035 *----------------------------------------------------------------------------*/
5037 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
5043 a
= float32_squash_input_denormal(a
, status
);
5044 aSig
= extractFloat32Frac( a
);
5045 aExp
= extractFloat32Exp( a
);
5046 aSign
= extractFloat32Sign( a
);
5047 if ( aExp
== 0xFF ) {
5049 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
5051 return floatx80_silence_nan(res
, status
);
5053 return packFloatx80(aSign
,
5054 floatx80_infinity_high
,
5055 floatx80_infinity_low
);
5058 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5059 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5062 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
5066 /*----------------------------------------------------------------------------
5067 | Returns the result of converting the single-precision floating-point value
5068 | `a' to the double-precision floating-point format. The conversion is
5069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5071 *----------------------------------------------------------------------------*/
5073 float128
float32_to_float128(float32 a
, float_status
*status
)
5079 a
= float32_squash_input_denormal(a
, status
);
5080 aSig
= extractFloat32Frac( a
);
5081 aExp
= extractFloat32Exp( a
);
5082 aSign
= extractFloat32Sign( a
);
5083 if ( aExp
== 0xFF ) {
5085 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
5087 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5090 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5091 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5094 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
5098 /*----------------------------------------------------------------------------
5099 | Returns the remainder of the single-precision floating-point value `a'
5100 | with respect to the corresponding value `b'. The operation is performed
5101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102 *----------------------------------------------------------------------------*/
5104 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
5107 int aExp
, bExp
, expDiff
;
5108 uint32_t aSig
, bSig
;
5110 uint64_t aSig64
, bSig64
, q64
;
5111 uint32_t alternateASig
;
5113 a
= float32_squash_input_denormal(a
, status
);
5114 b
= float32_squash_input_denormal(b
, status
);
5116 aSig
= extractFloat32Frac( a
);
5117 aExp
= extractFloat32Exp( a
);
5118 aSign
= extractFloat32Sign( a
);
5119 bSig
= extractFloat32Frac( b
);
5120 bExp
= extractFloat32Exp( b
);
5121 if ( aExp
== 0xFF ) {
5122 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
5123 return propagateFloat32NaN(a
, b
, status
);
5125 float_raise(float_flag_invalid
, status
);
5126 return float32_default_nan(status
);
5128 if ( bExp
== 0xFF ) {
5130 return propagateFloat32NaN(a
, b
, status
);
5136 float_raise(float_flag_invalid
, status
);
5137 return float32_default_nan(status
);
5139 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
5142 if ( aSig
== 0 ) return a
;
5143 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5145 expDiff
= aExp
- bExp
;
5148 if ( expDiff
< 32 ) {
5151 if ( expDiff
< 0 ) {
5152 if ( expDiff
< -1 ) return a
;
5155 q
= ( bSig
<= aSig
);
5156 if ( q
) aSig
-= bSig
;
5157 if ( 0 < expDiff
) {
5158 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5161 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5169 if ( bSig
<= aSig
) aSig
-= bSig
;
5170 aSig64
= ( (uint64_t) aSig
)<<40;
5171 bSig64
= ( (uint64_t) bSig
)<<40;
5173 while ( 0 < expDiff
) {
5174 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5175 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5176 aSig64
= - ( ( bSig
* q64
)<<38 );
5180 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5181 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5182 q
= q64
>>( 64 - expDiff
);
5184 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5187 alternateASig
= aSig
;
5190 } while ( 0 <= (int32_t) aSig
);
5191 sigMean
= aSig
+ alternateASig
;
5192 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5193 aSig
= alternateASig
;
5195 zSign
= ( (int32_t) aSig
< 0 );
5196 if ( zSign
) aSig
= - aSig
;
5197 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5202 /*----------------------------------------------------------------------------
5203 | Returns the binary exponential of the single-precision floating-point value
5204 | `a'. The operation is performed according to the IEC/IEEE Standard for
5205 | Binary Floating-Point Arithmetic.
5207 | Uses the following identities:
5209 | 1. -------------------------------------------------------------------------
5213 | 2. -------------------------------------------------------------------------
5216 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5218 *----------------------------------------------------------------------------*/
5220 static const float64 float32_exp2_coefficients
[15] =
5222 const_float64( 0x3ff0000000000000ll
), /* 1 */
5223 const_float64( 0x3fe0000000000000ll
), /* 2 */
5224 const_float64( 0x3fc5555555555555ll
), /* 3 */
5225 const_float64( 0x3fa5555555555555ll
), /* 4 */
5226 const_float64( 0x3f81111111111111ll
), /* 5 */
5227 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5228 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5229 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5230 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5231 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5232 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5233 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5234 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5235 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5236 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5239 float32
float32_exp2(float32 a
, float_status
*status
)
5246 a
= float32_squash_input_denormal(a
, status
);
5248 aSig
= extractFloat32Frac( a
);
5249 aExp
= extractFloat32Exp( a
);
5250 aSign
= extractFloat32Sign( a
);
5252 if ( aExp
== 0xFF) {
5254 return propagateFloat32NaN(a
, float32_zero
, status
);
5256 return (aSign
) ? float32_zero
: a
;
5259 if (aSig
== 0) return float32_one
;
5262 float_raise(float_flag_inexact
, status
);
5264 /* ******************************* */
5265 /* using float64 for approximation */
5266 /* ******************************* */
5267 x
= float32_to_float64(a
, status
);
5268 x
= float64_mul(x
, float64_ln2
, status
);
5272 for (i
= 0 ; i
< 15 ; i
++) {
5275 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5276 r
= float64_add(r
, f
, status
);
5278 xn
= float64_mul(xn
, x
, status
);
5281 return float64_to_float32(r
, status
);
5284 /*----------------------------------------------------------------------------
5285 | Returns the binary log of the single-precision floating-point value `a'.
5286 | The operation is performed according to the IEC/IEEE Standard for Binary
5287 | Floating-Point Arithmetic.
5288 *----------------------------------------------------------------------------*/
5289 float32
float32_log2(float32 a
, float_status
*status
)
5293 uint32_t aSig
, zSig
, i
;
5295 a
= float32_squash_input_denormal(a
, status
);
5296 aSig
= extractFloat32Frac( a
);
5297 aExp
= extractFloat32Exp( a
);
5298 aSign
= extractFloat32Sign( a
);
5301 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5302 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5305 float_raise(float_flag_invalid
, status
);
5306 return float32_default_nan(status
);
5308 if ( aExp
== 0xFF ) {
5310 return propagateFloat32NaN(a
, float32_zero
, status
);
5320 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5321 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5322 if ( aSig
& 0x01000000 ) {
5331 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5334 /*----------------------------------------------------------------------------
5335 | Returns the result of converting the double-precision floating-point value
5336 | `a' to the extended double-precision floating-point format. The conversion
5337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5339 *----------------------------------------------------------------------------*/
5341 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5347 a
= float64_squash_input_denormal(a
, status
);
5348 aSig
= extractFloat64Frac( a
);
5349 aExp
= extractFloat64Exp( a
);
5350 aSign
= extractFloat64Sign( a
);
5351 if ( aExp
== 0x7FF ) {
5353 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5355 return floatx80_silence_nan(res
, status
);
5357 return packFloatx80(aSign
,
5358 floatx80_infinity_high
,
5359 floatx80_infinity_low
);
5362 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5363 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5367 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5371 /*----------------------------------------------------------------------------
5372 | Returns the result of converting the double-precision floating-point value
5373 | `a' to the quadruple-precision floating-point format. The conversion is
5374 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5376 *----------------------------------------------------------------------------*/
5378 float128
float64_to_float128(float64 a
, float_status
*status
)
5382 uint64_t aSig
, zSig0
, zSig1
;
5384 a
= float64_squash_input_denormal(a
, status
);
5385 aSig
= extractFloat64Frac( a
);
5386 aExp
= extractFloat64Exp( a
);
5387 aSign
= extractFloat64Sign( a
);
5388 if ( aExp
== 0x7FF ) {
5390 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5392 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5395 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5396 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5399 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5400 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5405 /*----------------------------------------------------------------------------
5406 | Returns the remainder of the double-precision floating-point value `a'
5407 | with respect to the corresponding value `b'. The operation is performed
5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 *----------------------------------------------------------------------------*/
5411 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5414 int aExp
, bExp
, expDiff
;
5415 uint64_t aSig
, bSig
;
5416 uint64_t q
, alternateASig
;
5419 a
= float64_squash_input_denormal(a
, status
);
5420 b
= float64_squash_input_denormal(b
, status
);
5421 aSig
= extractFloat64Frac( a
);
5422 aExp
= extractFloat64Exp( a
);
5423 aSign
= extractFloat64Sign( a
);
5424 bSig
= extractFloat64Frac( b
);
5425 bExp
= extractFloat64Exp( b
);
5426 if ( aExp
== 0x7FF ) {
5427 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5428 return propagateFloat64NaN(a
, b
, status
);
5430 float_raise(float_flag_invalid
, status
);
5431 return float64_default_nan(status
);
5433 if ( bExp
== 0x7FF ) {
5435 return propagateFloat64NaN(a
, b
, status
);
5441 float_raise(float_flag_invalid
, status
);
5442 return float64_default_nan(status
);
5444 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5447 if ( aSig
== 0 ) return a
;
5448 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5450 expDiff
= aExp
- bExp
;
5451 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5452 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5453 if ( expDiff
< 0 ) {
5454 if ( expDiff
< -1 ) return a
;
5457 q
= ( bSig
<= aSig
);
5458 if ( q
) aSig
-= bSig
;
5460 while ( 0 < expDiff
) {
5461 q
= estimateDiv128To64( aSig
, 0, bSig
);
5462 q
= ( 2 < q
) ? q
- 2 : 0;
5463 aSig
= - ( ( bSig
>>2 ) * q
);
5467 if ( 0 < expDiff
) {
5468 q
= estimateDiv128To64( aSig
, 0, bSig
);
5469 q
= ( 2 < q
) ? q
- 2 : 0;
5472 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5479 alternateASig
= aSig
;
5482 } while ( 0 <= (int64_t) aSig
);
5483 sigMean
= aSig
+ alternateASig
;
5484 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5485 aSig
= alternateASig
;
5487 zSign
= ( (int64_t) aSig
< 0 );
5488 if ( zSign
) aSig
= - aSig
;
5489 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5493 /*----------------------------------------------------------------------------
5494 | Returns the binary log of the double-precision floating-point value `a'.
5495 | The operation is performed according to the IEC/IEEE Standard for Binary
5496 | Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498 float64
float64_log2(float64 a
, float_status
*status
)
5502 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5503 a
= float64_squash_input_denormal(a
, status
);
5505 aSig
= extractFloat64Frac( a
);
5506 aExp
= extractFloat64Exp( a
);
5507 aSign
= extractFloat64Sign( a
);
5510 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5511 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5514 float_raise(float_flag_invalid
, status
);
5515 return float64_default_nan(status
);
5517 if ( aExp
== 0x7FF ) {
5519 return propagateFloat64NaN(a
, float64_zero
, status
);
5525 aSig
|= UINT64_C(0x0010000000000000);
5527 zSig
= (uint64_t)aExp
<< 52;
5528 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5529 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5530 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5531 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5539 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5542 /*----------------------------------------------------------------------------
5543 | Returns the result of converting the extended double-precision floating-
5544 | point value `a' to the 32-bit two's complement integer format. The
5545 | conversion is performed according to the IEC/IEEE Standard for Binary
5546 | Floating-Point Arithmetic---which means in particular that the conversion
5547 | is rounded according to the current rounding mode. If `a' is a NaN, the
5548 | largest positive integer is returned. Otherwise, if the conversion
5549 | overflows, the largest integer with the same sign as `a' is returned.
5550 *----------------------------------------------------------------------------*/
5552 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5555 int32_t aExp
, shiftCount
;
5558 if (floatx80_invalid_encoding(a
)) {
5559 float_raise(float_flag_invalid
, status
);
5562 aSig
= extractFloatx80Frac( a
);
5563 aExp
= extractFloatx80Exp( a
);
5564 aSign
= extractFloatx80Sign( a
);
5565 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5566 shiftCount
= 0x4037 - aExp
;
5567 if ( shiftCount
<= 0 ) shiftCount
= 1;
5568 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5569 return roundAndPackInt32(aSign
, aSig
, status
);
5573 /*----------------------------------------------------------------------------
5574 | Returns the result of converting the extended double-precision floating-
5575 | point value `a' to the 32-bit two's complement integer format. The
5576 | conversion is performed according to the IEC/IEEE Standard for Binary
5577 | Floating-Point Arithmetic, except that the conversion is always rounded
5578 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5579 | Otherwise, if the conversion overflows, the largest integer with the same
5580 | sign as `a' is returned.
5581 *----------------------------------------------------------------------------*/
5583 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5586 int32_t aExp
, shiftCount
;
5587 uint64_t aSig
, savedASig
;
5590 if (floatx80_invalid_encoding(a
)) {
5591 float_raise(float_flag_invalid
, status
);
5594 aSig
= extractFloatx80Frac( a
);
5595 aExp
= extractFloatx80Exp( a
);
5596 aSign
= extractFloatx80Sign( a
);
5597 if ( 0x401E < aExp
) {
5598 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5601 else if ( aExp
< 0x3FFF ) {
5603 float_raise(float_flag_inexact
, status
);
5607 shiftCount
= 0x403E - aExp
;
5609 aSig
>>= shiftCount
;
5611 if ( aSign
) z
= - z
;
5612 if ( ( z
< 0 ) ^ aSign
) {
5614 float_raise(float_flag_invalid
, status
);
5615 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5617 if ( ( aSig
<<shiftCount
) != savedASig
) {
5618 float_raise(float_flag_inexact
, status
);
5624 /*----------------------------------------------------------------------------
5625 | Returns the result of converting the extended double-precision floating-
5626 | point value `a' to the 64-bit two's complement integer format. The
5627 | conversion is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic---which means in particular that the conversion
5629 | is rounded according to the current rounding mode. If `a' is a NaN,
5630 | the largest positive integer is returned. Otherwise, if the conversion
5631 | overflows, the largest integer with the same sign as `a' is returned.
5632 *----------------------------------------------------------------------------*/
5634 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5637 int32_t aExp
, shiftCount
;
5638 uint64_t aSig
, aSigExtra
;
5640 if (floatx80_invalid_encoding(a
)) {
5641 float_raise(float_flag_invalid
, status
);
5644 aSig
= extractFloatx80Frac( a
);
5645 aExp
= extractFloatx80Exp( a
);
5646 aSign
= extractFloatx80Sign( a
);
5647 shiftCount
= 0x403E - aExp
;
5648 if ( shiftCount
<= 0 ) {
5650 float_raise(float_flag_invalid
, status
);
5651 if (!aSign
|| floatx80_is_any_nan(a
)) {
5659 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5661 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5665 /*----------------------------------------------------------------------------
5666 | Returns the result of converting the extended double-precision floating-
5667 | point value `a' to the 64-bit two's complement integer format. The
5668 | conversion is performed according to the IEC/IEEE Standard for Binary
5669 | Floating-Point Arithmetic, except that the conversion is always rounded
5670 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5671 | Otherwise, if the conversion overflows, the largest integer with the same
5672 | sign as `a' is returned.
5673 *----------------------------------------------------------------------------*/
5675 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5678 int32_t aExp
, shiftCount
;
5682 if (floatx80_invalid_encoding(a
)) {
5683 float_raise(float_flag_invalid
, status
);
5686 aSig
= extractFloatx80Frac( a
);
5687 aExp
= extractFloatx80Exp( a
);
5688 aSign
= extractFloatx80Sign( a
);
5689 shiftCount
= aExp
- 0x403E;
5690 if ( 0 <= shiftCount
) {
5691 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5692 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5693 float_raise(float_flag_invalid
, status
);
5694 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5700 else if ( aExp
< 0x3FFF ) {
5702 float_raise(float_flag_inexact
, status
);
5706 z
= aSig
>>( - shiftCount
);
5707 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5708 float_raise(float_flag_inexact
, status
);
5710 if ( aSign
) z
= - z
;
5715 /*----------------------------------------------------------------------------
5716 | Returns the result of converting the extended double-precision floating-
5717 | point value `a' to the single-precision floating-point format. The
5718 | conversion is performed according to the IEC/IEEE Standard for Binary
5719 | Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5722 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5728 if (floatx80_invalid_encoding(a
)) {
5729 float_raise(float_flag_invalid
, status
);
5730 return float32_default_nan(status
);
5732 aSig
= extractFloatx80Frac( a
);
5733 aExp
= extractFloatx80Exp( a
);
5734 aSign
= extractFloatx80Sign( a
);
5735 if ( aExp
== 0x7FFF ) {
5736 if ( (uint64_t) ( aSig
<<1 ) ) {
5737 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5739 return float32_silence_nan(res
, status
);
5741 return packFloat32( aSign
, 0xFF, 0 );
5743 shift64RightJamming( aSig
, 33, &aSig
);
5744 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5745 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5749 /*----------------------------------------------------------------------------
5750 | Returns the result of converting the extended double-precision floating-
5751 | point value `a' to the double-precision floating-point format. The
5752 | conversion is performed according to the IEC/IEEE Standard for Binary
5753 | Floating-Point Arithmetic.
5754 *----------------------------------------------------------------------------*/
5756 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5760 uint64_t aSig
, zSig
;
5762 if (floatx80_invalid_encoding(a
)) {
5763 float_raise(float_flag_invalid
, status
);
5764 return float64_default_nan(status
);
5766 aSig
= extractFloatx80Frac( a
);
5767 aExp
= extractFloatx80Exp( a
);
5768 aSign
= extractFloatx80Sign( a
);
5769 if ( aExp
== 0x7FFF ) {
5770 if ( (uint64_t) ( aSig
<<1 ) ) {
5771 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5773 return float64_silence_nan(res
, status
);
5775 return packFloat64( aSign
, 0x7FF, 0 );
5777 shift64RightJamming( aSig
, 1, &zSig
);
5778 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5779 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5783 /*----------------------------------------------------------------------------
5784 | Returns the result of converting the extended double-precision floating-
5785 | point value `a' to the quadruple-precision floating-point format. The
5786 | conversion is performed according to the IEC/IEEE Standard for Binary
5787 | Floating-Point Arithmetic.
5788 *----------------------------------------------------------------------------*/
5790 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5794 uint64_t aSig
, zSig0
, zSig1
;
5796 if (floatx80_invalid_encoding(a
)) {
5797 float_raise(float_flag_invalid
, status
);
5798 return float128_default_nan(status
);
5800 aSig
= extractFloatx80Frac( a
);
5801 aExp
= extractFloatx80Exp( a
);
5802 aSign
= extractFloatx80Sign( a
);
5803 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5804 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5806 return float128_silence_nan(res
, status
);
5808 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5809 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5813 /*----------------------------------------------------------------------------
5814 | Rounds the extended double-precision floating-point value `a'
5815 | to the precision provided by floatx80_rounding_precision and returns the
5816 | result as an extended double-precision floating-point value.
5817 | The operation is performed according to the IEC/IEEE Standard for Binary
5818 | Floating-Point Arithmetic.
5819 *----------------------------------------------------------------------------*/
5821 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5823 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5824 extractFloatx80Sign(a
),
5825 extractFloatx80Exp(a
),
5826 extractFloatx80Frac(a
), 0, status
);
5829 /*----------------------------------------------------------------------------
5830 | Rounds the extended double-precision floating-point value `a' to an integer,
5831 | and returns the result as an extended quadruple-precision floating-point
5832 | value. The operation is performed according to the IEC/IEEE Standard for
5833 | Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5836 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5840 uint64_t lastBitMask
, roundBitsMask
;
5843 if (floatx80_invalid_encoding(a
)) {
5844 float_raise(float_flag_invalid
, status
);
5845 return floatx80_default_nan(status
);
5847 aExp
= extractFloatx80Exp( a
);
5848 if ( 0x403E <= aExp
) {
5849 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5850 return propagateFloatx80NaN(a
, a
, status
);
5854 if ( aExp
< 0x3FFF ) {
5856 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5859 float_raise(float_flag_inexact
, status
);
5860 aSign
= extractFloatx80Sign( a
);
5861 switch (status
->float_rounding_mode
) {
5862 case float_round_nearest_even
:
5863 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5866 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5869 case float_round_ties_away
:
5870 if (aExp
== 0x3FFE) {
5871 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5874 case float_round_down
:
5877 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5878 : packFloatx80( 0, 0, 0 );
5879 case float_round_up
:
5881 aSign
? packFloatx80( 1, 0, 0 )
5882 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5884 case float_round_to_zero
:
5887 g_assert_not_reached();
5889 return packFloatx80( aSign
, 0, 0 );
5892 lastBitMask
<<= 0x403E - aExp
;
5893 roundBitsMask
= lastBitMask
- 1;
5895 switch (status
->float_rounding_mode
) {
5896 case float_round_nearest_even
:
5897 z
.low
+= lastBitMask
>>1;
5898 if ((z
.low
& roundBitsMask
) == 0) {
5899 z
.low
&= ~lastBitMask
;
5902 case float_round_ties_away
:
5903 z
.low
+= lastBitMask
>> 1;
5905 case float_round_to_zero
:
5907 case float_round_up
:
5908 if (!extractFloatx80Sign(z
)) {
5909 z
.low
+= roundBitsMask
;
5912 case float_round_down
:
5913 if (extractFloatx80Sign(z
)) {
5914 z
.low
+= roundBitsMask
;
5920 z
.low
&= ~ roundBitsMask
;
5923 z
.low
= UINT64_C(0x8000000000000000);
5925 if (z
.low
!= a
.low
) {
5926 float_raise(float_flag_inexact
, status
);
5932 /*----------------------------------------------------------------------------
5933 | Returns the result of adding the absolute values of the extended double-
5934 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5935 | negated before being returned. `zSign' is ignored if the result is a NaN.
5936 | The addition is performed according to the IEC/IEEE Standard for Binary
5937 | Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5940 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5941 float_status
*status
)
5943 int32_t aExp
, bExp
, zExp
;
5944 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5947 aSig
= extractFloatx80Frac( a
);
5948 aExp
= extractFloatx80Exp( a
);
5949 bSig
= extractFloatx80Frac( b
);
5950 bExp
= extractFloatx80Exp( b
);
5951 expDiff
= aExp
- bExp
;
5952 if ( 0 < expDiff
) {
5953 if ( aExp
== 0x7FFF ) {
5954 if ((uint64_t)(aSig
<< 1)) {
5955 return propagateFloatx80NaN(a
, b
, status
);
5959 if ( bExp
== 0 ) --expDiff
;
5960 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5963 else if ( expDiff
< 0 ) {
5964 if ( bExp
== 0x7FFF ) {
5965 if ((uint64_t)(bSig
<< 1)) {
5966 return propagateFloatx80NaN(a
, b
, status
);
5968 return packFloatx80(zSign
,
5969 floatx80_infinity_high
,
5970 floatx80_infinity_low
);
5972 if ( aExp
== 0 ) ++expDiff
;
5973 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5977 if ( aExp
== 0x7FFF ) {
5978 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5979 return propagateFloatx80NaN(a
, b
, status
);
5984 zSig0
= aSig
+ bSig
;
5986 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5987 /* At least one of the values is a pseudo-denormal,
5988 * and there is a carry out of the result. */
5993 return packFloatx80(zSign
, 0, 0);
5995 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
6001 zSig0
= aSig
+ bSig
;
6002 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
6004 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6005 zSig0
|= UINT64_C(0x8000000000000000);
6008 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6009 zSign
, zExp
, zSig0
, zSig1
, status
);
6012 /*----------------------------------------------------------------------------
6013 | Returns the result of subtracting the absolute values of the extended
6014 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
6015 | difference is negated before being returned. `zSign' is ignored if the
6016 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6017 | Standard for Binary Floating-Point Arithmetic.
6018 *----------------------------------------------------------------------------*/
6020 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
6021 float_status
*status
)
6023 int32_t aExp
, bExp
, zExp
;
6024 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6027 aSig
= extractFloatx80Frac( a
);
6028 aExp
= extractFloatx80Exp( a
);
6029 bSig
= extractFloatx80Frac( b
);
6030 bExp
= extractFloatx80Exp( b
);
6031 expDiff
= aExp
- bExp
;
6032 if ( 0 < expDiff
) goto aExpBigger
;
6033 if ( expDiff
< 0 ) goto bExpBigger
;
6034 if ( aExp
== 0x7FFF ) {
6035 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
6036 return propagateFloatx80NaN(a
, b
, status
);
6038 float_raise(float_flag_invalid
, status
);
6039 return floatx80_default_nan(status
);
6046 if ( bSig
< aSig
) goto aBigger
;
6047 if ( aSig
< bSig
) goto bBigger
;
6048 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
6050 if ( bExp
== 0x7FFF ) {
6051 if ((uint64_t)(bSig
<< 1)) {
6052 return propagateFloatx80NaN(a
, b
, status
);
6054 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
6055 floatx80_infinity_low
);
6057 if ( aExp
== 0 ) ++expDiff
;
6058 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
6060 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
6063 goto normalizeRoundAndPack
;
6065 if ( aExp
== 0x7FFF ) {
6066 if ((uint64_t)(aSig
<< 1)) {
6067 return propagateFloatx80NaN(a
, b
, status
);
6071 if ( bExp
== 0 ) --expDiff
;
6072 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
6074 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
6076 normalizeRoundAndPack
:
6077 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
6078 zSign
, zExp
, zSig0
, zSig1
, status
);
6081 /*----------------------------------------------------------------------------
6082 | Returns the result of adding the extended double-precision floating-point
6083 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6084 | Standard for Binary Floating-Point Arithmetic.
6085 *----------------------------------------------------------------------------*/
6087 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
6091 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6092 float_raise(float_flag_invalid
, status
);
6093 return floatx80_default_nan(status
);
6095 aSign
= extractFloatx80Sign( a
);
6096 bSign
= extractFloatx80Sign( b
);
6097 if ( aSign
== bSign
) {
6098 return addFloatx80Sigs(a
, b
, aSign
, status
);
6101 return subFloatx80Sigs(a
, b
, aSign
, status
);
6106 /*----------------------------------------------------------------------------
6107 | Returns the result of subtracting the extended double-precision floating-
6108 | point values `a' and `b'. The operation is performed according to the
6109 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6110 *----------------------------------------------------------------------------*/
6112 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
6116 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6117 float_raise(float_flag_invalid
, status
);
6118 return floatx80_default_nan(status
);
6120 aSign
= extractFloatx80Sign( a
);
6121 bSign
= extractFloatx80Sign( b
);
6122 if ( aSign
== bSign
) {
6123 return subFloatx80Sigs(a
, b
, aSign
, status
);
6126 return addFloatx80Sigs(a
, b
, aSign
, status
);
6131 /*----------------------------------------------------------------------------
6132 | Returns the result of multiplying the extended double-precision floating-
6133 | point values `a' and `b'. The operation is performed according to the
6134 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6135 *----------------------------------------------------------------------------*/
6137 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6139 bool aSign
, bSign
, zSign
;
6140 int32_t aExp
, bExp
, zExp
;
6141 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6143 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6144 float_raise(float_flag_invalid
, status
);
6145 return floatx80_default_nan(status
);
6147 aSig
= extractFloatx80Frac( a
);
6148 aExp
= extractFloatx80Exp( a
);
6149 aSign
= extractFloatx80Sign( a
);
6150 bSig
= extractFloatx80Frac( b
);
6151 bExp
= extractFloatx80Exp( b
);
6152 bSign
= extractFloatx80Sign( b
);
6153 zSign
= aSign
^ bSign
;
6154 if ( aExp
== 0x7FFF ) {
6155 if ( (uint64_t) ( aSig
<<1 )
6156 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6157 return propagateFloatx80NaN(a
, b
, status
);
6159 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6160 return packFloatx80(zSign
, floatx80_infinity_high
,
6161 floatx80_infinity_low
);
6163 if ( bExp
== 0x7FFF ) {
6164 if ((uint64_t)(bSig
<< 1)) {
6165 return propagateFloatx80NaN(a
, b
, status
);
6167 if ( ( aExp
| aSig
) == 0 ) {
6169 float_raise(float_flag_invalid
, status
);
6170 return floatx80_default_nan(status
);
6172 return packFloatx80(zSign
, floatx80_infinity_high
,
6173 floatx80_infinity_low
);
6176 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6177 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6180 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6181 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6183 zExp
= aExp
+ bExp
- 0x3FFE;
6184 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6185 if ( 0 < (int64_t) zSig0
) {
6186 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6189 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6190 zSign
, zExp
, zSig0
, zSig1
, status
);
6193 /*----------------------------------------------------------------------------
6194 | Returns the result of dividing the extended double-precision floating-point
6195 | value `a' by the corresponding value `b'. The operation is performed
6196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6197 *----------------------------------------------------------------------------*/
6199 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6201 bool aSign
, bSign
, zSign
;
6202 int32_t aExp
, bExp
, zExp
;
6203 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6204 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6206 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6207 float_raise(float_flag_invalid
, status
);
6208 return floatx80_default_nan(status
);
6210 aSig
= extractFloatx80Frac( a
);
6211 aExp
= extractFloatx80Exp( a
);
6212 aSign
= extractFloatx80Sign( a
);
6213 bSig
= extractFloatx80Frac( b
);
6214 bExp
= extractFloatx80Exp( b
);
6215 bSign
= extractFloatx80Sign( b
);
6216 zSign
= aSign
^ bSign
;
6217 if ( aExp
== 0x7FFF ) {
6218 if ((uint64_t)(aSig
<< 1)) {
6219 return propagateFloatx80NaN(a
, b
, status
);
6221 if ( bExp
== 0x7FFF ) {
6222 if ((uint64_t)(bSig
<< 1)) {
6223 return propagateFloatx80NaN(a
, b
, status
);
6227 return packFloatx80(zSign
, floatx80_infinity_high
,
6228 floatx80_infinity_low
);
6230 if ( bExp
== 0x7FFF ) {
6231 if ((uint64_t)(bSig
<< 1)) {
6232 return propagateFloatx80NaN(a
, b
, status
);
6234 return packFloatx80( zSign
, 0, 0 );
6238 if ( ( aExp
| aSig
) == 0 ) {
6240 float_raise(float_flag_invalid
, status
);
6241 return floatx80_default_nan(status
);
6243 float_raise(float_flag_divbyzero
, status
);
6244 return packFloatx80(zSign
, floatx80_infinity_high
,
6245 floatx80_infinity_low
);
6247 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6250 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6251 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6253 zExp
= aExp
- bExp
+ 0x3FFE;
6255 if ( bSig
<= aSig
) {
6256 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6259 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6260 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6261 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6262 while ( (int64_t) rem0
< 0 ) {
6264 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6266 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6267 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6268 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6269 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6270 while ( (int64_t) rem1
< 0 ) {
6272 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6274 zSig1
|= ( ( rem1
| rem2
) != 0 );
6276 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6277 zSign
, zExp
, zSig0
, zSig1
, status
);
6280 /*----------------------------------------------------------------------------
6281 | Returns the remainder of the extended double-precision floating-point value
6282 | `a' with respect to the corresponding value `b'. The operation is performed
6283 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6284 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6285 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6286 | the absolute value of the integer quotient.
6287 *----------------------------------------------------------------------------*/
6289 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6290 float_status
*status
)
6293 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6294 uint64_t aSig0
, aSig1
, bSig
;
6295 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6298 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6299 float_raise(float_flag_invalid
, status
);
6300 return floatx80_default_nan(status
);
6302 aSig0
= extractFloatx80Frac( a
);
6303 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6304 aSign
= extractFloatx80Sign( a
);
6305 bSig
= extractFloatx80Frac( b
);
6306 bExp
= extractFloatx80Exp( b
);
6307 if ( aExp
== 0x7FFF ) {
6308 if ( (uint64_t) ( aSig0
<<1 )
6309 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6310 return propagateFloatx80NaN(a
, b
, status
);
6314 if ( bExp
== 0x7FFF ) {
6315 if ((uint64_t)(bSig
<< 1)) {
6316 return propagateFloatx80NaN(a
, b
, status
);
6318 if (aExp
== 0 && aSig0
>> 63) {
6320 * Pseudo-denormal argument must be returned in normalized
6323 return packFloatx80(aSign
, 1, aSig0
);
6330 float_raise(float_flag_invalid
, status
);
6331 return floatx80_default_nan(status
);
6333 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6336 if ( aSig0
== 0 ) return a
;
6337 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6340 expDiff
= aExp
- bExp
;
6342 if ( expDiff
< 0 ) {
6343 if ( mod
|| expDiff
< -1 ) {
6344 if (aExp
== 1 && aExpOrig
== 0) {
6346 * Pseudo-denormal argument must be returned in
6349 return packFloatx80(aSign
, aExp
, aSig0
);
6353 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6356 *quotient
= q
= ( bSig
<= aSig0
);
6357 if ( q
) aSig0
-= bSig
;
6359 while ( 0 < expDiff
) {
6360 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6361 q
= ( 2 < q
) ? q
- 2 : 0;
6362 mul64To128( bSig
, q
, &term0
, &term1
);
6363 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6364 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6370 if ( 0 < expDiff
) {
6371 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6372 q
= ( 2 < q
) ? q
- 2 : 0;
6374 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6375 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6376 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6377 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6379 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6382 *quotient
<<= expDiff
;
6393 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6394 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6395 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6398 aSig0
= alternateASig0
;
6399 aSig1
= alternateASig1
;
6405 normalizeRoundAndPackFloatx80(
6406 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6410 /*----------------------------------------------------------------------------
6411 | Returns the remainder of the extended double-precision floating-point value
6412 | `a' with respect to the corresponding value `b'. The operation is performed
6413 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6414 *----------------------------------------------------------------------------*/
6416 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6419 return floatx80_modrem(a
, b
, false, "ient
, status
);
6422 /*----------------------------------------------------------------------------
6423 | Returns the remainder of the extended double-precision floating-point value
6424 | `a' with respect to the corresponding value `b', with the quotient truncated
6426 *----------------------------------------------------------------------------*/
6428 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6431 return floatx80_modrem(a
, b
, true, "ient
, status
);
6434 /*----------------------------------------------------------------------------
6435 | Returns the square root of the extended double-precision floating-point
6436 | value `a'. The operation is performed according to the IEC/IEEE Standard
6437 | for Binary Floating-Point Arithmetic.
6438 *----------------------------------------------------------------------------*/
6440 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6444 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6445 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6447 if (floatx80_invalid_encoding(a
)) {
6448 float_raise(float_flag_invalid
, status
);
6449 return floatx80_default_nan(status
);
6451 aSig0
= extractFloatx80Frac( a
);
6452 aExp
= extractFloatx80Exp( a
);
6453 aSign
= extractFloatx80Sign( a
);
6454 if ( aExp
== 0x7FFF ) {
6455 if ((uint64_t)(aSig0
<< 1)) {
6456 return propagateFloatx80NaN(a
, a
, status
);
6458 if ( ! aSign
) return a
;
6462 if ( ( aExp
| aSig0
) == 0 ) return a
;
6464 float_raise(float_flag_invalid
, status
);
6465 return floatx80_default_nan(status
);
6468 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6469 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6471 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6472 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6473 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6474 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6475 doubleZSig0
= zSig0
<<1;
6476 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6477 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6478 while ( (int64_t) rem0
< 0 ) {
6481 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6483 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6484 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6485 if ( zSig1
== 0 ) zSig1
= 1;
6486 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6487 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6488 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6489 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6490 while ( (int64_t) rem1
< 0 ) {
6492 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6494 term2
|= doubleZSig0
;
6495 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6497 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6499 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6500 zSig0
|= doubleZSig0
;
6501 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6502 0, zExp
, zSig0
, zSig1
, status
);
6505 /*----------------------------------------------------------------------------
6506 | Returns the result of converting the quadruple-precision floating-point
6507 | value `a' to the 32-bit two's complement integer format. The conversion
6508 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6509 | Arithmetic---which means in particular that the conversion is rounded
6510 | according to the current rounding mode. If `a' is a NaN, the largest
6511 | positive integer is returned. Otherwise, if the conversion overflows, the
6512 | largest integer with the same sign as `a' is returned.
6513 *----------------------------------------------------------------------------*/
6515 int32_t float128_to_int32(float128 a
, float_status
*status
)
6518 int32_t aExp
, shiftCount
;
6519 uint64_t aSig0
, aSig1
;
6521 aSig1
= extractFloat128Frac1( a
);
6522 aSig0
= extractFloat128Frac0( a
);
6523 aExp
= extractFloat128Exp( a
);
6524 aSign
= extractFloat128Sign( a
);
6525 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6526 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6527 aSig0
|= ( aSig1
!= 0 );
6528 shiftCount
= 0x4028 - aExp
;
6529 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6530 return roundAndPackInt32(aSign
, aSig0
, status
);
6534 /*----------------------------------------------------------------------------
6535 | Returns the result of converting the quadruple-precision floating-point
6536 | value `a' to the 32-bit two's complement integer format. The conversion
6537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6538 | Arithmetic, except that the conversion is always rounded toward zero. If
6539 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6540 | conversion overflows, the largest integer with the same sign as `a' is
6542 *----------------------------------------------------------------------------*/
6544 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6547 int32_t aExp
, shiftCount
;
6548 uint64_t aSig0
, aSig1
, savedASig
;
6551 aSig1
= extractFloat128Frac1( a
);
6552 aSig0
= extractFloat128Frac0( a
);
6553 aExp
= extractFloat128Exp( a
);
6554 aSign
= extractFloat128Sign( a
);
6555 aSig0
|= ( aSig1
!= 0 );
6556 if ( 0x401E < aExp
) {
6557 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6560 else if ( aExp
< 0x3FFF ) {
6561 if (aExp
|| aSig0
) {
6562 float_raise(float_flag_inexact
, status
);
6566 aSig0
|= UINT64_C(0x0001000000000000);
6567 shiftCount
= 0x402F - aExp
;
6569 aSig0
>>= shiftCount
;
6571 if ( aSign
) z
= - z
;
6572 if ( ( z
< 0 ) ^ aSign
) {
6574 float_raise(float_flag_invalid
, status
);
6575 return aSign
? INT32_MIN
: INT32_MAX
;
6577 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6578 float_raise(float_flag_inexact
, status
);
6584 /*----------------------------------------------------------------------------
6585 | Returns the result of converting the quadruple-precision floating-point
6586 | value `a' to the 64-bit two's complement integer format. The conversion
6587 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6588 | Arithmetic---which means in particular that the conversion is rounded
6589 | according to the current rounding mode. If `a' is a NaN, the largest
6590 | positive integer is returned. Otherwise, if the conversion overflows, the
6591 | largest integer with the same sign as `a' is returned.
6592 *----------------------------------------------------------------------------*/
6594 int64_t float128_to_int64(float128 a
, float_status
*status
)
6597 int32_t aExp
, shiftCount
;
6598 uint64_t aSig0
, aSig1
;
6600 aSig1
= extractFloat128Frac1( a
);
6601 aSig0
= extractFloat128Frac0( a
);
6602 aExp
= extractFloat128Exp( a
);
6603 aSign
= extractFloat128Sign( a
);
6604 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6605 shiftCount
= 0x402F - aExp
;
6606 if ( shiftCount
<= 0 ) {
6607 if ( 0x403E < aExp
) {
6608 float_raise(float_flag_invalid
, status
);
6610 || ( ( aExp
== 0x7FFF )
6611 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6618 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6621 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6623 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6627 /*----------------------------------------------------------------------------
6628 | Returns the result of converting the quadruple-precision floating-point
6629 | value `a' to the 64-bit two's complement integer format. The conversion
6630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6631 | Arithmetic, except that the conversion is always rounded toward zero.
6632 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6633 | the conversion overflows, the largest integer with the same sign as `a' is
6635 *----------------------------------------------------------------------------*/
6637 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6640 int32_t aExp
, shiftCount
;
6641 uint64_t aSig0
, aSig1
;
6644 aSig1
= extractFloat128Frac1( a
);
6645 aSig0
= extractFloat128Frac0( a
);
6646 aExp
= extractFloat128Exp( a
);
6647 aSign
= extractFloat128Sign( a
);
6648 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6649 shiftCount
= aExp
- 0x402F;
6650 if ( 0 < shiftCount
) {
6651 if ( 0x403E <= aExp
) {
6652 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6653 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6654 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6656 float_raise(float_flag_inexact
, status
);
6660 float_raise(float_flag_invalid
, status
);
6661 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6667 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6668 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6669 float_raise(float_flag_inexact
, status
);
6673 if ( aExp
< 0x3FFF ) {
6674 if ( aExp
| aSig0
| aSig1
) {
6675 float_raise(float_flag_inexact
, status
);
6679 z
= aSig0
>>( - shiftCount
);
6681 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6682 float_raise(float_flag_inexact
, status
);
6685 if ( aSign
) z
= - z
;
6690 /*----------------------------------------------------------------------------
6691 | Returns the result of converting the quadruple-precision floating-point value
6692 | `a' to the 64-bit unsigned integer format. The conversion is
6693 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6694 | Arithmetic---which means in particular that the conversion is rounded
6695 | according to the current rounding mode. If `a' is a NaN, the largest
6696 | positive integer is returned. If the conversion overflows, the
6697 | largest unsigned integer is returned. If 'a' is negative, the value is
6698 | rounded and zero is returned; negative values that do not round to zero
6699 | will raise the inexact exception.
6700 *----------------------------------------------------------------------------*/
6702 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6707 uint64_t aSig0
, aSig1
;
6709 aSig0
= extractFloat128Frac0(a
);
6710 aSig1
= extractFloat128Frac1(a
);
6711 aExp
= extractFloat128Exp(a
);
6712 aSign
= extractFloat128Sign(a
);
6713 if (aSign
&& (aExp
> 0x3FFE)) {
6714 float_raise(float_flag_invalid
, status
);
6715 if (float128_is_any_nan(a
)) {
6722 aSig0
|= UINT64_C(0x0001000000000000);
6724 shiftCount
= 0x402F - aExp
;
6725 if (shiftCount
<= 0) {
6726 if (0x403E < aExp
) {
6727 float_raise(float_flag_invalid
, status
);
6730 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6732 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6734 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6737 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6740 signed char current_rounding_mode
= status
->float_rounding_mode
;
6742 set_float_rounding_mode(float_round_to_zero
, status
);
6743 v
= float128_to_uint64(a
, status
);
6744 set_float_rounding_mode(current_rounding_mode
, status
);
6749 /*----------------------------------------------------------------------------
6750 | Returns the result of converting the quadruple-precision floating-point
6751 | value `a' to the 32-bit unsigned integer format. The conversion
6752 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6753 | Arithmetic except that the conversion is always rounded toward zero.
6754 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6755 | if the conversion overflows, the largest unsigned integer is returned.
6756 | If 'a' is negative, the value is rounded and zero is returned; negative
6757 | values that do not round to zero will raise the inexact exception.
6758 *----------------------------------------------------------------------------*/
6760 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6764 int old_exc_flags
= get_float_exception_flags(status
);
6766 v
= float128_to_uint64_round_to_zero(a
, status
);
6767 if (v
> 0xffffffff) {
6772 set_float_exception_flags(old_exc_flags
, status
);
6773 float_raise(float_flag_invalid
, status
);
6777 /*----------------------------------------------------------------------------
6778 | Returns the result of converting the quadruple-precision floating-point value
6779 | `a' to the 32-bit unsigned integer format. The conversion is
6780 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6781 | Arithmetic---which means in particular that the conversion is rounded
6782 | according to the current rounding mode. If `a' is a NaN, the largest
6783 | positive integer is returned. If the conversion overflows, the
6784 | largest unsigned integer is returned. If 'a' is negative, the value is
6785 | rounded and zero is returned; negative values that do not round to zero
6786 | will raise the inexact exception.
6787 *----------------------------------------------------------------------------*/
6789 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6793 int old_exc_flags
= get_float_exception_flags(status
);
6795 v
= float128_to_uint64(a
, status
);
6796 if (v
> 0xffffffff) {
6801 set_float_exception_flags(old_exc_flags
, status
);
6802 float_raise(float_flag_invalid
, status
);
6806 /*----------------------------------------------------------------------------
6807 | Returns the result of converting the quadruple-precision floating-point
6808 | value `a' to the single-precision floating-point format. The conversion
6809 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6811 *----------------------------------------------------------------------------*/
6813 float32
float128_to_float32(float128 a
, float_status
*status
)
6817 uint64_t aSig0
, aSig1
;
6820 aSig1
= extractFloat128Frac1( a
);
6821 aSig0
= extractFloat128Frac0( a
);
6822 aExp
= extractFloat128Exp( a
);
6823 aSign
= extractFloat128Sign( a
);
6824 if ( aExp
== 0x7FFF ) {
6825 if ( aSig0
| aSig1
) {
6826 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6828 return packFloat32( aSign
, 0xFF, 0 );
6830 aSig0
|= ( aSig1
!= 0 );
6831 shift64RightJamming( aSig0
, 18, &aSig0
);
6833 if ( aExp
|| zSig
) {
6837 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6841 /*----------------------------------------------------------------------------
6842 | Returns the result of converting the quadruple-precision floating-point
6843 | value `a' to the double-precision floating-point format. The conversion
6844 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6846 *----------------------------------------------------------------------------*/
6848 float64
float128_to_float64(float128 a
, float_status
*status
)
6852 uint64_t aSig0
, aSig1
;
6854 aSig1
= extractFloat128Frac1( a
);
6855 aSig0
= extractFloat128Frac0( a
);
6856 aExp
= extractFloat128Exp( a
);
6857 aSign
= extractFloat128Sign( a
);
6858 if ( aExp
== 0x7FFF ) {
6859 if ( aSig0
| aSig1
) {
6860 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6862 return packFloat64( aSign
, 0x7FF, 0 );
6864 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6865 aSig0
|= ( aSig1
!= 0 );
6866 if ( aExp
|| aSig0
) {
6867 aSig0
|= UINT64_C(0x4000000000000000);
6870 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6874 /*----------------------------------------------------------------------------
6875 | Returns the result of converting the quadruple-precision floating-point
6876 | value `a' to the extended double-precision floating-point format. The
6877 | conversion is performed according to the IEC/IEEE Standard for Binary
6878 | Floating-Point Arithmetic.
6879 *----------------------------------------------------------------------------*/
6881 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6885 uint64_t aSig0
, aSig1
;
6887 aSig1
= extractFloat128Frac1( a
);
6888 aSig0
= extractFloat128Frac0( a
);
6889 aExp
= extractFloat128Exp( a
);
6890 aSign
= extractFloat128Sign( a
);
6891 if ( aExp
== 0x7FFF ) {
6892 if ( aSig0
| aSig1
) {
6893 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6895 return floatx80_silence_nan(res
, status
);
6897 return packFloatx80(aSign
, floatx80_infinity_high
,
6898 floatx80_infinity_low
);
6901 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6902 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6905 aSig0
|= UINT64_C(0x0001000000000000);
6907 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6908 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6912 /*----------------------------------------------------------------------------
6913 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6914 | returns the result as a quadruple-precision floating-point value. The
6915 | operation is performed according to the IEC/IEEE Standard for Binary
6916 | Floating-Point Arithmetic.
6917 *----------------------------------------------------------------------------*/
6919 float128
float128_round_to_int(float128 a
, float_status
*status
)
6923 uint64_t lastBitMask
, roundBitsMask
;
6926 aExp
= extractFloat128Exp( a
);
6927 if ( 0x402F <= aExp
) {
6928 if ( 0x406F <= aExp
) {
6929 if ( ( aExp
== 0x7FFF )
6930 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6932 return propagateFloat128NaN(a
, a
, status
);
6937 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6938 roundBitsMask
= lastBitMask
- 1;
6940 switch (status
->float_rounding_mode
) {
6941 case float_round_nearest_even
:
6942 if ( lastBitMask
) {
6943 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6944 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6947 if ( (int64_t) z
.low
< 0 ) {
6949 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6953 case float_round_ties_away
:
6955 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6957 if ((int64_t) z
.low
< 0) {
6962 case float_round_to_zero
:
6964 case float_round_up
:
6965 if (!extractFloat128Sign(z
)) {
6966 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6969 case float_round_down
:
6970 if (extractFloat128Sign(z
)) {
6971 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6974 case float_round_to_odd
:
6976 * Note that if lastBitMask == 0, the last bit is the lsb
6977 * of high, and roundBitsMask == -1.
6979 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6980 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6986 z
.low
&= ~ roundBitsMask
;
6989 if ( aExp
< 0x3FFF ) {
6990 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6991 float_raise(float_flag_inexact
, status
);
6992 aSign
= extractFloat128Sign( a
);
6993 switch (status
->float_rounding_mode
) {
6994 case float_round_nearest_even
:
6995 if ( ( aExp
== 0x3FFE )
6996 && ( extractFloat128Frac0( a
)
6997 | extractFloat128Frac1( a
) )
6999 return packFloat128( aSign
, 0x3FFF, 0, 0 );
7002 case float_round_ties_away
:
7003 if (aExp
== 0x3FFE) {
7004 return packFloat128(aSign
, 0x3FFF, 0, 0);
7007 case float_round_down
:
7009 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
7010 : packFloat128( 0, 0, 0, 0 );
7011 case float_round_up
:
7013 aSign
? packFloat128( 1, 0, 0, 0 )
7014 : packFloat128( 0, 0x3FFF, 0, 0 );
7016 case float_round_to_odd
:
7017 return packFloat128(aSign
, 0x3FFF, 0, 0);
7019 case float_round_to_zero
:
7022 return packFloat128( aSign
, 0, 0, 0 );
7025 lastBitMask
<<= 0x402F - aExp
;
7026 roundBitsMask
= lastBitMask
- 1;
7029 switch (status
->float_rounding_mode
) {
7030 case float_round_nearest_even
:
7031 z
.high
+= lastBitMask
>>1;
7032 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
7033 z
.high
&= ~ lastBitMask
;
7036 case float_round_ties_away
:
7037 z
.high
+= lastBitMask
>>1;
7039 case float_round_to_zero
:
7041 case float_round_up
:
7042 if (!extractFloat128Sign(z
)) {
7043 z
.high
|= ( a
.low
!= 0 );
7044 z
.high
+= roundBitsMask
;
7047 case float_round_down
:
7048 if (extractFloat128Sign(z
)) {
7049 z
.high
|= (a
.low
!= 0);
7050 z
.high
+= roundBitsMask
;
7053 case float_round_to_odd
:
7054 if ((z
.high
& lastBitMask
) == 0) {
7055 z
.high
|= (a
.low
!= 0);
7056 z
.high
+= roundBitsMask
;
7062 z
.high
&= ~ roundBitsMask
;
7064 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7065 float_raise(float_flag_inexact
, status
);
7071 /*----------------------------------------------------------------------------
7072 | Returns the result of multiplying the quadruple-precision floating-point
7073 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7074 | Standard for Binary Floating-Point Arithmetic.
7075 *----------------------------------------------------------------------------*/
7077 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7079 bool aSign
, bSign
, zSign
;
7080 int32_t aExp
, bExp
, zExp
;
7081 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7083 aSig1
= extractFloat128Frac1( a
);
7084 aSig0
= extractFloat128Frac0( a
);
7085 aExp
= extractFloat128Exp( a
);
7086 aSign
= extractFloat128Sign( a
);
7087 bSig1
= extractFloat128Frac1( b
);
7088 bSig0
= extractFloat128Frac0( b
);
7089 bExp
= extractFloat128Exp( b
);
7090 bSign
= extractFloat128Sign( b
);
7091 zSign
= aSign
^ bSign
;
7092 if ( aExp
== 0x7FFF ) {
7093 if ( ( aSig0
| aSig1
)
7094 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7095 return propagateFloat128NaN(a
, b
, status
);
7097 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7098 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7100 if ( bExp
== 0x7FFF ) {
7101 if (bSig0
| bSig1
) {
7102 return propagateFloat128NaN(a
, b
, status
);
7104 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7106 float_raise(float_flag_invalid
, status
);
7107 return float128_default_nan(status
);
7109 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7112 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7113 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7116 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7117 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7119 zExp
= aExp
+ bExp
- 0x4000;
7120 aSig0
|= UINT64_C(0x0001000000000000);
7121 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7122 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7123 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7124 zSig2
|= ( zSig3
!= 0 );
7125 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7126 shift128ExtraRightJamming(
7127 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7130 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7134 /*----------------------------------------------------------------------------
7135 | Returns the result of dividing the quadruple-precision floating-point value
7136 | `a' by the corresponding value `b'. The operation is performed according to
7137 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7140 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7142 bool aSign
, bSign
, zSign
;
7143 int32_t aExp
, bExp
, zExp
;
7144 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7145 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7147 aSig1
= extractFloat128Frac1( a
);
7148 aSig0
= extractFloat128Frac0( a
);
7149 aExp
= extractFloat128Exp( a
);
7150 aSign
= extractFloat128Sign( a
);
7151 bSig1
= extractFloat128Frac1( b
);
7152 bSig0
= extractFloat128Frac0( b
);
7153 bExp
= extractFloat128Exp( b
);
7154 bSign
= extractFloat128Sign( b
);
7155 zSign
= aSign
^ bSign
;
7156 if ( aExp
== 0x7FFF ) {
7157 if (aSig0
| aSig1
) {
7158 return propagateFloat128NaN(a
, b
, status
);
7160 if ( bExp
== 0x7FFF ) {
7161 if (bSig0
| bSig1
) {
7162 return propagateFloat128NaN(a
, b
, status
);
7166 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7168 if ( bExp
== 0x7FFF ) {
7169 if (bSig0
| bSig1
) {
7170 return propagateFloat128NaN(a
, b
, status
);
7172 return packFloat128( zSign
, 0, 0, 0 );
7175 if ( ( bSig0
| bSig1
) == 0 ) {
7176 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7178 float_raise(float_flag_invalid
, status
);
7179 return float128_default_nan(status
);
7181 float_raise(float_flag_divbyzero
, status
);
7182 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7184 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7187 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7188 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7190 zExp
= aExp
- bExp
+ 0x3FFD;
7192 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7194 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7195 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7196 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7199 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7200 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7201 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7202 while ( (int64_t) rem0
< 0 ) {
7204 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7206 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7207 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7208 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7209 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7210 while ( (int64_t) rem1
< 0 ) {
7212 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7214 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7216 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7217 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7221 /*----------------------------------------------------------------------------
7222 | Returns the remainder of the quadruple-precision floating-point value `a'
7223 | with respect to the corresponding value `b'. The operation is performed
7224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7225 *----------------------------------------------------------------------------*/
7227 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7230 int32_t aExp
, bExp
, expDiff
;
7231 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7232 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7235 aSig1
= extractFloat128Frac1( a
);
7236 aSig0
= extractFloat128Frac0( a
);
7237 aExp
= extractFloat128Exp( a
);
7238 aSign
= extractFloat128Sign( a
);
7239 bSig1
= extractFloat128Frac1( b
);
7240 bSig0
= extractFloat128Frac0( b
);
7241 bExp
= extractFloat128Exp( b
);
7242 if ( aExp
== 0x7FFF ) {
7243 if ( ( aSig0
| aSig1
)
7244 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7245 return propagateFloat128NaN(a
, b
, status
);
7249 if ( bExp
== 0x7FFF ) {
7250 if (bSig0
| bSig1
) {
7251 return propagateFloat128NaN(a
, b
, status
);
7256 if ( ( bSig0
| bSig1
) == 0 ) {
7258 float_raise(float_flag_invalid
, status
);
7259 return float128_default_nan(status
);
7261 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7264 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7265 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7267 expDiff
= aExp
- bExp
;
7268 if ( expDiff
< -1 ) return a
;
7270 aSig0
| UINT64_C(0x0001000000000000),
7272 15 - ( expDiff
< 0 ),
7277 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7278 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7279 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7281 while ( 0 < expDiff
) {
7282 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7283 q
= ( 4 < q
) ? q
- 4 : 0;
7284 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7285 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7286 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7287 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7290 if ( -64 < expDiff
) {
7291 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7292 q
= ( 4 < q
) ? q
- 4 : 0;
7294 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7296 if ( expDiff
< 0 ) {
7297 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7300 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7302 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7303 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7306 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7307 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7310 alternateASig0
= aSig0
;
7311 alternateASig1
= aSig1
;
7313 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7314 } while ( 0 <= (int64_t) aSig0
);
7316 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7317 if ( ( sigMean0
< 0 )
7318 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7319 aSig0
= alternateASig0
;
7320 aSig1
= alternateASig1
;
7322 zSign
= ( (int64_t) aSig0
< 0 );
7323 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7324 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7328 /*----------------------------------------------------------------------------
7329 | Returns the square root of the quadruple-precision floating-point value `a'.
7330 | The operation is performed according to the IEC/IEEE Standard for Binary
7331 | Floating-Point Arithmetic.
7332 *----------------------------------------------------------------------------*/
7334 float128
float128_sqrt(float128 a
, float_status
*status
)
7338 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7339 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7341 aSig1
= extractFloat128Frac1( a
);
7342 aSig0
= extractFloat128Frac0( a
);
7343 aExp
= extractFloat128Exp( a
);
7344 aSign
= extractFloat128Sign( a
);
7345 if ( aExp
== 0x7FFF ) {
7346 if (aSig0
| aSig1
) {
7347 return propagateFloat128NaN(a
, a
, status
);
7349 if ( ! aSign
) return a
;
7353 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7355 float_raise(float_flag_invalid
, status
);
7356 return float128_default_nan(status
);
7359 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7360 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7362 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7363 aSig0
|= UINT64_C(0x0001000000000000);
7364 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7365 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7366 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7367 doubleZSig0
= zSig0
<<1;
7368 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7369 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7370 while ( (int64_t) rem0
< 0 ) {
7373 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7375 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7376 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7377 if ( zSig1
== 0 ) zSig1
= 1;
7378 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7379 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7380 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7381 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7382 while ( (int64_t) rem1
< 0 ) {
7384 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7386 term2
|= doubleZSig0
;
7387 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7389 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7391 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7392 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7396 static inline FloatRelation
7397 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7398 float_status
*status
)
7402 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7403 float_raise(float_flag_invalid
, status
);
7404 return float_relation_unordered
;
7406 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7407 ( extractFloatx80Frac( a
)<<1 ) ) ||
7408 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7409 ( extractFloatx80Frac( b
)<<1 ) )) {
7411 floatx80_is_signaling_nan(a
, status
) ||
7412 floatx80_is_signaling_nan(b
, status
)) {
7413 float_raise(float_flag_invalid
, status
);
7415 return float_relation_unordered
;
7417 aSign
= extractFloatx80Sign( a
);
7418 bSign
= extractFloatx80Sign( b
);
7419 if ( aSign
!= bSign
) {
7421 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7422 ( ( a
.low
| b
.low
) == 0 ) ) {
7424 return float_relation_equal
;
7426 return 1 - (2 * aSign
);
7429 /* Normalize pseudo-denormals before comparison. */
7430 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7433 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7436 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7437 return float_relation_equal
;
7439 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7444 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7446 return floatx80_compare_internal(a
, b
, 0, status
);
7449 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7450 float_status
*status
)
7452 return floatx80_compare_internal(a
, b
, 1, status
);
7455 static inline FloatRelation
7456 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7457 float_status
*status
)
7461 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7462 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7463 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7464 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7466 float128_is_signaling_nan(a
, status
) ||
7467 float128_is_signaling_nan(b
, status
)) {
7468 float_raise(float_flag_invalid
, status
);
7470 return float_relation_unordered
;
7472 aSign
= extractFloat128Sign( a
);
7473 bSign
= extractFloat128Sign( b
);
7474 if ( aSign
!= bSign
) {
7475 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7477 return float_relation_equal
;
7479 return 1 - (2 * aSign
);
7482 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7483 return float_relation_equal
;
7485 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7490 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7492 return float128_compare_internal(a
, b
, 0, status
);
7495 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7496 float_status
*status
)
7498 return float128_compare_internal(a
, b
, 1, status
);
7501 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7507 if (floatx80_invalid_encoding(a
)) {
7508 float_raise(float_flag_invalid
, status
);
7509 return floatx80_default_nan(status
);
7511 aSig
= extractFloatx80Frac( a
);
7512 aExp
= extractFloatx80Exp( a
);
7513 aSign
= extractFloatx80Sign( a
);
7515 if ( aExp
== 0x7FFF ) {
7517 return propagateFloatx80NaN(a
, a
, status
);
7531 } else if (n
< -0x10000) {
7536 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7537 aSign
, aExp
, aSig
, 0, status
);
7540 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7544 uint64_t aSig0
, aSig1
;
7546 aSig1
= extractFloat128Frac1( a
);
7547 aSig0
= extractFloat128Frac0( a
);
7548 aExp
= extractFloat128Exp( a
);
7549 aSign
= extractFloat128Sign( a
);
7550 if ( aExp
== 0x7FFF ) {
7551 if ( aSig0
| aSig1
) {
7552 return propagateFloat128NaN(a
, a
, status
);
7557 aSig0
|= UINT64_C(0x0001000000000000);
7558 } else if (aSig0
== 0 && aSig1
== 0) {
7566 } else if (n
< -0x10000) {
7571 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7576 static void __attribute__((constructor
)) softfloat_init(void)
7578 union_float64 ua
, ub
, uc
, ur
;
7580 if (QEMU_NO_HARDFLOAT
) {
7584 * Test that the host's FMA is not obviously broken. For example,
7585 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7586 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7588 ua
.s
= 0x0020000000000001ULL
;
7589 ub
.s
= 0x3ca0000000000000ULL
;
7590 uc
.s
= 0x0020000000000000ULL
;
7591 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7592 if (ur
.s
!= 0x0020000000000001ULL
) {
7593 force_soft_fma
= true;