4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float. The
504 * exponent is unbiased and the fraction is normalized. All
505 * calculations are done with a 64 bit fraction and then rounded as
506 * appropriate for the final format.
508 * Thanks to the packed FloatClass a decent compiler should be able to
509 * fit the whole structure into registers and avoid using the stack
510 * for parameter passing.
520 #define DECOMPOSED_BINARY_POINT 63
521 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
545 uint64_t roundeven_mask
;
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
561 static const FloatFmt float16_params
= {
565 static const FloatFmt float16_params_ahp
= {
570 static const FloatFmt bfloat16_params
= {
574 static const FloatFmt float32_params
= {
578 static const FloatFmt float64_params
= {
582 /* Unpack a float to parts, but do not canonicalize. */
583 static void unpack_raw64(FloatParts64
*r
, const FloatFmt
*fmt
, uint64_t raw
)
585 const int f_size
= fmt
->frac_size
;
586 const int e_size
= fmt
->exp_size
;
588 *r
= (FloatParts64
) {
589 .cls
= float_class_unclassified
,
590 .sign
= extract64(raw
, f_size
+ e_size
, 1),
591 .exp
= extract64(raw
, f_size
, e_size
),
592 .frac
= extract64(raw
, 0, f_size
)
596 static inline void float16_unpack_raw(FloatParts64
*p
, float16 f
)
598 unpack_raw64(p
, &float16_params
, f
);
601 static inline void bfloat16_unpack_raw(FloatParts64
*p
, bfloat16 f
)
603 unpack_raw64(p
, &bfloat16_params
, f
);
606 static inline void float32_unpack_raw(FloatParts64
*p
, float32 f
)
608 unpack_raw64(p
, &float32_params
, f
);
611 static inline void float64_unpack_raw(FloatParts64
*p
, float64 f
)
613 unpack_raw64(p
, &float64_params
, f
);
616 /* Pack a float from parts, but do not canonicalize. */
617 static uint64_t pack_raw64(const FloatParts64
*p
, const FloatFmt
*fmt
)
619 const int f_size
= fmt
->frac_size
;
620 const int e_size
= fmt
->exp_size
;
623 ret
= (uint64_t)p
->sign
<< (f_size
+ e_size
);
624 ret
= deposit64(ret
, f_size
, e_size
, p
->exp
);
625 ret
= deposit64(ret
, 0, f_size
, p
->frac
);
629 static inline float16
float16_pack_raw(FloatParts64 p
)
631 return make_float16(pack_raw64(&p
, &float16_params
));
634 static inline bfloat16
bfloat16_pack_raw(FloatParts64 p
)
636 return pack_raw64(&p
, &bfloat16_params
);
639 static inline float32
float32_pack_raw(FloatParts64 p
)
641 return make_float32(pack_raw64(&p
, &float32_params
));
644 static inline float64
float64_pack_raw(FloatParts64 p
)
646 return make_float64(pack_raw64(&p
, &float64_params
));
649 /*----------------------------------------------------------------------------
650 | Functions and definitions to determine: (1) whether tininess for underflow
651 | is detected before or after rounding by default, (2) what (if anything)
652 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
653 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
654 | are propagated from function inputs to output. These details are target-
656 *----------------------------------------------------------------------------*/
657 #include "softfloat-specialize.c.inc"
659 #define parts_default_nan parts64_default_nan
661 /* Canonicalize EXP and FRAC, setting CLS. */
662 static FloatParts64
sf_canonicalize(FloatParts64 part
, const FloatFmt
*parm
,
663 float_status
*status
)
665 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
666 if (part
.frac
== 0) {
667 part
.cls
= float_class_inf
;
669 part
.frac
<<= parm
->frac_shift
;
670 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
671 ? float_class_snan
: float_class_qnan
);
673 } else if (part
.exp
== 0) {
674 if (likely(part
.frac
== 0)) {
675 part
.cls
= float_class_zero
;
676 } else if (status
->flush_inputs_to_zero
) {
677 float_raise(float_flag_input_denormal
, status
);
678 part
.cls
= float_class_zero
;
681 int shift
= clz64(part
.frac
);
682 part
.cls
= float_class_normal
;
683 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
687 part
.cls
= float_class_normal
;
688 part
.exp
-= parm
->exp_bias
;
689 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
694 /* Round and uncanonicalize a floating-point number by parts. There
695 * are FRAC_SHIFT bits that may require rounding at the bottom of the
696 * fraction; these bits will be removed. The exponent will be biased
697 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
700 static FloatParts64
round_canonical(FloatParts64 p
, float_status
*s
,
701 const FloatFmt
*parm
)
703 const uint64_t frac_lsb
= parm
->frac_lsb
;
704 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
705 const uint64_t round_mask
= parm
->round_mask
;
706 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
707 const int exp_max
= parm
->exp_max
;
708 const int frac_shift
= parm
->frac_shift
;
717 case float_class_normal
:
718 switch (s
->float_rounding_mode
) {
719 case float_round_nearest_even
:
720 overflow_norm
= false;
721 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
723 case float_round_ties_away
:
724 overflow_norm
= false;
727 case float_round_to_zero
:
728 overflow_norm
= true;
732 inc
= p
.sign
? 0 : round_mask
;
733 overflow_norm
= p
.sign
;
735 case float_round_down
:
736 inc
= p
.sign
? round_mask
: 0;
737 overflow_norm
= !p
.sign
;
739 case float_round_to_odd
:
740 overflow_norm
= true;
741 inc
= frac
& frac_lsb
? 0 : round_mask
;
744 g_assert_not_reached();
747 exp
+= parm
->exp_bias
;
748 if (likely(exp
> 0)) {
749 if (frac
& round_mask
) {
750 flags
|= float_flag_inexact
;
751 if (uadd64_overflow(frac
, inc
, &frac
)) {
752 frac
= (frac
>> 1) | DECOMPOSED_IMPLICIT_BIT
;
758 if (parm
->arm_althp
) {
759 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
760 if (unlikely(exp
> exp_max
)) {
761 /* Overflow. Return the maximum normal. */
762 flags
= float_flag_invalid
;
766 } else if (unlikely(exp
>= exp_max
)) {
767 flags
|= float_flag_overflow
| float_flag_inexact
;
772 p
.cls
= float_class_inf
;
776 } else if (s
->flush_to_zero
) {
777 flags
|= float_flag_output_denormal
;
778 p
.cls
= float_class_zero
;
781 bool is_tiny
= s
->tininess_before_rounding
|| (exp
< 0);
785 is_tiny
= !uadd64_overflow(frac
, inc
, &discard
);
788 shift64RightJamming(frac
, 1 - exp
, &frac
);
789 if (frac
& round_mask
) {
790 /* Need to recompute round-to-even. */
791 switch (s
->float_rounding_mode
) {
792 case float_round_nearest_even
:
793 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
796 case float_round_to_odd
:
797 inc
= frac
& frac_lsb
? 0 : round_mask
;
802 flags
|= float_flag_inexact
;
806 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
809 if (is_tiny
&& (flags
& float_flag_inexact
)) {
810 flags
|= float_flag_underflow
;
812 if (exp
== 0 && frac
== 0) {
813 p
.cls
= float_class_zero
;
818 case float_class_zero
:
824 case float_class_inf
:
826 assert(!parm
->arm_althp
);
831 case float_class_qnan
:
832 case float_class_snan
:
833 assert(!parm
->arm_althp
);
835 frac
>>= parm
->frac_shift
;
839 g_assert_not_reached();
842 float_raise(flags
, s
);
848 static FloatParts64
return_nan(FloatParts64 a
, float_status
*s
)
850 g_assert(is_nan(a
.cls
));
851 if (is_snan(a
.cls
)) {
852 float_raise(float_flag_invalid
, s
);
853 if (!s
->default_nan_mode
) {
854 return parts_silence_nan(a
, s
);
856 } else if (!s
->default_nan_mode
) {
859 parts_default_nan(&a
, s
);
863 static FloatParts64
pick_nan(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
865 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
866 float_raise(float_flag_invalid
, s
);
869 if (s
->default_nan_mode
) {
870 parts_default_nan(&a
, s
);
872 if (pickNaN(a
.cls
, b
.cls
,
874 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
), s
)) {
877 if (is_snan(a
.cls
)) {
878 return parts_silence_nan(a
, s
);
884 static FloatParts64
pick_nan_muladd(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
885 bool inf_zero
, float_status
*s
)
889 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
890 float_raise(float_flag_invalid
, s
);
893 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
895 if (s
->default_nan_mode
) {
896 /* Note that this check is after pickNaNMulAdd so that function
897 * has an opportunity to set the Invalid flag.
912 parts_default_nan(&a
, s
);
915 g_assert_not_reached();
918 if (is_snan(a
.cls
)) {
919 return parts_silence_nan(a
, s
);
925 * Pack/unpack routines with a specific FloatFmt.
928 static FloatParts64
float16a_unpack_canonical(float16 f
, float_status
*s
,
929 const FloatFmt
*params
)
933 float16_unpack_raw(&p
, f
);
934 return sf_canonicalize(p
, params
, s
);
937 static FloatParts64
float16_unpack_canonical(float16 f
, float_status
*s
)
939 return float16a_unpack_canonical(f
, s
, &float16_params
);
942 static FloatParts64
bfloat16_unpack_canonical(bfloat16 f
, float_status
*s
)
946 bfloat16_unpack_raw(&p
, f
);
947 return sf_canonicalize(p
, &bfloat16_params
, s
);
950 static float16
float16a_round_pack_canonical(FloatParts64 p
, float_status
*s
,
951 const FloatFmt
*params
)
953 return float16_pack_raw(round_canonical(p
, s
, params
));
956 static float16
float16_round_pack_canonical(FloatParts64 p
, float_status
*s
)
958 return float16a_round_pack_canonical(p
, s
, &float16_params
);
961 static bfloat16
bfloat16_round_pack_canonical(FloatParts64 p
, float_status
*s
)
963 return bfloat16_pack_raw(round_canonical(p
, s
, &bfloat16_params
));
966 static FloatParts64
float32_unpack_canonical(float32 f
, float_status
*s
)
970 float32_unpack_raw(&p
, f
);
971 return sf_canonicalize(p
, &float32_params
, s
);
974 static float32
float32_round_pack_canonical(FloatParts64 p
, float_status
*s
)
976 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
979 static FloatParts64
float64_unpack_canonical(float64 f
, float_status
*s
)
983 float64_unpack_raw(&p
, f
);
984 return sf_canonicalize(p
, &float64_params
, s
);
987 static float64
float64_round_pack_canonical(FloatParts64 p
, float_status
*s
)
989 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
993 * Returns the result of adding or subtracting the values of the
994 * floating-point values `a' and `b'. The operation is performed
995 * according to the IEC/IEEE Standard for Binary Floating-Point
999 static FloatParts64
addsub_floats(FloatParts64 a
, FloatParts64 b
, bool subtract
,
1002 bool a_sign
= a
.sign
;
1003 bool b_sign
= b
.sign
^ subtract
;
1005 if (a_sign
!= b_sign
) {
1008 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1009 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
1010 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1011 a
.frac
= a
.frac
- b
.frac
;
1013 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1014 a
.frac
= b
.frac
- a
.frac
;
1020 a
.cls
= float_class_zero
;
1021 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1023 int shift
= clz64(a
.frac
);
1024 a
.frac
= a
.frac
<< shift
;
1025 a
.exp
= a
.exp
- shift
;
1030 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1031 return pick_nan(a
, b
, s
);
1033 if (a
.cls
== float_class_inf
) {
1034 if (b
.cls
== float_class_inf
) {
1035 float_raise(float_flag_invalid
, s
);
1036 parts_default_nan(&a
, s
);
1040 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1041 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1044 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1045 b
.sign
= a_sign
^ 1;
1048 if (b
.cls
== float_class_zero
) {
1053 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1054 if (a
.exp
> b
.exp
) {
1055 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1056 } else if (a
.exp
< b
.exp
) {
1057 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1061 if (uadd64_overflow(a
.frac
, b
.frac
, &a
.frac
)) {
1062 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1063 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
1068 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1069 return pick_nan(a
, b
, s
);
1071 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1074 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1079 g_assert_not_reached();
1083 * Returns the result of adding or subtracting the floating-point
1084 * values `a' and `b'. The operation is performed according to the
1085 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1088 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1090 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1091 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1092 FloatParts64 pr
= addsub_floats(pa
, pb
, false, status
);
1094 return float16_round_pack_canonical(pr
, status
);
1097 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1099 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1100 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1101 FloatParts64 pr
= addsub_floats(pa
, pb
, true, status
);
1103 return float16_round_pack_canonical(pr
, status
);
1106 static float32 QEMU_SOFTFLOAT_ATTR
1107 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1109 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1110 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1111 FloatParts64 pr
= addsub_floats(pa
, pb
, subtract
, status
);
1113 return float32_round_pack_canonical(pr
, status
);
1116 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1118 return soft_f32_addsub(a
, b
, false, status
);
1121 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1123 return soft_f32_addsub(a
, b
, true, status
);
1126 static float64 QEMU_SOFTFLOAT_ATTR
1127 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1129 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1130 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1131 FloatParts64 pr
= addsub_floats(pa
, pb
, subtract
, status
);
1133 return float64_round_pack_canonical(pr
, status
);
1136 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1138 return soft_f64_addsub(a
, b
, false, status
);
1141 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1143 return soft_f64_addsub(a
, b
, true, status
);
1146 static float hard_f32_add(float a
, float b
)
1151 static float hard_f32_sub(float a
, float b
)
1156 static double hard_f64_add(double a
, double b
)
1161 static double hard_f64_sub(double a
, double b
)
1166 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1168 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1169 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1171 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1174 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1176 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1177 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1179 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1183 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1184 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1186 return float32_gen2(a
, b
, s
, hard
, soft
,
1187 f32_is_zon2
, f32_addsubmul_post
);
1190 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1191 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1193 return float64_gen2(a
, b
, s
, hard
, soft
,
1194 f64_is_zon2
, f64_addsubmul_post
);
1197 float32 QEMU_FLATTEN
1198 float32_add(float32 a
, float32 b
, float_status
*s
)
1200 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1203 float32 QEMU_FLATTEN
1204 float32_sub(float32 a
, float32 b
, float_status
*s
)
1206 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1209 float64 QEMU_FLATTEN
1210 float64_add(float64 a
, float64 b
, float_status
*s
)
1212 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1215 float64 QEMU_FLATTEN
1216 float64_sub(float64 a
, float64 b
, float_status
*s
)
1218 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1222 * Returns the result of adding or subtracting the bfloat16
1223 * values `a' and `b'.
1225 bfloat16 QEMU_FLATTEN
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1227 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1228 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1229 FloatParts64 pr
= addsub_floats(pa
, pb
, false, status
);
1231 return bfloat16_round_pack_canonical(pr
, status
);
1234 bfloat16 QEMU_FLATTEN
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1236 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1237 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1238 FloatParts64 pr
= addsub_floats(pa
, pb
, true, status
);
1240 return bfloat16_round_pack_canonical(pr
, status
);
1244 * Returns the result of multiplying the floating-point values `a' and
1245 * `b'. The operation is performed according to the IEC/IEEE Standard
1246 * for Binary Floating-Point Arithmetic.
1249 static FloatParts64
mul_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1251 bool sign
= a
.sign
^ b
.sign
;
1253 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1255 int exp
= a
.exp
+ b
.exp
;
1257 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1258 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1271 /* handle all the NaN cases */
1272 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1273 return pick_nan(a
, b
, s
);
1275 /* Inf * Zero == NaN */
1276 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1277 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1278 float_raise(float_flag_invalid
, s
);
1279 parts_default_nan(&a
, s
);
1282 /* Multiply by 0 or Inf */
1283 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1287 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1291 g_assert_not_reached();
1294 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1296 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1297 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1298 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1300 return float16_round_pack_canonical(pr
, status
);
1303 static float32 QEMU_SOFTFLOAT_ATTR
1304 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1306 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1307 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1308 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1310 return float32_round_pack_canonical(pr
, status
);
1313 static float64 QEMU_SOFTFLOAT_ATTR
1314 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1316 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1317 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1318 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1320 return float64_round_pack_canonical(pr
, status
);
1323 static float hard_f32_mul(float a
, float b
)
1328 static double hard_f64_mul(double a
, double b
)
1333 float32 QEMU_FLATTEN
1334 float32_mul(float32 a
, float32 b
, float_status
*s
)
1336 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1337 f32_is_zon2
, f32_addsubmul_post
);
1340 float64 QEMU_FLATTEN
1341 float64_mul(float64 a
, float64 b
, float_status
*s
)
1343 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1344 f64_is_zon2
, f64_addsubmul_post
);
1348 * Returns the result of multiplying the bfloat16
1349 * values `a' and `b'.
1352 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1354 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1355 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1356 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1358 return bfloat16_round_pack_canonical(pr
, status
);
1362 * Returns the result of multiplying the floating-point values `a' and
1363 * `b' then adding 'c', with no intermediate rounding step after the
1364 * multiplication. The operation is performed according to the
1365 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1366 * The flags argument allows the caller to select negation of the
1367 * addend, the intermediate product, or the final result. (The
1368 * difference between this and having the caller do a separate
1369 * negation is that negating externally will flip the sign bit on
1373 static FloatParts64
muladd_floats(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
1374 int flags
, float_status
*s
)
1376 bool inf_zero
, p_sign
;
1377 bool sign_flip
= flags
& float_muladd_negate_result
;
1381 int ab_mask
, abc_mask
;
1383 ab_mask
= float_cmask(a
.cls
) | float_cmask(b
.cls
);
1384 abc_mask
= float_cmask(c
.cls
) | ab_mask
;
1385 inf_zero
= ab_mask
== float_cmask_infzero
;
1387 /* It is implementation-defined whether the cases of (0,inf,qnan)
1388 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1389 * they return if they do), so we have to hand this information
1390 * off to the target-specific pick-a-NaN routine.
1392 if (unlikely(abc_mask
& float_cmask_anynan
)) {
1393 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1397 float_raise(float_flag_invalid
, s
);
1398 parts_default_nan(&a
, s
);
1402 if (flags
& float_muladd_negate_c
) {
1406 p_sign
= a
.sign
^ b
.sign
;
1408 if (flags
& float_muladd_negate_product
) {
1412 if (ab_mask
& float_cmask_inf
) {
1413 p_class
= float_class_inf
;
1414 } else if (ab_mask
& float_cmask_zero
) {
1415 p_class
= float_class_zero
;
1417 p_class
= float_class_normal
;
1420 if (c
.cls
== float_class_inf
) {
1421 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1422 float_raise(float_flag_invalid
, s
);
1423 parts_default_nan(&c
, s
);
1425 c
.sign
^= sign_flip
;
1430 if (p_class
== float_class_inf
) {
1431 a
.cls
= float_class_inf
;
1432 a
.sign
= p_sign
^ sign_flip
;
1436 if (p_class
== float_class_zero
) {
1437 if (c
.cls
== float_class_zero
) {
1438 if (p_sign
!= c
.sign
) {
1439 p_sign
= s
->float_rounding_mode
== float_round_down
;
1442 } else if (flags
& float_muladd_halve_result
) {
1445 c
.sign
^= sign_flip
;
1449 /* a & b should be normals now... */
1450 assert(a
.cls
== float_class_normal
&&
1451 b
.cls
== float_class_normal
);
1453 p_exp
= a
.exp
+ b
.exp
;
1455 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1457 /* Renormalize to the msb. */
1458 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1461 shortShift128Left(hi
, lo
, 1, &hi
, &lo
);
1465 if (c
.cls
!= float_class_zero
) {
1466 int exp_diff
= p_exp
- c
.exp
;
1467 if (p_sign
== c
.sign
) {
1469 if (exp_diff
<= 0) {
1470 shift64RightJamming(hi
, -exp_diff
, &hi
);
1472 if (uadd64_overflow(hi
, c
.frac
, &hi
)) {
1473 shift64RightJamming(hi
, 1, &hi
);
1474 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1478 uint64_t c_hi
, c_lo
, over
;
1479 shift128RightJamming(c
.frac
, 0, exp_diff
, &c_hi
, &c_lo
);
1480 add192(0, hi
, lo
, 0, c_hi
, c_lo
, &over
, &hi
, &lo
);
1482 shift64RightJamming(hi
, 1, &hi
);
1483 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1489 uint64_t c_hi
= c
.frac
, c_lo
= 0;
1491 if (exp_diff
<= 0) {
1492 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1495 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1496 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1498 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1503 shift128RightJamming(c_hi
, c_lo
,
1506 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1509 if (hi
== 0 && lo
== 0) {
1510 a
.cls
= float_class_zero
;
1511 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1512 a
.sign
^= sign_flip
;
1519 shift
= clz64(lo
) + 64;
1521 /* Normalizing to a binary point of 124 is the
1522 correct adjust for the exponent. However since we're
1523 shifting, we might as well put the binary point back
1524 at 63 where we really want it. Therefore shift as
1525 if we're leaving 1 bit at the top of the word, but
1526 adjust the exponent as if we're leaving 3 bits. */
1527 shift128Left(hi
, lo
, shift
, &hi
, &lo
);
1534 if (flags
& float_muladd_halve_result
) {
1538 /* finally prepare our result */
1539 a
.cls
= float_class_normal
;
1540 a
.sign
= p_sign
^ sign_flip
;
1547 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1548 int flags
, float_status
*status
)
1550 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1551 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1552 FloatParts64 pc
= float16_unpack_canonical(c
, status
);
1553 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1555 return float16_round_pack_canonical(pr
, status
);
1558 static float32 QEMU_SOFTFLOAT_ATTR
1559 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1560 float_status
*status
)
1562 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1563 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1564 FloatParts64 pc
= float32_unpack_canonical(c
, status
);
1565 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1567 return float32_round_pack_canonical(pr
, status
);
1570 static float64 QEMU_SOFTFLOAT_ATTR
1571 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1572 float_status
*status
)
1574 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1575 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1576 FloatParts64 pc
= float64_unpack_canonical(c
, status
);
1577 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1579 return float64_round_pack_canonical(pr
, status
);
1582 static bool force_soft_fma
;
1584 float32 QEMU_FLATTEN
1585 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1587 union_float32 ua
, ub
, uc
, ur
;
1593 if (unlikely(!can_use_fpu(s
))) {
1596 if (unlikely(flags
& float_muladd_halve_result
)) {
1600 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1601 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1605 if (unlikely(force_soft_fma
)) {
1610 * When (a || b) == 0, there's no need to check for under/over flow,
1611 * since we know the addend is (normal || 0) and the product is 0.
1613 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1617 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1618 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1619 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1621 if (flags
& float_muladd_negate_c
) {
1626 union_float32 ua_orig
= ua
;
1627 union_float32 uc_orig
= uc
;
1629 if (flags
& float_muladd_negate_product
) {
1632 if (flags
& float_muladd_negate_c
) {
1636 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1638 if (unlikely(f32_is_inf(ur
))) {
1639 float_raise(float_flag_overflow
, s
);
1640 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1646 if (flags
& float_muladd_negate_result
) {
1647 return float32_chs(ur
.s
);
1652 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1655 float64 QEMU_FLATTEN
1656 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1658 union_float64 ua
, ub
, uc
, ur
;
1664 if (unlikely(!can_use_fpu(s
))) {
1667 if (unlikely(flags
& float_muladd_halve_result
)) {
1671 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1672 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1676 if (unlikely(force_soft_fma
)) {
1681 * When (a || b) == 0, there's no need to check for under/over flow,
1682 * since we know the addend is (normal || 0) and the product is 0.
1684 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1688 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1689 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1690 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1692 if (flags
& float_muladd_negate_c
) {
1697 union_float64 ua_orig
= ua
;
1698 union_float64 uc_orig
= uc
;
1700 if (flags
& float_muladd_negate_product
) {
1703 if (flags
& float_muladd_negate_c
) {
1707 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1709 if (unlikely(f64_is_inf(ur
))) {
1710 float_raise(float_flag_overflow
, s
);
1711 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1717 if (flags
& float_muladd_negate_result
) {
1718 return float64_chs(ur
.s
);
1723 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1727 * Returns the result of multiplying the bfloat16 values `a'
1728 * and `b' then adding 'c', with no intermediate rounding step after the
1732 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1733 int flags
, float_status
*status
)
1735 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1736 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1737 FloatParts64 pc
= bfloat16_unpack_canonical(c
, status
);
1738 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1740 return bfloat16_round_pack_canonical(pr
, status
);
1744 * Returns the result of dividing the floating-point value `a' by the
1745 * corresponding value `b'. The operation is performed according to
1746 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1749 static FloatParts64
div_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1751 bool sign
= a
.sign
^ b
.sign
;
1753 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1754 uint64_t n0
, n1
, q
, r
;
1755 int exp
= a
.exp
- b
.exp
;
1758 * We want a 2*N / N-bit division to produce exactly an N-bit
1759 * result, so that we do not lose any precision and so that we
1760 * do not have to renormalize afterward. If A.frac < B.frac,
1761 * then division would produce an (N-1)-bit result; shift A left
1762 * by one to produce the an N-bit result, and decrement the
1763 * exponent to match.
1765 * The udiv_qrnnd algorithm that we're using requires normalization,
1766 * i.e. the msb of the denominator must be set, which is already true.
1768 if (a
.frac
< b
.frac
) {
1770 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1772 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1774 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1776 /* Set lsb if there is a remainder, to set inexact. */
1777 a
.frac
= q
| (r
!= 0);
1782 /* handle all the NaN cases */
1783 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1784 return pick_nan(a
, b
, s
);
1786 /* 0/0 or Inf/Inf */
1789 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1790 float_raise(float_flag_invalid
, s
);
1791 parts_default_nan(&a
, s
);
1794 /* Inf / x or 0 / x */
1795 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1800 if (b
.cls
== float_class_zero
) {
1801 float_raise(float_flag_divbyzero
, s
);
1802 a
.cls
= float_class_inf
;
1807 if (b
.cls
== float_class_inf
) {
1808 a
.cls
= float_class_zero
;
1812 g_assert_not_reached();
1815 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1817 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1818 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1819 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1821 return float16_round_pack_canonical(pr
, status
);
1824 static float32 QEMU_SOFTFLOAT_ATTR
1825 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1827 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1828 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1829 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1831 return float32_round_pack_canonical(pr
, status
);
1834 static float64 QEMU_SOFTFLOAT_ATTR
1835 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1837 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1838 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1839 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1841 return float64_round_pack_canonical(pr
, status
);
1844 static float hard_f32_div(float a
, float b
)
1849 static double hard_f64_div(double a
, double b
)
1854 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1856 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1857 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1858 fpclassify(b
.h
) == FP_NORMAL
;
1860 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1863 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1865 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1866 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1867 fpclassify(b
.h
) == FP_NORMAL
;
1869 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1872 static bool f32_div_post(union_float32 a
, union_float32 b
)
1874 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1875 return fpclassify(a
.h
) != FP_ZERO
;
1877 return !float32_is_zero(a
.s
);
1880 static bool f64_div_post(union_float64 a
, union_float64 b
)
1882 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1883 return fpclassify(a
.h
) != FP_ZERO
;
1885 return !float64_is_zero(a
.s
);
1888 float32 QEMU_FLATTEN
1889 float32_div(float32 a
, float32 b
, float_status
*s
)
1891 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1892 f32_div_pre
, f32_div_post
);
1895 float64 QEMU_FLATTEN
1896 float64_div(float64 a
, float64 b
, float_status
*s
)
1898 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1899 f64_div_pre
, f64_div_post
);
1903 * Returns the result of dividing the bfloat16
1904 * value `a' by the corresponding value `b'.
1907 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1909 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1910 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1911 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1913 return bfloat16_round_pack_canonical(pr
, status
);
1917 * Float to Float conversions
1919 * Returns the result of converting one float format to another. The
1920 * conversion is performed according to the IEC/IEEE Standard for
1921 * Binary Floating-Point Arithmetic.
1923 * The float_to_float helper only needs to take care of raising
1924 * invalid exceptions and handling the conversion on NaNs.
1927 static FloatParts64
float_to_float(FloatParts64 a
, const FloatFmt
*dstf
,
1930 if (dstf
->arm_althp
) {
1932 case float_class_qnan
:
1933 case float_class_snan
:
1934 /* There is no NaN in the destination format. Raise Invalid
1935 * and return a zero with the sign of the input NaN.
1937 float_raise(float_flag_invalid
, s
);
1938 a
.cls
= float_class_zero
;
1943 case float_class_inf
:
1944 /* There is no Inf in the destination format. Raise Invalid
1945 * and return the maximum normal with the correct sign.
1947 float_raise(float_flag_invalid
, s
);
1948 a
.cls
= float_class_normal
;
1949 a
.exp
= dstf
->exp_max
;
1950 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1956 } else if (is_nan(a
.cls
)) {
1957 return return_nan(a
, s
);
1962 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1964 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1965 FloatParts64 p
= float16a_unpack_canonical(a
, s
, fmt16
);
1966 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
1967 return float32_round_pack_canonical(pr
, s
);
1970 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1972 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1973 FloatParts64 p
= float16a_unpack_canonical(a
, s
, fmt16
);
1974 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
1975 return float64_round_pack_canonical(pr
, s
);
1978 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1980 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1981 FloatParts64 p
= float32_unpack_canonical(a
, s
);
1982 FloatParts64 pr
= float_to_float(p
, fmt16
, s
);
1983 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1986 static float64 QEMU_SOFTFLOAT_ATTR
1987 soft_float32_to_float64(float32 a
, float_status
*s
)
1989 FloatParts64 p
= float32_unpack_canonical(a
, s
);
1990 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
1991 return float64_round_pack_canonical(pr
, s
);
1994 float64
float32_to_float64(float32 a
, float_status
*s
)
1996 if (likely(float32_is_normal(a
))) {
1997 /* Widening conversion can never produce inexact results. */
2003 } else if (float32_is_zero(a
)) {
2004 return float64_set_sign(float64_zero
, float32_is_neg(a
));
2006 return soft_float32_to_float64(a
, s
);
2010 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2012 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2013 FloatParts64 p
= float64_unpack_canonical(a
, s
);
2014 FloatParts64 pr
= float_to_float(p
, fmt16
, s
);
2015 return float16a_round_pack_canonical(pr
, s
, fmt16
);
2018 float32
float64_to_float32(float64 a
, float_status
*s
)
2020 FloatParts64 p
= float64_unpack_canonical(a
, s
);
2021 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
2022 return float32_round_pack_canonical(pr
, s
);
2025 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2027 FloatParts64 p
= bfloat16_unpack_canonical(a
, s
);
2028 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
2029 return float32_round_pack_canonical(pr
, s
);
2032 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2034 FloatParts64 p
= bfloat16_unpack_canonical(a
, s
);
2035 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
2036 return float64_round_pack_canonical(pr
, s
);
2039 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2041 FloatParts64 p
= float32_unpack_canonical(a
, s
);
2042 FloatParts64 pr
= float_to_float(p
, &bfloat16_params
, s
);
2043 return bfloat16_round_pack_canonical(pr
, s
);
2046 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2048 FloatParts64 p
= float64_unpack_canonical(a
, s
);
2049 FloatParts64 pr
= float_to_float(p
, &bfloat16_params
, s
);
2050 return bfloat16_round_pack_canonical(pr
, s
);
2054 * Rounds the floating-point value `a' to an integer, and returns the
2055 * result as a floating-point value. The operation is performed
2056 * according to the IEC/IEEE Standard for Binary Floating-Point
2060 static FloatParts64
round_to_int(FloatParts64 a
, FloatRoundMode rmode
,
2061 int scale
, float_status
*s
)
2064 case float_class_qnan
:
2065 case float_class_snan
:
2066 return return_nan(a
, s
);
2068 case float_class_zero
:
2069 case float_class_inf
:
2070 /* already "integral" */
2073 case float_class_normal
:
2074 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2077 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2078 /* already integral */
2083 /* all fractional */
2084 float_raise(float_flag_inexact
, s
);
2086 case float_round_nearest_even
:
2087 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2089 case float_round_ties_away
:
2090 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2092 case float_round_to_zero
:
2095 case float_round_up
:
2098 case float_round_down
:
2101 case float_round_to_odd
:
2105 g_assert_not_reached();
2109 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2112 a
.cls
= float_class_zero
;
2115 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2116 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2117 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2118 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2122 case float_round_nearest_even
:
2123 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2125 case float_round_ties_away
:
2128 case float_round_to_zero
:
2131 case float_round_up
:
2132 inc
= a
.sign
? 0 : rnd_mask
;
2134 case float_round_down
:
2135 inc
= a
.sign
? rnd_mask
: 0;
2137 case float_round_to_odd
:
2138 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2141 g_assert_not_reached();
2144 if (a
.frac
& rnd_mask
) {
2145 float_raise(float_flag_inexact
, s
);
2146 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2148 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2151 a
.frac
&= ~rnd_mask
;
2156 g_assert_not_reached();
2161 float16
float16_round_to_int(float16 a
, float_status
*s
)
2163 FloatParts64 pa
= float16_unpack_canonical(a
, s
);
2164 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2165 return float16_round_pack_canonical(pr
, s
);
2168 float32
float32_round_to_int(float32 a
, float_status
*s
)
2170 FloatParts64 pa
= float32_unpack_canonical(a
, s
);
2171 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2172 return float32_round_pack_canonical(pr
, s
);
2175 float64
float64_round_to_int(float64 a
, float_status
*s
)
2177 FloatParts64 pa
= float64_unpack_canonical(a
, s
);
2178 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2179 return float64_round_pack_canonical(pr
, s
);
2183 * Rounds the bfloat16 value `a' to an integer, and returns the
2184 * result as a bfloat16 value.
2187 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2189 FloatParts64 pa
= bfloat16_unpack_canonical(a
, s
);
2190 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2191 return bfloat16_round_pack_canonical(pr
, s
);
2195 * Returns the result of converting the floating-point value `a' to
2196 * the two's complement integer format. The conversion is performed
2197 * according to the IEC/IEEE Standard for Binary Floating-Point
2198 * Arithmetic---which means in particular that the conversion is
2199 * rounded according to the current rounding mode. If `a' is a NaN,
2200 * the largest positive integer is returned. Otherwise, if the
2201 * conversion overflows, the largest integer with the same sign as `a'
2205 static int64_t round_to_int_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2206 int scale
, int64_t min
, int64_t max
,
2210 int orig_flags
= get_float_exception_flags(s
);
2211 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2214 case float_class_snan
:
2215 case float_class_qnan
:
2216 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2218 case float_class_inf
:
2219 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2220 return p
.sign
? min
: max
;
2221 case float_class_zero
:
2223 case float_class_normal
:
2224 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2225 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2230 if (r
<= -(uint64_t) min
) {
2233 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2240 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2245 g_assert_not_reached();
2249 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2252 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2253 rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2256 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2259 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2260 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2263 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2266 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2267 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2270 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2273 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2274 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2277 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2280 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2281 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2284 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2287 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2288 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2291 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2294 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2295 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2298 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2301 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2302 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2305 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2308 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2309 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2312 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2315 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2316 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2319 int8_t float16_to_int8(float16 a
, float_status
*s
)
2321 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2324 int16_t float16_to_int16(float16 a
, float_status
*s
)
2326 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2329 int32_t float16_to_int32(float16 a
, float_status
*s
)
2331 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2334 int64_t float16_to_int64(float16 a
, float_status
*s
)
2336 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2339 int16_t float32_to_int16(float32 a
, float_status
*s
)
2341 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2344 int32_t float32_to_int32(float32 a
, float_status
*s
)
2346 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2349 int64_t float32_to_int64(float32 a
, float_status
*s
)
2351 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2354 int16_t float64_to_int16(float64 a
, float_status
*s
)
2356 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2359 int32_t float64_to_int32(float64 a
, float_status
*s
)
2361 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2364 int64_t float64_to_int64(float64 a
, float_status
*s
)
2366 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2369 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2371 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2374 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2376 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2379 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2381 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2384 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2386 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2389 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2391 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2394 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2396 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2399 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2401 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2404 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2406 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2409 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2411 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2415 * Returns the result of converting the floating-point value `a' to
2416 * the two's complement integer format.
2419 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2422 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2423 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2426 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2429 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2430 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2433 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2436 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2437 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2440 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2442 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2445 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2447 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2450 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2452 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2455 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2457 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2460 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2462 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2465 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2467 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2471 * Returns the result of converting the floating-point value `a' to
2472 * the unsigned integer format. The conversion is performed according
2473 * to the IEC/IEEE Standard for Binary Floating-Point
2474 * Arithmetic---which means in particular that the conversion is
2475 * rounded according to the current rounding mode. If `a' is a NaN,
2476 * the largest unsigned integer is returned. Otherwise, if the
2477 * conversion overflows, the largest unsigned integer is returned. If
2478 * the 'a' is negative, the result is rounded and zero is returned;
2479 * values that do not round to zero will raise the inexact exception
2483 static uint64_t round_to_uint_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2484 int scale
, uint64_t max
,
2487 int orig_flags
= get_float_exception_flags(s
);
2488 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2492 case float_class_snan
:
2493 case float_class_qnan
:
2494 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2496 case float_class_inf
:
2497 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2498 return p
.sign
? 0 : max
;
2499 case float_class_zero
:
2501 case float_class_normal
:
2503 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2507 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2508 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2510 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2514 /* For uint64 this will never trip, but if p.exp is too large
2515 * to shift a decomposed fraction we shall have exited via the
2519 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2524 g_assert_not_reached();
2528 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2531 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2532 rmode
, scale
, UINT8_MAX
, s
);
2535 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2538 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2539 rmode
, scale
, UINT16_MAX
, s
);
2542 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2545 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2546 rmode
, scale
, UINT32_MAX
, s
);
2549 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2552 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2553 rmode
, scale
, UINT64_MAX
, s
);
2556 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2559 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2560 rmode
, scale
, UINT16_MAX
, s
);
2563 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2566 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2567 rmode
, scale
, UINT32_MAX
, s
);
2570 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2573 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2574 rmode
, scale
, UINT64_MAX
, s
);
2577 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2580 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2581 rmode
, scale
, UINT16_MAX
, s
);
2584 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2587 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2588 rmode
, scale
, UINT32_MAX
, s
);
2591 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2594 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2595 rmode
, scale
, UINT64_MAX
, s
);
2598 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2600 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2603 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2605 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2608 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2610 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2613 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2615 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2618 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2620 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2623 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2625 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2628 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2630 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2633 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2635 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2638 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2640 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2643 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2645 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2648 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2650 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2653 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2655 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2658 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2660 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2663 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2665 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2668 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2670 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2673 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2675 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2678 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2680 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2683 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2685 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2688 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2690 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2694 * Returns the result of converting the bfloat16 value `a' to
2695 * the unsigned integer format.
2698 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2699 int scale
, float_status
*s
)
2701 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2702 rmode
, scale
, UINT16_MAX
, s
);
2705 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2706 int scale
, float_status
*s
)
2708 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2709 rmode
, scale
, UINT32_MAX
, s
);
2712 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2713 int scale
, float_status
*s
)
2715 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2716 rmode
, scale
, UINT64_MAX
, s
);
2719 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2721 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2724 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2726 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2729 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2731 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2734 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2736 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2739 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2741 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2744 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2746 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2750 * Integer to float conversions
2752 * Returns the result of converting the two's complement integer `a'
2753 * to the floating-point format. The conversion is performed according
2754 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2757 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2759 FloatParts64 r
= { .sign
= false };
2762 r
.cls
= float_class_zero
;
2767 r
.cls
= float_class_normal
;
2773 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2775 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2776 r
.frac
= f
<< shift
;
2782 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2784 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2785 return float16_round_pack_canonical(pa
, status
);
2788 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2790 return int64_to_float16_scalbn(a
, scale
, status
);
2793 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2795 return int64_to_float16_scalbn(a
, scale
, status
);
2798 float16
int64_to_float16(int64_t a
, float_status
*status
)
2800 return int64_to_float16_scalbn(a
, 0, status
);
2803 float16
int32_to_float16(int32_t a
, float_status
*status
)
2805 return int64_to_float16_scalbn(a
, 0, status
);
2808 float16
int16_to_float16(int16_t a
, float_status
*status
)
2810 return int64_to_float16_scalbn(a
, 0, status
);
2813 float16
int8_to_float16(int8_t a
, float_status
*status
)
2815 return int64_to_float16_scalbn(a
, 0, status
);
2818 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2820 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2821 return float32_round_pack_canonical(pa
, status
);
2824 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2826 return int64_to_float32_scalbn(a
, scale
, status
);
2829 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2831 return int64_to_float32_scalbn(a
, scale
, status
);
2834 float32
int64_to_float32(int64_t a
, float_status
*status
)
2836 return int64_to_float32_scalbn(a
, 0, status
);
2839 float32
int32_to_float32(int32_t a
, float_status
*status
)
2841 return int64_to_float32_scalbn(a
, 0, status
);
2844 float32
int16_to_float32(int16_t a
, float_status
*status
)
2846 return int64_to_float32_scalbn(a
, 0, status
);
2849 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2851 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2852 return float64_round_pack_canonical(pa
, status
);
2855 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2857 return int64_to_float64_scalbn(a
, scale
, status
);
2860 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2862 return int64_to_float64_scalbn(a
, scale
, status
);
2865 float64
int64_to_float64(int64_t a
, float_status
*status
)
2867 return int64_to_float64_scalbn(a
, 0, status
);
2870 float64
int32_to_float64(int32_t a
, float_status
*status
)
2872 return int64_to_float64_scalbn(a
, 0, status
);
2875 float64
int16_to_float64(int16_t a
, float_status
*status
)
2877 return int64_to_float64_scalbn(a
, 0, status
);
2881 * Returns the result of converting the two's complement integer `a'
2882 * to the bfloat16 format.
2885 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
2887 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2888 return bfloat16_round_pack_canonical(pa
, status
);
2891 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
2893 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2896 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
2898 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2901 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
2903 return int64_to_bfloat16_scalbn(a
, 0, status
);
2906 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
2908 return int64_to_bfloat16_scalbn(a
, 0, status
);
2911 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
2913 return int64_to_bfloat16_scalbn(a
, 0, status
);
2917 * Unsigned Integer to float conversions
2919 * Returns the result of converting the unsigned integer `a' to the
2920 * floating-point format. The conversion is performed according to the
2921 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2924 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2926 FloatParts64 r
= { .sign
= false };
2930 r
.cls
= float_class_zero
;
2932 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2934 r
.cls
= float_class_normal
;
2935 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2936 r
.frac
= a
<< shift
;
2942 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2944 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2945 return float16_round_pack_canonical(pa
, status
);
2948 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2950 return uint64_to_float16_scalbn(a
, scale
, status
);
2953 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2955 return uint64_to_float16_scalbn(a
, scale
, status
);
2958 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2960 return uint64_to_float16_scalbn(a
, 0, status
);
2963 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2965 return uint64_to_float16_scalbn(a
, 0, status
);
2968 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2970 return uint64_to_float16_scalbn(a
, 0, status
);
2973 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
2975 return uint64_to_float16_scalbn(a
, 0, status
);
2978 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2980 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2981 return float32_round_pack_canonical(pa
, status
);
2984 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2986 return uint64_to_float32_scalbn(a
, scale
, status
);
2989 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2991 return uint64_to_float32_scalbn(a
, scale
, status
);
2994 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2996 return uint64_to_float32_scalbn(a
, 0, status
);
2999 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3001 return uint64_to_float32_scalbn(a
, 0, status
);
3004 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3006 return uint64_to_float32_scalbn(a
, 0, status
);
3009 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3011 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3012 return float64_round_pack_canonical(pa
, status
);
3015 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3017 return uint64_to_float64_scalbn(a
, scale
, status
);
3020 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3022 return uint64_to_float64_scalbn(a
, scale
, status
);
3025 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3027 return uint64_to_float64_scalbn(a
, 0, status
);
3030 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3032 return uint64_to_float64_scalbn(a
, 0, status
);
3035 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3037 return uint64_to_float64_scalbn(a
, 0, status
);
3041 * Returns the result of converting the unsigned integer `a' to the
3045 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3047 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3048 return bfloat16_round_pack_canonical(pa
, status
);
3051 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3053 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3056 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3058 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3061 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3063 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3066 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3068 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3071 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3073 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3077 /* min() and max() functions. These can't be implemented as
3078 * 'compare and pick one input' because that would mishandle
3079 * NaNs and +0 vs -0.
3081 * minnum() and maxnum() functions. These are similar to the min()
3082 * and max() functions but if one of the arguments is a QNaN and
3083 * the other is numerical then the numerical argument is returned.
3084 * SNaNs will get quietened before being returned.
3085 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3086 * and maxNum() operations. min() and max() are the typical min/max
3087 * semantics provided by many CPUs which predate that specification.
3089 * minnummag() and maxnummag() functions correspond to minNumMag()
3090 * and minNumMag() from the IEEE-754 2008.
3092 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3093 bool ieee
, bool ismag
, float_status
*s
)
3095 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3097 /* Takes two floating-point values `a' and `b', one of
3098 * which is a NaN, and returns the appropriate NaN
3099 * result. If either `a' or `b' is a signaling NaN,
3100 * the invalid exception is raised.
3102 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3103 return pick_nan(a
, b
, s
);
3104 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3106 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3110 return pick_nan(a
, b
, s
);
3115 case float_class_normal
:
3118 case float_class_inf
:
3121 case float_class_zero
:
3125 g_assert_not_reached();
3129 case float_class_normal
:
3132 case float_class_inf
:
3135 case float_class_zero
:
3139 g_assert_not_reached();
3143 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3144 bool a_less
= a_exp
< b_exp
;
3145 if (a_exp
== b_exp
) {
3146 a_less
= a
.frac
< b
.frac
;
3148 return a_less
^ ismin
? b
: a
;
3151 if (a
.sign
== b
.sign
) {
3152 bool a_less
= a_exp
< b_exp
;
3153 if (a_exp
== b_exp
) {
3154 a_less
= a
.frac
< b
.frac
;
3156 return a
.sign
^ a_less
^ ismin
? b
: a
;
3158 return a
.sign
^ ismin
? b
: a
;
3163 #define MINMAX(sz, name, ismin, isiee, ismag) \
3164 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3167 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3168 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3169 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3171 return float ## sz ## _round_pack_canonical(pr, s); \
3174 MINMAX(16, min
, true, false, false)
3175 MINMAX(16, minnum
, true, true, false)
3176 MINMAX(16, minnummag
, true, true, true)
3177 MINMAX(16, max
, false, false, false)
3178 MINMAX(16, maxnum
, false, true, false)
3179 MINMAX(16, maxnummag
, false, true, true)
3181 MINMAX(32, min
, true, false, false)
3182 MINMAX(32, minnum
, true, true, false)
3183 MINMAX(32, minnummag
, true, true, true)
3184 MINMAX(32, max
, false, false, false)
3185 MINMAX(32, maxnum
, false, true, false)
3186 MINMAX(32, maxnummag
, false, true, true)
3188 MINMAX(64, min
, true, false, false)
3189 MINMAX(64, minnum
, true, true, false)
3190 MINMAX(64, minnummag
, true, true, true)
3191 MINMAX(64, max
, false, false, false)
3192 MINMAX(64, maxnum
, false, true, false)
3193 MINMAX(64, maxnummag
, false, true, true)
3197 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3198 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3200 FloatParts64 pa = bfloat16_unpack_canonical(a, s); \
3201 FloatParts64 pb = bfloat16_unpack_canonical(b, s); \
3202 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3204 return bfloat16_round_pack_canonical(pr, s); \
3207 BF16_MINMAX(min
, true, false, false)
3208 BF16_MINMAX(minnum
, true, true, false)
3209 BF16_MINMAX(minnummag
, true, true, true)
3210 BF16_MINMAX(max
, false, false, false)
3211 BF16_MINMAX(maxnum
, false, true, false)
3212 BF16_MINMAX(maxnummag
, false, true, true)
3216 /* Floating point compare */
3217 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3220 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3222 a
.cls
== float_class_snan
||
3223 b
.cls
== float_class_snan
) {
3224 float_raise(float_flag_invalid
, s
);
3226 return float_relation_unordered
;
3229 if (a
.cls
== float_class_zero
) {
3230 if (b
.cls
== float_class_zero
) {
3231 return float_relation_equal
;
3233 return b
.sign
? float_relation_greater
: float_relation_less
;
3234 } else if (b
.cls
== float_class_zero
) {
3235 return a
.sign
? float_relation_less
: float_relation_greater
;
3238 /* The only really important thing about infinity is its sign. If
3239 * both are infinities the sign marks the smallest of the two.
3241 if (a
.cls
== float_class_inf
) {
3242 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3243 return float_relation_equal
;
3245 return a
.sign
? float_relation_less
: float_relation_greater
;
3246 } else if (b
.cls
== float_class_inf
) {
3247 return b
.sign
? float_relation_greater
: float_relation_less
;
3250 if (a
.sign
!= b
.sign
) {
3251 return a
.sign
? float_relation_less
: float_relation_greater
;
3254 if (a
.exp
== b
.exp
) {
3255 if (a
.frac
== b
.frac
) {
3256 return float_relation_equal
;
3259 return a
.frac
> b
.frac
?
3260 float_relation_less
: float_relation_greater
;
3262 return a
.frac
> b
.frac
?
3263 float_relation_greater
: float_relation_less
;
3267 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3269 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3274 #define COMPARE(name, attr, sz) \
3276 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3278 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3279 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3280 return compare_floats(pa, pb, is_quiet, s); \
3283 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3284 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3285 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3289 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3291 return soft_f16_compare(a
, b
, false, s
);
3294 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3296 return soft_f16_compare(a
, b
, true, s
);
3299 static FloatRelation QEMU_FLATTEN
3300 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3302 union_float32 ua
, ub
;
3307 if (QEMU_NO_HARDFLOAT
) {
3311 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3312 if (isgreaterequal(ua
.h
, ub
.h
)) {
3313 if (isgreater(ua
.h
, ub
.h
)) {
3314 return float_relation_greater
;
3316 return float_relation_equal
;
3318 if (likely(isless(ua
.h
, ub
.h
))) {
3319 return float_relation_less
;
3321 /* The only condition remaining is unordered.
3322 * Fall through to set flags.
3325 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3328 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3330 return f32_compare(a
, b
, false, s
);
3333 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3335 return f32_compare(a
, b
, true, s
);
3338 static FloatRelation QEMU_FLATTEN
3339 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3341 union_float64 ua
, ub
;
3346 if (QEMU_NO_HARDFLOAT
) {
3350 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3351 if (isgreaterequal(ua
.h
, ub
.h
)) {
3352 if (isgreater(ua
.h
, ub
.h
)) {
3353 return float_relation_greater
;
3355 return float_relation_equal
;
3357 if (likely(isless(ua
.h
, ub
.h
))) {
3358 return float_relation_less
;
3360 /* The only condition remaining is unordered.
3361 * Fall through to set flags.
3364 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3367 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3369 return f64_compare(a
, b
, false, s
);
3372 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3374 return f64_compare(a
, b
, true, s
);
3377 static FloatRelation QEMU_FLATTEN
3378 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3380 FloatParts64 pa
= bfloat16_unpack_canonical(a
, s
);
3381 FloatParts64 pb
= bfloat16_unpack_canonical(b
, s
);
3382 return compare_floats(pa
, pb
, is_quiet
, s
);
3385 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3387 return soft_bf16_compare(a
, b
, false, s
);
3390 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3392 return soft_bf16_compare(a
, b
, true, s
);
3395 /* Multiply A by 2 raised to the power N. */
3396 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3398 if (unlikely(is_nan(a
.cls
))) {
3399 return return_nan(a
, s
);
3401 if (a
.cls
== float_class_normal
) {
3402 /* The largest float type (even though not supported by FloatParts64)
3403 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3404 * still allows rounding to infinity, without allowing overflow
3405 * within the int32_t that backs FloatParts64.exp.
3407 n
= MIN(MAX(n
, -0x10000), 0x10000);
3413 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3415 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
3416 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3417 return float16_round_pack_canonical(pr
, status
);
3420 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3422 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
3423 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3424 return float32_round_pack_canonical(pr
, status
);
3427 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3429 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
3430 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3431 return float64_round_pack_canonical(pr
, status
);
3434 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3436 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
3437 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3438 return bfloat16_round_pack_canonical(pr
, status
);
3444 * The old softfloat code did an approximation step before zeroing in
3445 * on the final result. However for simpleness we just compute the
3446 * square root by iterating down from the implicit bit to enough extra
3447 * bits to ensure we get a correctly rounded result.
3449 * This does mean however the calculation is slower than before,
3450 * especially for 64 bit floats.
3453 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3455 uint64_t a_frac
, r_frac
, s_frac
;
3458 if (is_nan(a
.cls
)) {
3459 return return_nan(a
, s
);
3461 if (a
.cls
== float_class_zero
) {
3462 return a
; /* sqrt(+-0) = +-0 */
3465 float_raise(float_flag_invalid
, s
);
3466 parts_default_nan(&a
, s
);
3469 if (a
.cls
== float_class_inf
) {
3470 return a
; /* sqrt(+inf) = +inf */
3473 assert(a
.cls
== float_class_normal
);
3475 /* We need two overflow bits at the top. Adding room for that is a
3476 * right shift. If the exponent is odd, we can discard the low bit
3477 * by multiplying the fraction by 2; that's a left shift. Combine
3478 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3480 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3483 /* Bit-by-bit computation of sqrt. */
3487 /* Iterate from implicit bit down to the 3 extra bits to compute a
3488 * properly rounded result. Remember we've inserted two more bits
3489 * at the top, so these positions are two less.
3491 bit
= DECOMPOSED_BINARY_POINT
- 2;
3492 last_bit
= MAX(p
->frac_shift
- 4, 0);
3494 uint64_t q
= 1ULL << bit
;
3495 uint64_t t_frac
= s_frac
+ q
;
3496 if (t_frac
<= a_frac
) {
3497 s_frac
= t_frac
+ q
;
3502 } while (--bit
>= last_bit
);
3504 /* Undo the right shift done above. If there is any remaining
3505 * fraction, the result is inexact. Set the sticky bit.
3507 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3512 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3514 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
3515 FloatParts64 pr
= sqrt_float(pa
, status
, &float16_params
);
3516 return float16_round_pack_canonical(pr
, status
);
3519 static float32 QEMU_SOFTFLOAT_ATTR
3520 soft_f32_sqrt(float32 a
, float_status
*status
)
3522 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
3523 FloatParts64 pr
= sqrt_float(pa
, status
, &float32_params
);
3524 return float32_round_pack_canonical(pr
, status
);
3527 static float64 QEMU_SOFTFLOAT_ATTR
3528 soft_f64_sqrt(float64 a
, float_status
*status
)
3530 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
3531 FloatParts64 pr
= sqrt_float(pa
, status
, &float64_params
);
3532 return float64_round_pack_canonical(pr
, status
);
3535 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3537 union_float32 ua
, ur
;
3540 if (unlikely(!can_use_fpu(s
))) {
3544 float32_input_flush1(&ua
.s
, s
);
3545 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3546 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3547 fpclassify(ua
.h
) == FP_ZERO
) ||
3551 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3552 float32_is_neg(ua
.s
))) {
3559 return soft_f32_sqrt(ua
.s
, s
);
3562 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3564 union_float64 ua
, ur
;
3567 if (unlikely(!can_use_fpu(s
))) {
3571 float64_input_flush1(&ua
.s
, s
);
3572 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3573 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3574 fpclassify(ua
.h
) == FP_ZERO
) ||
3578 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3579 float64_is_neg(ua
.s
))) {
3586 return soft_f64_sqrt(ua
.s
, s
);
3589 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3591 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
3592 FloatParts64 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3593 return bfloat16_round_pack_canonical(pr
, status
);
3596 /*----------------------------------------------------------------------------
3597 | The pattern for a default generated NaN.
3598 *----------------------------------------------------------------------------*/
3600 float16
float16_default_nan(float_status
*status
)
3604 parts_default_nan(&p
, status
);
3605 p
.frac
>>= float16_params
.frac_shift
;
3606 return float16_pack_raw(p
);
3609 float32
float32_default_nan(float_status
*status
)
3613 parts_default_nan(&p
, status
);
3614 p
.frac
>>= float32_params
.frac_shift
;
3615 return float32_pack_raw(p
);
3618 float64
float64_default_nan(float_status
*status
)
3622 parts_default_nan(&p
, status
);
3623 p
.frac
>>= float64_params
.frac_shift
;
3624 return float64_pack_raw(p
);
3627 float128
float128_default_nan(float_status
*status
)
3632 parts_default_nan(&p
, status
);
3633 /* Extrapolate from the choices made by parts_default_nan to fill
3634 * in the quad-floating format. If the low bit is set, assume we
3635 * want to set all non-snan bits.
3637 r
.low
= -(p
.frac
& 1);
3638 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3639 r
.high
|= UINT64_C(0x7FFF000000000000);
3640 r
.high
|= (uint64_t)p
.sign
<< 63;
3645 bfloat16
bfloat16_default_nan(float_status
*status
)
3649 parts_default_nan(&p
, status
);
3650 p
.frac
>>= bfloat16_params
.frac_shift
;
3651 return bfloat16_pack_raw(p
);
3654 /*----------------------------------------------------------------------------
3655 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3656 *----------------------------------------------------------------------------*/
3658 float16
float16_silence_nan(float16 a
, float_status
*status
)
3662 float16_unpack_raw(&p
, a
);
3663 p
.frac
<<= float16_params
.frac_shift
;
3664 p
= parts_silence_nan(p
, status
);
3665 p
.frac
>>= float16_params
.frac_shift
;
3666 return float16_pack_raw(p
);
3669 float32
float32_silence_nan(float32 a
, float_status
*status
)
3673 float32_unpack_raw(&p
, a
);
3674 p
.frac
<<= float32_params
.frac_shift
;
3675 p
= parts_silence_nan(p
, status
);
3676 p
.frac
>>= float32_params
.frac_shift
;
3677 return float32_pack_raw(p
);
3680 float64
float64_silence_nan(float64 a
, float_status
*status
)
3684 float64_unpack_raw(&p
, a
);
3685 p
.frac
<<= float64_params
.frac_shift
;
3686 p
= parts_silence_nan(p
, status
);
3687 p
.frac
>>= float64_params
.frac_shift
;
3688 return float64_pack_raw(p
);
3691 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3695 bfloat16_unpack_raw(&p
, a
);
3696 p
.frac
<<= bfloat16_params
.frac_shift
;
3697 p
= parts_silence_nan(p
, status
);
3698 p
.frac
>>= bfloat16_params
.frac_shift
;
3699 return bfloat16_pack_raw(p
);
3702 /*----------------------------------------------------------------------------
3703 | If `a' is denormal and we are in flush-to-zero mode then set the
3704 | input-denormal exception and return zero. Otherwise just return the value.
3705 *----------------------------------------------------------------------------*/
3707 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3709 if (p
.exp
== 0 && p
.frac
!= 0) {
3710 float_raise(float_flag_input_denormal
, status
);
3717 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3719 if (status
->flush_inputs_to_zero
) {
3722 float16_unpack_raw(&p
, a
);
3723 if (parts_squash_denormal(p
, status
)) {
3724 return float16_set_sign(float16_zero
, p
.sign
);
3730 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3732 if (status
->flush_inputs_to_zero
) {
3735 float32_unpack_raw(&p
, a
);
3736 if (parts_squash_denormal(p
, status
)) {
3737 return float32_set_sign(float32_zero
, p
.sign
);
3743 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3745 if (status
->flush_inputs_to_zero
) {
3748 float64_unpack_raw(&p
, a
);
3749 if (parts_squash_denormal(p
, status
)) {
3750 return float64_set_sign(float64_zero
, p
.sign
);
3756 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3758 if (status
->flush_inputs_to_zero
) {
3761 bfloat16_unpack_raw(&p
, a
);
3762 if (parts_squash_denormal(p
, status
)) {
3763 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3769 /*----------------------------------------------------------------------------
3770 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3771 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3772 | input. If `zSign' is 1, the input is negated before being converted to an
3773 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3774 | is simply rounded to an integer, with the inexact exception raised if the
3775 | input cannot be represented exactly as an integer. However, if the fixed-
3776 | point input is too large, the invalid exception is raised and the largest
3777 | positive or negative integer is returned.
3778 *----------------------------------------------------------------------------*/
3780 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3781 float_status
*status
)
3783 int8_t roundingMode
;
3784 bool roundNearestEven
;
3785 int8_t roundIncrement
, roundBits
;
3788 roundingMode
= status
->float_rounding_mode
;
3789 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3790 switch (roundingMode
) {
3791 case float_round_nearest_even
:
3792 case float_round_ties_away
:
3793 roundIncrement
= 0x40;
3795 case float_round_to_zero
:
3798 case float_round_up
:
3799 roundIncrement
= zSign
? 0 : 0x7f;
3801 case float_round_down
:
3802 roundIncrement
= zSign
? 0x7f : 0;
3804 case float_round_to_odd
:
3805 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3810 roundBits
= absZ
& 0x7F;
3811 absZ
= ( absZ
+ roundIncrement
)>>7;
3812 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3816 if ( zSign
) z
= - z
;
3817 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3818 float_raise(float_flag_invalid
, status
);
3819 return zSign
? INT32_MIN
: INT32_MAX
;
3822 float_raise(float_flag_inexact
, status
);
3828 /*----------------------------------------------------------------------------
3829 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3830 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3831 | and returns the properly rounded 64-bit integer corresponding to the input.
3832 | If `zSign' is 1, the input is negated before being converted to an integer.
3833 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3834 | the inexact exception raised if the input cannot be represented exactly as
3835 | an integer. However, if the fixed-point input is too large, the invalid
3836 | exception is raised and the largest positive or negative integer is
3838 *----------------------------------------------------------------------------*/
3840 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3841 float_status
*status
)
3843 int8_t roundingMode
;
3844 bool roundNearestEven
, increment
;
3847 roundingMode
= status
->float_rounding_mode
;
3848 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3849 switch (roundingMode
) {
3850 case float_round_nearest_even
:
3851 case float_round_ties_away
:
3852 increment
= ((int64_t) absZ1
< 0);
3854 case float_round_to_zero
:
3857 case float_round_up
:
3858 increment
= !zSign
&& absZ1
;
3860 case float_round_down
:
3861 increment
= zSign
&& absZ1
;
3863 case float_round_to_odd
:
3864 increment
= !(absZ0
& 1) && absZ1
;
3871 if ( absZ0
== 0 ) goto overflow
;
3872 if (!(absZ1
<< 1) && roundNearestEven
) {
3877 if ( zSign
) z
= - z
;
3878 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3880 float_raise(float_flag_invalid
, status
);
3881 return zSign
? INT64_MIN
: INT64_MAX
;
3884 float_raise(float_flag_inexact
, status
);
3890 /*----------------------------------------------------------------------------
3891 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3892 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3893 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3894 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3895 | with the inexact exception raised if the input cannot be represented exactly
3896 | as an integer. However, if the fixed-point input is too large, the invalid
3897 | exception is raised and the largest unsigned integer is returned.
3898 *----------------------------------------------------------------------------*/
3900 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
3901 uint64_t absZ1
, float_status
*status
)
3903 int8_t roundingMode
;
3904 bool roundNearestEven
, increment
;
3906 roundingMode
= status
->float_rounding_mode
;
3907 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3908 switch (roundingMode
) {
3909 case float_round_nearest_even
:
3910 case float_round_ties_away
:
3911 increment
= ((int64_t)absZ1
< 0);
3913 case float_round_to_zero
:
3916 case float_round_up
:
3917 increment
= !zSign
&& absZ1
;
3919 case float_round_down
:
3920 increment
= zSign
&& absZ1
;
3922 case float_round_to_odd
:
3923 increment
= !(absZ0
& 1) && absZ1
;
3931 float_raise(float_flag_invalid
, status
);
3934 if (!(absZ1
<< 1) && roundNearestEven
) {
3939 if (zSign
&& absZ0
) {
3940 float_raise(float_flag_invalid
, status
);
3945 float_raise(float_flag_inexact
, status
);
3950 /*----------------------------------------------------------------------------
3951 | Normalizes the subnormal single-precision floating-point value represented
3952 | by the denormalized significand `aSig'. The normalized exponent and
3953 | significand are stored at the locations pointed to by `zExpPtr' and
3954 | `zSigPtr', respectively.
3955 *----------------------------------------------------------------------------*/
3958 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3962 shiftCount
= clz32(aSig
) - 8;
3963 *zSigPtr
= aSig
<<shiftCount
;
3964 *zExpPtr
= 1 - shiftCount
;
3968 /*----------------------------------------------------------------------------
3969 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3970 | and significand `zSig', and returns the proper single-precision floating-
3971 | point value corresponding to the abstract input. Ordinarily, the abstract
3972 | value is simply rounded and packed into the single-precision format, with
3973 | the inexact exception raised if the abstract input cannot be represented
3974 | exactly. However, if the abstract value is too large, the overflow and
3975 | inexact exceptions are raised and an infinity or maximal finite value is
3976 | returned. If the abstract value is too small, the input value is rounded to
3977 | a subnormal number, and the underflow and inexact exceptions are raised if
3978 | the abstract input cannot be represented exactly as a subnormal single-
3979 | precision floating-point number.
3980 | The input significand `zSig' has its binary point between bits 30
3981 | and 29, which is 7 bits to the left of the usual location. This shifted
3982 | significand must be normalized or smaller. If `zSig' is not normalized,
3983 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3984 | and it must not require rounding. In the usual case that `zSig' is
3985 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3986 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3987 | Binary Floating-Point Arithmetic.
3988 *----------------------------------------------------------------------------*/
3990 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3991 float_status
*status
)
3993 int8_t roundingMode
;
3994 bool roundNearestEven
;
3995 int8_t roundIncrement
, roundBits
;
3998 roundingMode
= status
->float_rounding_mode
;
3999 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4000 switch (roundingMode
) {
4001 case float_round_nearest_even
:
4002 case float_round_ties_away
:
4003 roundIncrement
= 0x40;
4005 case float_round_to_zero
:
4008 case float_round_up
:
4009 roundIncrement
= zSign
? 0 : 0x7f;
4011 case float_round_down
:
4012 roundIncrement
= zSign
? 0x7f : 0;
4014 case float_round_to_odd
:
4015 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4021 roundBits
= zSig
& 0x7F;
4022 if ( 0xFD <= (uint16_t) zExp
) {
4023 if ( ( 0xFD < zExp
)
4024 || ( ( zExp
== 0xFD )
4025 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4027 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4028 roundIncrement
!= 0;
4029 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4030 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4033 if (status
->flush_to_zero
) {
4034 float_raise(float_flag_output_denormal
, status
);
4035 return packFloat32(zSign
, 0, 0);
4037 isTiny
= status
->tininess_before_rounding
4039 || (zSig
+ roundIncrement
< 0x80000000);
4040 shift32RightJamming( zSig
, - zExp
, &zSig
);
4042 roundBits
= zSig
& 0x7F;
4043 if (isTiny
&& roundBits
) {
4044 float_raise(float_flag_underflow
, status
);
4046 if (roundingMode
== float_round_to_odd
) {
4048 * For round-to-odd case, the roundIncrement depends on
4049 * zSig which just changed.
4051 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4056 float_raise(float_flag_inexact
, status
);
4058 zSig
= ( zSig
+ roundIncrement
)>>7;
4059 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4062 if ( zSig
== 0 ) zExp
= 0;
4063 return packFloat32( zSign
, zExp
, zSig
);
4067 /*----------------------------------------------------------------------------
4068 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4069 | and significand `zSig', and returns the proper single-precision floating-
4070 | point value corresponding to the abstract input. This routine is just like
4071 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4072 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4073 | floating-point exponent.
4074 *----------------------------------------------------------------------------*/
4077 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4078 float_status
*status
)
4082 shiftCount
= clz32(zSig
) - 1;
4083 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4088 /*----------------------------------------------------------------------------
4089 | Normalizes the subnormal double-precision floating-point value represented
4090 | by the denormalized significand `aSig'. The normalized exponent and
4091 | significand are stored at the locations pointed to by `zExpPtr' and
4092 | `zSigPtr', respectively.
4093 *----------------------------------------------------------------------------*/
4096 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4100 shiftCount
= clz64(aSig
) - 11;
4101 *zSigPtr
= aSig
<<shiftCount
;
4102 *zExpPtr
= 1 - shiftCount
;
4106 /*----------------------------------------------------------------------------
4107 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4108 | double-precision floating-point value, returning the result. After being
4109 | shifted into the proper positions, the three fields are simply added
4110 | together to form the result. This means that any integer portion of `zSig'
4111 | will be added into the exponent. Since a properly normalized significand
4112 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4113 | than the desired result exponent whenever `zSig' is a complete, normalized
4115 *----------------------------------------------------------------------------*/
4117 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4120 return make_float64(
4121 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4125 /*----------------------------------------------------------------------------
4126 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4127 | and significand `zSig', and returns the proper double-precision floating-
4128 | point value corresponding to the abstract input. Ordinarily, the abstract
4129 | value is simply rounded and packed into the double-precision format, with
4130 | the inexact exception raised if the abstract input cannot be represented
4131 | exactly. However, if the abstract value is too large, the overflow and
4132 | inexact exceptions are raised and an infinity or maximal finite value is
4133 | returned. If the abstract value is too small, the input value is rounded to
4134 | a subnormal number, and the underflow and inexact exceptions are raised if
4135 | the abstract input cannot be represented exactly as a subnormal double-
4136 | precision floating-point number.
4137 | The input significand `zSig' has its binary point between bits 62
4138 | and 61, which is 10 bits to the left of the usual location. This shifted
4139 | significand must be normalized or smaller. If `zSig' is not normalized,
4140 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4141 | and it must not require rounding. In the usual case that `zSig' is
4142 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4143 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4144 | Binary Floating-Point Arithmetic.
4145 *----------------------------------------------------------------------------*/
4147 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4148 float_status
*status
)
4150 int8_t roundingMode
;
4151 bool roundNearestEven
;
4152 int roundIncrement
, roundBits
;
4155 roundingMode
= status
->float_rounding_mode
;
4156 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4157 switch (roundingMode
) {
4158 case float_round_nearest_even
:
4159 case float_round_ties_away
:
4160 roundIncrement
= 0x200;
4162 case float_round_to_zero
:
4165 case float_round_up
:
4166 roundIncrement
= zSign
? 0 : 0x3ff;
4168 case float_round_down
:
4169 roundIncrement
= zSign
? 0x3ff : 0;
4171 case float_round_to_odd
:
4172 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4177 roundBits
= zSig
& 0x3FF;
4178 if ( 0x7FD <= (uint16_t) zExp
) {
4179 if ( ( 0x7FD < zExp
)
4180 || ( ( zExp
== 0x7FD )
4181 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4183 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4184 roundIncrement
!= 0;
4185 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4186 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4189 if (status
->flush_to_zero
) {
4190 float_raise(float_flag_output_denormal
, status
);
4191 return packFloat64(zSign
, 0, 0);
4193 isTiny
= status
->tininess_before_rounding
4195 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4196 shift64RightJamming( zSig
, - zExp
, &zSig
);
4198 roundBits
= zSig
& 0x3FF;
4199 if (isTiny
&& roundBits
) {
4200 float_raise(float_flag_underflow
, status
);
4202 if (roundingMode
== float_round_to_odd
) {
4204 * For round-to-odd case, the roundIncrement depends on
4205 * zSig which just changed.
4207 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4212 float_raise(float_flag_inexact
, status
);
4214 zSig
= ( zSig
+ roundIncrement
)>>10;
4215 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4218 if ( zSig
== 0 ) zExp
= 0;
4219 return packFloat64( zSign
, zExp
, zSig
);
4223 /*----------------------------------------------------------------------------
4224 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4225 | and significand `zSig', and returns the proper double-precision floating-
4226 | point value corresponding to the abstract input. This routine is just like
4227 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4228 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4229 | floating-point exponent.
4230 *----------------------------------------------------------------------------*/
4233 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4234 float_status
*status
)
4238 shiftCount
= clz64(zSig
) - 1;
4239 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4244 /*----------------------------------------------------------------------------
4245 | Normalizes the subnormal extended double-precision floating-point value
4246 | represented by the denormalized significand `aSig'. The normalized exponent
4247 | and significand are stored at the locations pointed to by `zExpPtr' and
4248 | `zSigPtr', respectively.
4249 *----------------------------------------------------------------------------*/
4251 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4256 shiftCount
= clz64(aSig
);
4257 *zSigPtr
= aSig
<<shiftCount
;
4258 *zExpPtr
= 1 - shiftCount
;
4261 /*----------------------------------------------------------------------------
4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4263 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4264 | and returns the proper extended double-precision floating-point value
4265 | corresponding to the abstract input. Ordinarily, the abstract value is
4266 | rounded and packed into the extended double-precision format, with the
4267 | inexact exception raised if the abstract input cannot be represented
4268 | exactly. However, if the abstract value is too large, the overflow and
4269 | inexact exceptions are raised and an infinity or maximal finite value is
4270 | returned. If the abstract value is too small, the input value is rounded to
4271 | a subnormal number, and the underflow and inexact exceptions are raised if
4272 | the abstract input cannot be represented exactly as a subnormal extended
4273 | double-precision floating-point number.
4274 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4275 | number of bits as single or double precision, respectively. Otherwise, the
4276 | result is rounded to the full precision of the extended double-precision
4278 | The input significand must be normalized or smaller. If the input
4279 | significand is not normalized, `zExp' must be 0; in that case, the result
4280 | returned is a subnormal number, and it must not require rounding. The
4281 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4282 | Floating-Point Arithmetic.
4283 *----------------------------------------------------------------------------*/
4285 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4286 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4287 float_status
*status
)
4289 int8_t roundingMode
;
4290 bool roundNearestEven
, increment
, isTiny
;
4291 int64_t roundIncrement
, roundMask
, roundBits
;
4293 roundingMode
= status
->float_rounding_mode
;
4294 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4295 if ( roundingPrecision
== 80 ) goto precision80
;
4296 if ( roundingPrecision
== 64 ) {
4297 roundIncrement
= UINT64_C(0x0000000000000400);
4298 roundMask
= UINT64_C(0x00000000000007FF);
4300 else if ( roundingPrecision
== 32 ) {
4301 roundIncrement
= UINT64_C(0x0000008000000000);
4302 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4307 zSig0
|= ( zSig1
!= 0 );
4308 switch (roundingMode
) {
4309 case float_round_nearest_even
:
4310 case float_round_ties_away
:
4312 case float_round_to_zero
:
4315 case float_round_up
:
4316 roundIncrement
= zSign
? 0 : roundMask
;
4318 case float_round_down
:
4319 roundIncrement
= zSign
? roundMask
: 0;
4324 roundBits
= zSig0
& roundMask
;
4325 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4326 if ( ( 0x7FFE < zExp
)
4327 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4332 if (status
->flush_to_zero
) {
4333 float_raise(float_flag_output_denormal
, status
);
4334 return packFloatx80(zSign
, 0, 0);
4336 isTiny
= status
->tininess_before_rounding
4338 || (zSig0
<= zSig0
+ roundIncrement
);
4339 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4341 roundBits
= zSig0
& roundMask
;
4342 if (isTiny
&& roundBits
) {
4343 float_raise(float_flag_underflow
, status
);
4346 float_raise(float_flag_inexact
, status
);
4348 zSig0
+= roundIncrement
;
4349 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4350 roundIncrement
= roundMask
+ 1;
4351 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4352 roundMask
|= roundIncrement
;
4354 zSig0
&= ~ roundMask
;
4355 return packFloatx80( zSign
, zExp
, zSig0
);
4359 float_raise(float_flag_inexact
, status
);
4361 zSig0
+= roundIncrement
;
4362 if ( zSig0
< roundIncrement
) {
4364 zSig0
= UINT64_C(0x8000000000000000);
4366 roundIncrement
= roundMask
+ 1;
4367 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4368 roundMask
|= roundIncrement
;
4370 zSig0
&= ~ roundMask
;
4371 if ( zSig0
== 0 ) zExp
= 0;
4372 return packFloatx80( zSign
, zExp
, zSig0
);
4374 switch (roundingMode
) {
4375 case float_round_nearest_even
:
4376 case float_round_ties_away
:
4377 increment
= ((int64_t)zSig1
< 0);
4379 case float_round_to_zero
:
4382 case float_round_up
:
4383 increment
= !zSign
&& zSig1
;
4385 case float_round_down
:
4386 increment
= zSign
&& zSig1
;
4391 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4392 if ( ( 0x7FFE < zExp
)
4393 || ( ( zExp
== 0x7FFE )
4394 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4400 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4401 if ( ( roundingMode
== float_round_to_zero
)
4402 || ( zSign
&& ( roundingMode
== float_round_up
) )
4403 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4405 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4407 return packFloatx80(zSign
,
4408 floatx80_infinity_high
,
4409 floatx80_infinity_low
);
4412 isTiny
= status
->tininess_before_rounding
4415 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4416 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4418 if (isTiny
&& zSig1
) {
4419 float_raise(float_flag_underflow
, status
);
4422 float_raise(float_flag_inexact
, status
);
4424 switch (roundingMode
) {
4425 case float_round_nearest_even
:
4426 case float_round_ties_away
:
4427 increment
= ((int64_t)zSig1
< 0);
4429 case float_round_to_zero
:
4432 case float_round_up
:
4433 increment
= !zSign
&& zSig1
;
4435 case float_round_down
:
4436 increment
= zSign
&& zSig1
;
4443 if (!(zSig1
<< 1) && roundNearestEven
) {
4446 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4448 return packFloatx80( zSign
, zExp
, zSig0
);
4452 float_raise(float_flag_inexact
, status
);
4458 zSig0
= UINT64_C(0x8000000000000000);
4461 if (!(zSig1
<< 1) && roundNearestEven
) {
4467 if ( zSig0
== 0 ) zExp
= 0;
4469 return packFloatx80( zSign
, zExp
, zSig0
);
4473 /*----------------------------------------------------------------------------
4474 | Takes an abstract floating-point value having sign `zSign', exponent
4475 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4476 | and returns the proper extended double-precision floating-point value
4477 | corresponding to the abstract input. This routine is just like
4478 | `roundAndPackFloatx80' except that the input significand does not have to be
4480 *----------------------------------------------------------------------------*/
4482 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4483 bool zSign
, int32_t zExp
,
4484 uint64_t zSig0
, uint64_t zSig1
,
4485 float_status
*status
)
4494 shiftCount
= clz64(zSig0
);
4495 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4497 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4498 zSig0
, zSig1
, status
);
4502 /*----------------------------------------------------------------------------
4503 | Returns the least-significant 64 fraction bits of the quadruple-precision
4504 | floating-point value `a'.
4505 *----------------------------------------------------------------------------*/
4507 static inline uint64_t extractFloat128Frac1( float128 a
)
4514 /*----------------------------------------------------------------------------
4515 | Returns the most-significant 48 fraction bits of the quadruple-precision
4516 | floating-point value `a'.
4517 *----------------------------------------------------------------------------*/
4519 static inline uint64_t extractFloat128Frac0( float128 a
)
4522 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4526 /*----------------------------------------------------------------------------
4527 | Returns the exponent bits of the quadruple-precision floating-point value
4529 *----------------------------------------------------------------------------*/
4531 static inline int32_t extractFloat128Exp( float128 a
)
4534 return ( a
.high
>>48 ) & 0x7FFF;
4538 /*----------------------------------------------------------------------------
4539 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4540 *----------------------------------------------------------------------------*/
4542 static inline bool extractFloat128Sign(float128 a
)
4544 return a
.high
>> 63;
4547 /*----------------------------------------------------------------------------
4548 | Normalizes the subnormal quadruple-precision floating-point value
4549 | represented by the denormalized significand formed by the concatenation of
4550 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4551 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4552 | significand are stored at the location pointed to by `zSig0Ptr', and the
4553 | least significant 64 bits of the normalized significand are stored at the
4554 | location pointed to by `zSig1Ptr'.
4555 *----------------------------------------------------------------------------*/
4558 normalizeFloat128Subnormal(
4569 shiftCount
= clz64(aSig1
) - 15;
4570 if ( shiftCount
< 0 ) {
4571 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4572 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4575 *zSig0Ptr
= aSig1
<<shiftCount
;
4578 *zExpPtr
= - shiftCount
- 63;
4581 shiftCount
= clz64(aSig0
) - 15;
4582 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4583 *zExpPtr
= 1 - shiftCount
;
4588 /*----------------------------------------------------------------------------
4589 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4590 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4591 | floating-point value, returning the result. After being shifted into the
4592 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4593 | added together to form the most significant 32 bits of the result. This
4594 | means that any integer portion of `zSig0' will be added into the exponent.
4595 | Since a properly normalized significand will have an integer portion equal
4596 | to 1, the `zExp' input should be 1 less than the desired result exponent
4597 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4599 *----------------------------------------------------------------------------*/
4601 static inline float128
4602 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4607 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4611 /*----------------------------------------------------------------------------
4612 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4613 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4614 | and `zSig2', and returns the proper quadruple-precision floating-point value
4615 | corresponding to the abstract input. Ordinarily, the abstract value is
4616 | simply rounded and packed into the quadruple-precision format, with the
4617 | inexact exception raised if the abstract input cannot be represented
4618 | exactly. However, if the abstract value is too large, the overflow and
4619 | inexact exceptions are raised and an infinity or maximal finite value is
4620 | returned. If the abstract value is too small, the input value is rounded to
4621 | a subnormal number, and the underflow and inexact exceptions are raised if
4622 | the abstract input cannot be represented exactly as a subnormal quadruple-
4623 | precision floating-point number.
4624 | The input significand must be normalized or smaller. If the input
4625 | significand is not normalized, `zExp' must be 0; in that case, the result
4626 | returned is a subnormal number, and it must not require rounding. In the
4627 | usual case that the input significand is normalized, `zExp' must be 1 less
4628 | than the ``true'' floating-point exponent. The handling of underflow and
4629 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4632 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4633 uint64_t zSig0
, uint64_t zSig1
,
4634 uint64_t zSig2
, float_status
*status
)
4636 int8_t roundingMode
;
4637 bool roundNearestEven
, increment
, isTiny
;
4639 roundingMode
= status
->float_rounding_mode
;
4640 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4641 switch (roundingMode
) {
4642 case float_round_nearest_even
:
4643 case float_round_ties_away
:
4644 increment
= ((int64_t)zSig2
< 0);
4646 case float_round_to_zero
:
4649 case float_round_up
:
4650 increment
= !zSign
&& zSig2
;
4652 case float_round_down
:
4653 increment
= zSign
&& zSig2
;
4655 case float_round_to_odd
:
4656 increment
= !(zSig1
& 0x1) && zSig2
;
4661 if ( 0x7FFD <= (uint32_t) zExp
) {
4662 if ( ( 0x7FFD < zExp
)
4663 || ( ( zExp
== 0x7FFD )
4665 UINT64_C(0x0001FFFFFFFFFFFF),
4666 UINT64_C(0xFFFFFFFFFFFFFFFF),
4673 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4674 if ( ( roundingMode
== float_round_to_zero
)
4675 || ( zSign
&& ( roundingMode
== float_round_up
) )
4676 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4677 || (roundingMode
== float_round_to_odd
)
4683 UINT64_C(0x0000FFFFFFFFFFFF),
4684 UINT64_C(0xFFFFFFFFFFFFFFFF)
4687 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4690 if (status
->flush_to_zero
) {
4691 float_raise(float_flag_output_denormal
, status
);
4692 return packFloat128(zSign
, 0, 0, 0);
4694 isTiny
= status
->tininess_before_rounding
4697 || lt128(zSig0
, zSig1
,
4698 UINT64_C(0x0001FFFFFFFFFFFF),
4699 UINT64_C(0xFFFFFFFFFFFFFFFF));
4700 shift128ExtraRightJamming(
4701 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4703 if (isTiny
&& zSig2
) {
4704 float_raise(float_flag_underflow
, status
);
4706 switch (roundingMode
) {
4707 case float_round_nearest_even
:
4708 case float_round_ties_away
:
4709 increment
= ((int64_t)zSig2
< 0);
4711 case float_round_to_zero
:
4714 case float_round_up
:
4715 increment
= !zSign
&& zSig2
;
4717 case float_round_down
:
4718 increment
= zSign
&& zSig2
;
4720 case float_round_to_odd
:
4721 increment
= !(zSig1
& 0x1) && zSig2
;
4729 float_raise(float_flag_inexact
, status
);
4732 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4733 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4738 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4740 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4744 /*----------------------------------------------------------------------------
4745 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4746 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4747 | returns the proper quadruple-precision floating-point value corresponding
4748 | to the abstract input. This routine is just like `roundAndPackFloat128'
4749 | except that the input significand has fewer bits and does not have to be
4750 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4752 *----------------------------------------------------------------------------*/
4754 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4755 uint64_t zSig0
, uint64_t zSig1
,
4756 float_status
*status
)
4766 shiftCount
= clz64(zSig0
) - 15;
4767 if ( 0 <= shiftCount
) {
4769 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4772 shift128ExtraRightJamming(
4773 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4776 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of converting the 32-bit two's complement integer `a'
4783 | to the extended double-precision floating-point format. The conversion
4784 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4786 *----------------------------------------------------------------------------*/
4788 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4795 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4797 absA
= zSign
? - a
: a
;
4798 shiftCount
= clz32(absA
) + 32;
4800 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the 32-bit two's complement integer `a' to
4806 | the quadruple-precision floating-point format. The conversion is performed
4807 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4808 *----------------------------------------------------------------------------*/
4810 float128
int32_to_float128(int32_t a
, float_status
*status
)
4817 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4819 absA
= zSign
? - a
: a
;
4820 shiftCount
= clz32(absA
) + 17;
4822 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the 64-bit two's complement integer `a'
4828 | to the extended double-precision floating-point format. The conversion
4829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4831 *----------------------------------------------------------------------------*/
4833 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4839 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4841 absA
= zSign
? - a
: a
;
4842 shiftCount
= clz64(absA
);
4843 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of converting the 64-bit two's complement integer `a' to
4849 | the quadruple-precision floating-point format. The conversion is performed
4850 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4851 *----------------------------------------------------------------------------*/
4853 float128
int64_to_float128(int64_t a
, float_status
*status
)
4859 uint64_t zSig0
, zSig1
;
4861 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4863 absA
= zSign
? - a
: a
;
4864 shiftCount
= clz64(absA
) + 49;
4865 zExp
= 0x406E - shiftCount
;
4866 if ( 64 <= shiftCount
) {
4875 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4876 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4880 /*----------------------------------------------------------------------------
4881 | Returns the result of converting the 64-bit unsigned integer `a'
4882 | to the quadruple-precision floating-point format. The conversion is performed
4883 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4884 *----------------------------------------------------------------------------*/
4886 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4889 return float128_zero
;
4891 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of converting the single-precision floating-point value
4896 | `a' to the extended double-precision floating-point format. The conversion
4897 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4899 *----------------------------------------------------------------------------*/
4901 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4907 a
= float32_squash_input_denormal(a
, status
);
4908 aSig
= extractFloat32Frac( a
);
4909 aExp
= extractFloat32Exp( a
);
4910 aSign
= extractFloat32Sign( a
);
4911 if ( aExp
== 0xFF ) {
4913 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4915 return floatx80_silence_nan(res
, status
);
4917 return packFloatx80(aSign
,
4918 floatx80_infinity_high
,
4919 floatx80_infinity_low
);
4922 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4923 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4926 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4930 /*----------------------------------------------------------------------------
4931 | Returns the result of converting the single-precision floating-point value
4932 | `a' to the double-precision floating-point format. The conversion is
4933 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4935 *----------------------------------------------------------------------------*/
4937 float128
float32_to_float128(float32 a
, float_status
*status
)
4943 a
= float32_squash_input_denormal(a
, status
);
4944 aSig
= extractFloat32Frac( a
);
4945 aExp
= extractFloat32Exp( a
);
4946 aSign
= extractFloat32Sign( a
);
4947 if ( aExp
== 0xFF ) {
4949 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4951 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4954 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4955 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4958 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4962 /*----------------------------------------------------------------------------
4963 | Returns the remainder of the single-precision floating-point value `a'
4964 | with respect to the corresponding value `b'. The operation is performed
4965 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4968 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4971 int aExp
, bExp
, expDiff
;
4972 uint32_t aSig
, bSig
;
4974 uint64_t aSig64
, bSig64
, q64
;
4975 uint32_t alternateASig
;
4977 a
= float32_squash_input_denormal(a
, status
);
4978 b
= float32_squash_input_denormal(b
, status
);
4980 aSig
= extractFloat32Frac( a
);
4981 aExp
= extractFloat32Exp( a
);
4982 aSign
= extractFloat32Sign( a
);
4983 bSig
= extractFloat32Frac( b
);
4984 bExp
= extractFloat32Exp( b
);
4985 if ( aExp
== 0xFF ) {
4986 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4987 return propagateFloat32NaN(a
, b
, status
);
4989 float_raise(float_flag_invalid
, status
);
4990 return float32_default_nan(status
);
4992 if ( bExp
== 0xFF ) {
4994 return propagateFloat32NaN(a
, b
, status
);
5000 float_raise(float_flag_invalid
, status
);
5001 return float32_default_nan(status
);
5003 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
5006 if ( aSig
== 0 ) return a
;
5007 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5009 expDiff
= aExp
- bExp
;
5012 if ( expDiff
< 32 ) {
5015 if ( expDiff
< 0 ) {
5016 if ( expDiff
< -1 ) return a
;
5019 q
= ( bSig
<= aSig
);
5020 if ( q
) aSig
-= bSig
;
5021 if ( 0 < expDiff
) {
5022 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5025 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5033 if ( bSig
<= aSig
) aSig
-= bSig
;
5034 aSig64
= ( (uint64_t) aSig
)<<40;
5035 bSig64
= ( (uint64_t) bSig
)<<40;
5037 while ( 0 < expDiff
) {
5038 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5039 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5040 aSig64
= - ( ( bSig
* q64
)<<38 );
5044 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5045 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5046 q
= q64
>>( 64 - expDiff
);
5048 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5051 alternateASig
= aSig
;
5054 } while ( 0 <= (int32_t) aSig
);
5055 sigMean
= aSig
+ alternateASig
;
5056 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5057 aSig
= alternateASig
;
5059 zSign
= ( (int32_t) aSig
< 0 );
5060 if ( zSign
) aSig
= - aSig
;
5061 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5066 /*----------------------------------------------------------------------------
5067 | Returns the binary exponential of the single-precision floating-point value
5068 | `a'. The operation is performed according to the IEC/IEEE Standard for
5069 | Binary Floating-Point Arithmetic.
5071 | Uses the following identities:
5073 | 1. -------------------------------------------------------------------------
5077 | 2. -------------------------------------------------------------------------
5080 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5082 *----------------------------------------------------------------------------*/
5084 static const float64 float32_exp2_coefficients
[15] =
5086 const_float64( 0x3ff0000000000000ll
), /* 1 */
5087 const_float64( 0x3fe0000000000000ll
), /* 2 */
5088 const_float64( 0x3fc5555555555555ll
), /* 3 */
5089 const_float64( 0x3fa5555555555555ll
), /* 4 */
5090 const_float64( 0x3f81111111111111ll
), /* 5 */
5091 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5092 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5093 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5094 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5095 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5096 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5097 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5098 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5099 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5100 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5103 float32
float32_exp2(float32 a
, float_status
*status
)
5110 a
= float32_squash_input_denormal(a
, status
);
5112 aSig
= extractFloat32Frac( a
);
5113 aExp
= extractFloat32Exp( a
);
5114 aSign
= extractFloat32Sign( a
);
5116 if ( aExp
== 0xFF) {
5118 return propagateFloat32NaN(a
, float32_zero
, status
);
5120 return (aSign
) ? float32_zero
: a
;
5123 if (aSig
== 0) return float32_one
;
5126 float_raise(float_flag_inexact
, status
);
5128 /* ******************************* */
5129 /* using float64 for approximation */
5130 /* ******************************* */
5131 x
= float32_to_float64(a
, status
);
5132 x
= float64_mul(x
, float64_ln2
, status
);
5136 for (i
= 0 ; i
< 15 ; i
++) {
5139 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5140 r
= float64_add(r
, f
, status
);
5142 xn
= float64_mul(xn
, x
, status
);
5145 return float64_to_float32(r
, status
);
5148 /*----------------------------------------------------------------------------
5149 | Returns the binary log of the single-precision floating-point value `a'.
5150 | The operation is performed according to the IEC/IEEE Standard for Binary
5151 | Floating-Point Arithmetic.
5152 *----------------------------------------------------------------------------*/
5153 float32
float32_log2(float32 a
, float_status
*status
)
5157 uint32_t aSig
, zSig
, i
;
5159 a
= float32_squash_input_denormal(a
, status
);
5160 aSig
= extractFloat32Frac( a
);
5161 aExp
= extractFloat32Exp( a
);
5162 aSign
= extractFloat32Sign( a
);
5165 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5166 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5169 float_raise(float_flag_invalid
, status
);
5170 return float32_default_nan(status
);
5172 if ( aExp
== 0xFF ) {
5174 return propagateFloat32NaN(a
, float32_zero
, status
);
5184 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5185 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5186 if ( aSig
& 0x01000000 ) {
5195 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5198 /*----------------------------------------------------------------------------
5199 | Returns the result of converting the double-precision floating-point value
5200 | `a' to the extended double-precision floating-point format. The conversion
5201 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5203 *----------------------------------------------------------------------------*/
5205 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5211 a
= float64_squash_input_denormal(a
, status
);
5212 aSig
= extractFloat64Frac( a
);
5213 aExp
= extractFloat64Exp( a
);
5214 aSign
= extractFloat64Sign( a
);
5215 if ( aExp
== 0x7FF ) {
5217 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5219 return floatx80_silence_nan(res
, status
);
5221 return packFloatx80(aSign
,
5222 floatx80_infinity_high
,
5223 floatx80_infinity_low
);
5226 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5227 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5231 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5235 /*----------------------------------------------------------------------------
5236 | Returns the result of converting the double-precision floating-point value
5237 | `a' to the quadruple-precision floating-point format. The conversion is
5238 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5240 *----------------------------------------------------------------------------*/
5242 float128
float64_to_float128(float64 a
, float_status
*status
)
5246 uint64_t aSig
, zSig0
, zSig1
;
5248 a
= float64_squash_input_denormal(a
, status
);
5249 aSig
= extractFloat64Frac( a
);
5250 aExp
= extractFloat64Exp( a
);
5251 aSign
= extractFloat64Sign( a
);
5252 if ( aExp
== 0x7FF ) {
5254 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5256 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5259 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5260 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5263 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5264 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5269 /*----------------------------------------------------------------------------
5270 | Returns the remainder of the double-precision floating-point value `a'
5271 | with respect to the corresponding value `b'. The operation is performed
5272 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5275 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5278 int aExp
, bExp
, expDiff
;
5279 uint64_t aSig
, bSig
;
5280 uint64_t q
, alternateASig
;
5283 a
= float64_squash_input_denormal(a
, status
);
5284 b
= float64_squash_input_denormal(b
, status
);
5285 aSig
= extractFloat64Frac( a
);
5286 aExp
= extractFloat64Exp( a
);
5287 aSign
= extractFloat64Sign( a
);
5288 bSig
= extractFloat64Frac( b
);
5289 bExp
= extractFloat64Exp( b
);
5290 if ( aExp
== 0x7FF ) {
5291 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5292 return propagateFloat64NaN(a
, b
, status
);
5294 float_raise(float_flag_invalid
, status
);
5295 return float64_default_nan(status
);
5297 if ( bExp
== 0x7FF ) {
5299 return propagateFloat64NaN(a
, b
, status
);
5305 float_raise(float_flag_invalid
, status
);
5306 return float64_default_nan(status
);
5308 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5311 if ( aSig
== 0 ) return a
;
5312 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5314 expDiff
= aExp
- bExp
;
5315 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5316 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5317 if ( expDiff
< 0 ) {
5318 if ( expDiff
< -1 ) return a
;
5321 q
= ( bSig
<= aSig
);
5322 if ( q
) aSig
-= bSig
;
5324 while ( 0 < expDiff
) {
5325 q
= estimateDiv128To64( aSig
, 0, bSig
);
5326 q
= ( 2 < q
) ? q
- 2 : 0;
5327 aSig
= - ( ( bSig
>>2 ) * q
);
5331 if ( 0 < expDiff
) {
5332 q
= estimateDiv128To64( aSig
, 0, bSig
);
5333 q
= ( 2 < q
) ? q
- 2 : 0;
5336 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5343 alternateASig
= aSig
;
5346 } while ( 0 <= (int64_t) aSig
);
5347 sigMean
= aSig
+ alternateASig
;
5348 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5349 aSig
= alternateASig
;
5351 zSign
= ( (int64_t) aSig
< 0 );
5352 if ( zSign
) aSig
= - aSig
;
5353 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5357 /*----------------------------------------------------------------------------
5358 | Returns the binary log of the double-precision floating-point value `a'.
5359 | The operation is performed according to the IEC/IEEE Standard for Binary
5360 | Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362 float64
float64_log2(float64 a
, float_status
*status
)
5366 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5367 a
= float64_squash_input_denormal(a
, status
);
5369 aSig
= extractFloat64Frac( a
);
5370 aExp
= extractFloat64Exp( a
);
5371 aSign
= extractFloat64Sign( a
);
5374 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5375 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5378 float_raise(float_flag_invalid
, status
);
5379 return float64_default_nan(status
);
5381 if ( aExp
== 0x7FF ) {
5383 return propagateFloat64NaN(a
, float64_zero
, status
);
5389 aSig
|= UINT64_C(0x0010000000000000);
5391 zSig
= (uint64_t)aExp
<< 52;
5392 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5393 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5394 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5395 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5403 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5406 /*----------------------------------------------------------------------------
5407 | Returns the result of converting the extended double-precision floating-
5408 | point value `a' to the 32-bit two's complement integer format. The
5409 | conversion is performed according to the IEC/IEEE Standard for Binary
5410 | Floating-Point Arithmetic---which means in particular that the conversion
5411 | is rounded according to the current rounding mode. If `a' is a NaN, the
5412 | largest positive integer is returned. Otherwise, if the conversion
5413 | overflows, the largest integer with the same sign as `a' is returned.
5414 *----------------------------------------------------------------------------*/
5416 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5419 int32_t aExp
, shiftCount
;
5422 if (floatx80_invalid_encoding(a
)) {
5423 float_raise(float_flag_invalid
, status
);
5426 aSig
= extractFloatx80Frac( a
);
5427 aExp
= extractFloatx80Exp( a
);
5428 aSign
= extractFloatx80Sign( a
);
5429 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5430 shiftCount
= 0x4037 - aExp
;
5431 if ( shiftCount
<= 0 ) shiftCount
= 1;
5432 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5433 return roundAndPackInt32(aSign
, aSig
, status
);
5437 /*----------------------------------------------------------------------------
5438 | Returns the result of converting the extended double-precision floating-
5439 | point value `a' to the 32-bit two's complement integer format. The
5440 | conversion is performed according to the IEC/IEEE Standard for Binary
5441 | Floating-Point Arithmetic, except that the conversion is always rounded
5442 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5443 | Otherwise, if the conversion overflows, the largest integer with the same
5444 | sign as `a' is returned.
5445 *----------------------------------------------------------------------------*/
5447 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5450 int32_t aExp
, shiftCount
;
5451 uint64_t aSig
, savedASig
;
5454 if (floatx80_invalid_encoding(a
)) {
5455 float_raise(float_flag_invalid
, status
);
5458 aSig
= extractFloatx80Frac( a
);
5459 aExp
= extractFloatx80Exp( a
);
5460 aSign
= extractFloatx80Sign( a
);
5461 if ( 0x401E < aExp
) {
5462 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5465 else if ( aExp
< 0x3FFF ) {
5467 float_raise(float_flag_inexact
, status
);
5471 shiftCount
= 0x403E - aExp
;
5473 aSig
>>= shiftCount
;
5475 if ( aSign
) z
= - z
;
5476 if ( ( z
< 0 ) ^ aSign
) {
5478 float_raise(float_flag_invalid
, status
);
5479 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5481 if ( ( aSig
<<shiftCount
) != savedASig
) {
5482 float_raise(float_flag_inexact
, status
);
5488 /*----------------------------------------------------------------------------
5489 | Returns the result of converting the extended double-precision floating-
5490 | point value `a' to the 64-bit two's complement integer format. The
5491 | conversion is performed according to the IEC/IEEE Standard for Binary
5492 | Floating-Point Arithmetic---which means in particular that the conversion
5493 | is rounded according to the current rounding mode. If `a' is a NaN,
5494 | the largest positive integer is returned. Otherwise, if the conversion
5495 | overflows, the largest integer with the same sign as `a' is returned.
5496 *----------------------------------------------------------------------------*/
5498 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5501 int32_t aExp
, shiftCount
;
5502 uint64_t aSig
, aSigExtra
;
5504 if (floatx80_invalid_encoding(a
)) {
5505 float_raise(float_flag_invalid
, status
);
5508 aSig
= extractFloatx80Frac( a
);
5509 aExp
= extractFloatx80Exp( a
);
5510 aSign
= extractFloatx80Sign( a
);
5511 shiftCount
= 0x403E - aExp
;
5512 if ( shiftCount
<= 0 ) {
5514 float_raise(float_flag_invalid
, status
);
5515 if (!aSign
|| floatx80_is_any_nan(a
)) {
5523 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5525 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5529 /*----------------------------------------------------------------------------
5530 | Returns the result of converting the extended double-precision floating-
5531 | point value `a' to the 64-bit two's complement integer format. The
5532 | conversion is performed according to the IEC/IEEE Standard for Binary
5533 | Floating-Point Arithmetic, except that the conversion is always rounded
5534 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5535 | Otherwise, if the conversion overflows, the largest integer with the same
5536 | sign as `a' is returned.
5537 *----------------------------------------------------------------------------*/
5539 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5542 int32_t aExp
, shiftCount
;
5546 if (floatx80_invalid_encoding(a
)) {
5547 float_raise(float_flag_invalid
, status
);
5550 aSig
= extractFloatx80Frac( a
);
5551 aExp
= extractFloatx80Exp( a
);
5552 aSign
= extractFloatx80Sign( a
);
5553 shiftCount
= aExp
- 0x403E;
5554 if ( 0 <= shiftCount
) {
5555 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5556 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5557 float_raise(float_flag_invalid
, status
);
5558 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5564 else if ( aExp
< 0x3FFF ) {
5566 float_raise(float_flag_inexact
, status
);
5570 z
= aSig
>>( - shiftCount
);
5571 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5572 float_raise(float_flag_inexact
, status
);
5574 if ( aSign
) z
= - z
;
5579 /*----------------------------------------------------------------------------
5580 | Returns the result of converting the extended double-precision floating-
5581 | point value `a' to the single-precision floating-point format. The
5582 | conversion is performed according to the IEC/IEEE Standard for Binary
5583 | Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5586 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5592 if (floatx80_invalid_encoding(a
)) {
5593 float_raise(float_flag_invalid
, status
);
5594 return float32_default_nan(status
);
5596 aSig
= extractFloatx80Frac( a
);
5597 aExp
= extractFloatx80Exp( a
);
5598 aSign
= extractFloatx80Sign( a
);
5599 if ( aExp
== 0x7FFF ) {
5600 if ( (uint64_t) ( aSig
<<1 ) ) {
5601 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5603 return float32_silence_nan(res
, status
);
5605 return packFloat32( aSign
, 0xFF, 0 );
5607 shift64RightJamming( aSig
, 33, &aSig
);
5608 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5609 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the extended double-precision floating-
5615 | point value `a' to the double-precision floating-point format. The
5616 | conversion is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5620 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5624 uint64_t aSig
, zSig
;
5626 if (floatx80_invalid_encoding(a
)) {
5627 float_raise(float_flag_invalid
, status
);
5628 return float64_default_nan(status
);
5630 aSig
= extractFloatx80Frac( a
);
5631 aExp
= extractFloatx80Exp( a
);
5632 aSign
= extractFloatx80Sign( a
);
5633 if ( aExp
== 0x7FFF ) {
5634 if ( (uint64_t) ( aSig
<<1 ) ) {
5635 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5637 return float64_silence_nan(res
, status
);
5639 return packFloat64( aSign
, 0x7FF, 0 );
5641 shift64RightJamming( aSig
, 1, &zSig
);
5642 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5643 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5647 /*----------------------------------------------------------------------------
5648 | Returns the result of converting the extended double-precision floating-
5649 | point value `a' to the quadruple-precision floating-point format. The
5650 | conversion is performed according to the IEC/IEEE Standard for Binary
5651 | Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5654 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5658 uint64_t aSig
, zSig0
, zSig1
;
5660 if (floatx80_invalid_encoding(a
)) {
5661 float_raise(float_flag_invalid
, status
);
5662 return float128_default_nan(status
);
5664 aSig
= extractFloatx80Frac( a
);
5665 aExp
= extractFloatx80Exp( a
);
5666 aSign
= extractFloatx80Sign( a
);
5667 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5668 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5670 return float128_silence_nan(res
, status
);
5672 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5673 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5677 /*----------------------------------------------------------------------------
5678 | Rounds the extended double-precision floating-point value `a'
5679 | to the precision provided by floatx80_rounding_precision and returns the
5680 | result as an extended double-precision floating-point value.
5681 | The operation is performed according to the IEC/IEEE Standard for Binary
5682 | Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5685 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5687 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5688 extractFloatx80Sign(a
),
5689 extractFloatx80Exp(a
),
5690 extractFloatx80Frac(a
), 0, status
);
5693 /*----------------------------------------------------------------------------
5694 | Rounds the extended double-precision floating-point value `a' to an integer,
5695 | and returns the result as an extended quadruple-precision floating-point
5696 | value. The operation is performed according to the IEC/IEEE Standard for
5697 | Binary Floating-Point Arithmetic.
5698 *----------------------------------------------------------------------------*/
5700 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5704 uint64_t lastBitMask
, roundBitsMask
;
5707 if (floatx80_invalid_encoding(a
)) {
5708 float_raise(float_flag_invalid
, status
);
5709 return floatx80_default_nan(status
);
5711 aExp
= extractFloatx80Exp( a
);
5712 if ( 0x403E <= aExp
) {
5713 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5714 return propagateFloatx80NaN(a
, a
, status
);
5718 if ( aExp
< 0x3FFF ) {
5720 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5723 float_raise(float_flag_inexact
, status
);
5724 aSign
= extractFloatx80Sign( a
);
5725 switch (status
->float_rounding_mode
) {
5726 case float_round_nearest_even
:
5727 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5730 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5733 case float_round_ties_away
:
5734 if (aExp
== 0x3FFE) {
5735 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5738 case float_round_down
:
5741 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5742 : packFloatx80( 0, 0, 0 );
5743 case float_round_up
:
5745 aSign
? packFloatx80( 1, 0, 0 )
5746 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5748 case float_round_to_zero
:
5751 g_assert_not_reached();
5753 return packFloatx80( aSign
, 0, 0 );
5756 lastBitMask
<<= 0x403E - aExp
;
5757 roundBitsMask
= lastBitMask
- 1;
5759 switch (status
->float_rounding_mode
) {
5760 case float_round_nearest_even
:
5761 z
.low
+= lastBitMask
>>1;
5762 if ((z
.low
& roundBitsMask
) == 0) {
5763 z
.low
&= ~lastBitMask
;
5766 case float_round_ties_away
:
5767 z
.low
+= lastBitMask
>> 1;
5769 case float_round_to_zero
:
5771 case float_round_up
:
5772 if (!extractFloatx80Sign(z
)) {
5773 z
.low
+= roundBitsMask
;
5776 case float_round_down
:
5777 if (extractFloatx80Sign(z
)) {
5778 z
.low
+= roundBitsMask
;
5784 z
.low
&= ~ roundBitsMask
;
5787 z
.low
= UINT64_C(0x8000000000000000);
5789 if (z
.low
!= a
.low
) {
5790 float_raise(float_flag_inexact
, status
);
5796 /*----------------------------------------------------------------------------
5797 | Returns the result of adding the absolute values of the extended double-
5798 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5799 | negated before being returned. `zSign' is ignored if the result is a NaN.
5800 | The addition is performed according to the IEC/IEEE Standard for Binary
5801 | Floating-Point Arithmetic.
5802 *----------------------------------------------------------------------------*/
5804 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5805 float_status
*status
)
5807 int32_t aExp
, bExp
, zExp
;
5808 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5811 aSig
= extractFloatx80Frac( a
);
5812 aExp
= extractFloatx80Exp( a
);
5813 bSig
= extractFloatx80Frac( b
);
5814 bExp
= extractFloatx80Exp( b
);
5815 expDiff
= aExp
- bExp
;
5816 if ( 0 < expDiff
) {
5817 if ( aExp
== 0x7FFF ) {
5818 if ((uint64_t)(aSig
<< 1)) {
5819 return propagateFloatx80NaN(a
, b
, status
);
5823 if ( bExp
== 0 ) --expDiff
;
5824 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5827 else if ( expDiff
< 0 ) {
5828 if ( bExp
== 0x7FFF ) {
5829 if ((uint64_t)(bSig
<< 1)) {
5830 return propagateFloatx80NaN(a
, b
, status
);
5832 return packFloatx80(zSign
,
5833 floatx80_infinity_high
,
5834 floatx80_infinity_low
);
5836 if ( aExp
== 0 ) ++expDiff
;
5837 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5841 if ( aExp
== 0x7FFF ) {
5842 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5843 return propagateFloatx80NaN(a
, b
, status
);
5848 zSig0
= aSig
+ bSig
;
5850 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5851 /* At least one of the values is a pseudo-denormal,
5852 * and there is a carry out of the result. */
5857 return packFloatx80(zSign
, 0, 0);
5859 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5865 zSig0
= aSig
+ bSig
;
5866 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5868 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5869 zSig0
|= UINT64_C(0x8000000000000000);
5872 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5873 zSign
, zExp
, zSig0
, zSig1
, status
);
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of subtracting the absolute values of the extended
5878 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5879 | difference is negated before being returned. `zSign' is ignored if the
5880 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5881 | Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5884 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5885 float_status
*status
)
5887 int32_t aExp
, bExp
, zExp
;
5888 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5891 aSig
= extractFloatx80Frac( a
);
5892 aExp
= extractFloatx80Exp( a
);
5893 bSig
= extractFloatx80Frac( b
);
5894 bExp
= extractFloatx80Exp( b
);
5895 expDiff
= aExp
- bExp
;
5896 if ( 0 < expDiff
) goto aExpBigger
;
5897 if ( expDiff
< 0 ) goto bExpBigger
;
5898 if ( aExp
== 0x7FFF ) {
5899 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5900 return propagateFloatx80NaN(a
, b
, status
);
5902 float_raise(float_flag_invalid
, status
);
5903 return floatx80_default_nan(status
);
5910 if ( bSig
< aSig
) goto aBigger
;
5911 if ( aSig
< bSig
) goto bBigger
;
5912 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5914 if ( bExp
== 0x7FFF ) {
5915 if ((uint64_t)(bSig
<< 1)) {
5916 return propagateFloatx80NaN(a
, b
, status
);
5918 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5919 floatx80_infinity_low
);
5921 if ( aExp
== 0 ) ++expDiff
;
5922 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5924 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5927 goto normalizeRoundAndPack
;
5929 if ( aExp
== 0x7FFF ) {
5930 if ((uint64_t)(aSig
<< 1)) {
5931 return propagateFloatx80NaN(a
, b
, status
);
5935 if ( bExp
== 0 ) --expDiff
;
5936 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5938 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5940 normalizeRoundAndPack
:
5941 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5942 zSign
, zExp
, zSig0
, zSig1
, status
);
5945 /*----------------------------------------------------------------------------
5946 | Returns the result of adding the extended double-precision floating-point
5947 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5948 | Standard for Binary Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5951 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5955 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5956 float_raise(float_flag_invalid
, status
);
5957 return floatx80_default_nan(status
);
5959 aSign
= extractFloatx80Sign( a
);
5960 bSign
= extractFloatx80Sign( b
);
5961 if ( aSign
== bSign
) {
5962 return addFloatx80Sigs(a
, b
, aSign
, status
);
5965 return subFloatx80Sigs(a
, b
, aSign
, status
);
5970 /*----------------------------------------------------------------------------
5971 | Returns the result of subtracting the extended double-precision floating-
5972 | point values `a' and `b'. The operation is performed according to the
5973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5974 *----------------------------------------------------------------------------*/
5976 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5980 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5981 float_raise(float_flag_invalid
, status
);
5982 return floatx80_default_nan(status
);
5984 aSign
= extractFloatx80Sign( a
);
5985 bSign
= extractFloatx80Sign( b
);
5986 if ( aSign
== bSign
) {
5987 return subFloatx80Sigs(a
, b
, aSign
, status
);
5990 return addFloatx80Sigs(a
, b
, aSign
, status
);
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of multiplying the extended double-precision floating-
5997 | point values `a' and `b'. The operation is performed according to the
5998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5999 *----------------------------------------------------------------------------*/
6001 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6003 bool aSign
, bSign
, zSign
;
6004 int32_t aExp
, bExp
, zExp
;
6005 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6007 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6008 float_raise(float_flag_invalid
, status
);
6009 return floatx80_default_nan(status
);
6011 aSig
= extractFloatx80Frac( a
);
6012 aExp
= extractFloatx80Exp( a
);
6013 aSign
= extractFloatx80Sign( a
);
6014 bSig
= extractFloatx80Frac( b
);
6015 bExp
= extractFloatx80Exp( b
);
6016 bSign
= extractFloatx80Sign( b
);
6017 zSign
= aSign
^ bSign
;
6018 if ( aExp
== 0x7FFF ) {
6019 if ( (uint64_t) ( aSig
<<1 )
6020 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6021 return propagateFloatx80NaN(a
, b
, status
);
6023 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6024 return packFloatx80(zSign
, floatx80_infinity_high
,
6025 floatx80_infinity_low
);
6027 if ( bExp
== 0x7FFF ) {
6028 if ((uint64_t)(bSig
<< 1)) {
6029 return propagateFloatx80NaN(a
, b
, status
);
6031 if ( ( aExp
| aSig
) == 0 ) {
6033 float_raise(float_flag_invalid
, status
);
6034 return floatx80_default_nan(status
);
6036 return packFloatx80(zSign
, floatx80_infinity_high
,
6037 floatx80_infinity_low
);
6040 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6041 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6044 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6045 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6047 zExp
= aExp
+ bExp
- 0x3FFE;
6048 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6049 if ( 0 < (int64_t) zSig0
) {
6050 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6053 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6054 zSign
, zExp
, zSig0
, zSig1
, status
);
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of dividing the extended double-precision floating-point
6059 | value `a' by the corresponding value `b'. The operation is performed
6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6061 *----------------------------------------------------------------------------*/
6063 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6065 bool aSign
, bSign
, zSign
;
6066 int32_t aExp
, bExp
, zExp
;
6067 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6068 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6070 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6071 float_raise(float_flag_invalid
, status
);
6072 return floatx80_default_nan(status
);
6074 aSig
= extractFloatx80Frac( a
);
6075 aExp
= extractFloatx80Exp( a
);
6076 aSign
= extractFloatx80Sign( a
);
6077 bSig
= extractFloatx80Frac( b
);
6078 bExp
= extractFloatx80Exp( b
);
6079 bSign
= extractFloatx80Sign( b
);
6080 zSign
= aSign
^ bSign
;
6081 if ( aExp
== 0x7FFF ) {
6082 if ((uint64_t)(aSig
<< 1)) {
6083 return propagateFloatx80NaN(a
, b
, status
);
6085 if ( bExp
== 0x7FFF ) {
6086 if ((uint64_t)(bSig
<< 1)) {
6087 return propagateFloatx80NaN(a
, b
, status
);
6091 return packFloatx80(zSign
, floatx80_infinity_high
,
6092 floatx80_infinity_low
);
6094 if ( bExp
== 0x7FFF ) {
6095 if ((uint64_t)(bSig
<< 1)) {
6096 return propagateFloatx80NaN(a
, b
, status
);
6098 return packFloatx80( zSign
, 0, 0 );
6102 if ( ( aExp
| aSig
) == 0 ) {
6104 float_raise(float_flag_invalid
, status
);
6105 return floatx80_default_nan(status
);
6107 float_raise(float_flag_divbyzero
, status
);
6108 return packFloatx80(zSign
, floatx80_infinity_high
,
6109 floatx80_infinity_low
);
6111 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6114 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6115 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6117 zExp
= aExp
- bExp
+ 0x3FFE;
6119 if ( bSig
<= aSig
) {
6120 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6123 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6124 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6125 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6126 while ( (int64_t) rem0
< 0 ) {
6128 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6130 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6131 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6132 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6133 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6134 while ( (int64_t) rem1
< 0 ) {
6136 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6138 zSig1
|= ( ( rem1
| rem2
) != 0 );
6140 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6141 zSign
, zExp
, zSig0
, zSig1
, status
);
6144 /*----------------------------------------------------------------------------
6145 | Returns the remainder of the extended double-precision floating-point value
6146 | `a' with respect to the corresponding value `b'. The operation is performed
6147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6148 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6149 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6150 | the absolute value of the integer quotient.
6151 *----------------------------------------------------------------------------*/
6153 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6154 float_status
*status
)
6157 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6158 uint64_t aSig0
, aSig1
, bSig
;
6159 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6162 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6163 float_raise(float_flag_invalid
, status
);
6164 return floatx80_default_nan(status
);
6166 aSig0
= extractFloatx80Frac( a
);
6167 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6168 aSign
= extractFloatx80Sign( a
);
6169 bSig
= extractFloatx80Frac( b
);
6170 bExp
= extractFloatx80Exp( b
);
6171 if ( aExp
== 0x7FFF ) {
6172 if ( (uint64_t) ( aSig0
<<1 )
6173 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6174 return propagateFloatx80NaN(a
, b
, status
);
6178 if ( bExp
== 0x7FFF ) {
6179 if ((uint64_t)(bSig
<< 1)) {
6180 return propagateFloatx80NaN(a
, b
, status
);
6182 if (aExp
== 0 && aSig0
>> 63) {
6184 * Pseudo-denormal argument must be returned in normalized
6187 return packFloatx80(aSign
, 1, aSig0
);
6194 float_raise(float_flag_invalid
, status
);
6195 return floatx80_default_nan(status
);
6197 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6200 if ( aSig0
== 0 ) return a
;
6201 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6204 expDiff
= aExp
- bExp
;
6206 if ( expDiff
< 0 ) {
6207 if ( mod
|| expDiff
< -1 ) {
6208 if (aExp
== 1 && aExpOrig
== 0) {
6210 * Pseudo-denormal argument must be returned in
6213 return packFloatx80(aSign
, aExp
, aSig0
);
6217 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6220 *quotient
= q
= ( bSig
<= aSig0
);
6221 if ( q
) aSig0
-= bSig
;
6223 while ( 0 < expDiff
) {
6224 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6225 q
= ( 2 < q
) ? q
- 2 : 0;
6226 mul64To128( bSig
, q
, &term0
, &term1
);
6227 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6228 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6234 if ( 0 < expDiff
) {
6235 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6236 q
= ( 2 < q
) ? q
- 2 : 0;
6238 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6239 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6240 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6241 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6243 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6246 *quotient
<<= expDiff
;
6257 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6258 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6259 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6262 aSig0
= alternateASig0
;
6263 aSig1
= alternateASig1
;
6269 normalizeRoundAndPackFloatx80(
6270 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6274 /*----------------------------------------------------------------------------
6275 | Returns the remainder of the extended double-precision floating-point value
6276 | `a' with respect to the corresponding value `b'. The operation is performed
6277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6278 *----------------------------------------------------------------------------*/
6280 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6283 return floatx80_modrem(a
, b
, false, "ient
, status
);
6286 /*----------------------------------------------------------------------------
6287 | Returns the remainder of the extended double-precision floating-point value
6288 | `a' with respect to the corresponding value `b', with the quotient truncated
6290 *----------------------------------------------------------------------------*/
6292 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6295 return floatx80_modrem(a
, b
, true, "ient
, status
);
6298 /*----------------------------------------------------------------------------
6299 | Returns the square root of the extended double-precision floating-point
6300 | value `a'. The operation is performed according to the IEC/IEEE Standard
6301 | for Binary Floating-Point Arithmetic.
6302 *----------------------------------------------------------------------------*/
6304 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6308 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6309 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6311 if (floatx80_invalid_encoding(a
)) {
6312 float_raise(float_flag_invalid
, status
);
6313 return floatx80_default_nan(status
);
6315 aSig0
= extractFloatx80Frac( a
);
6316 aExp
= extractFloatx80Exp( a
);
6317 aSign
= extractFloatx80Sign( a
);
6318 if ( aExp
== 0x7FFF ) {
6319 if ((uint64_t)(aSig0
<< 1)) {
6320 return propagateFloatx80NaN(a
, a
, status
);
6322 if ( ! aSign
) return a
;
6326 if ( ( aExp
| aSig0
) == 0 ) return a
;
6328 float_raise(float_flag_invalid
, status
);
6329 return floatx80_default_nan(status
);
6332 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6333 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6335 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6336 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6337 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6338 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6339 doubleZSig0
= zSig0
<<1;
6340 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6341 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6342 while ( (int64_t) rem0
< 0 ) {
6345 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6347 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6348 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6349 if ( zSig1
== 0 ) zSig1
= 1;
6350 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6351 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6352 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6353 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6354 while ( (int64_t) rem1
< 0 ) {
6356 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6358 term2
|= doubleZSig0
;
6359 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6361 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6363 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6364 zSig0
|= doubleZSig0
;
6365 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6366 0, zExp
, zSig0
, zSig1
, status
);
6369 /*----------------------------------------------------------------------------
6370 | Returns the result of converting the quadruple-precision floating-point
6371 | value `a' to the 32-bit two's complement integer format. The conversion
6372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6373 | Arithmetic---which means in particular that the conversion is rounded
6374 | according to the current rounding mode. If `a' is a NaN, the largest
6375 | positive integer is returned. Otherwise, if the conversion overflows, the
6376 | largest integer with the same sign as `a' is returned.
6377 *----------------------------------------------------------------------------*/
6379 int32_t float128_to_int32(float128 a
, float_status
*status
)
6382 int32_t aExp
, shiftCount
;
6383 uint64_t aSig0
, aSig1
;
6385 aSig1
= extractFloat128Frac1( a
);
6386 aSig0
= extractFloat128Frac0( a
);
6387 aExp
= extractFloat128Exp( a
);
6388 aSign
= extractFloat128Sign( a
);
6389 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6390 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6391 aSig0
|= ( aSig1
!= 0 );
6392 shiftCount
= 0x4028 - aExp
;
6393 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6394 return roundAndPackInt32(aSign
, aSig0
, status
);
6398 /*----------------------------------------------------------------------------
6399 | Returns the result of converting the quadruple-precision floating-point
6400 | value `a' to the 32-bit two's complement integer format. The conversion
6401 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6402 | Arithmetic, except that the conversion is always rounded toward zero. If
6403 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6404 | conversion overflows, the largest integer with the same sign as `a' is
6406 *----------------------------------------------------------------------------*/
6408 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6411 int32_t aExp
, shiftCount
;
6412 uint64_t aSig0
, aSig1
, savedASig
;
6415 aSig1
= extractFloat128Frac1( a
);
6416 aSig0
= extractFloat128Frac0( a
);
6417 aExp
= extractFloat128Exp( a
);
6418 aSign
= extractFloat128Sign( a
);
6419 aSig0
|= ( aSig1
!= 0 );
6420 if ( 0x401E < aExp
) {
6421 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6424 else if ( aExp
< 0x3FFF ) {
6425 if (aExp
|| aSig0
) {
6426 float_raise(float_flag_inexact
, status
);
6430 aSig0
|= UINT64_C(0x0001000000000000);
6431 shiftCount
= 0x402F - aExp
;
6433 aSig0
>>= shiftCount
;
6435 if ( aSign
) z
= - z
;
6436 if ( ( z
< 0 ) ^ aSign
) {
6438 float_raise(float_flag_invalid
, status
);
6439 return aSign
? INT32_MIN
: INT32_MAX
;
6441 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6442 float_raise(float_flag_inexact
, status
);
6448 /*----------------------------------------------------------------------------
6449 | Returns the result of converting the quadruple-precision floating-point
6450 | value `a' to the 64-bit two's complement integer format. The conversion
6451 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6452 | Arithmetic---which means in particular that the conversion is rounded
6453 | according to the current rounding mode. If `a' is a NaN, the largest
6454 | positive integer is returned. Otherwise, if the conversion overflows, the
6455 | largest integer with the same sign as `a' is returned.
6456 *----------------------------------------------------------------------------*/
6458 int64_t float128_to_int64(float128 a
, float_status
*status
)
6461 int32_t aExp
, shiftCount
;
6462 uint64_t aSig0
, aSig1
;
6464 aSig1
= extractFloat128Frac1( a
);
6465 aSig0
= extractFloat128Frac0( a
);
6466 aExp
= extractFloat128Exp( a
);
6467 aSign
= extractFloat128Sign( a
);
6468 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6469 shiftCount
= 0x402F - aExp
;
6470 if ( shiftCount
<= 0 ) {
6471 if ( 0x403E < aExp
) {
6472 float_raise(float_flag_invalid
, status
);
6474 || ( ( aExp
== 0x7FFF )
6475 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6482 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6485 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6487 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6491 /*----------------------------------------------------------------------------
6492 | Returns the result of converting the quadruple-precision floating-point
6493 | value `a' to the 64-bit two's complement integer format. The conversion
6494 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6495 | Arithmetic, except that the conversion is always rounded toward zero.
6496 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6497 | the conversion overflows, the largest integer with the same sign as `a' is
6499 *----------------------------------------------------------------------------*/
6501 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6504 int32_t aExp
, shiftCount
;
6505 uint64_t aSig0
, aSig1
;
6508 aSig1
= extractFloat128Frac1( a
);
6509 aSig0
= extractFloat128Frac0( a
);
6510 aExp
= extractFloat128Exp( a
);
6511 aSign
= extractFloat128Sign( a
);
6512 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6513 shiftCount
= aExp
- 0x402F;
6514 if ( 0 < shiftCount
) {
6515 if ( 0x403E <= aExp
) {
6516 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6517 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6518 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6520 float_raise(float_flag_inexact
, status
);
6524 float_raise(float_flag_invalid
, status
);
6525 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6531 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6532 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6533 float_raise(float_flag_inexact
, status
);
6537 if ( aExp
< 0x3FFF ) {
6538 if ( aExp
| aSig0
| aSig1
) {
6539 float_raise(float_flag_inexact
, status
);
6543 z
= aSig0
>>( - shiftCount
);
6545 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6546 float_raise(float_flag_inexact
, status
);
6549 if ( aSign
) z
= - z
;
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point value
6556 | `a' to the 64-bit unsigned integer format. The conversion is
6557 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic---which means in particular that the conversion is rounded
6559 | according to the current rounding mode. If `a' is a NaN, the largest
6560 | positive integer is returned. If the conversion overflows, the
6561 | largest unsigned integer is returned. If 'a' is negative, the value is
6562 | rounded and zero is returned; negative values that do not round to zero
6563 | will raise the inexact exception.
6564 *----------------------------------------------------------------------------*/
6566 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6571 uint64_t aSig0
, aSig1
;
6573 aSig0
= extractFloat128Frac0(a
);
6574 aSig1
= extractFloat128Frac1(a
);
6575 aExp
= extractFloat128Exp(a
);
6576 aSign
= extractFloat128Sign(a
);
6577 if (aSign
&& (aExp
> 0x3FFE)) {
6578 float_raise(float_flag_invalid
, status
);
6579 if (float128_is_any_nan(a
)) {
6586 aSig0
|= UINT64_C(0x0001000000000000);
6588 shiftCount
= 0x402F - aExp
;
6589 if (shiftCount
<= 0) {
6590 if (0x403E < aExp
) {
6591 float_raise(float_flag_invalid
, status
);
6594 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6596 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6598 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6601 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6604 signed char current_rounding_mode
= status
->float_rounding_mode
;
6606 set_float_rounding_mode(float_round_to_zero
, status
);
6607 v
= float128_to_uint64(a
, status
);
6608 set_float_rounding_mode(current_rounding_mode
, status
);
6613 /*----------------------------------------------------------------------------
6614 | Returns the result of converting the quadruple-precision floating-point
6615 | value `a' to the 32-bit unsigned integer format. The conversion
6616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6617 | Arithmetic except that the conversion is always rounded toward zero.
6618 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6619 | if the conversion overflows, the largest unsigned integer is returned.
6620 | If 'a' is negative, the value is rounded and zero is returned; negative
6621 | values that do not round to zero will raise the inexact exception.
6622 *----------------------------------------------------------------------------*/
6624 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6628 int old_exc_flags
= get_float_exception_flags(status
);
6630 v
= float128_to_uint64_round_to_zero(a
, status
);
6631 if (v
> 0xffffffff) {
6636 set_float_exception_flags(old_exc_flags
, status
);
6637 float_raise(float_flag_invalid
, status
);
6641 /*----------------------------------------------------------------------------
6642 | Returns the result of converting the quadruple-precision floating-point value
6643 | `a' to the 32-bit unsigned integer format. The conversion is
6644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6645 | Arithmetic---which means in particular that the conversion is rounded
6646 | according to the current rounding mode. If `a' is a NaN, the largest
6647 | positive integer is returned. If the conversion overflows, the
6648 | largest unsigned integer is returned. If 'a' is negative, the value is
6649 | rounded and zero is returned; negative values that do not round to zero
6650 | will raise the inexact exception.
6651 *----------------------------------------------------------------------------*/
6653 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6657 int old_exc_flags
= get_float_exception_flags(status
);
6659 v
= float128_to_uint64(a
, status
);
6660 if (v
> 0xffffffff) {
6665 set_float_exception_flags(old_exc_flags
, status
);
6666 float_raise(float_flag_invalid
, status
);
6670 /*----------------------------------------------------------------------------
6671 | Returns the result of converting the quadruple-precision floating-point
6672 | value `a' to the single-precision floating-point format. The conversion
6673 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6675 *----------------------------------------------------------------------------*/
6677 float32
float128_to_float32(float128 a
, float_status
*status
)
6681 uint64_t aSig0
, aSig1
;
6684 aSig1
= extractFloat128Frac1( a
);
6685 aSig0
= extractFloat128Frac0( a
);
6686 aExp
= extractFloat128Exp( a
);
6687 aSign
= extractFloat128Sign( a
);
6688 if ( aExp
== 0x7FFF ) {
6689 if ( aSig0
| aSig1
) {
6690 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6692 return packFloat32( aSign
, 0xFF, 0 );
6694 aSig0
|= ( aSig1
!= 0 );
6695 shift64RightJamming( aSig0
, 18, &aSig0
);
6697 if ( aExp
|| zSig
) {
6701 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6705 /*----------------------------------------------------------------------------
6706 | Returns the result of converting the quadruple-precision floating-point
6707 | value `a' to the double-precision floating-point format. The conversion
6708 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6710 *----------------------------------------------------------------------------*/
6712 float64
float128_to_float64(float128 a
, float_status
*status
)
6716 uint64_t aSig0
, aSig1
;
6718 aSig1
= extractFloat128Frac1( a
);
6719 aSig0
= extractFloat128Frac0( a
);
6720 aExp
= extractFloat128Exp( a
);
6721 aSign
= extractFloat128Sign( a
);
6722 if ( aExp
== 0x7FFF ) {
6723 if ( aSig0
| aSig1
) {
6724 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6726 return packFloat64( aSign
, 0x7FF, 0 );
6728 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6729 aSig0
|= ( aSig1
!= 0 );
6730 if ( aExp
|| aSig0
) {
6731 aSig0
|= UINT64_C(0x4000000000000000);
6734 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6738 /*----------------------------------------------------------------------------
6739 | Returns the result of converting the quadruple-precision floating-point
6740 | value `a' to the extended double-precision floating-point format. The
6741 | conversion is performed according to the IEC/IEEE Standard for Binary
6742 | Floating-Point Arithmetic.
6743 *----------------------------------------------------------------------------*/
6745 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6749 uint64_t aSig0
, aSig1
;
6751 aSig1
= extractFloat128Frac1( a
);
6752 aSig0
= extractFloat128Frac0( a
);
6753 aExp
= extractFloat128Exp( a
);
6754 aSign
= extractFloat128Sign( a
);
6755 if ( aExp
== 0x7FFF ) {
6756 if ( aSig0
| aSig1
) {
6757 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6759 return floatx80_silence_nan(res
, status
);
6761 return packFloatx80(aSign
, floatx80_infinity_high
,
6762 floatx80_infinity_low
);
6765 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6766 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6769 aSig0
|= UINT64_C(0x0001000000000000);
6771 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6772 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6776 /*----------------------------------------------------------------------------
6777 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6778 | returns the result as a quadruple-precision floating-point value. The
6779 | operation is performed according to the IEC/IEEE Standard for Binary
6780 | Floating-Point Arithmetic.
6781 *----------------------------------------------------------------------------*/
6783 float128
float128_round_to_int(float128 a
, float_status
*status
)
6787 uint64_t lastBitMask
, roundBitsMask
;
6790 aExp
= extractFloat128Exp( a
);
6791 if ( 0x402F <= aExp
) {
6792 if ( 0x406F <= aExp
) {
6793 if ( ( aExp
== 0x7FFF )
6794 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6796 return propagateFloat128NaN(a
, a
, status
);
6801 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6802 roundBitsMask
= lastBitMask
- 1;
6804 switch (status
->float_rounding_mode
) {
6805 case float_round_nearest_even
:
6806 if ( lastBitMask
) {
6807 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6808 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6811 if ( (int64_t) z
.low
< 0 ) {
6813 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6817 case float_round_ties_away
:
6819 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6821 if ((int64_t) z
.low
< 0) {
6826 case float_round_to_zero
:
6828 case float_round_up
:
6829 if (!extractFloat128Sign(z
)) {
6830 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6833 case float_round_down
:
6834 if (extractFloat128Sign(z
)) {
6835 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6838 case float_round_to_odd
:
6840 * Note that if lastBitMask == 0, the last bit is the lsb
6841 * of high, and roundBitsMask == -1.
6843 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6844 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6850 z
.low
&= ~ roundBitsMask
;
6853 if ( aExp
< 0x3FFF ) {
6854 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6855 float_raise(float_flag_inexact
, status
);
6856 aSign
= extractFloat128Sign( a
);
6857 switch (status
->float_rounding_mode
) {
6858 case float_round_nearest_even
:
6859 if ( ( aExp
== 0x3FFE )
6860 && ( extractFloat128Frac0( a
)
6861 | extractFloat128Frac1( a
) )
6863 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6866 case float_round_ties_away
:
6867 if (aExp
== 0x3FFE) {
6868 return packFloat128(aSign
, 0x3FFF, 0, 0);
6871 case float_round_down
:
6873 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6874 : packFloat128( 0, 0, 0, 0 );
6875 case float_round_up
:
6877 aSign
? packFloat128( 1, 0, 0, 0 )
6878 : packFloat128( 0, 0x3FFF, 0, 0 );
6880 case float_round_to_odd
:
6881 return packFloat128(aSign
, 0x3FFF, 0, 0);
6883 case float_round_to_zero
:
6886 return packFloat128( aSign
, 0, 0, 0 );
6889 lastBitMask
<<= 0x402F - aExp
;
6890 roundBitsMask
= lastBitMask
- 1;
6893 switch (status
->float_rounding_mode
) {
6894 case float_round_nearest_even
:
6895 z
.high
+= lastBitMask
>>1;
6896 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6897 z
.high
&= ~ lastBitMask
;
6900 case float_round_ties_away
:
6901 z
.high
+= lastBitMask
>>1;
6903 case float_round_to_zero
:
6905 case float_round_up
:
6906 if (!extractFloat128Sign(z
)) {
6907 z
.high
|= ( a
.low
!= 0 );
6908 z
.high
+= roundBitsMask
;
6911 case float_round_down
:
6912 if (extractFloat128Sign(z
)) {
6913 z
.high
|= (a
.low
!= 0);
6914 z
.high
+= roundBitsMask
;
6917 case float_round_to_odd
:
6918 if ((z
.high
& lastBitMask
) == 0) {
6919 z
.high
|= (a
.low
!= 0);
6920 z
.high
+= roundBitsMask
;
6926 z
.high
&= ~ roundBitsMask
;
6928 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6929 float_raise(float_flag_inexact
, status
);
6935 /*----------------------------------------------------------------------------
6936 | Returns the result of adding the absolute values of the quadruple-precision
6937 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6938 | before being returned. `zSign' is ignored if the result is a NaN.
6939 | The addition is performed according to the IEC/IEEE Standard for Binary
6940 | Floating-Point Arithmetic.
6941 *----------------------------------------------------------------------------*/
6943 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6944 float_status
*status
)
6946 int32_t aExp
, bExp
, zExp
;
6947 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6950 aSig1
= extractFloat128Frac1( a
);
6951 aSig0
= extractFloat128Frac0( a
);
6952 aExp
= extractFloat128Exp( a
);
6953 bSig1
= extractFloat128Frac1( b
);
6954 bSig0
= extractFloat128Frac0( b
);
6955 bExp
= extractFloat128Exp( b
);
6956 expDiff
= aExp
- bExp
;
6957 if ( 0 < expDiff
) {
6958 if ( aExp
== 0x7FFF ) {
6959 if (aSig0
| aSig1
) {
6960 return propagateFloat128NaN(a
, b
, status
);
6968 bSig0
|= UINT64_C(0x0001000000000000);
6970 shift128ExtraRightJamming(
6971 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6974 else if ( expDiff
< 0 ) {
6975 if ( bExp
== 0x7FFF ) {
6976 if (bSig0
| bSig1
) {
6977 return propagateFloat128NaN(a
, b
, status
);
6979 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6985 aSig0
|= UINT64_C(0x0001000000000000);
6987 shift128ExtraRightJamming(
6988 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6992 if ( aExp
== 0x7FFF ) {
6993 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6994 return propagateFloat128NaN(a
, b
, status
);
6998 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7000 if (status
->flush_to_zero
) {
7001 if (zSig0
| zSig1
) {
7002 float_raise(float_flag_output_denormal
, status
);
7004 return packFloat128(zSign
, 0, 0, 0);
7006 return packFloat128( zSign
, 0, zSig0
, zSig1
);
7009 zSig0
|= UINT64_C(0x0002000000000000);
7013 aSig0
|= UINT64_C(0x0001000000000000);
7014 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7016 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
7019 shift128ExtraRightJamming(
7020 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7022 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7026 /*----------------------------------------------------------------------------
7027 | Returns the result of subtracting the absolute values of the quadruple-
7028 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7029 | difference is negated before being returned. `zSign' is ignored if the
7030 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7031 | Standard for Binary Floating-Point Arithmetic.
7032 *----------------------------------------------------------------------------*/
7034 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
7035 float_status
*status
)
7037 int32_t aExp
, bExp
, zExp
;
7038 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7041 aSig1
= extractFloat128Frac1( a
);
7042 aSig0
= extractFloat128Frac0( a
);
7043 aExp
= extractFloat128Exp( a
);
7044 bSig1
= extractFloat128Frac1( b
);
7045 bSig0
= extractFloat128Frac0( b
);
7046 bExp
= extractFloat128Exp( b
);
7047 expDiff
= aExp
- bExp
;
7048 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7049 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7050 if ( 0 < expDiff
) goto aExpBigger
;
7051 if ( expDiff
< 0 ) goto bExpBigger
;
7052 if ( aExp
== 0x7FFF ) {
7053 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7054 return propagateFloat128NaN(a
, b
, status
);
7056 float_raise(float_flag_invalid
, status
);
7057 return float128_default_nan(status
);
7063 if ( bSig0
< aSig0
) goto aBigger
;
7064 if ( aSig0
< bSig0
) goto bBigger
;
7065 if ( bSig1
< aSig1
) goto aBigger
;
7066 if ( aSig1
< bSig1
) goto bBigger
;
7067 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7070 if ( bExp
== 0x7FFF ) {
7071 if (bSig0
| bSig1
) {
7072 return propagateFloat128NaN(a
, b
, status
);
7074 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7080 aSig0
|= UINT64_C(0x4000000000000000);
7082 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7083 bSig0
|= UINT64_C(0x4000000000000000);
7085 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7088 goto normalizeRoundAndPack
;
7090 if ( aExp
== 0x7FFF ) {
7091 if (aSig0
| aSig1
) {
7092 return propagateFloat128NaN(a
, b
, status
);
7100 bSig0
|= UINT64_C(0x4000000000000000);
7102 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7103 aSig0
|= UINT64_C(0x4000000000000000);
7105 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7107 normalizeRoundAndPack
:
7109 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7114 /*----------------------------------------------------------------------------
7115 | Returns the result of adding the quadruple-precision floating-point values
7116 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7117 | for Binary Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7120 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7124 aSign
= extractFloat128Sign( a
);
7125 bSign
= extractFloat128Sign( b
);
7126 if ( aSign
== bSign
) {
7127 return addFloat128Sigs(a
, b
, aSign
, status
);
7130 return subFloat128Sigs(a
, b
, aSign
, status
);
7135 /*----------------------------------------------------------------------------
7136 | Returns the result of subtracting the quadruple-precision floating-point
7137 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7138 | Standard for Binary Floating-Point Arithmetic.
7139 *----------------------------------------------------------------------------*/
7141 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7145 aSign
= extractFloat128Sign( a
);
7146 bSign
= extractFloat128Sign( b
);
7147 if ( aSign
== bSign
) {
7148 return subFloat128Sigs(a
, b
, aSign
, status
);
7151 return addFloat128Sigs(a
, b
, aSign
, status
);
7156 /*----------------------------------------------------------------------------
7157 | Returns the result of multiplying the quadruple-precision floating-point
7158 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7159 | Standard for Binary Floating-Point Arithmetic.
7160 *----------------------------------------------------------------------------*/
7162 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7164 bool aSign
, bSign
, zSign
;
7165 int32_t aExp
, bExp
, zExp
;
7166 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7168 aSig1
= extractFloat128Frac1( a
);
7169 aSig0
= extractFloat128Frac0( a
);
7170 aExp
= extractFloat128Exp( a
);
7171 aSign
= extractFloat128Sign( a
);
7172 bSig1
= extractFloat128Frac1( b
);
7173 bSig0
= extractFloat128Frac0( b
);
7174 bExp
= extractFloat128Exp( b
);
7175 bSign
= extractFloat128Sign( b
);
7176 zSign
= aSign
^ bSign
;
7177 if ( aExp
== 0x7FFF ) {
7178 if ( ( aSig0
| aSig1
)
7179 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7180 return propagateFloat128NaN(a
, b
, status
);
7182 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7183 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7185 if ( bExp
== 0x7FFF ) {
7186 if (bSig0
| bSig1
) {
7187 return propagateFloat128NaN(a
, b
, status
);
7189 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7191 float_raise(float_flag_invalid
, status
);
7192 return float128_default_nan(status
);
7194 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7197 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7198 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7201 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7202 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7204 zExp
= aExp
+ bExp
- 0x4000;
7205 aSig0
|= UINT64_C(0x0001000000000000);
7206 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7207 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7208 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7209 zSig2
|= ( zSig3
!= 0 );
7210 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7211 shift128ExtraRightJamming(
7212 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7215 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7219 /*----------------------------------------------------------------------------
7220 | Returns the result of dividing the quadruple-precision floating-point value
7221 | `a' by the corresponding value `b'. The operation is performed according to
7222 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7223 *----------------------------------------------------------------------------*/
7225 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7227 bool aSign
, bSign
, zSign
;
7228 int32_t aExp
, bExp
, zExp
;
7229 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7230 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7232 aSig1
= extractFloat128Frac1( a
);
7233 aSig0
= extractFloat128Frac0( a
);
7234 aExp
= extractFloat128Exp( a
);
7235 aSign
= extractFloat128Sign( a
);
7236 bSig1
= extractFloat128Frac1( b
);
7237 bSig0
= extractFloat128Frac0( b
);
7238 bExp
= extractFloat128Exp( b
);
7239 bSign
= extractFloat128Sign( b
);
7240 zSign
= aSign
^ bSign
;
7241 if ( aExp
== 0x7FFF ) {
7242 if (aSig0
| aSig1
) {
7243 return propagateFloat128NaN(a
, b
, status
);
7245 if ( bExp
== 0x7FFF ) {
7246 if (bSig0
| bSig1
) {
7247 return propagateFloat128NaN(a
, b
, status
);
7251 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7253 if ( bExp
== 0x7FFF ) {
7254 if (bSig0
| bSig1
) {
7255 return propagateFloat128NaN(a
, b
, status
);
7257 return packFloat128( zSign
, 0, 0, 0 );
7260 if ( ( bSig0
| bSig1
) == 0 ) {
7261 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7263 float_raise(float_flag_invalid
, status
);
7264 return float128_default_nan(status
);
7266 float_raise(float_flag_divbyzero
, status
);
7267 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7269 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7272 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7273 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7275 zExp
= aExp
- bExp
+ 0x3FFD;
7277 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7279 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7280 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7281 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7284 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7285 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7286 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7287 while ( (int64_t) rem0
< 0 ) {
7289 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7291 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7292 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7293 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7294 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7295 while ( (int64_t) rem1
< 0 ) {
7297 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7299 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7301 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7302 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7306 /*----------------------------------------------------------------------------
7307 | Returns the remainder of the quadruple-precision floating-point value `a'
7308 | with respect to the corresponding value `b'. The operation is performed
7309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7310 *----------------------------------------------------------------------------*/
7312 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7315 int32_t aExp
, bExp
, expDiff
;
7316 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7317 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7320 aSig1
= extractFloat128Frac1( a
);
7321 aSig0
= extractFloat128Frac0( a
);
7322 aExp
= extractFloat128Exp( a
);
7323 aSign
= extractFloat128Sign( a
);
7324 bSig1
= extractFloat128Frac1( b
);
7325 bSig0
= extractFloat128Frac0( b
);
7326 bExp
= extractFloat128Exp( b
);
7327 if ( aExp
== 0x7FFF ) {
7328 if ( ( aSig0
| aSig1
)
7329 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7330 return propagateFloat128NaN(a
, b
, status
);
7334 if ( bExp
== 0x7FFF ) {
7335 if (bSig0
| bSig1
) {
7336 return propagateFloat128NaN(a
, b
, status
);
7341 if ( ( bSig0
| bSig1
) == 0 ) {
7343 float_raise(float_flag_invalid
, status
);
7344 return float128_default_nan(status
);
7346 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7349 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7350 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7352 expDiff
= aExp
- bExp
;
7353 if ( expDiff
< -1 ) return a
;
7355 aSig0
| UINT64_C(0x0001000000000000),
7357 15 - ( expDiff
< 0 ),
7362 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7363 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7364 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7366 while ( 0 < expDiff
) {
7367 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7368 q
= ( 4 < q
) ? q
- 4 : 0;
7369 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7370 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7371 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7372 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7375 if ( -64 < expDiff
) {
7376 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7377 q
= ( 4 < q
) ? q
- 4 : 0;
7379 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7381 if ( expDiff
< 0 ) {
7382 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7385 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7387 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7388 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7391 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7392 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7395 alternateASig0
= aSig0
;
7396 alternateASig1
= aSig1
;
7398 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7399 } while ( 0 <= (int64_t) aSig0
);
7401 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7402 if ( ( sigMean0
< 0 )
7403 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7404 aSig0
= alternateASig0
;
7405 aSig1
= alternateASig1
;
7407 zSign
= ( (int64_t) aSig0
< 0 );
7408 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7409 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7413 /*----------------------------------------------------------------------------
7414 | Returns the square root of the quadruple-precision floating-point value `a'.
7415 | The operation is performed according to the IEC/IEEE Standard for Binary
7416 | Floating-Point Arithmetic.
7417 *----------------------------------------------------------------------------*/
7419 float128
float128_sqrt(float128 a
, float_status
*status
)
7423 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7424 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7426 aSig1
= extractFloat128Frac1( a
);
7427 aSig0
= extractFloat128Frac0( a
);
7428 aExp
= extractFloat128Exp( a
);
7429 aSign
= extractFloat128Sign( a
);
7430 if ( aExp
== 0x7FFF ) {
7431 if (aSig0
| aSig1
) {
7432 return propagateFloat128NaN(a
, a
, status
);
7434 if ( ! aSign
) return a
;
7438 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7440 float_raise(float_flag_invalid
, status
);
7441 return float128_default_nan(status
);
7444 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7445 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7447 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7448 aSig0
|= UINT64_C(0x0001000000000000);
7449 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7450 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7451 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7452 doubleZSig0
= zSig0
<<1;
7453 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7454 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7455 while ( (int64_t) rem0
< 0 ) {
7458 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7460 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7461 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7462 if ( zSig1
== 0 ) zSig1
= 1;
7463 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7464 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7465 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7466 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7467 while ( (int64_t) rem1
< 0 ) {
7469 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7471 term2
|= doubleZSig0
;
7472 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7474 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7476 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7477 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7481 static inline FloatRelation
7482 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7483 float_status
*status
)
7487 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7488 float_raise(float_flag_invalid
, status
);
7489 return float_relation_unordered
;
7491 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7492 ( extractFloatx80Frac( a
)<<1 ) ) ||
7493 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7494 ( extractFloatx80Frac( b
)<<1 ) )) {
7496 floatx80_is_signaling_nan(a
, status
) ||
7497 floatx80_is_signaling_nan(b
, status
)) {
7498 float_raise(float_flag_invalid
, status
);
7500 return float_relation_unordered
;
7502 aSign
= extractFloatx80Sign( a
);
7503 bSign
= extractFloatx80Sign( b
);
7504 if ( aSign
!= bSign
) {
7506 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7507 ( ( a
.low
| b
.low
) == 0 ) ) {
7509 return float_relation_equal
;
7511 return 1 - (2 * aSign
);
7514 /* Normalize pseudo-denormals before comparison. */
7515 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7518 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7521 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7522 return float_relation_equal
;
7524 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7529 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7531 return floatx80_compare_internal(a
, b
, 0, status
);
7534 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7535 float_status
*status
)
7537 return floatx80_compare_internal(a
, b
, 1, status
);
7540 static inline FloatRelation
7541 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7542 float_status
*status
)
7546 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7547 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7548 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7549 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7551 float128_is_signaling_nan(a
, status
) ||
7552 float128_is_signaling_nan(b
, status
)) {
7553 float_raise(float_flag_invalid
, status
);
7555 return float_relation_unordered
;
7557 aSign
= extractFloat128Sign( a
);
7558 bSign
= extractFloat128Sign( b
);
7559 if ( aSign
!= bSign
) {
7560 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7562 return float_relation_equal
;
7564 return 1 - (2 * aSign
);
7567 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7568 return float_relation_equal
;
7570 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7575 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7577 return float128_compare_internal(a
, b
, 0, status
);
7580 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7581 float_status
*status
)
7583 return float128_compare_internal(a
, b
, 1, status
);
7586 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7592 if (floatx80_invalid_encoding(a
)) {
7593 float_raise(float_flag_invalid
, status
);
7594 return floatx80_default_nan(status
);
7596 aSig
= extractFloatx80Frac( a
);
7597 aExp
= extractFloatx80Exp( a
);
7598 aSign
= extractFloatx80Sign( a
);
7600 if ( aExp
== 0x7FFF ) {
7602 return propagateFloatx80NaN(a
, a
, status
);
7616 } else if (n
< -0x10000) {
7621 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7622 aSign
, aExp
, aSig
, 0, status
);
7625 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7629 uint64_t aSig0
, aSig1
;
7631 aSig1
= extractFloat128Frac1( a
);
7632 aSig0
= extractFloat128Frac0( a
);
7633 aExp
= extractFloat128Exp( a
);
7634 aSign
= extractFloat128Sign( a
);
7635 if ( aExp
== 0x7FFF ) {
7636 if ( aSig0
| aSig1
) {
7637 return propagateFloat128NaN(a
, a
, status
);
7642 aSig0
|= UINT64_C(0x0001000000000000);
7643 } else if (aSig0
== 0 && aSig1
== 0) {
7651 } else if (n
< -0x10000) {
7656 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7661 static void __attribute__((constructor
)) softfloat_init(void)
7663 union_float64 ua
, ub
, uc
, ur
;
7665 if (QEMU_NO_HARDFLOAT
) {
7669 * Test that the host's FMA is not obviously broken. For example,
7670 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7671 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7673 ua
.s
= 0x0020000000000001ULL
;
7674 ub
.s
= 0x3ca0000000000000ULL
;
7675 uc
.s
= 0x0020000000000000ULL
;
7676 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7677 if (ur
.s
!= 0x0020000000000001ULL
) {
7678 force_soft_fma
= true;