4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 s
->float_exception_flags
|= float_flag_overflow
;
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 s
->float_exception_flags
|= float_flag_overflow
;
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
475 return unlikely(c
>= float_class_qnan
);
478 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
480 return c
== float_class_snan
;
483 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
485 return c
== float_class_qnan
;
489 * Structure holding all of the decomposed parts of a float. The
490 * exponent is unbiased and the fraction is normalized. All
491 * calculations are done with a 64 bit fraction and then rounded as
492 * appropriate for the final format.
494 * Thanks to the packed FloatClass a decent compiler should be able to
495 * fit the whole structure into registers and avoid using the stack
496 * for parameter passing.
506 #define DECOMPOSED_BINARY_POINT 63
507 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
509 /* Structure holding all of the relevant parameters for a format.
510 * exp_size: the size of the exponent field
511 * exp_bias: the offset applied to the exponent field
512 * exp_max: the maximum normalised exponent
513 * frac_size: the size of the fraction field
514 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
515 * The following are computed based the size of fraction
516 * frac_lsb: least significant bit of fraction
517 * frac_lsbm1: the bit below the least significant bit (for rounding)
518 * round_mask/roundeven_mask: masks used for rounding
519 * The following optional modifiers are available:
520 * arm_althp: handle ARM Alternative Half Precision
531 uint64_t roundeven_mask
;
535 /* Expand fields based on the size of exponent and fraction */
536 #define FLOAT_PARAMS(E, F) \
538 .exp_bias = ((1 << E) - 1) >> 1, \
539 .exp_max = (1 << E) - 1, \
541 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
542 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
543 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
544 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
545 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
547 static const FloatFmt float16_params
= {
551 static const FloatFmt float16_params_ahp
= {
556 static const FloatFmt bfloat16_params
= {
560 static const FloatFmt float32_params
= {
564 static const FloatFmt float64_params
= {
568 /* Unpack a float to parts, but do not canonicalize. */
569 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
571 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
573 return (FloatParts
) {
574 .cls
= float_class_unclassified
,
575 .sign
= extract64(raw
, sign_pos
, 1),
576 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
577 .frac
= extract64(raw
, 0, fmt
.frac_size
),
581 static inline FloatParts
float16_unpack_raw(float16 f
)
583 return unpack_raw(float16_params
, f
);
586 static inline FloatParts
bfloat16_unpack_raw(bfloat16 f
)
588 return unpack_raw(bfloat16_params
, f
);
591 static inline FloatParts
float32_unpack_raw(float32 f
)
593 return unpack_raw(float32_params
, f
);
596 static inline FloatParts
float64_unpack_raw(float64 f
)
598 return unpack_raw(float64_params
, f
);
601 /* Pack a float from parts, but do not canonicalize. */
602 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
604 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
605 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
606 return deposit64(ret
, sign_pos
, 1, p
.sign
);
609 static inline float16
float16_pack_raw(FloatParts p
)
611 return make_float16(pack_raw(float16_params
, p
));
614 static inline bfloat16
bfloat16_pack_raw(FloatParts p
)
616 return pack_raw(bfloat16_params
, p
);
619 static inline float32
float32_pack_raw(FloatParts p
)
621 return make_float32(pack_raw(float32_params
, p
));
624 static inline float64
float64_pack_raw(FloatParts p
)
626 return make_float64(pack_raw(float64_params
, p
));
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine: (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output. These details are target-
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.c.inc"
639 /* Canonicalize EXP and FRAC, setting CLS. */
640 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
641 float_status
*status
)
643 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
644 if (part
.frac
== 0) {
645 part
.cls
= float_class_inf
;
647 part
.frac
<<= parm
->frac_shift
;
648 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
649 ? float_class_snan
: float_class_qnan
);
651 } else if (part
.exp
== 0) {
652 if (likely(part
.frac
== 0)) {
653 part
.cls
= float_class_zero
;
654 } else if (status
->flush_inputs_to_zero
) {
655 float_raise(float_flag_input_denormal
, status
);
656 part
.cls
= float_class_zero
;
659 int shift
= clz64(part
.frac
);
660 part
.cls
= float_class_normal
;
661 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
665 part
.cls
= float_class_normal
;
666 part
.exp
-= parm
->exp_bias
;
667 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
672 /* Round and uncanonicalize a floating-point number by parts. There
673 * are FRAC_SHIFT bits that may require rounding at the bottom of the
674 * fraction; these bits will be removed. The exponent will be biased
675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
678 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
679 const FloatFmt
*parm
)
681 const uint64_t frac_lsb
= parm
->frac_lsb
;
682 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
683 const uint64_t round_mask
= parm
->round_mask
;
684 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
685 const int exp_max
= parm
->exp_max
;
686 const int frac_shift
= parm
->frac_shift
;
695 case float_class_normal
:
696 switch (s
->float_rounding_mode
) {
697 case float_round_nearest_even
:
698 overflow_norm
= false;
699 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
701 case float_round_ties_away
:
702 overflow_norm
= false;
705 case float_round_to_zero
:
706 overflow_norm
= true;
710 inc
= p
.sign
? 0 : round_mask
;
711 overflow_norm
= p
.sign
;
713 case float_round_down
:
714 inc
= p
.sign
? round_mask
: 0;
715 overflow_norm
= !p
.sign
;
717 case float_round_to_odd
:
718 overflow_norm
= true;
719 inc
= frac
& frac_lsb
? 0 : round_mask
;
722 g_assert_not_reached();
725 exp
+= parm
->exp_bias
;
726 if (likely(exp
> 0)) {
727 if (frac
& round_mask
) {
728 flags
|= float_flag_inexact
;
729 if (uadd64_overflow(frac
, inc
, &frac
)) {
730 frac
= (frac
>> 1) | DECOMPOSED_IMPLICIT_BIT
;
736 if (parm
->arm_althp
) {
737 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
738 if (unlikely(exp
> exp_max
)) {
739 /* Overflow. Return the maximum normal. */
740 flags
= float_flag_invalid
;
744 } else if (unlikely(exp
>= exp_max
)) {
745 flags
|= float_flag_overflow
| float_flag_inexact
;
750 p
.cls
= float_class_inf
;
754 } else if (s
->flush_to_zero
) {
755 flags
|= float_flag_output_denormal
;
756 p
.cls
= float_class_zero
;
759 bool is_tiny
= s
->tininess_before_rounding
|| (exp
< 0);
763 is_tiny
= !uadd64_overflow(frac
, inc
, &discard
);
766 shift64RightJamming(frac
, 1 - exp
, &frac
);
767 if (frac
& round_mask
) {
768 /* Need to recompute round-to-even. */
769 switch (s
->float_rounding_mode
) {
770 case float_round_nearest_even
:
771 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
774 case float_round_to_odd
:
775 inc
= frac
& frac_lsb
? 0 : round_mask
;
780 flags
|= float_flag_inexact
;
784 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
787 if (is_tiny
&& (flags
& float_flag_inexact
)) {
788 flags
|= float_flag_underflow
;
790 if (exp
== 0 && frac
== 0) {
791 p
.cls
= float_class_zero
;
796 case float_class_zero
:
802 case float_class_inf
:
804 assert(!parm
->arm_althp
);
809 case float_class_qnan
:
810 case float_class_snan
:
811 assert(!parm
->arm_althp
);
813 frac
>>= parm
->frac_shift
;
817 g_assert_not_reached();
820 float_raise(flags
, s
);
826 /* Explicit FloatFmt version */
827 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
828 const FloatFmt
*params
)
830 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
833 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
835 return float16a_unpack_canonical(f
, s
, &float16_params
);
838 static FloatParts
bfloat16_unpack_canonical(bfloat16 f
, float_status
*s
)
840 return sf_canonicalize(bfloat16_unpack_raw(f
), &bfloat16_params
, s
);
843 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
844 const FloatFmt
*params
)
846 return float16_pack_raw(round_canonical(p
, s
, params
));
849 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
851 return float16a_round_pack_canonical(p
, s
, &float16_params
);
854 static bfloat16
bfloat16_round_pack_canonical(FloatParts p
, float_status
*s
)
856 return bfloat16_pack_raw(round_canonical(p
, s
, &bfloat16_params
));
859 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
861 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
864 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
866 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
869 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
871 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
874 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
876 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
879 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
882 case float_class_snan
:
883 s
->float_exception_flags
|= float_flag_invalid
;
884 a
= parts_silence_nan(a
, s
);
886 case float_class_qnan
:
887 if (s
->default_nan_mode
) {
888 return parts_default_nan(s
);
893 g_assert_not_reached();
898 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
900 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
901 s
->float_exception_flags
|= float_flag_invalid
;
904 if (s
->default_nan_mode
) {
905 return parts_default_nan(s
);
907 if (pickNaN(a
.cls
, b
.cls
,
909 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
), s
)) {
912 if (is_snan(a
.cls
)) {
913 return parts_silence_nan(a
, s
);
919 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
920 bool inf_zero
, float_status
*s
)
924 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
925 s
->float_exception_flags
|= float_flag_invalid
;
928 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
930 if (s
->default_nan_mode
) {
931 /* Note that this check is after pickNaNMulAdd so that function
932 * has an opportunity to set the Invalid flag.
947 return parts_default_nan(s
);
949 g_assert_not_reached();
952 if (is_snan(a
.cls
)) {
953 return parts_silence_nan(a
, s
);
959 * Returns the result of adding or subtracting the values of the
960 * floating-point values `a' and `b'. The operation is performed
961 * according to the IEC/IEEE Standard for Binary Floating-Point
965 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
968 bool a_sign
= a
.sign
;
969 bool b_sign
= b
.sign
^ subtract
;
971 if (a_sign
!= b_sign
) {
974 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
975 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
976 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
977 a
.frac
= a
.frac
- b
.frac
;
979 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
980 a
.frac
= b
.frac
- a
.frac
;
986 a
.cls
= float_class_zero
;
987 a
.sign
= s
->float_rounding_mode
== float_round_down
;
989 int shift
= clz64(a
.frac
);
990 a
.frac
= a
.frac
<< shift
;
991 a
.exp
= a
.exp
- shift
;
996 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
997 return pick_nan(a
, b
, s
);
999 if (a
.cls
== float_class_inf
) {
1000 if (b
.cls
== float_class_inf
) {
1001 float_raise(float_flag_invalid
, s
);
1002 return parts_default_nan(s
);
1006 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1007 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1010 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1011 b
.sign
= a_sign
^ 1;
1014 if (b
.cls
== float_class_zero
) {
1019 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1020 if (a
.exp
> b
.exp
) {
1021 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1022 } else if (a
.exp
< b
.exp
) {
1023 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1027 if (uadd64_overflow(a
.frac
, b
.frac
, &a
.frac
)) {
1028 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1029 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
1034 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1035 return pick_nan(a
, b
, s
);
1037 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1040 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1045 g_assert_not_reached();
1049 * Returns the result of adding or subtracting the floating-point
1050 * values `a' and `b'. The operation is performed according to the
1051 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1054 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1056 FloatParts pa
= float16_unpack_canonical(a
, status
);
1057 FloatParts pb
= float16_unpack_canonical(b
, status
);
1058 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1060 return float16_round_pack_canonical(pr
, status
);
1063 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1065 FloatParts pa
= float16_unpack_canonical(a
, status
);
1066 FloatParts pb
= float16_unpack_canonical(b
, status
);
1067 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1069 return float16_round_pack_canonical(pr
, status
);
1072 static float32 QEMU_SOFTFLOAT_ATTR
1073 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1075 FloatParts pa
= float32_unpack_canonical(a
, status
);
1076 FloatParts pb
= float32_unpack_canonical(b
, status
);
1077 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1079 return float32_round_pack_canonical(pr
, status
);
1082 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1084 return soft_f32_addsub(a
, b
, false, status
);
1087 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1089 return soft_f32_addsub(a
, b
, true, status
);
1092 static float64 QEMU_SOFTFLOAT_ATTR
1093 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1095 FloatParts pa
= float64_unpack_canonical(a
, status
);
1096 FloatParts pb
= float64_unpack_canonical(b
, status
);
1097 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1099 return float64_round_pack_canonical(pr
, status
);
1102 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1104 return soft_f64_addsub(a
, b
, false, status
);
1107 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1109 return soft_f64_addsub(a
, b
, true, status
);
1112 static float hard_f32_add(float a
, float b
)
1117 static float hard_f32_sub(float a
, float b
)
1122 static double hard_f64_add(double a
, double b
)
1127 static double hard_f64_sub(double a
, double b
)
1132 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1134 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1135 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1137 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1140 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1142 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1143 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1145 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1149 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1150 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1152 return float32_gen2(a
, b
, s
, hard
, soft
,
1153 f32_is_zon2
, f32_addsubmul_post
);
1156 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1157 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1159 return float64_gen2(a
, b
, s
, hard
, soft
,
1160 f64_is_zon2
, f64_addsubmul_post
);
1163 float32 QEMU_FLATTEN
1164 float32_add(float32 a
, float32 b
, float_status
*s
)
1166 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1169 float32 QEMU_FLATTEN
1170 float32_sub(float32 a
, float32 b
, float_status
*s
)
1172 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1175 float64 QEMU_FLATTEN
1176 float64_add(float64 a
, float64 b
, float_status
*s
)
1178 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1181 float64 QEMU_FLATTEN
1182 float64_sub(float64 a
, float64 b
, float_status
*s
)
1184 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1188 * Returns the result of adding or subtracting the bfloat16
1189 * values `a' and `b'.
1191 bfloat16 QEMU_FLATTEN
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1193 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1194 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1195 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1197 return bfloat16_round_pack_canonical(pr
, status
);
1200 bfloat16 QEMU_FLATTEN
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1202 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1203 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1204 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1206 return bfloat16_round_pack_canonical(pr
, status
);
1210 * Returns the result of multiplying the floating-point values `a' and
1211 * `b'. The operation is performed according to the IEC/IEEE Standard
1212 * for Binary Floating-Point Arithmetic.
1215 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1217 bool sign
= a
.sign
^ b
.sign
;
1219 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1221 int exp
= a
.exp
+ b
.exp
;
1223 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1224 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1237 /* handle all the NaN cases */
1238 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1239 return pick_nan(a
, b
, s
);
1241 /* Inf * Zero == NaN */
1242 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1243 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1244 s
->float_exception_flags
|= float_flag_invalid
;
1245 return parts_default_nan(s
);
1247 /* Multiply by 0 or Inf */
1248 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1252 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1256 g_assert_not_reached();
1259 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1261 FloatParts pa
= float16_unpack_canonical(a
, status
);
1262 FloatParts pb
= float16_unpack_canonical(b
, status
);
1263 FloatParts pr
= mul_floats(pa
, pb
, status
);
1265 return float16_round_pack_canonical(pr
, status
);
1268 static float32 QEMU_SOFTFLOAT_ATTR
1269 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1271 FloatParts pa
= float32_unpack_canonical(a
, status
);
1272 FloatParts pb
= float32_unpack_canonical(b
, status
);
1273 FloatParts pr
= mul_floats(pa
, pb
, status
);
1275 return float32_round_pack_canonical(pr
, status
);
1278 static float64 QEMU_SOFTFLOAT_ATTR
1279 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1281 FloatParts pa
= float64_unpack_canonical(a
, status
);
1282 FloatParts pb
= float64_unpack_canonical(b
, status
);
1283 FloatParts pr
= mul_floats(pa
, pb
, status
);
1285 return float64_round_pack_canonical(pr
, status
);
1288 static float hard_f32_mul(float a
, float b
)
1293 static double hard_f64_mul(double a
, double b
)
1298 float32 QEMU_FLATTEN
1299 float32_mul(float32 a
, float32 b
, float_status
*s
)
1301 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1302 f32_is_zon2
, f32_addsubmul_post
);
1305 float64 QEMU_FLATTEN
1306 float64_mul(float64 a
, float64 b
, float_status
*s
)
1308 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1309 f64_is_zon2
, f64_addsubmul_post
);
1313 * Returns the result of multiplying the bfloat16
1314 * values `a' and `b'.
1317 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1319 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1320 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1321 FloatParts pr
= mul_floats(pa
, pb
, status
);
1323 return bfloat16_round_pack_canonical(pr
, status
);
1327 * Returns the result of multiplying the floating-point values `a' and
1328 * `b' then adding 'c', with no intermediate rounding step after the
1329 * multiplication. The operation is performed according to the
1330 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1331 * The flags argument allows the caller to select negation of the
1332 * addend, the intermediate product, or the final result. (The
1333 * difference between this and having the caller do a separate
1334 * negation is that negating externally will flip the sign bit on
1338 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1339 int flags
, float_status
*s
)
1341 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1342 ((1 << float_class_inf
) | (1 << float_class_zero
));
1344 bool sign_flip
= flags
& float_muladd_negate_result
;
1349 /* It is implementation-defined whether the cases of (0,inf,qnan)
1350 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1351 * they return if they do), so we have to hand this information
1352 * off to the target-specific pick-a-NaN routine.
1354 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1355 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1359 s
->float_exception_flags
|= float_flag_invalid
;
1360 return parts_default_nan(s
);
1363 if (flags
& float_muladd_negate_c
) {
1367 p_sign
= a
.sign
^ b
.sign
;
1369 if (flags
& float_muladd_negate_product
) {
1373 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1374 p_class
= float_class_inf
;
1375 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1376 p_class
= float_class_zero
;
1378 p_class
= float_class_normal
;
1381 if (c
.cls
== float_class_inf
) {
1382 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1383 s
->float_exception_flags
|= float_flag_invalid
;
1384 return parts_default_nan(s
);
1386 a
.cls
= float_class_inf
;
1387 a
.sign
= c
.sign
^ sign_flip
;
1392 if (p_class
== float_class_inf
) {
1393 a
.cls
= float_class_inf
;
1394 a
.sign
= p_sign
^ sign_flip
;
1398 if (p_class
== float_class_zero
) {
1399 if (c
.cls
== float_class_zero
) {
1400 if (p_sign
!= c
.sign
) {
1401 p_sign
= s
->float_rounding_mode
== float_round_down
;
1404 } else if (flags
& float_muladd_halve_result
) {
1407 c
.sign
^= sign_flip
;
1411 /* a & b should be normals now... */
1412 assert(a
.cls
== float_class_normal
&&
1413 b
.cls
== float_class_normal
);
1415 p_exp
= a
.exp
+ b
.exp
;
1417 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1419 /* Renormalize to the msb. */
1420 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1423 shortShift128Left(hi
, lo
, 1, &hi
, &lo
);
1427 if (c
.cls
!= float_class_zero
) {
1428 int exp_diff
= p_exp
- c
.exp
;
1429 if (p_sign
== c
.sign
) {
1431 if (exp_diff
<= 0) {
1432 shift64RightJamming(hi
, -exp_diff
, &hi
);
1434 if (uadd64_overflow(hi
, c
.frac
, &hi
)) {
1435 shift64RightJamming(hi
, 1, &hi
);
1436 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1440 uint64_t c_hi
, c_lo
, over
;
1441 shift128RightJamming(c
.frac
, 0, exp_diff
, &c_hi
, &c_lo
);
1442 add192(0, hi
, lo
, 0, c_hi
, c_lo
, &over
, &hi
, &lo
);
1444 shift64RightJamming(hi
, 1, &hi
);
1445 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1451 uint64_t c_hi
= c
.frac
, c_lo
= 0;
1453 if (exp_diff
<= 0) {
1454 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1457 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1458 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1460 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1465 shift128RightJamming(c_hi
, c_lo
,
1468 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1471 if (hi
== 0 && lo
== 0) {
1472 a
.cls
= float_class_zero
;
1473 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1474 a
.sign
^= sign_flip
;
1481 shift
= clz64(lo
) + 64;
1483 /* Normalizing to a binary point of 124 is the
1484 correct adjust for the exponent. However since we're
1485 shifting, we might as well put the binary point back
1486 at 63 where we really want it. Therefore shift as
1487 if we're leaving 1 bit at the top of the word, but
1488 adjust the exponent as if we're leaving 3 bits. */
1489 shift128Left(hi
, lo
, shift
, &hi
, &lo
);
1496 if (flags
& float_muladd_halve_result
) {
1500 /* finally prepare our result */
1501 a
.cls
= float_class_normal
;
1502 a
.sign
= p_sign
^ sign_flip
;
1509 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1510 int flags
, float_status
*status
)
1512 FloatParts pa
= float16_unpack_canonical(a
, status
);
1513 FloatParts pb
= float16_unpack_canonical(b
, status
);
1514 FloatParts pc
= float16_unpack_canonical(c
, status
);
1515 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1517 return float16_round_pack_canonical(pr
, status
);
1520 static float32 QEMU_SOFTFLOAT_ATTR
1521 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1522 float_status
*status
)
1524 FloatParts pa
= float32_unpack_canonical(a
, status
);
1525 FloatParts pb
= float32_unpack_canonical(b
, status
);
1526 FloatParts pc
= float32_unpack_canonical(c
, status
);
1527 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1529 return float32_round_pack_canonical(pr
, status
);
1532 static float64 QEMU_SOFTFLOAT_ATTR
1533 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1534 float_status
*status
)
1536 FloatParts pa
= float64_unpack_canonical(a
, status
);
1537 FloatParts pb
= float64_unpack_canonical(b
, status
);
1538 FloatParts pc
= float64_unpack_canonical(c
, status
);
1539 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1541 return float64_round_pack_canonical(pr
, status
);
1544 static bool force_soft_fma
;
1546 float32 QEMU_FLATTEN
1547 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1549 union_float32 ua
, ub
, uc
, ur
;
1555 if (unlikely(!can_use_fpu(s
))) {
1558 if (unlikely(flags
& float_muladd_halve_result
)) {
1562 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1563 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1567 if (unlikely(force_soft_fma
)) {
1572 * When (a || b) == 0, there's no need to check for under/over flow,
1573 * since we know the addend is (normal || 0) and the product is 0.
1575 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1579 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1580 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1581 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1583 if (flags
& float_muladd_negate_c
) {
1588 union_float32 ua_orig
= ua
;
1589 union_float32 uc_orig
= uc
;
1591 if (flags
& float_muladd_negate_product
) {
1594 if (flags
& float_muladd_negate_c
) {
1598 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1600 if (unlikely(f32_is_inf(ur
))) {
1601 s
->float_exception_flags
|= float_flag_overflow
;
1602 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1608 if (flags
& float_muladd_negate_result
) {
1609 return float32_chs(ur
.s
);
1614 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1617 float64 QEMU_FLATTEN
1618 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1620 union_float64 ua
, ub
, uc
, ur
;
1626 if (unlikely(!can_use_fpu(s
))) {
1629 if (unlikely(flags
& float_muladd_halve_result
)) {
1633 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1634 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1638 if (unlikely(force_soft_fma
)) {
1643 * When (a || b) == 0, there's no need to check for under/over flow,
1644 * since we know the addend is (normal || 0) and the product is 0.
1646 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1650 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1651 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1652 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1654 if (flags
& float_muladd_negate_c
) {
1659 union_float64 ua_orig
= ua
;
1660 union_float64 uc_orig
= uc
;
1662 if (flags
& float_muladd_negate_product
) {
1665 if (flags
& float_muladd_negate_c
) {
1669 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1671 if (unlikely(f64_is_inf(ur
))) {
1672 s
->float_exception_flags
|= float_flag_overflow
;
1673 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1679 if (flags
& float_muladd_negate_result
) {
1680 return float64_chs(ur
.s
);
1685 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1689 * Returns the result of multiplying the bfloat16 values `a'
1690 * and `b' then adding 'c', with no intermediate rounding step after the
1694 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1695 int flags
, float_status
*status
)
1697 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1698 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1699 FloatParts pc
= bfloat16_unpack_canonical(c
, status
);
1700 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1702 return bfloat16_round_pack_canonical(pr
, status
);
1706 * Returns the result of dividing the floating-point value `a' by the
1707 * corresponding value `b'. The operation is performed according to
1708 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1711 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1713 bool sign
= a
.sign
^ b
.sign
;
1715 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1716 uint64_t n0
, n1
, q
, r
;
1717 int exp
= a
.exp
- b
.exp
;
1720 * We want a 2*N / N-bit division to produce exactly an N-bit
1721 * result, so that we do not lose any precision and so that we
1722 * do not have to renormalize afterward. If A.frac < B.frac,
1723 * then division would produce an (N-1)-bit result; shift A left
1724 * by one to produce the an N-bit result, and decrement the
1725 * exponent to match.
1727 * The udiv_qrnnd algorithm that we're using requires normalization,
1728 * i.e. the msb of the denominator must be set, which is already true.
1730 if (a
.frac
< b
.frac
) {
1732 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1734 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1736 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1738 /* Set lsb if there is a remainder, to set inexact. */
1739 a
.frac
= q
| (r
!= 0);
1744 /* handle all the NaN cases */
1745 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1746 return pick_nan(a
, b
, s
);
1748 /* 0/0 or Inf/Inf */
1751 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1752 s
->float_exception_flags
|= float_flag_invalid
;
1753 return parts_default_nan(s
);
1755 /* Inf / x or 0 / x */
1756 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1761 if (b
.cls
== float_class_zero
) {
1762 s
->float_exception_flags
|= float_flag_divbyzero
;
1763 a
.cls
= float_class_inf
;
1768 if (b
.cls
== float_class_inf
) {
1769 a
.cls
= float_class_zero
;
1773 g_assert_not_reached();
1776 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1778 FloatParts pa
= float16_unpack_canonical(a
, status
);
1779 FloatParts pb
= float16_unpack_canonical(b
, status
);
1780 FloatParts pr
= div_floats(pa
, pb
, status
);
1782 return float16_round_pack_canonical(pr
, status
);
1785 static float32 QEMU_SOFTFLOAT_ATTR
1786 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1788 FloatParts pa
= float32_unpack_canonical(a
, status
);
1789 FloatParts pb
= float32_unpack_canonical(b
, status
);
1790 FloatParts pr
= div_floats(pa
, pb
, status
);
1792 return float32_round_pack_canonical(pr
, status
);
1795 static float64 QEMU_SOFTFLOAT_ATTR
1796 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1798 FloatParts pa
= float64_unpack_canonical(a
, status
);
1799 FloatParts pb
= float64_unpack_canonical(b
, status
);
1800 FloatParts pr
= div_floats(pa
, pb
, status
);
1802 return float64_round_pack_canonical(pr
, status
);
1805 static float hard_f32_div(float a
, float b
)
1810 static double hard_f64_div(double a
, double b
)
1815 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1817 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1818 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1819 fpclassify(b
.h
) == FP_NORMAL
;
1821 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1824 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1826 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1827 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1828 fpclassify(b
.h
) == FP_NORMAL
;
1830 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1833 static bool f32_div_post(union_float32 a
, union_float32 b
)
1835 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1836 return fpclassify(a
.h
) != FP_ZERO
;
1838 return !float32_is_zero(a
.s
);
1841 static bool f64_div_post(union_float64 a
, union_float64 b
)
1843 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1844 return fpclassify(a
.h
) != FP_ZERO
;
1846 return !float64_is_zero(a
.s
);
1849 float32 QEMU_FLATTEN
1850 float32_div(float32 a
, float32 b
, float_status
*s
)
1852 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1853 f32_div_pre
, f32_div_post
);
1856 float64 QEMU_FLATTEN
1857 float64_div(float64 a
, float64 b
, float_status
*s
)
1859 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1860 f64_div_pre
, f64_div_post
);
1864 * Returns the result of dividing the bfloat16
1865 * value `a' by the corresponding value `b'.
1868 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1870 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1871 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1872 FloatParts pr
= div_floats(pa
, pb
, status
);
1874 return bfloat16_round_pack_canonical(pr
, status
);
1878 * Float to Float conversions
1880 * Returns the result of converting one float format to another. The
1881 * conversion is performed according to the IEC/IEEE Standard for
1882 * Binary Floating-Point Arithmetic.
1884 * The float_to_float helper only needs to take care of raising
1885 * invalid exceptions and handling the conversion on NaNs.
1888 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1891 if (dstf
->arm_althp
) {
1893 case float_class_qnan
:
1894 case float_class_snan
:
1895 /* There is no NaN in the destination format. Raise Invalid
1896 * and return a zero with the sign of the input NaN.
1898 s
->float_exception_flags
|= float_flag_invalid
;
1899 a
.cls
= float_class_zero
;
1904 case float_class_inf
:
1905 /* There is no Inf in the destination format. Raise Invalid
1906 * and return the maximum normal with the correct sign.
1908 s
->float_exception_flags
|= float_flag_invalid
;
1909 a
.cls
= float_class_normal
;
1910 a
.exp
= dstf
->exp_max
;
1911 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1917 } else if (is_nan(a
.cls
)) {
1918 if (is_snan(a
.cls
)) {
1919 s
->float_exception_flags
|= float_flag_invalid
;
1920 a
= parts_silence_nan(a
, s
);
1922 if (s
->default_nan_mode
) {
1923 return parts_default_nan(s
);
1929 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1931 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1932 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1933 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1934 return float32_round_pack_canonical(pr
, s
);
1937 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1939 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1940 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1941 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1942 return float64_round_pack_canonical(pr
, s
);
1945 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1947 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1948 FloatParts p
= float32_unpack_canonical(a
, s
);
1949 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1950 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1953 static float64 QEMU_SOFTFLOAT_ATTR
1954 soft_float32_to_float64(float32 a
, float_status
*s
)
1956 FloatParts p
= float32_unpack_canonical(a
, s
);
1957 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1958 return float64_round_pack_canonical(pr
, s
);
1961 float64
float32_to_float64(float32 a
, float_status
*s
)
1963 if (likely(float32_is_normal(a
))) {
1964 /* Widening conversion can never produce inexact results. */
1970 } else if (float32_is_zero(a
)) {
1971 return float64_set_sign(float64_zero
, float32_is_neg(a
));
1973 return soft_float32_to_float64(a
, s
);
1977 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1979 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1980 FloatParts p
= float64_unpack_canonical(a
, s
);
1981 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1982 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1985 float32
float64_to_float32(float64 a
, float_status
*s
)
1987 FloatParts p
= float64_unpack_canonical(a
, s
);
1988 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1989 return float32_round_pack_canonical(pr
, s
);
1992 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
1994 FloatParts p
= bfloat16_unpack_canonical(a
, s
);
1995 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1996 return float32_round_pack_canonical(pr
, s
);
1999 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2001 FloatParts p
= bfloat16_unpack_canonical(a
, s
);
2002 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
2003 return float64_round_pack_canonical(pr
, s
);
2006 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2008 FloatParts p
= float32_unpack_canonical(a
, s
);
2009 FloatParts pr
= float_to_float(p
, &bfloat16_params
, s
);
2010 return bfloat16_round_pack_canonical(pr
, s
);
2013 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2015 FloatParts p
= float64_unpack_canonical(a
, s
);
2016 FloatParts pr
= float_to_float(p
, &bfloat16_params
, s
);
2017 return bfloat16_round_pack_canonical(pr
, s
);
2021 * Rounds the floating-point value `a' to an integer, and returns the
2022 * result as a floating-point value. The operation is performed
2023 * according to the IEC/IEEE Standard for Binary Floating-Point
2027 static FloatParts
round_to_int(FloatParts a
, FloatRoundMode rmode
,
2028 int scale
, float_status
*s
)
2031 case float_class_qnan
:
2032 case float_class_snan
:
2033 return return_nan(a
, s
);
2035 case float_class_zero
:
2036 case float_class_inf
:
2037 /* already "integral" */
2040 case float_class_normal
:
2041 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2044 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2045 /* already integral */
2050 /* all fractional */
2051 s
->float_exception_flags
|= float_flag_inexact
;
2053 case float_round_nearest_even
:
2054 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2056 case float_round_ties_away
:
2057 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2059 case float_round_to_zero
:
2062 case float_round_up
:
2065 case float_round_down
:
2068 case float_round_to_odd
:
2072 g_assert_not_reached();
2076 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2079 a
.cls
= float_class_zero
;
2082 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2083 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2084 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2085 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2089 case float_round_nearest_even
:
2090 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2092 case float_round_ties_away
:
2095 case float_round_to_zero
:
2098 case float_round_up
:
2099 inc
= a
.sign
? 0 : rnd_mask
;
2101 case float_round_down
:
2102 inc
= a
.sign
? rnd_mask
: 0;
2104 case float_round_to_odd
:
2105 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2108 g_assert_not_reached();
2111 if (a
.frac
& rnd_mask
) {
2112 s
->float_exception_flags
|= float_flag_inexact
;
2113 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2115 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2118 a
.frac
&= ~rnd_mask
;
2123 g_assert_not_reached();
2128 float16
float16_round_to_int(float16 a
, float_status
*s
)
2130 FloatParts pa
= float16_unpack_canonical(a
, s
);
2131 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2132 return float16_round_pack_canonical(pr
, s
);
2135 float32
float32_round_to_int(float32 a
, float_status
*s
)
2137 FloatParts pa
= float32_unpack_canonical(a
, s
);
2138 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2139 return float32_round_pack_canonical(pr
, s
);
2142 float64
float64_round_to_int(float64 a
, float_status
*s
)
2144 FloatParts pa
= float64_unpack_canonical(a
, s
);
2145 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2146 return float64_round_pack_canonical(pr
, s
);
2150 * Rounds the bfloat16 value `a' to an integer, and returns the
2151 * result as a bfloat16 value.
2154 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2156 FloatParts pa
= bfloat16_unpack_canonical(a
, s
);
2157 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2158 return bfloat16_round_pack_canonical(pr
, s
);
2162 * Returns the result of converting the floating-point value `a' to
2163 * the two's complement integer format. The conversion is performed
2164 * according to the IEC/IEEE Standard for Binary Floating-Point
2165 * Arithmetic---which means in particular that the conversion is
2166 * rounded according to the current rounding mode. If `a' is a NaN,
2167 * the largest positive integer is returned. Otherwise, if the
2168 * conversion overflows, the largest integer with the same sign as `a'
2172 static int64_t round_to_int_and_pack(FloatParts in
, FloatRoundMode rmode
,
2173 int scale
, int64_t min
, int64_t max
,
2177 int orig_flags
= get_float_exception_flags(s
);
2178 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2181 case float_class_snan
:
2182 case float_class_qnan
:
2183 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2185 case float_class_inf
:
2186 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2187 return p
.sign
? min
: max
;
2188 case float_class_zero
:
2190 case float_class_normal
:
2191 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2192 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2197 if (r
<= -(uint64_t) min
) {
2200 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2207 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2212 g_assert_not_reached();
2216 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2219 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2220 rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2223 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2226 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2227 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2230 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2233 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2234 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2237 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2240 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2241 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2244 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2247 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2248 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2251 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2254 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2255 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2258 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2261 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2262 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2265 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2268 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2269 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2272 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2275 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2276 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2279 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2282 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2283 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2286 int8_t float16_to_int8(float16 a
, float_status
*s
)
2288 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2291 int16_t float16_to_int16(float16 a
, float_status
*s
)
2293 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2296 int32_t float16_to_int32(float16 a
, float_status
*s
)
2298 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2301 int64_t float16_to_int64(float16 a
, float_status
*s
)
2303 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2306 int16_t float32_to_int16(float32 a
, float_status
*s
)
2308 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2311 int32_t float32_to_int32(float32 a
, float_status
*s
)
2313 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2316 int64_t float32_to_int64(float32 a
, float_status
*s
)
2318 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2321 int16_t float64_to_int16(float64 a
, float_status
*s
)
2323 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2326 int32_t float64_to_int32(float64 a
, float_status
*s
)
2328 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2331 int64_t float64_to_int64(float64 a
, float_status
*s
)
2333 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2336 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2338 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2341 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2343 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2346 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2348 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2351 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2353 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2356 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2358 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2361 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2363 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2366 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2368 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2371 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2373 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2376 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2378 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2382 * Returns the result of converting the floating-point value `a' to
2383 * the two's complement integer format.
2386 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2389 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2390 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2393 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2396 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2397 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2400 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2403 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2404 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2407 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2409 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2412 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2414 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2417 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2419 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2422 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2424 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2427 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2429 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2432 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2434 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2438 * Returns the result of converting the floating-point value `a' to
2439 * the unsigned integer format. The conversion is performed according
2440 * to the IEC/IEEE Standard for Binary Floating-Point
2441 * Arithmetic---which means in particular that the conversion is
2442 * rounded according to the current rounding mode. If `a' is a NaN,
2443 * the largest unsigned integer is returned. Otherwise, if the
2444 * conversion overflows, the largest unsigned integer is returned. If
2445 * the 'a' is negative, the result is rounded and zero is returned;
2446 * values that do not round to zero will raise the inexact exception
2450 static uint64_t round_to_uint_and_pack(FloatParts in
, FloatRoundMode rmode
,
2451 int scale
, uint64_t max
,
2454 int orig_flags
= get_float_exception_flags(s
);
2455 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2459 case float_class_snan
:
2460 case float_class_qnan
:
2461 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2463 case float_class_inf
:
2464 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2465 return p
.sign
? 0 : max
;
2466 case float_class_zero
:
2468 case float_class_normal
:
2470 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2474 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2475 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2477 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2481 /* For uint64 this will never trip, but if p.exp is too large
2482 * to shift a decomposed fraction we shall have exited via the
2486 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2491 g_assert_not_reached();
2495 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2498 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2499 rmode
, scale
, UINT8_MAX
, s
);
2502 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2505 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2506 rmode
, scale
, UINT16_MAX
, s
);
2509 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2512 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2513 rmode
, scale
, UINT32_MAX
, s
);
2516 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2519 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2520 rmode
, scale
, UINT64_MAX
, s
);
2523 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2526 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2527 rmode
, scale
, UINT16_MAX
, s
);
2530 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2533 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2534 rmode
, scale
, UINT32_MAX
, s
);
2537 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2540 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2541 rmode
, scale
, UINT64_MAX
, s
);
2544 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2547 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2548 rmode
, scale
, UINT16_MAX
, s
);
2551 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2554 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2555 rmode
, scale
, UINT32_MAX
, s
);
2558 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2561 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2562 rmode
, scale
, UINT64_MAX
, s
);
2565 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2567 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2570 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2572 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2575 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2577 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2580 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2582 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2585 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2587 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2590 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2592 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2595 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2597 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2600 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2602 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2605 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2607 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2610 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2612 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2615 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2617 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2620 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2622 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2625 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2627 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2630 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2632 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2635 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2637 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2640 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2642 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2645 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2647 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2650 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2652 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2655 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2657 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2661 * Returns the result of converting the bfloat16 value `a' to
2662 * the unsigned integer format.
2665 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2666 int scale
, float_status
*s
)
2668 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2669 rmode
, scale
, UINT16_MAX
, s
);
2672 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2673 int scale
, float_status
*s
)
2675 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2676 rmode
, scale
, UINT32_MAX
, s
);
2679 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2680 int scale
, float_status
*s
)
2682 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2683 rmode
, scale
, UINT64_MAX
, s
);
2686 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2688 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2691 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2693 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2696 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2698 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2701 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2703 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2706 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2708 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2711 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2713 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2717 * Integer to float conversions
2719 * Returns the result of converting the two's complement integer `a'
2720 * to the floating-point format. The conversion is performed according
2721 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2724 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2726 FloatParts r
= { .sign
= false };
2729 r
.cls
= float_class_zero
;
2734 r
.cls
= float_class_normal
;
2740 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2742 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2743 r
.frac
= f
<< shift
;
2749 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2751 FloatParts pa
= int_to_float(a
, scale
, status
);
2752 return float16_round_pack_canonical(pa
, status
);
2755 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2757 return int64_to_float16_scalbn(a
, scale
, status
);
2760 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2762 return int64_to_float16_scalbn(a
, scale
, status
);
2765 float16
int64_to_float16(int64_t a
, float_status
*status
)
2767 return int64_to_float16_scalbn(a
, 0, status
);
2770 float16
int32_to_float16(int32_t a
, float_status
*status
)
2772 return int64_to_float16_scalbn(a
, 0, status
);
2775 float16
int16_to_float16(int16_t a
, float_status
*status
)
2777 return int64_to_float16_scalbn(a
, 0, status
);
2780 float16
int8_to_float16(int8_t a
, float_status
*status
)
2782 return int64_to_float16_scalbn(a
, 0, status
);
2785 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2787 FloatParts pa
= int_to_float(a
, scale
, status
);
2788 return float32_round_pack_canonical(pa
, status
);
2791 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2793 return int64_to_float32_scalbn(a
, scale
, status
);
2796 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2798 return int64_to_float32_scalbn(a
, scale
, status
);
2801 float32
int64_to_float32(int64_t a
, float_status
*status
)
2803 return int64_to_float32_scalbn(a
, 0, status
);
2806 float32
int32_to_float32(int32_t a
, float_status
*status
)
2808 return int64_to_float32_scalbn(a
, 0, status
);
2811 float32
int16_to_float32(int16_t a
, float_status
*status
)
2813 return int64_to_float32_scalbn(a
, 0, status
);
2816 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2818 FloatParts pa
= int_to_float(a
, scale
, status
);
2819 return float64_round_pack_canonical(pa
, status
);
2822 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2824 return int64_to_float64_scalbn(a
, scale
, status
);
2827 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2829 return int64_to_float64_scalbn(a
, scale
, status
);
2832 float64
int64_to_float64(int64_t a
, float_status
*status
)
2834 return int64_to_float64_scalbn(a
, 0, status
);
2837 float64
int32_to_float64(int32_t a
, float_status
*status
)
2839 return int64_to_float64_scalbn(a
, 0, status
);
2842 float64
int16_to_float64(int16_t a
, float_status
*status
)
2844 return int64_to_float64_scalbn(a
, 0, status
);
2848 * Returns the result of converting the two's complement integer `a'
2849 * to the bfloat16 format.
2852 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
2854 FloatParts pa
= int_to_float(a
, scale
, status
);
2855 return bfloat16_round_pack_canonical(pa
, status
);
2858 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
2860 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2863 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
2865 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2868 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
2870 return int64_to_bfloat16_scalbn(a
, 0, status
);
2873 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
2875 return int64_to_bfloat16_scalbn(a
, 0, status
);
2878 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
2880 return int64_to_bfloat16_scalbn(a
, 0, status
);
2884 * Unsigned Integer to float conversions
2886 * Returns the result of converting the unsigned integer `a' to the
2887 * floating-point format. The conversion is performed according to the
2888 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2891 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2893 FloatParts r
= { .sign
= false };
2897 r
.cls
= float_class_zero
;
2899 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2901 r
.cls
= float_class_normal
;
2902 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2903 r
.frac
= a
<< shift
;
2909 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2911 FloatParts pa
= uint_to_float(a
, scale
, status
);
2912 return float16_round_pack_canonical(pa
, status
);
2915 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2917 return uint64_to_float16_scalbn(a
, scale
, status
);
2920 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2922 return uint64_to_float16_scalbn(a
, scale
, status
);
2925 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2927 return uint64_to_float16_scalbn(a
, 0, status
);
2930 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2932 return uint64_to_float16_scalbn(a
, 0, status
);
2935 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2937 return uint64_to_float16_scalbn(a
, 0, status
);
2940 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
2942 return uint64_to_float16_scalbn(a
, 0, status
);
2945 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2947 FloatParts pa
= uint_to_float(a
, scale
, status
);
2948 return float32_round_pack_canonical(pa
, status
);
2951 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2953 return uint64_to_float32_scalbn(a
, scale
, status
);
2956 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2958 return uint64_to_float32_scalbn(a
, scale
, status
);
2961 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2963 return uint64_to_float32_scalbn(a
, 0, status
);
2966 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2968 return uint64_to_float32_scalbn(a
, 0, status
);
2971 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2973 return uint64_to_float32_scalbn(a
, 0, status
);
2976 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2978 FloatParts pa
= uint_to_float(a
, scale
, status
);
2979 return float64_round_pack_canonical(pa
, status
);
2982 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2984 return uint64_to_float64_scalbn(a
, scale
, status
);
2987 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2989 return uint64_to_float64_scalbn(a
, scale
, status
);
2992 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2994 return uint64_to_float64_scalbn(a
, 0, status
);
2997 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2999 return uint64_to_float64_scalbn(a
, 0, status
);
3002 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3004 return uint64_to_float64_scalbn(a
, 0, status
);
3008 * Returns the result of converting the unsigned integer `a' to the
3012 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3014 FloatParts pa
= uint_to_float(a
, scale
, status
);
3015 return bfloat16_round_pack_canonical(pa
, status
);
3018 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3020 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3023 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3025 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3028 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3030 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3033 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3035 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3038 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3040 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3044 /* min() and max() functions. These can't be implemented as
3045 * 'compare and pick one input' because that would mishandle
3046 * NaNs and +0 vs -0.
3048 * minnum() and maxnum() functions. These are similar to the min()
3049 * and max() functions but if one of the arguments is a QNaN and
3050 * the other is numerical then the numerical argument is returned.
3051 * SNaNs will get quietened before being returned.
3052 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3053 * and maxNum() operations. min() and max() are the typical min/max
3054 * semantics provided by many CPUs which predate that specification.
3056 * minnummag() and maxnummag() functions correspond to minNumMag()
3057 * and minNumMag() from the IEEE-754 2008.
3059 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
3060 bool ieee
, bool ismag
, float_status
*s
)
3062 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3064 /* Takes two floating-point values `a' and `b', one of
3065 * which is a NaN, and returns the appropriate NaN
3066 * result. If either `a' or `b' is a signaling NaN,
3067 * the invalid exception is raised.
3069 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3070 return pick_nan(a
, b
, s
);
3071 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3073 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3077 return pick_nan(a
, b
, s
);
3082 case float_class_normal
:
3085 case float_class_inf
:
3088 case float_class_zero
:
3092 g_assert_not_reached();
3096 case float_class_normal
:
3099 case float_class_inf
:
3102 case float_class_zero
:
3106 g_assert_not_reached();
3110 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3111 bool a_less
= a_exp
< b_exp
;
3112 if (a_exp
== b_exp
) {
3113 a_less
= a
.frac
< b
.frac
;
3115 return a_less
^ ismin
? b
: a
;
3118 if (a
.sign
== b
.sign
) {
3119 bool a_less
= a_exp
< b_exp
;
3120 if (a_exp
== b_exp
) {
3121 a_less
= a
.frac
< b
.frac
;
3123 return a
.sign
^ a_less
^ ismin
? b
: a
;
3125 return a
.sign
^ ismin
? b
: a
;
3130 #define MINMAX(sz, name, ismin, isiee, ismag) \
3131 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3134 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
3135 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
3136 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3138 return float ## sz ## _round_pack_canonical(pr, s); \
3141 MINMAX(16, min
, true, false, false)
3142 MINMAX(16, minnum
, true, true, false)
3143 MINMAX(16, minnummag
, true, true, true)
3144 MINMAX(16, max
, false, false, false)
3145 MINMAX(16, maxnum
, false, true, false)
3146 MINMAX(16, maxnummag
, false, true, true)
3148 MINMAX(32, min
, true, false, false)
3149 MINMAX(32, minnum
, true, true, false)
3150 MINMAX(32, minnummag
, true, true, true)
3151 MINMAX(32, max
, false, false, false)
3152 MINMAX(32, maxnum
, false, true, false)
3153 MINMAX(32, maxnummag
, false, true, true)
3155 MINMAX(64, min
, true, false, false)
3156 MINMAX(64, minnum
, true, true, false)
3157 MINMAX(64, minnummag
, true, true, true)
3158 MINMAX(64, max
, false, false, false)
3159 MINMAX(64, maxnum
, false, true, false)
3160 MINMAX(64, maxnummag
, false, true, true)
3164 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3165 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3167 FloatParts pa = bfloat16_unpack_canonical(a, s); \
3168 FloatParts pb = bfloat16_unpack_canonical(b, s); \
3169 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3171 return bfloat16_round_pack_canonical(pr, s); \
3174 BF16_MINMAX(min
, true, false, false)
3175 BF16_MINMAX(minnum
, true, true, false)
3176 BF16_MINMAX(minnummag
, true, true, true)
3177 BF16_MINMAX(max
, false, false, false)
3178 BF16_MINMAX(maxnum
, false, true, false)
3179 BF16_MINMAX(maxnummag
, false, true, true)
3183 /* Floating point compare */
3184 static FloatRelation
compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
3187 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3189 a
.cls
== float_class_snan
||
3190 b
.cls
== float_class_snan
) {
3191 s
->float_exception_flags
|= float_flag_invalid
;
3193 return float_relation_unordered
;
3196 if (a
.cls
== float_class_zero
) {
3197 if (b
.cls
== float_class_zero
) {
3198 return float_relation_equal
;
3200 return b
.sign
? float_relation_greater
: float_relation_less
;
3201 } else if (b
.cls
== float_class_zero
) {
3202 return a
.sign
? float_relation_less
: float_relation_greater
;
3205 /* The only really important thing about infinity is its sign. If
3206 * both are infinities the sign marks the smallest of the two.
3208 if (a
.cls
== float_class_inf
) {
3209 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3210 return float_relation_equal
;
3212 return a
.sign
? float_relation_less
: float_relation_greater
;
3213 } else if (b
.cls
== float_class_inf
) {
3214 return b
.sign
? float_relation_greater
: float_relation_less
;
3217 if (a
.sign
!= b
.sign
) {
3218 return a
.sign
? float_relation_less
: float_relation_greater
;
3221 if (a
.exp
== b
.exp
) {
3222 if (a
.frac
== b
.frac
) {
3223 return float_relation_equal
;
3226 return a
.frac
> b
.frac
?
3227 float_relation_less
: float_relation_greater
;
3229 return a
.frac
> b
.frac
?
3230 float_relation_greater
: float_relation_less
;
3234 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3236 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3241 #define COMPARE(name, attr, sz) \
3243 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3245 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
3246 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
3247 return compare_floats(pa, pb, is_quiet, s); \
3250 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3251 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3252 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3256 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3258 return soft_f16_compare(a
, b
, false, s
);
3261 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3263 return soft_f16_compare(a
, b
, true, s
);
3266 static FloatRelation QEMU_FLATTEN
3267 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3269 union_float32 ua
, ub
;
3274 if (QEMU_NO_HARDFLOAT
) {
3278 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3279 if (isgreaterequal(ua
.h
, ub
.h
)) {
3280 if (isgreater(ua
.h
, ub
.h
)) {
3281 return float_relation_greater
;
3283 return float_relation_equal
;
3285 if (likely(isless(ua
.h
, ub
.h
))) {
3286 return float_relation_less
;
3288 /* The only condition remaining is unordered.
3289 * Fall through to set flags.
3292 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3295 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3297 return f32_compare(a
, b
, false, s
);
3300 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3302 return f32_compare(a
, b
, true, s
);
3305 static FloatRelation QEMU_FLATTEN
3306 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3308 union_float64 ua
, ub
;
3313 if (QEMU_NO_HARDFLOAT
) {
3317 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3318 if (isgreaterequal(ua
.h
, ub
.h
)) {
3319 if (isgreater(ua
.h
, ub
.h
)) {
3320 return float_relation_greater
;
3322 return float_relation_equal
;
3324 if (likely(isless(ua
.h
, ub
.h
))) {
3325 return float_relation_less
;
3327 /* The only condition remaining is unordered.
3328 * Fall through to set flags.
3331 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3334 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3336 return f64_compare(a
, b
, false, s
);
3339 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3341 return f64_compare(a
, b
, true, s
);
3344 static FloatRelation QEMU_FLATTEN
3345 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3347 FloatParts pa
= bfloat16_unpack_canonical(a
, s
);
3348 FloatParts pb
= bfloat16_unpack_canonical(b
, s
);
3349 return compare_floats(pa
, pb
, is_quiet
, s
);
3352 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3354 return soft_bf16_compare(a
, b
, false, s
);
3357 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3359 return soft_bf16_compare(a
, b
, true, s
);
3362 /* Multiply A by 2 raised to the power N. */
3363 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3365 if (unlikely(is_nan(a
.cls
))) {
3366 return return_nan(a
, s
);
3368 if (a
.cls
== float_class_normal
) {
3369 /* The largest float type (even though not supported by FloatParts)
3370 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3371 * still allows rounding to infinity, without allowing overflow
3372 * within the int32_t that backs FloatParts.exp.
3374 n
= MIN(MAX(n
, -0x10000), 0x10000);
3380 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3382 FloatParts pa
= float16_unpack_canonical(a
, status
);
3383 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3384 return float16_round_pack_canonical(pr
, status
);
3387 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3389 FloatParts pa
= float32_unpack_canonical(a
, status
);
3390 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3391 return float32_round_pack_canonical(pr
, status
);
3394 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3396 FloatParts pa
= float64_unpack_canonical(a
, status
);
3397 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3398 return float64_round_pack_canonical(pr
, status
);
3401 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3403 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
3404 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3405 return bfloat16_round_pack_canonical(pr
, status
);
3411 * The old softfloat code did an approximation step before zeroing in
3412 * on the final result. However for simpleness we just compute the
3413 * square root by iterating down from the implicit bit to enough extra
3414 * bits to ensure we get a correctly rounded result.
3416 * This does mean however the calculation is slower than before,
3417 * especially for 64 bit floats.
3420 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3422 uint64_t a_frac
, r_frac
, s_frac
;
3425 if (is_nan(a
.cls
)) {
3426 return return_nan(a
, s
);
3428 if (a
.cls
== float_class_zero
) {
3429 return a
; /* sqrt(+-0) = +-0 */
3432 s
->float_exception_flags
|= float_flag_invalid
;
3433 return parts_default_nan(s
);
3435 if (a
.cls
== float_class_inf
) {
3436 return a
; /* sqrt(+inf) = +inf */
3439 assert(a
.cls
== float_class_normal
);
3441 /* We need two overflow bits at the top. Adding room for that is a
3442 * right shift. If the exponent is odd, we can discard the low bit
3443 * by multiplying the fraction by 2; that's a left shift. Combine
3444 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3446 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3449 /* Bit-by-bit computation of sqrt. */
3453 /* Iterate from implicit bit down to the 3 extra bits to compute a
3454 * properly rounded result. Remember we've inserted two more bits
3455 * at the top, so these positions are two less.
3457 bit
= DECOMPOSED_BINARY_POINT
- 2;
3458 last_bit
= MAX(p
->frac_shift
- 4, 0);
3460 uint64_t q
= 1ULL << bit
;
3461 uint64_t t_frac
= s_frac
+ q
;
3462 if (t_frac
<= a_frac
) {
3463 s_frac
= t_frac
+ q
;
3468 } while (--bit
>= last_bit
);
3470 /* Undo the right shift done above. If there is any remaining
3471 * fraction, the result is inexact. Set the sticky bit.
3473 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3478 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3480 FloatParts pa
= float16_unpack_canonical(a
, status
);
3481 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3482 return float16_round_pack_canonical(pr
, status
);
3485 static float32 QEMU_SOFTFLOAT_ATTR
3486 soft_f32_sqrt(float32 a
, float_status
*status
)
3488 FloatParts pa
= float32_unpack_canonical(a
, status
);
3489 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3490 return float32_round_pack_canonical(pr
, status
);
3493 static float64 QEMU_SOFTFLOAT_ATTR
3494 soft_f64_sqrt(float64 a
, float_status
*status
)
3496 FloatParts pa
= float64_unpack_canonical(a
, status
);
3497 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3498 return float64_round_pack_canonical(pr
, status
);
3501 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3503 union_float32 ua
, ur
;
3506 if (unlikely(!can_use_fpu(s
))) {
3510 float32_input_flush1(&ua
.s
, s
);
3511 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3512 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3513 fpclassify(ua
.h
) == FP_ZERO
) ||
3517 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3518 float32_is_neg(ua
.s
))) {
3525 return soft_f32_sqrt(ua
.s
, s
);
3528 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3530 union_float64 ua
, ur
;
3533 if (unlikely(!can_use_fpu(s
))) {
3537 float64_input_flush1(&ua
.s
, s
);
3538 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3539 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3540 fpclassify(ua
.h
) == FP_ZERO
) ||
3544 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3545 float64_is_neg(ua
.s
))) {
3552 return soft_f64_sqrt(ua
.s
, s
);
3555 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3557 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
3558 FloatParts pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3559 return bfloat16_round_pack_canonical(pr
, status
);
3562 /*----------------------------------------------------------------------------
3563 | The pattern for a default generated NaN.
3564 *----------------------------------------------------------------------------*/
3566 float16
float16_default_nan(float_status
*status
)
3568 FloatParts p
= parts_default_nan(status
);
3569 p
.frac
>>= float16_params
.frac_shift
;
3570 return float16_pack_raw(p
);
3573 float32
float32_default_nan(float_status
*status
)
3575 FloatParts p
= parts_default_nan(status
);
3576 p
.frac
>>= float32_params
.frac_shift
;
3577 return float32_pack_raw(p
);
3580 float64
float64_default_nan(float_status
*status
)
3582 FloatParts p
= parts_default_nan(status
);
3583 p
.frac
>>= float64_params
.frac_shift
;
3584 return float64_pack_raw(p
);
3587 float128
float128_default_nan(float_status
*status
)
3589 FloatParts p
= parts_default_nan(status
);
3592 /* Extrapolate from the choices made by parts_default_nan to fill
3593 * in the quad-floating format. If the low bit is set, assume we
3594 * want to set all non-snan bits.
3596 r
.low
= -(p
.frac
& 1);
3597 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3598 r
.high
|= UINT64_C(0x7FFF000000000000);
3599 r
.high
|= (uint64_t)p
.sign
<< 63;
3604 bfloat16
bfloat16_default_nan(float_status
*status
)
3606 FloatParts p
= parts_default_nan(status
);
3607 p
.frac
>>= bfloat16_params
.frac_shift
;
3608 return bfloat16_pack_raw(p
);
3611 /*----------------------------------------------------------------------------
3612 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3613 *----------------------------------------------------------------------------*/
3615 float16
float16_silence_nan(float16 a
, float_status
*status
)
3617 FloatParts p
= float16_unpack_raw(a
);
3618 p
.frac
<<= float16_params
.frac_shift
;
3619 p
= parts_silence_nan(p
, status
);
3620 p
.frac
>>= float16_params
.frac_shift
;
3621 return float16_pack_raw(p
);
3624 float32
float32_silence_nan(float32 a
, float_status
*status
)
3626 FloatParts p
= float32_unpack_raw(a
);
3627 p
.frac
<<= float32_params
.frac_shift
;
3628 p
= parts_silence_nan(p
, status
);
3629 p
.frac
>>= float32_params
.frac_shift
;
3630 return float32_pack_raw(p
);
3633 float64
float64_silence_nan(float64 a
, float_status
*status
)
3635 FloatParts p
= float64_unpack_raw(a
);
3636 p
.frac
<<= float64_params
.frac_shift
;
3637 p
= parts_silence_nan(p
, status
);
3638 p
.frac
>>= float64_params
.frac_shift
;
3639 return float64_pack_raw(p
);
3642 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3644 FloatParts p
= bfloat16_unpack_raw(a
);
3645 p
.frac
<<= bfloat16_params
.frac_shift
;
3646 p
= parts_silence_nan(p
, status
);
3647 p
.frac
>>= bfloat16_params
.frac_shift
;
3648 return bfloat16_pack_raw(p
);
3651 /*----------------------------------------------------------------------------
3652 | If `a' is denormal and we are in flush-to-zero mode then set the
3653 | input-denormal exception and return zero. Otherwise just return the value.
3654 *----------------------------------------------------------------------------*/
3656 static bool parts_squash_denormal(FloatParts p
, float_status
*status
)
3658 if (p
.exp
== 0 && p
.frac
!= 0) {
3659 float_raise(float_flag_input_denormal
, status
);
3666 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3668 if (status
->flush_inputs_to_zero
) {
3669 FloatParts p
= float16_unpack_raw(a
);
3670 if (parts_squash_denormal(p
, status
)) {
3671 return float16_set_sign(float16_zero
, p
.sign
);
3677 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3679 if (status
->flush_inputs_to_zero
) {
3680 FloatParts p
= float32_unpack_raw(a
);
3681 if (parts_squash_denormal(p
, status
)) {
3682 return float32_set_sign(float32_zero
, p
.sign
);
3688 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3690 if (status
->flush_inputs_to_zero
) {
3691 FloatParts p
= float64_unpack_raw(a
);
3692 if (parts_squash_denormal(p
, status
)) {
3693 return float64_set_sign(float64_zero
, p
.sign
);
3699 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3701 if (status
->flush_inputs_to_zero
) {
3702 FloatParts p
= bfloat16_unpack_raw(a
);
3703 if (parts_squash_denormal(p
, status
)) {
3704 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3710 /*----------------------------------------------------------------------------
3711 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3712 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3713 | input. If `zSign' is 1, the input is negated before being converted to an
3714 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3715 | is simply rounded to an integer, with the inexact exception raised if the
3716 | input cannot be represented exactly as an integer. However, if the fixed-
3717 | point input is too large, the invalid exception is raised and the largest
3718 | positive or negative integer is returned.
3719 *----------------------------------------------------------------------------*/
3721 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3722 float_status
*status
)
3724 int8_t roundingMode
;
3725 bool roundNearestEven
;
3726 int8_t roundIncrement
, roundBits
;
3729 roundingMode
= status
->float_rounding_mode
;
3730 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3731 switch (roundingMode
) {
3732 case float_round_nearest_even
:
3733 case float_round_ties_away
:
3734 roundIncrement
= 0x40;
3736 case float_round_to_zero
:
3739 case float_round_up
:
3740 roundIncrement
= zSign
? 0 : 0x7f;
3742 case float_round_down
:
3743 roundIncrement
= zSign
? 0x7f : 0;
3745 case float_round_to_odd
:
3746 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3751 roundBits
= absZ
& 0x7F;
3752 absZ
= ( absZ
+ roundIncrement
)>>7;
3753 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3757 if ( zSign
) z
= - z
;
3758 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3759 float_raise(float_flag_invalid
, status
);
3760 return zSign
? INT32_MIN
: INT32_MAX
;
3763 status
->float_exception_flags
|= float_flag_inexact
;
3769 /*----------------------------------------------------------------------------
3770 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3771 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3772 | and returns the properly rounded 64-bit integer corresponding to the input.
3773 | If `zSign' is 1, the input is negated before being converted to an integer.
3774 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3775 | the inexact exception raised if the input cannot be represented exactly as
3776 | an integer. However, if the fixed-point input is too large, the invalid
3777 | exception is raised and the largest positive or negative integer is
3779 *----------------------------------------------------------------------------*/
3781 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3782 float_status
*status
)
3784 int8_t roundingMode
;
3785 bool roundNearestEven
, increment
;
3788 roundingMode
= status
->float_rounding_mode
;
3789 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3790 switch (roundingMode
) {
3791 case float_round_nearest_even
:
3792 case float_round_ties_away
:
3793 increment
= ((int64_t) absZ1
< 0);
3795 case float_round_to_zero
:
3798 case float_round_up
:
3799 increment
= !zSign
&& absZ1
;
3801 case float_round_down
:
3802 increment
= zSign
&& absZ1
;
3804 case float_round_to_odd
:
3805 increment
= !(absZ0
& 1) && absZ1
;
3812 if ( absZ0
== 0 ) goto overflow
;
3813 if (!(absZ1
<< 1) && roundNearestEven
) {
3818 if ( zSign
) z
= - z
;
3819 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3821 float_raise(float_flag_invalid
, status
);
3822 return zSign
? INT64_MIN
: INT64_MAX
;
3825 status
->float_exception_flags
|= float_flag_inexact
;
3831 /*----------------------------------------------------------------------------
3832 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3833 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3834 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3835 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3836 | with the inexact exception raised if the input cannot be represented exactly
3837 | as an integer. However, if the fixed-point input is too large, the invalid
3838 | exception is raised and the largest unsigned integer is returned.
3839 *----------------------------------------------------------------------------*/
3841 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
3842 uint64_t absZ1
, float_status
*status
)
3844 int8_t roundingMode
;
3845 bool roundNearestEven
, increment
;
3847 roundingMode
= status
->float_rounding_mode
;
3848 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3849 switch (roundingMode
) {
3850 case float_round_nearest_even
:
3851 case float_round_ties_away
:
3852 increment
= ((int64_t)absZ1
< 0);
3854 case float_round_to_zero
:
3857 case float_round_up
:
3858 increment
= !zSign
&& absZ1
;
3860 case float_round_down
:
3861 increment
= zSign
&& absZ1
;
3863 case float_round_to_odd
:
3864 increment
= !(absZ0
& 1) && absZ1
;
3872 float_raise(float_flag_invalid
, status
);
3875 if (!(absZ1
<< 1) && roundNearestEven
) {
3880 if (zSign
&& absZ0
) {
3881 float_raise(float_flag_invalid
, status
);
3886 status
->float_exception_flags
|= float_flag_inexact
;
3891 /*----------------------------------------------------------------------------
3892 | Normalizes the subnormal single-precision floating-point value represented
3893 | by the denormalized significand `aSig'. The normalized exponent and
3894 | significand are stored at the locations pointed to by `zExpPtr' and
3895 | `zSigPtr', respectively.
3896 *----------------------------------------------------------------------------*/
3899 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3903 shiftCount
= clz32(aSig
) - 8;
3904 *zSigPtr
= aSig
<<shiftCount
;
3905 *zExpPtr
= 1 - shiftCount
;
3909 /*----------------------------------------------------------------------------
3910 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3911 | and significand `zSig', and returns the proper single-precision floating-
3912 | point value corresponding to the abstract input. Ordinarily, the abstract
3913 | value is simply rounded and packed into the single-precision format, with
3914 | the inexact exception raised if the abstract input cannot be represented
3915 | exactly. However, if the abstract value is too large, the overflow and
3916 | inexact exceptions are raised and an infinity or maximal finite value is
3917 | returned. If the abstract value is too small, the input value is rounded to
3918 | a subnormal number, and the underflow and inexact exceptions are raised if
3919 | the abstract input cannot be represented exactly as a subnormal single-
3920 | precision floating-point number.
3921 | The input significand `zSig' has its binary point between bits 30
3922 | and 29, which is 7 bits to the left of the usual location. This shifted
3923 | significand must be normalized or smaller. If `zSig' is not normalized,
3924 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3925 | and it must not require rounding. In the usual case that `zSig' is
3926 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3927 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3928 | Binary Floating-Point Arithmetic.
3929 *----------------------------------------------------------------------------*/
3931 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3932 float_status
*status
)
3934 int8_t roundingMode
;
3935 bool roundNearestEven
;
3936 int8_t roundIncrement
, roundBits
;
3939 roundingMode
= status
->float_rounding_mode
;
3940 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3941 switch (roundingMode
) {
3942 case float_round_nearest_even
:
3943 case float_round_ties_away
:
3944 roundIncrement
= 0x40;
3946 case float_round_to_zero
:
3949 case float_round_up
:
3950 roundIncrement
= zSign
? 0 : 0x7f;
3952 case float_round_down
:
3953 roundIncrement
= zSign
? 0x7f : 0;
3955 case float_round_to_odd
:
3956 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3962 roundBits
= zSig
& 0x7F;
3963 if ( 0xFD <= (uint16_t) zExp
) {
3964 if ( ( 0xFD < zExp
)
3965 || ( ( zExp
== 0xFD )
3966 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3968 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3969 roundIncrement
!= 0;
3970 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3971 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3974 if (status
->flush_to_zero
) {
3975 float_raise(float_flag_output_denormal
, status
);
3976 return packFloat32(zSign
, 0, 0);
3978 isTiny
= status
->tininess_before_rounding
3980 || (zSig
+ roundIncrement
< 0x80000000);
3981 shift32RightJamming( zSig
, - zExp
, &zSig
);
3983 roundBits
= zSig
& 0x7F;
3984 if (isTiny
&& roundBits
) {
3985 float_raise(float_flag_underflow
, status
);
3987 if (roundingMode
== float_round_to_odd
) {
3989 * For round-to-odd case, the roundIncrement depends on
3990 * zSig which just changed.
3992 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3997 status
->float_exception_flags
|= float_flag_inexact
;
3999 zSig
= ( zSig
+ roundIncrement
)>>7;
4000 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4003 if ( zSig
== 0 ) zExp
= 0;
4004 return packFloat32( zSign
, zExp
, zSig
);
4008 /*----------------------------------------------------------------------------
4009 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4010 | and significand `zSig', and returns the proper single-precision floating-
4011 | point value corresponding to the abstract input. This routine is just like
4012 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4013 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4014 | floating-point exponent.
4015 *----------------------------------------------------------------------------*/
4018 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4019 float_status
*status
)
4023 shiftCount
= clz32(zSig
) - 1;
4024 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4029 /*----------------------------------------------------------------------------
4030 | Normalizes the subnormal double-precision floating-point value represented
4031 | by the denormalized significand `aSig'. The normalized exponent and
4032 | significand are stored at the locations pointed to by `zExpPtr' and
4033 | `zSigPtr', respectively.
4034 *----------------------------------------------------------------------------*/
4037 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4041 shiftCount
= clz64(aSig
) - 11;
4042 *zSigPtr
= aSig
<<shiftCount
;
4043 *zExpPtr
= 1 - shiftCount
;
4047 /*----------------------------------------------------------------------------
4048 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4049 | double-precision floating-point value, returning the result. After being
4050 | shifted into the proper positions, the three fields are simply added
4051 | together to form the result. This means that any integer portion of `zSig'
4052 | will be added into the exponent. Since a properly normalized significand
4053 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4054 | than the desired result exponent whenever `zSig' is a complete, normalized
4056 *----------------------------------------------------------------------------*/
4058 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4061 return make_float64(
4062 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4066 /*----------------------------------------------------------------------------
4067 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4068 | and significand `zSig', and returns the proper double-precision floating-
4069 | point value corresponding to the abstract input. Ordinarily, the abstract
4070 | value is simply rounded and packed into the double-precision format, with
4071 | the inexact exception raised if the abstract input cannot be represented
4072 | exactly. However, if the abstract value is too large, the overflow and
4073 | inexact exceptions are raised and an infinity or maximal finite value is
4074 | returned. If the abstract value is too small, the input value is rounded to
4075 | a subnormal number, and the underflow and inexact exceptions are raised if
4076 | the abstract input cannot be represented exactly as a subnormal double-
4077 | precision floating-point number.
4078 | The input significand `zSig' has its binary point between bits 62
4079 | and 61, which is 10 bits to the left of the usual location. This shifted
4080 | significand must be normalized or smaller. If `zSig' is not normalized,
4081 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4082 | and it must not require rounding. In the usual case that `zSig' is
4083 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4084 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4085 | Binary Floating-Point Arithmetic.
4086 *----------------------------------------------------------------------------*/
4088 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4089 float_status
*status
)
4091 int8_t roundingMode
;
4092 bool roundNearestEven
;
4093 int roundIncrement
, roundBits
;
4096 roundingMode
= status
->float_rounding_mode
;
4097 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4098 switch (roundingMode
) {
4099 case float_round_nearest_even
:
4100 case float_round_ties_away
:
4101 roundIncrement
= 0x200;
4103 case float_round_to_zero
:
4106 case float_round_up
:
4107 roundIncrement
= zSign
? 0 : 0x3ff;
4109 case float_round_down
:
4110 roundIncrement
= zSign
? 0x3ff : 0;
4112 case float_round_to_odd
:
4113 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4118 roundBits
= zSig
& 0x3FF;
4119 if ( 0x7FD <= (uint16_t) zExp
) {
4120 if ( ( 0x7FD < zExp
)
4121 || ( ( zExp
== 0x7FD )
4122 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4124 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4125 roundIncrement
!= 0;
4126 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4127 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4130 if (status
->flush_to_zero
) {
4131 float_raise(float_flag_output_denormal
, status
);
4132 return packFloat64(zSign
, 0, 0);
4134 isTiny
= status
->tininess_before_rounding
4136 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4137 shift64RightJamming( zSig
, - zExp
, &zSig
);
4139 roundBits
= zSig
& 0x3FF;
4140 if (isTiny
&& roundBits
) {
4141 float_raise(float_flag_underflow
, status
);
4143 if (roundingMode
== float_round_to_odd
) {
4145 * For round-to-odd case, the roundIncrement depends on
4146 * zSig which just changed.
4148 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4153 status
->float_exception_flags
|= float_flag_inexact
;
4155 zSig
= ( zSig
+ roundIncrement
)>>10;
4156 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4159 if ( zSig
== 0 ) zExp
= 0;
4160 return packFloat64( zSign
, zExp
, zSig
);
4164 /*----------------------------------------------------------------------------
4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4166 | and significand `zSig', and returns the proper double-precision floating-
4167 | point value corresponding to the abstract input. This routine is just like
4168 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4169 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4170 | floating-point exponent.
4171 *----------------------------------------------------------------------------*/
4174 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4175 float_status
*status
)
4179 shiftCount
= clz64(zSig
) - 1;
4180 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4185 /*----------------------------------------------------------------------------
4186 | Normalizes the subnormal extended double-precision floating-point value
4187 | represented by the denormalized significand `aSig'. The normalized exponent
4188 | and significand are stored at the locations pointed to by `zExpPtr' and
4189 | `zSigPtr', respectively.
4190 *----------------------------------------------------------------------------*/
4192 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4197 shiftCount
= clz64(aSig
);
4198 *zSigPtr
= aSig
<<shiftCount
;
4199 *zExpPtr
= 1 - shiftCount
;
4202 /*----------------------------------------------------------------------------
4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4204 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4205 | and returns the proper extended double-precision floating-point value
4206 | corresponding to the abstract input. Ordinarily, the abstract value is
4207 | rounded and packed into the extended double-precision format, with the
4208 | inexact exception raised if the abstract input cannot be represented
4209 | exactly. However, if the abstract value is too large, the overflow and
4210 | inexact exceptions are raised and an infinity or maximal finite value is
4211 | returned. If the abstract value is too small, the input value is rounded to
4212 | a subnormal number, and the underflow and inexact exceptions are raised if
4213 | the abstract input cannot be represented exactly as a subnormal extended
4214 | double-precision floating-point number.
4215 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4216 | number of bits as single or double precision, respectively. Otherwise, the
4217 | result is rounded to the full precision of the extended double-precision
4219 | The input significand must be normalized or smaller. If the input
4220 | significand is not normalized, `zExp' must be 0; in that case, the result
4221 | returned is a subnormal number, and it must not require rounding. The
4222 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4223 | Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4226 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4227 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4228 float_status
*status
)
4230 int8_t roundingMode
;
4231 bool roundNearestEven
, increment
, isTiny
;
4232 int64_t roundIncrement
, roundMask
, roundBits
;
4234 roundingMode
= status
->float_rounding_mode
;
4235 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4236 if ( roundingPrecision
== 80 ) goto precision80
;
4237 if ( roundingPrecision
== 64 ) {
4238 roundIncrement
= UINT64_C(0x0000000000000400);
4239 roundMask
= UINT64_C(0x00000000000007FF);
4241 else if ( roundingPrecision
== 32 ) {
4242 roundIncrement
= UINT64_C(0x0000008000000000);
4243 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4248 zSig0
|= ( zSig1
!= 0 );
4249 switch (roundingMode
) {
4250 case float_round_nearest_even
:
4251 case float_round_ties_away
:
4253 case float_round_to_zero
:
4256 case float_round_up
:
4257 roundIncrement
= zSign
? 0 : roundMask
;
4259 case float_round_down
:
4260 roundIncrement
= zSign
? roundMask
: 0;
4265 roundBits
= zSig0
& roundMask
;
4266 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4267 if ( ( 0x7FFE < zExp
)
4268 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4273 if (status
->flush_to_zero
) {
4274 float_raise(float_flag_output_denormal
, status
);
4275 return packFloatx80(zSign
, 0, 0);
4277 isTiny
= status
->tininess_before_rounding
4279 || (zSig0
<= zSig0
+ roundIncrement
);
4280 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4282 roundBits
= zSig0
& roundMask
;
4283 if (isTiny
&& roundBits
) {
4284 float_raise(float_flag_underflow
, status
);
4287 status
->float_exception_flags
|= float_flag_inexact
;
4289 zSig0
+= roundIncrement
;
4290 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4291 roundIncrement
= roundMask
+ 1;
4292 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4293 roundMask
|= roundIncrement
;
4295 zSig0
&= ~ roundMask
;
4296 return packFloatx80( zSign
, zExp
, zSig0
);
4300 status
->float_exception_flags
|= float_flag_inexact
;
4302 zSig0
+= roundIncrement
;
4303 if ( zSig0
< roundIncrement
) {
4305 zSig0
= UINT64_C(0x8000000000000000);
4307 roundIncrement
= roundMask
+ 1;
4308 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4309 roundMask
|= roundIncrement
;
4311 zSig0
&= ~ roundMask
;
4312 if ( zSig0
== 0 ) zExp
= 0;
4313 return packFloatx80( zSign
, zExp
, zSig0
);
4315 switch (roundingMode
) {
4316 case float_round_nearest_even
:
4317 case float_round_ties_away
:
4318 increment
= ((int64_t)zSig1
< 0);
4320 case float_round_to_zero
:
4323 case float_round_up
:
4324 increment
= !zSign
&& zSig1
;
4326 case float_round_down
:
4327 increment
= zSign
&& zSig1
;
4332 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4333 if ( ( 0x7FFE < zExp
)
4334 || ( ( zExp
== 0x7FFE )
4335 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4341 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4342 if ( ( roundingMode
== float_round_to_zero
)
4343 || ( zSign
&& ( roundingMode
== float_round_up
) )
4344 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4346 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4348 return packFloatx80(zSign
,
4349 floatx80_infinity_high
,
4350 floatx80_infinity_low
);
4353 isTiny
= status
->tininess_before_rounding
4356 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4357 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4359 if (isTiny
&& zSig1
) {
4360 float_raise(float_flag_underflow
, status
);
4363 status
->float_exception_flags
|= float_flag_inexact
;
4365 switch (roundingMode
) {
4366 case float_round_nearest_even
:
4367 case float_round_ties_away
:
4368 increment
= ((int64_t)zSig1
< 0);
4370 case float_round_to_zero
:
4373 case float_round_up
:
4374 increment
= !zSign
&& zSig1
;
4376 case float_round_down
:
4377 increment
= zSign
&& zSig1
;
4384 if (!(zSig1
<< 1) && roundNearestEven
) {
4387 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4389 return packFloatx80( zSign
, zExp
, zSig0
);
4393 status
->float_exception_flags
|= float_flag_inexact
;
4399 zSig0
= UINT64_C(0x8000000000000000);
4402 if (!(zSig1
<< 1) && roundNearestEven
) {
4408 if ( zSig0
== 0 ) zExp
= 0;
4410 return packFloatx80( zSign
, zExp
, zSig0
);
4414 /*----------------------------------------------------------------------------
4415 | Takes an abstract floating-point value having sign `zSign', exponent
4416 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4417 | and returns the proper extended double-precision floating-point value
4418 | corresponding to the abstract input. This routine is just like
4419 | `roundAndPackFloatx80' except that the input significand does not have to be
4421 *----------------------------------------------------------------------------*/
4423 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4424 bool zSign
, int32_t zExp
,
4425 uint64_t zSig0
, uint64_t zSig1
,
4426 float_status
*status
)
4435 shiftCount
= clz64(zSig0
);
4436 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4438 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4439 zSig0
, zSig1
, status
);
4443 /*----------------------------------------------------------------------------
4444 | Returns the least-significant 64 fraction bits of the quadruple-precision
4445 | floating-point value `a'.
4446 *----------------------------------------------------------------------------*/
4448 static inline uint64_t extractFloat128Frac1( float128 a
)
4455 /*----------------------------------------------------------------------------
4456 | Returns the most-significant 48 fraction bits of the quadruple-precision
4457 | floating-point value `a'.
4458 *----------------------------------------------------------------------------*/
4460 static inline uint64_t extractFloat128Frac0( float128 a
)
4463 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4467 /*----------------------------------------------------------------------------
4468 | Returns the exponent bits of the quadruple-precision floating-point value
4470 *----------------------------------------------------------------------------*/
4472 static inline int32_t extractFloat128Exp( float128 a
)
4475 return ( a
.high
>>48 ) & 0x7FFF;
4479 /*----------------------------------------------------------------------------
4480 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4481 *----------------------------------------------------------------------------*/
4483 static inline bool extractFloat128Sign(float128 a
)
4485 return a
.high
>> 63;
4488 /*----------------------------------------------------------------------------
4489 | Normalizes the subnormal quadruple-precision floating-point value
4490 | represented by the denormalized significand formed by the concatenation of
4491 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4492 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4493 | significand are stored at the location pointed to by `zSig0Ptr', and the
4494 | least significant 64 bits of the normalized significand are stored at the
4495 | location pointed to by `zSig1Ptr'.
4496 *----------------------------------------------------------------------------*/
4499 normalizeFloat128Subnormal(
4510 shiftCount
= clz64(aSig1
) - 15;
4511 if ( shiftCount
< 0 ) {
4512 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4513 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4516 *zSig0Ptr
= aSig1
<<shiftCount
;
4519 *zExpPtr
= - shiftCount
- 63;
4522 shiftCount
= clz64(aSig0
) - 15;
4523 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4524 *zExpPtr
= 1 - shiftCount
;
4529 /*----------------------------------------------------------------------------
4530 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4531 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4532 | floating-point value, returning the result. After being shifted into the
4533 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4534 | added together to form the most significant 32 bits of the result. This
4535 | means that any integer portion of `zSig0' will be added into the exponent.
4536 | Since a properly normalized significand will have an integer portion equal
4537 | to 1, the `zExp' input should be 1 less than the desired result exponent
4538 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4540 *----------------------------------------------------------------------------*/
4542 static inline float128
4543 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4548 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4552 /*----------------------------------------------------------------------------
4553 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4554 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4555 | and `zSig2', and returns the proper quadruple-precision floating-point value
4556 | corresponding to the abstract input. Ordinarily, the abstract value is
4557 | simply rounded and packed into the quadruple-precision format, with the
4558 | inexact exception raised if the abstract input cannot be represented
4559 | exactly. However, if the abstract value is too large, the overflow and
4560 | inexact exceptions are raised and an infinity or maximal finite value is
4561 | returned. If the abstract value is too small, the input value is rounded to
4562 | a subnormal number, and the underflow and inexact exceptions are raised if
4563 | the abstract input cannot be represented exactly as a subnormal quadruple-
4564 | precision floating-point number.
4565 | The input significand must be normalized or smaller. If the input
4566 | significand is not normalized, `zExp' must be 0; in that case, the result
4567 | returned is a subnormal number, and it must not require rounding. In the
4568 | usual case that the input significand is normalized, `zExp' must be 1 less
4569 | than the ``true'' floating-point exponent. The handling of underflow and
4570 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4573 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4574 uint64_t zSig0
, uint64_t zSig1
,
4575 uint64_t zSig2
, float_status
*status
)
4577 int8_t roundingMode
;
4578 bool roundNearestEven
, increment
, isTiny
;
4580 roundingMode
= status
->float_rounding_mode
;
4581 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4582 switch (roundingMode
) {
4583 case float_round_nearest_even
:
4584 case float_round_ties_away
:
4585 increment
= ((int64_t)zSig2
< 0);
4587 case float_round_to_zero
:
4590 case float_round_up
:
4591 increment
= !zSign
&& zSig2
;
4593 case float_round_down
:
4594 increment
= zSign
&& zSig2
;
4596 case float_round_to_odd
:
4597 increment
= !(zSig1
& 0x1) && zSig2
;
4602 if ( 0x7FFD <= (uint32_t) zExp
) {
4603 if ( ( 0x7FFD < zExp
)
4604 || ( ( zExp
== 0x7FFD )
4606 UINT64_C(0x0001FFFFFFFFFFFF),
4607 UINT64_C(0xFFFFFFFFFFFFFFFF),
4614 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4615 if ( ( roundingMode
== float_round_to_zero
)
4616 || ( zSign
&& ( roundingMode
== float_round_up
) )
4617 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4618 || (roundingMode
== float_round_to_odd
)
4624 UINT64_C(0x0000FFFFFFFFFFFF),
4625 UINT64_C(0xFFFFFFFFFFFFFFFF)
4628 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4631 if (status
->flush_to_zero
) {
4632 float_raise(float_flag_output_denormal
, status
);
4633 return packFloat128(zSign
, 0, 0, 0);
4635 isTiny
= status
->tininess_before_rounding
4638 || lt128(zSig0
, zSig1
,
4639 UINT64_C(0x0001FFFFFFFFFFFF),
4640 UINT64_C(0xFFFFFFFFFFFFFFFF));
4641 shift128ExtraRightJamming(
4642 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4644 if (isTiny
&& zSig2
) {
4645 float_raise(float_flag_underflow
, status
);
4647 switch (roundingMode
) {
4648 case float_round_nearest_even
:
4649 case float_round_ties_away
:
4650 increment
= ((int64_t)zSig2
< 0);
4652 case float_round_to_zero
:
4655 case float_round_up
:
4656 increment
= !zSign
&& zSig2
;
4658 case float_round_down
:
4659 increment
= zSign
&& zSig2
;
4661 case float_round_to_odd
:
4662 increment
= !(zSig1
& 0x1) && zSig2
;
4670 status
->float_exception_flags
|= float_flag_inexact
;
4673 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4674 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4679 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4681 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4685 /*----------------------------------------------------------------------------
4686 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4687 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4688 | returns the proper quadruple-precision floating-point value corresponding
4689 | to the abstract input. This routine is just like `roundAndPackFloat128'
4690 | except that the input significand has fewer bits and does not have to be
4691 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4693 *----------------------------------------------------------------------------*/
4695 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4696 uint64_t zSig0
, uint64_t zSig1
,
4697 float_status
*status
)
4707 shiftCount
= clz64(zSig0
) - 15;
4708 if ( 0 <= shiftCount
) {
4710 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4713 shift128ExtraRightJamming(
4714 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4717 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the 32-bit two's complement integer `a'
4724 | to the extended double-precision floating-point format. The conversion
4725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4727 *----------------------------------------------------------------------------*/
4729 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4736 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4738 absA
= zSign
? - a
: a
;
4739 shiftCount
= clz32(absA
) + 32;
4741 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of converting the 32-bit two's complement integer `a' to
4747 | the quadruple-precision floating-point format. The conversion is performed
4748 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4751 float128
int32_to_float128(int32_t a
, float_status
*status
)
4758 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4760 absA
= zSign
? - a
: a
;
4761 shiftCount
= clz32(absA
) + 17;
4763 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4767 /*----------------------------------------------------------------------------
4768 | Returns the result of converting the 64-bit two's complement integer `a'
4769 | to the extended double-precision floating-point format. The conversion
4770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4772 *----------------------------------------------------------------------------*/
4774 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4780 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4782 absA
= zSign
? - a
: a
;
4783 shiftCount
= clz64(absA
);
4784 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4788 /*----------------------------------------------------------------------------
4789 | Returns the result of converting the 64-bit two's complement integer `a' to
4790 | the quadruple-precision floating-point format. The conversion is performed
4791 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4792 *----------------------------------------------------------------------------*/
4794 float128
int64_to_float128(int64_t a
, float_status
*status
)
4800 uint64_t zSig0
, zSig1
;
4802 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4804 absA
= zSign
? - a
: a
;
4805 shiftCount
= clz64(absA
) + 49;
4806 zExp
= 0x406E - shiftCount
;
4807 if ( 64 <= shiftCount
) {
4816 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4817 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4821 /*----------------------------------------------------------------------------
4822 | Returns the result of converting the 64-bit unsigned integer `a'
4823 | to the quadruple-precision floating-point format. The conversion is performed
4824 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4827 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4830 return float128_zero
;
4832 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4835 /*----------------------------------------------------------------------------
4836 | Returns the result of converting the single-precision floating-point value
4837 | `a' to the extended double-precision floating-point format. The conversion
4838 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4840 *----------------------------------------------------------------------------*/
4842 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4848 a
= float32_squash_input_denormal(a
, status
);
4849 aSig
= extractFloat32Frac( a
);
4850 aExp
= extractFloat32Exp( a
);
4851 aSign
= extractFloat32Sign( a
);
4852 if ( aExp
== 0xFF ) {
4854 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4856 return floatx80_silence_nan(res
, status
);
4858 return packFloatx80(aSign
,
4859 floatx80_infinity_high
,
4860 floatx80_infinity_low
);
4863 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4864 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4867 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4871 /*----------------------------------------------------------------------------
4872 | Returns the result of converting the single-precision floating-point value
4873 | `a' to the double-precision floating-point format. The conversion is
4874 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4876 *----------------------------------------------------------------------------*/
4878 float128
float32_to_float128(float32 a
, float_status
*status
)
4884 a
= float32_squash_input_denormal(a
, status
);
4885 aSig
= extractFloat32Frac( a
);
4886 aExp
= extractFloat32Exp( a
);
4887 aSign
= extractFloat32Sign( a
);
4888 if ( aExp
== 0xFF ) {
4890 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4892 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4895 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4896 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4899 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4903 /*----------------------------------------------------------------------------
4904 | Returns the remainder of the single-precision floating-point value `a'
4905 | with respect to the corresponding value `b'. The operation is performed
4906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4907 *----------------------------------------------------------------------------*/
4909 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4912 int aExp
, bExp
, expDiff
;
4913 uint32_t aSig
, bSig
;
4915 uint64_t aSig64
, bSig64
, q64
;
4916 uint32_t alternateASig
;
4918 a
= float32_squash_input_denormal(a
, status
);
4919 b
= float32_squash_input_denormal(b
, status
);
4921 aSig
= extractFloat32Frac( a
);
4922 aExp
= extractFloat32Exp( a
);
4923 aSign
= extractFloat32Sign( a
);
4924 bSig
= extractFloat32Frac( b
);
4925 bExp
= extractFloat32Exp( b
);
4926 if ( aExp
== 0xFF ) {
4927 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4928 return propagateFloat32NaN(a
, b
, status
);
4930 float_raise(float_flag_invalid
, status
);
4931 return float32_default_nan(status
);
4933 if ( bExp
== 0xFF ) {
4935 return propagateFloat32NaN(a
, b
, status
);
4941 float_raise(float_flag_invalid
, status
);
4942 return float32_default_nan(status
);
4944 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4947 if ( aSig
== 0 ) return a
;
4948 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4950 expDiff
= aExp
- bExp
;
4953 if ( expDiff
< 32 ) {
4956 if ( expDiff
< 0 ) {
4957 if ( expDiff
< -1 ) return a
;
4960 q
= ( bSig
<= aSig
);
4961 if ( q
) aSig
-= bSig
;
4962 if ( 0 < expDiff
) {
4963 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4966 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4974 if ( bSig
<= aSig
) aSig
-= bSig
;
4975 aSig64
= ( (uint64_t) aSig
)<<40;
4976 bSig64
= ( (uint64_t) bSig
)<<40;
4978 while ( 0 < expDiff
) {
4979 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4980 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4981 aSig64
= - ( ( bSig
* q64
)<<38 );
4985 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4986 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4987 q
= q64
>>( 64 - expDiff
);
4989 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4992 alternateASig
= aSig
;
4995 } while ( 0 <= (int32_t) aSig
);
4996 sigMean
= aSig
+ alternateASig
;
4997 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4998 aSig
= alternateASig
;
5000 zSign
= ( (int32_t) aSig
< 0 );
5001 if ( zSign
) aSig
= - aSig
;
5002 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5007 /*----------------------------------------------------------------------------
5008 | Returns the binary exponential of the single-precision floating-point value
5009 | `a'. The operation is performed according to the IEC/IEEE Standard for
5010 | Binary Floating-Point Arithmetic.
5012 | Uses the following identities:
5014 | 1. -------------------------------------------------------------------------
5018 | 2. -------------------------------------------------------------------------
5021 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5023 *----------------------------------------------------------------------------*/
5025 static const float64 float32_exp2_coefficients
[15] =
5027 const_float64( 0x3ff0000000000000ll
), /* 1 */
5028 const_float64( 0x3fe0000000000000ll
), /* 2 */
5029 const_float64( 0x3fc5555555555555ll
), /* 3 */
5030 const_float64( 0x3fa5555555555555ll
), /* 4 */
5031 const_float64( 0x3f81111111111111ll
), /* 5 */
5032 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5033 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5034 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5035 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5036 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5037 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5038 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5039 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5040 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5041 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5044 float32
float32_exp2(float32 a
, float_status
*status
)
5051 a
= float32_squash_input_denormal(a
, status
);
5053 aSig
= extractFloat32Frac( a
);
5054 aExp
= extractFloat32Exp( a
);
5055 aSign
= extractFloat32Sign( a
);
5057 if ( aExp
== 0xFF) {
5059 return propagateFloat32NaN(a
, float32_zero
, status
);
5061 return (aSign
) ? float32_zero
: a
;
5064 if (aSig
== 0) return float32_one
;
5067 float_raise(float_flag_inexact
, status
);
5069 /* ******************************* */
5070 /* using float64 for approximation */
5071 /* ******************************* */
5072 x
= float32_to_float64(a
, status
);
5073 x
= float64_mul(x
, float64_ln2
, status
);
5077 for (i
= 0 ; i
< 15 ; i
++) {
5080 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5081 r
= float64_add(r
, f
, status
);
5083 xn
= float64_mul(xn
, x
, status
);
5086 return float64_to_float32(r
, status
);
5089 /*----------------------------------------------------------------------------
5090 | Returns the binary log of the single-precision floating-point value `a'.
5091 | The operation is performed according to the IEC/IEEE Standard for Binary
5092 | Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094 float32
float32_log2(float32 a
, float_status
*status
)
5098 uint32_t aSig
, zSig
, i
;
5100 a
= float32_squash_input_denormal(a
, status
);
5101 aSig
= extractFloat32Frac( a
);
5102 aExp
= extractFloat32Exp( a
);
5103 aSign
= extractFloat32Sign( a
);
5106 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5107 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5110 float_raise(float_flag_invalid
, status
);
5111 return float32_default_nan(status
);
5113 if ( aExp
== 0xFF ) {
5115 return propagateFloat32NaN(a
, float32_zero
, status
);
5125 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5126 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5127 if ( aSig
& 0x01000000 ) {
5136 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5139 /*----------------------------------------------------------------------------
5140 | Returns the result of converting the double-precision floating-point value
5141 | `a' to the extended double-precision floating-point format. The conversion
5142 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5144 *----------------------------------------------------------------------------*/
5146 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5152 a
= float64_squash_input_denormal(a
, status
);
5153 aSig
= extractFloat64Frac( a
);
5154 aExp
= extractFloat64Exp( a
);
5155 aSign
= extractFloat64Sign( a
);
5156 if ( aExp
== 0x7FF ) {
5158 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5160 return floatx80_silence_nan(res
, status
);
5162 return packFloatx80(aSign
,
5163 floatx80_infinity_high
,
5164 floatx80_infinity_low
);
5167 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5168 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5172 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5176 /*----------------------------------------------------------------------------
5177 | Returns the result of converting the double-precision floating-point value
5178 | `a' to the quadruple-precision floating-point format. The conversion is
5179 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5181 *----------------------------------------------------------------------------*/
5183 float128
float64_to_float128(float64 a
, float_status
*status
)
5187 uint64_t aSig
, zSig0
, zSig1
;
5189 a
= float64_squash_input_denormal(a
, status
);
5190 aSig
= extractFloat64Frac( a
);
5191 aExp
= extractFloat64Exp( a
);
5192 aSign
= extractFloat64Sign( a
);
5193 if ( aExp
== 0x7FF ) {
5195 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5197 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5200 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5201 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5204 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5205 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5210 /*----------------------------------------------------------------------------
5211 | Returns the remainder of the double-precision floating-point value `a'
5212 | with respect to the corresponding value `b'. The operation is performed
5213 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5214 *----------------------------------------------------------------------------*/
5216 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5219 int aExp
, bExp
, expDiff
;
5220 uint64_t aSig
, bSig
;
5221 uint64_t q
, alternateASig
;
5224 a
= float64_squash_input_denormal(a
, status
);
5225 b
= float64_squash_input_denormal(b
, status
);
5226 aSig
= extractFloat64Frac( a
);
5227 aExp
= extractFloat64Exp( a
);
5228 aSign
= extractFloat64Sign( a
);
5229 bSig
= extractFloat64Frac( b
);
5230 bExp
= extractFloat64Exp( b
);
5231 if ( aExp
== 0x7FF ) {
5232 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5233 return propagateFloat64NaN(a
, b
, status
);
5235 float_raise(float_flag_invalid
, status
);
5236 return float64_default_nan(status
);
5238 if ( bExp
== 0x7FF ) {
5240 return propagateFloat64NaN(a
, b
, status
);
5246 float_raise(float_flag_invalid
, status
);
5247 return float64_default_nan(status
);
5249 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5252 if ( aSig
== 0 ) return a
;
5253 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5255 expDiff
= aExp
- bExp
;
5256 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5257 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5258 if ( expDiff
< 0 ) {
5259 if ( expDiff
< -1 ) return a
;
5262 q
= ( bSig
<= aSig
);
5263 if ( q
) aSig
-= bSig
;
5265 while ( 0 < expDiff
) {
5266 q
= estimateDiv128To64( aSig
, 0, bSig
);
5267 q
= ( 2 < q
) ? q
- 2 : 0;
5268 aSig
= - ( ( bSig
>>2 ) * q
);
5272 if ( 0 < expDiff
) {
5273 q
= estimateDiv128To64( aSig
, 0, bSig
);
5274 q
= ( 2 < q
) ? q
- 2 : 0;
5277 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5284 alternateASig
= aSig
;
5287 } while ( 0 <= (int64_t) aSig
);
5288 sigMean
= aSig
+ alternateASig
;
5289 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5290 aSig
= alternateASig
;
5292 zSign
= ( (int64_t) aSig
< 0 );
5293 if ( zSign
) aSig
= - aSig
;
5294 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5298 /*----------------------------------------------------------------------------
5299 | Returns the binary log of the double-precision floating-point value `a'.
5300 | The operation is performed according to the IEC/IEEE Standard for Binary
5301 | Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303 float64
float64_log2(float64 a
, float_status
*status
)
5307 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5308 a
= float64_squash_input_denormal(a
, status
);
5310 aSig
= extractFloat64Frac( a
);
5311 aExp
= extractFloat64Exp( a
);
5312 aSign
= extractFloat64Sign( a
);
5315 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5316 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5319 float_raise(float_flag_invalid
, status
);
5320 return float64_default_nan(status
);
5322 if ( aExp
== 0x7FF ) {
5324 return propagateFloat64NaN(a
, float64_zero
, status
);
5330 aSig
|= UINT64_C(0x0010000000000000);
5332 zSig
= (uint64_t)aExp
<< 52;
5333 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5334 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5335 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5336 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5344 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5347 /*----------------------------------------------------------------------------
5348 | Returns the result of converting the extended double-precision floating-
5349 | point value `a' to the 32-bit two's complement integer format. The
5350 | conversion is performed according to the IEC/IEEE Standard for Binary
5351 | Floating-Point Arithmetic---which means in particular that the conversion
5352 | is rounded according to the current rounding mode. If `a' is a NaN, the
5353 | largest positive integer is returned. Otherwise, if the conversion
5354 | overflows, the largest integer with the same sign as `a' is returned.
5355 *----------------------------------------------------------------------------*/
5357 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5360 int32_t aExp
, shiftCount
;
5363 if (floatx80_invalid_encoding(a
)) {
5364 float_raise(float_flag_invalid
, status
);
5367 aSig
= extractFloatx80Frac( a
);
5368 aExp
= extractFloatx80Exp( a
);
5369 aSign
= extractFloatx80Sign( a
);
5370 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5371 shiftCount
= 0x4037 - aExp
;
5372 if ( shiftCount
<= 0 ) shiftCount
= 1;
5373 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5374 return roundAndPackInt32(aSign
, aSig
, status
);
5378 /*----------------------------------------------------------------------------
5379 | Returns the result of converting the extended double-precision floating-
5380 | point value `a' to the 32-bit two's complement integer format. The
5381 | conversion is performed according to the IEC/IEEE Standard for Binary
5382 | Floating-Point Arithmetic, except that the conversion is always rounded
5383 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5384 | Otherwise, if the conversion overflows, the largest integer with the same
5385 | sign as `a' is returned.
5386 *----------------------------------------------------------------------------*/
5388 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5391 int32_t aExp
, shiftCount
;
5392 uint64_t aSig
, savedASig
;
5395 if (floatx80_invalid_encoding(a
)) {
5396 float_raise(float_flag_invalid
, status
);
5399 aSig
= extractFloatx80Frac( a
);
5400 aExp
= extractFloatx80Exp( a
);
5401 aSign
= extractFloatx80Sign( a
);
5402 if ( 0x401E < aExp
) {
5403 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5406 else if ( aExp
< 0x3FFF ) {
5408 status
->float_exception_flags
|= float_flag_inexact
;
5412 shiftCount
= 0x403E - aExp
;
5414 aSig
>>= shiftCount
;
5416 if ( aSign
) z
= - z
;
5417 if ( ( z
< 0 ) ^ aSign
) {
5419 float_raise(float_flag_invalid
, status
);
5420 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5422 if ( ( aSig
<<shiftCount
) != savedASig
) {
5423 status
->float_exception_flags
|= float_flag_inexact
;
5429 /*----------------------------------------------------------------------------
5430 | Returns the result of converting the extended double-precision floating-
5431 | point value `a' to the 64-bit two's complement integer format. The
5432 | conversion is performed according to the IEC/IEEE Standard for Binary
5433 | Floating-Point Arithmetic---which means in particular that the conversion
5434 | is rounded according to the current rounding mode. If `a' is a NaN,
5435 | the largest positive integer is returned. Otherwise, if the conversion
5436 | overflows, the largest integer with the same sign as `a' is returned.
5437 *----------------------------------------------------------------------------*/
5439 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5442 int32_t aExp
, shiftCount
;
5443 uint64_t aSig
, aSigExtra
;
5445 if (floatx80_invalid_encoding(a
)) {
5446 float_raise(float_flag_invalid
, status
);
5449 aSig
= extractFloatx80Frac( a
);
5450 aExp
= extractFloatx80Exp( a
);
5451 aSign
= extractFloatx80Sign( a
);
5452 shiftCount
= 0x403E - aExp
;
5453 if ( shiftCount
<= 0 ) {
5455 float_raise(float_flag_invalid
, status
);
5456 if (!aSign
|| floatx80_is_any_nan(a
)) {
5464 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5466 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5470 /*----------------------------------------------------------------------------
5471 | Returns the result of converting the extended double-precision floating-
5472 | point value `a' to the 64-bit two's complement integer format. The
5473 | conversion is performed according to the IEC/IEEE Standard for Binary
5474 | Floating-Point Arithmetic, except that the conversion is always rounded
5475 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5476 | Otherwise, if the conversion overflows, the largest integer with the same
5477 | sign as `a' is returned.
5478 *----------------------------------------------------------------------------*/
5480 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5483 int32_t aExp
, shiftCount
;
5487 if (floatx80_invalid_encoding(a
)) {
5488 float_raise(float_flag_invalid
, status
);
5491 aSig
= extractFloatx80Frac( a
);
5492 aExp
= extractFloatx80Exp( a
);
5493 aSign
= extractFloatx80Sign( a
);
5494 shiftCount
= aExp
- 0x403E;
5495 if ( 0 <= shiftCount
) {
5496 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5497 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5498 float_raise(float_flag_invalid
, status
);
5499 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5505 else if ( aExp
< 0x3FFF ) {
5507 status
->float_exception_flags
|= float_flag_inexact
;
5511 z
= aSig
>>( - shiftCount
);
5512 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5513 status
->float_exception_flags
|= float_flag_inexact
;
5515 if ( aSign
) z
= - z
;
5520 /*----------------------------------------------------------------------------
5521 | Returns the result of converting the extended double-precision floating-
5522 | point value `a' to the single-precision floating-point format. The
5523 | conversion is performed according to the IEC/IEEE Standard for Binary
5524 | Floating-Point Arithmetic.
5525 *----------------------------------------------------------------------------*/
5527 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5533 if (floatx80_invalid_encoding(a
)) {
5534 float_raise(float_flag_invalid
, status
);
5535 return float32_default_nan(status
);
5537 aSig
= extractFloatx80Frac( a
);
5538 aExp
= extractFloatx80Exp( a
);
5539 aSign
= extractFloatx80Sign( a
);
5540 if ( aExp
== 0x7FFF ) {
5541 if ( (uint64_t) ( aSig
<<1 ) ) {
5542 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5544 return float32_silence_nan(res
, status
);
5546 return packFloat32( aSign
, 0xFF, 0 );
5548 shift64RightJamming( aSig
, 33, &aSig
);
5549 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5550 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5554 /*----------------------------------------------------------------------------
5555 | Returns the result of converting the extended double-precision floating-
5556 | point value `a' to the double-precision floating-point format. The
5557 | conversion is performed according to the IEC/IEEE Standard for Binary
5558 | Floating-Point Arithmetic.
5559 *----------------------------------------------------------------------------*/
5561 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5565 uint64_t aSig
, zSig
;
5567 if (floatx80_invalid_encoding(a
)) {
5568 float_raise(float_flag_invalid
, status
);
5569 return float64_default_nan(status
);
5571 aSig
= extractFloatx80Frac( a
);
5572 aExp
= extractFloatx80Exp( a
);
5573 aSign
= extractFloatx80Sign( a
);
5574 if ( aExp
== 0x7FFF ) {
5575 if ( (uint64_t) ( aSig
<<1 ) ) {
5576 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5578 return float64_silence_nan(res
, status
);
5580 return packFloat64( aSign
, 0x7FF, 0 );
5582 shift64RightJamming( aSig
, 1, &zSig
);
5583 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5584 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the quadruple-precision floating-point format. The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5595 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5599 uint64_t aSig
, zSig0
, zSig1
;
5601 if (floatx80_invalid_encoding(a
)) {
5602 float_raise(float_flag_invalid
, status
);
5603 return float128_default_nan(status
);
5605 aSig
= extractFloatx80Frac( a
);
5606 aExp
= extractFloatx80Exp( a
);
5607 aSign
= extractFloatx80Sign( a
);
5608 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5609 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5611 return float128_silence_nan(res
, status
);
5613 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5614 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5618 /*----------------------------------------------------------------------------
5619 | Rounds the extended double-precision floating-point value `a'
5620 | to the precision provided by floatx80_rounding_precision and returns the
5621 | result as an extended double-precision floating-point value.
5622 | The operation is performed according to the IEC/IEEE Standard for Binary
5623 | Floating-Point Arithmetic.
5624 *----------------------------------------------------------------------------*/
5626 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5628 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5629 extractFloatx80Sign(a
),
5630 extractFloatx80Exp(a
),
5631 extractFloatx80Frac(a
), 0, status
);
5634 /*----------------------------------------------------------------------------
5635 | Rounds the extended double-precision floating-point value `a' to an integer,
5636 | and returns the result as an extended quadruple-precision floating-point
5637 | value. The operation is performed according to the IEC/IEEE Standard for
5638 | Binary Floating-Point Arithmetic.
5639 *----------------------------------------------------------------------------*/
5641 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5645 uint64_t lastBitMask
, roundBitsMask
;
5648 if (floatx80_invalid_encoding(a
)) {
5649 float_raise(float_flag_invalid
, status
);
5650 return floatx80_default_nan(status
);
5652 aExp
= extractFloatx80Exp( a
);
5653 if ( 0x403E <= aExp
) {
5654 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5655 return propagateFloatx80NaN(a
, a
, status
);
5659 if ( aExp
< 0x3FFF ) {
5661 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5664 status
->float_exception_flags
|= float_flag_inexact
;
5665 aSign
= extractFloatx80Sign( a
);
5666 switch (status
->float_rounding_mode
) {
5667 case float_round_nearest_even
:
5668 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5671 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5674 case float_round_ties_away
:
5675 if (aExp
== 0x3FFE) {
5676 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5679 case float_round_down
:
5682 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5683 : packFloatx80( 0, 0, 0 );
5684 case float_round_up
:
5686 aSign
? packFloatx80( 1, 0, 0 )
5687 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5689 case float_round_to_zero
:
5692 g_assert_not_reached();
5694 return packFloatx80( aSign
, 0, 0 );
5697 lastBitMask
<<= 0x403E - aExp
;
5698 roundBitsMask
= lastBitMask
- 1;
5700 switch (status
->float_rounding_mode
) {
5701 case float_round_nearest_even
:
5702 z
.low
+= lastBitMask
>>1;
5703 if ((z
.low
& roundBitsMask
) == 0) {
5704 z
.low
&= ~lastBitMask
;
5707 case float_round_ties_away
:
5708 z
.low
+= lastBitMask
>> 1;
5710 case float_round_to_zero
:
5712 case float_round_up
:
5713 if (!extractFloatx80Sign(z
)) {
5714 z
.low
+= roundBitsMask
;
5717 case float_round_down
:
5718 if (extractFloatx80Sign(z
)) {
5719 z
.low
+= roundBitsMask
;
5725 z
.low
&= ~ roundBitsMask
;
5728 z
.low
= UINT64_C(0x8000000000000000);
5730 if (z
.low
!= a
.low
) {
5731 status
->float_exception_flags
|= float_flag_inexact
;
5737 /*----------------------------------------------------------------------------
5738 | Returns the result of adding the absolute values of the extended double-
5739 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5740 | negated before being returned. `zSign' is ignored if the result is a NaN.
5741 | The addition is performed according to the IEC/IEEE Standard for Binary
5742 | Floating-Point Arithmetic.
5743 *----------------------------------------------------------------------------*/
5745 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5746 float_status
*status
)
5748 int32_t aExp
, bExp
, zExp
;
5749 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5752 aSig
= extractFloatx80Frac( a
);
5753 aExp
= extractFloatx80Exp( a
);
5754 bSig
= extractFloatx80Frac( b
);
5755 bExp
= extractFloatx80Exp( b
);
5756 expDiff
= aExp
- bExp
;
5757 if ( 0 < expDiff
) {
5758 if ( aExp
== 0x7FFF ) {
5759 if ((uint64_t)(aSig
<< 1)) {
5760 return propagateFloatx80NaN(a
, b
, status
);
5764 if ( bExp
== 0 ) --expDiff
;
5765 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5768 else if ( expDiff
< 0 ) {
5769 if ( bExp
== 0x7FFF ) {
5770 if ((uint64_t)(bSig
<< 1)) {
5771 return propagateFloatx80NaN(a
, b
, status
);
5773 return packFloatx80(zSign
,
5774 floatx80_infinity_high
,
5775 floatx80_infinity_low
);
5777 if ( aExp
== 0 ) ++expDiff
;
5778 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5782 if ( aExp
== 0x7FFF ) {
5783 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5784 return propagateFloatx80NaN(a
, b
, status
);
5789 zSig0
= aSig
+ bSig
;
5791 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5792 /* At least one of the values is a pseudo-denormal,
5793 * and there is a carry out of the result. */
5798 return packFloatx80(zSign
, 0, 0);
5800 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5806 zSig0
= aSig
+ bSig
;
5807 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5809 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5810 zSig0
|= UINT64_C(0x8000000000000000);
5813 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5814 zSign
, zExp
, zSig0
, zSig1
, status
);
5817 /*----------------------------------------------------------------------------
5818 | Returns the result of subtracting the absolute values of the extended
5819 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5820 | difference is negated before being returned. `zSign' is ignored if the
5821 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5822 | Standard for Binary Floating-Point Arithmetic.
5823 *----------------------------------------------------------------------------*/
5825 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5826 float_status
*status
)
5828 int32_t aExp
, bExp
, zExp
;
5829 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5832 aSig
= extractFloatx80Frac( a
);
5833 aExp
= extractFloatx80Exp( a
);
5834 bSig
= extractFloatx80Frac( b
);
5835 bExp
= extractFloatx80Exp( b
);
5836 expDiff
= aExp
- bExp
;
5837 if ( 0 < expDiff
) goto aExpBigger
;
5838 if ( expDiff
< 0 ) goto bExpBigger
;
5839 if ( aExp
== 0x7FFF ) {
5840 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5841 return propagateFloatx80NaN(a
, b
, status
);
5843 float_raise(float_flag_invalid
, status
);
5844 return floatx80_default_nan(status
);
5851 if ( bSig
< aSig
) goto aBigger
;
5852 if ( aSig
< bSig
) goto bBigger
;
5853 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5855 if ( bExp
== 0x7FFF ) {
5856 if ((uint64_t)(bSig
<< 1)) {
5857 return propagateFloatx80NaN(a
, b
, status
);
5859 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5860 floatx80_infinity_low
);
5862 if ( aExp
== 0 ) ++expDiff
;
5863 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5865 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5868 goto normalizeRoundAndPack
;
5870 if ( aExp
== 0x7FFF ) {
5871 if ((uint64_t)(aSig
<< 1)) {
5872 return propagateFloatx80NaN(a
, b
, status
);
5876 if ( bExp
== 0 ) --expDiff
;
5877 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5879 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5881 normalizeRoundAndPack
:
5882 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5883 zSign
, zExp
, zSig0
, zSig1
, status
);
5886 /*----------------------------------------------------------------------------
5887 | Returns the result of adding the extended double-precision floating-point
5888 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5889 | Standard for Binary Floating-Point Arithmetic.
5890 *----------------------------------------------------------------------------*/
5892 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5896 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5897 float_raise(float_flag_invalid
, status
);
5898 return floatx80_default_nan(status
);
5900 aSign
= extractFloatx80Sign( a
);
5901 bSign
= extractFloatx80Sign( b
);
5902 if ( aSign
== bSign
) {
5903 return addFloatx80Sigs(a
, b
, aSign
, status
);
5906 return subFloatx80Sigs(a
, b
, aSign
, status
);
5911 /*----------------------------------------------------------------------------
5912 | Returns the result of subtracting the extended double-precision floating-
5913 | point values `a' and `b'. The operation is performed according to the
5914 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5917 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5921 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5922 float_raise(float_flag_invalid
, status
);
5923 return floatx80_default_nan(status
);
5925 aSign
= extractFloatx80Sign( a
);
5926 bSign
= extractFloatx80Sign( b
);
5927 if ( aSign
== bSign
) {
5928 return subFloatx80Sigs(a
, b
, aSign
, status
);
5931 return addFloatx80Sigs(a
, b
, aSign
, status
);
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of multiplying the extended double-precision floating-
5938 | point values `a' and `b'. The operation is performed according to the
5939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5942 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5944 bool aSign
, bSign
, zSign
;
5945 int32_t aExp
, bExp
, zExp
;
5946 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5948 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5949 float_raise(float_flag_invalid
, status
);
5950 return floatx80_default_nan(status
);
5952 aSig
= extractFloatx80Frac( a
);
5953 aExp
= extractFloatx80Exp( a
);
5954 aSign
= extractFloatx80Sign( a
);
5955 bSig
= extractFloatx80Frac( b
);
5956 bExp
= extractFloatx80Exp( b
);
5957 bSign
= extractFloatx80Sign( b
);
5958 zSign
= aSign
^ bSign
;
5959 if ( aExp
== 0x7FFF ) {
5960 if ( (uint64_t) ( aSig
<<1 )
5961 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5962 return propagateFloatx80NaN(a
, b
, status
);
5964 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5965 return packFloatx80(zSign
, floatx80_infinity_high
,
5966 floatx80_infinity_low
);
5968 if ( bExp
== 0x7FFF ) {
5969 if ((uint64_t)(bSig
<< 1)) {
5970 return propagateFloatx80NaN(a
, b
, status
);
5972 if ( ( aExp
| aSig
) == 0 ) {
5974 float_raise(float_flag_invalid
, status
);
5975 return floatx80_default_nan(status
);
5977 return packFloatx80(zSign
, floatx80_infinity_high
,
5978 floatx80_infinity_low
);
5981 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5982 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5985 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5986 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5988 zExp
= aExp
+ bExp
- 0x3FFE;
5989 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5990 if ( 0 < (int64_t) zSig0
) {
5991 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5994 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5995 zSign
, zExp
, zSig0
, zSig1
, status
);
5998 /*----------------------------------------------------------------------------
5999 | Returns the result of dividing the extended double-precision floating-point
6000 | value `a' by the corresponding value `b'. The operation is performed
6001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002 *----------------------------------------------------------------------------*/
6004 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6006 bool aSign
, bSign
, zSign
;
6007 int32_t aExp
, bExp
, zExp
;
6008 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6009 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6011 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6012 float_raise(float_flag_invalid
, status
);
6013 return floatx80_default_nan(status
);
6015 aSig
= extractFloatx80Frac( a
);
6016 aExp
= extractFloatx80Exp( a
);
6017 aSign
= extractFloatx80Sign( a
);
6018 bSig
= extractFloatx80Frac( b
);
6019 bExp
= extractFloatx80Exp( b
);
6020 bSign
= extractFloatx80Sign( b
);
6021 zSign
= aSign
^ bSign
;
6022 if ( aExp
== 0x7FFF ) {
6023 if ((uint64_t)(aSig
<< 1)) {
6024 return propagateFloatx80NaN(a
, b
, status
);
6026 if ( bExp
== 0x7FFF ) {
6027 if ((uint64_t)(bSig
<< 1)) {
6028 return propagateFloatx80NaN(a
, b
, status
);
6032 return packFloatx80(zSign
, floatx80_infinity_high
,
6033 floatx80_infinity_low
);
6035 if ( bExp
== 0x7FFF ) {
6036 if ((uint64_t)(bSig
<< 1)) {
6037 return propagateFloatx80NaN(a
, b
, status
);
6039 return packFloatx80( zSign
, 0, 0 );
6043 if ( ( aExp
| aSig
) == 0 ) {
6045 float_raise(float_flag_invalid
, status
);
6046 return floatx80_default_nan(status
);
6048 float_raise(float_flag_divbyzero
, status
);
6049 return packFloatx80(zSign
, floatx80_infinity_high
,
6050 floatx80_infinity_low
);
6052 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6055 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6056 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6058 zExp
= aExp
- bExp
+ 0x3FFE;
6060 if ( bSig
<= aSig
) {
6061 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6064 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6065 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6066 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6067 while ( (int64_t) rem0
< 0 ) {
6069 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6071 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6072 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6073 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6074 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6075 while ( (int64_t) rem1
< 0 ) {
6077 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6079 zSig1
|= ( ( rem1
| rem2
) != 0 );
6081 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6082 zSign
, zExp
, zSig0
, zSig1
, status
);
6085 /*----------------------------------------------------------------------------
6086 | Returns the remainder of the extended double-precision floating-point value
6087 | `a' with respect to the corresponding value `b'. The operation is performed
6088 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6089 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6090 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6091 | the absolute value of the integer quotient.
6092 *----------------------------------------------------------------------------*/
6094 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6095 float_status
*status
)
6098 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6099 uint64_t aSig0
, aSig1
, bSig
;
6100 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6103 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6104 float_raise(float_flag_invalid
, status
);
6105 return floatx80_default_nan(status
);
6107 aSig0
= extractFloatx80Frac( a
);
6108 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6109 aSign
= extractFloatx80Sign( a
);
6110 bSig
= extractFloatx80Frac( b
);
6111 bExp
= extractFloatx80Exp( b
);
6112 if ( aExp
== 0x7FFF ) {
6113 if ( (uint64_t) ( aSig0
<<1 )
6114 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6115 return propagateFloatx80NaN(a
, b
, status
);
6119 if ( bExp
== 0x7FFF ) {
6120 if ((uint64_t)(bSig
<< 1)) {
6121 return propagateFloatx80NaN(a
, b
, status
);
6123 if (aExp
== 0 && aSig0
>> 63) {
6125 * Pseudo-denormal argument must be returned in normalized
6128 return packFloatx80(aSign
, 1, aSig0
);
6135 float_raise(float_flag_invalid
, status
);
6136 return floatx80_default_nan(status
);
6138 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6141 if ( aSig0
== 0 ) return a
;
6142 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6145 expDiff
= aExp
- bExp
;
6147 if ( expDiff
< 0 ) {
6148 if ( mod
|| expDiff
< -1 ) {
6149 if (aExp
== 1 && aExpOrig
== 0) {
6151 * Pseudo-denormal argument must be returned in
6154 return packFloatx80(aSign
, aExp
, aSig0
);
6158 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6161 *quotient
= q
= ( bSig
<= aSig0
);
6162 if ( q
) aSig0
-= bSig
;
6164 while ( 0 < expDiff
) {
6165 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6166 q
= ( 2 < q
) ? q
- 2 : 0;
6167 mul64To128( bSig
, q
, &term0
, &term1
);
6168 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6169 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6175 if ( 0 < expDiff
) {
6176 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6177 q
= ( 2 < q
) ? q
- 2 : 0;
6179 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6180 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6181 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6182 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6184 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6187 *quotient
<<= expDiff
;
6198 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6199 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6200 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6203 aSig0
= alternateASig0
;
6204 aSig1
= alternateASig1
;
6210 normalizeRoundAndPackFloatx80(
6211 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6215 /*----------------------------------------------------------------------------
6216 | Returns the remainder of the extended double-precision floating-point value
6217 | `a' with respect to the corresponding value `b'. The operation is performed
6218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6221 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6224 return floatx80_modrem(a
, b
, false, "ient
, status
);
6227 /*----------------------------------------------------------------------------
6228 | Returns the remainder of the extended double-precision floating-point value
6229 | `a' with respect to the corresponding value `b', with the quotient truncated
6231 *----------------------------------------------------------------------------*/
6233 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6236 return floatx80_modrem(a
, b
, true, "ient
, status
);
6239 /*----------------------------------------------------------------------------
6240 | Returns the square root of the extended double-precision floating-point
6241 | value `a'. The operation is performed according to the IEC/IEEE Standard
6242 | for Binary Floating-Point Arithmetic.
6243 *----------------------------------------------------------------------------*/
6245 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6249 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6250 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6252 if (floatx80_invalid_encoding(a
)) {
6253 float_raise(float_flag_invalid
, status
);
6254 return floatx80_default_nan(status
);
6256 aSig0
= extractFloatx80Frac( a
);
6257 aExp
= extractFloatx80Exp( a
);
6258 aSign
= extractFloatx80Sign( a
);
6259 if ( aExp
== 0x7FFF ) {
6260 if ((uint64_t)(aSig0
<< 1)) {
6261 return propagateFloatx80NaN(a
, a
, status
);
6263 if ( ! aSign
) return a
;
6267 if ( ( aExp
| aSig0
) == 0 ) return a
;
6269 float_raise(float_flag_invalid
, status
);
6270 return floatx80_default_nan(status
);
6273 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6274 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6276 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6277 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6278 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6279 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6280 doubleZSig0
= zSig0
<<1;
6281 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6282 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6283 while ( (int64_t) rem0
< 0 ) {
6286 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6288 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6289 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6290 if ( zSig1
== 0 ) zSig1
= 1;
6291 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6292 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6293 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6294 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6295 while ( (int64_t) rem1
< 0 ) {
6297 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6299 term2
|= doubleZSig0
;
6300 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6302 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6304 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6305 zSig0
|= doubleZSig0
;
6306 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6307 0, zExp
, zSig0
, zSig1
, status
);
6310 /*----------------------------------------------------------------------------
6311 | Returns the result of converting the quadruple-precision floating-point
6312 | value `a' to the 32-bit two's complement integer format. The conversion
6313 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6314 | Arithmetic---which means in particular that the conversion is rounded
6315 | according to the current rounding mode. If `a' is a NaN, the largest
6316 | positive integer is returned. Otherwise, if the conversion overflows, the
6317 | largest integer with the same sign as `a' is returned.
6318 *----------------------------------------------------------------------------*/
6320 int32_t float128_to_int32(float128 a
, float_status
*status
)
6323 int32_t aExp
, shiftCount
;
6324 uint64_t aSig0
, aSig1
;
6326 aSig1
= extractFloat128Frac1( a
);
6327 aSig0
= extractFloat128Frac0( a
);
6328 aExp
= extractFloat128Exp( a
);
6329 aSign
= extractFloat128Sign( a
);
6330 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6331 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6332 aSig0
|= ( aSig1
!= 0 );
6333 shiftCount
= 0x4028 - aExp
;
6334 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6335 return roundAndPackInt32(aSign
, aSig0
, status
);
6339 /*----------------------------------------------------------------------------
6340 | Returns the result of converting the quadruple-precision floating-point
6341 | value `a' to the 32-bit two's complement integer format. The conversion
6342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6343 | Arithmetic, except that the conversion is always rounded toward zero. If
6344 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6345 | conversion overflows, the largest integer with the same sign as `a' is
6347 *----------------------------------------------------------------------------*/
6349 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6352 int32_t aExp
, shiftCount
;
6353 uint64_t aSig0
, aSig1
, savedASig
;
6356 aSig1
= extractFloat128Frac1( a
);
6357 aSig0
= extractFloat128Frac0( a
);
6358 aExp
= extractFloat128Exp( a
);
6359 aSign
= extractFloat128Sign( a
);
6360 aSig0
|= ( aSig1
!= 0 );
6361 if ( 0x401E < aExp
) {
6362 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6365 else if ( aExp
< 0x3FFF ) {
6366 if (aExp
|| aSig0
) {
6367 status
->float_exception_flags
|= float_flag_inexact
;
6371 aSig0
|= UINT64_C(0x0001000000000000);
6372 shiftCount
= 0x402F - aExp
;
6374 aSig0
>>= shiftCount
;
6376 if ( aSign
) z
= - z
;
6377 if ( ( z
< 0 ) ^ aSign
) {
6379 float_raise(float_flag_invalid
, status
);
6380 return aSign
? INT32_MIN
: INT32_MAX
;
6382 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6383 status
->float_exception_flags
|= float_flag_inexact
;
6389 /*----------------------------------------------------------------------------
6390 | Returns the result of converting the quadruple-precision floating-point
6391 | value `a' to the 64-bit two's complement integer format. The conversion
6392 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6393 | Arithmetic---which means in particular that the conversion is rounded
6394 | according to the current rounding mode. If `a' is a NaN, the largest
6395 | positive integer is returned. Otherwise, if the conversion overflows, the
6396 | largest integer with the same sign as `a' is returned.
6397 *----------------------------------------------------------------------------*/
6399 int64_t float128_to_int64(float128 a
, float_status
*status
)
6402 int32_t aExp
, shiftCount
;
6403 uint64_t aSig0
, aSig1
;
6405 aSig1
= extractFloat128Frac1( a
);
6406 aSig0
= extractFloat128Frac0( a
);
6407 aExp
= extractFloat128Exp( a
);
6408 aSign
= extractFloat128Sign( a
);
6409 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6410 shiftCount
= 0x402F - aExp
;
6411 if ( shiftCount
<= 0 ) {
6412 if ( 0x403E < aExp
) {
6413 float_raise(float_flag_invalid
, status
);
6415 || ( ( aExp
== 0x7FFF )
6416 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6423 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6426 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6428 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6432 /*----------------------------------------------------------------------------
6433 | Returns the result of converting the quadruple-precision floating-point
6434 | value `a' to the 64-bit two's complement integer format. The conversion
6435 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6436 | Arithmetic, except that the conversion is always rounded toward zero.
6437 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6438 | the conversion overflows, the largest integer with the same sign as `a' is
6440 *----------------------------------------------------------------------------*/
6442 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6445 int32_t aExp
, shiftCount
;
6446 uint64_t aSig0
, aSig1
;
6449 aSig1
= extractFloat128Frac1( a
);
6450 aSig0
= extractFloat128Frac0( a
);
6451 aExp
= extractFloat128Exp( a
);
6452 aSign
= extractFloat128Sign( a
);
6453 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6454 shiftCount
= aExp
- 0x402F;
6455 if ( 0 < shiftCount
) {
6456 if ( 0x403E <= aExp
) {
6457 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6458 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6459 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6461 status
->float_exception_flags
|= float_flag_inexact
;
6465 float_raise(float_flag_invalid
, status
);
6466 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6472 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6473 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6474 status
->float_exception_flags
|= float_flag_inexact
;
6478 if ( aExp
< 0x3FFF ) {
6479 if ( aExp
| aSig0
| aSig1
) {
6480 status
->float_exception_flags
|= float_flag_inexact
;
6484 z
= aSig0
>>( - shiftCount
);
6486 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6487 status
->float_exception_flags
|= float_flag_inexact
;
6490 if ( aSign
) z
= - z
;
6495 /*----------------------------------------------------------------------------
6496 | Returns the result of converting the quadruple-precision floating-point value
6497 | `a' to the 64-bit unsigned integer format. The conversion is
6498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6499 | Arithmetic---which means in particular that the conversion is rounded
6500 | according to the current rounding mode. If `a' is a NaN, the largest
6501 | positive integer is returned. If the conversion overflows, the
6502 | largest unsigned integer is returned. If 'a' is negative, the value is
6503 | rounded and zero is returned; negative values that do not round to zero
6504 | will raise the inexact exception.
6505 *----------------------------------------------------------------------------*/
6507 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6512 uint64_t aSig0
, aSig1
;
6514 aSig0
= extractFloat128Frac0(a
);
6515 aSig1
= extractFloat128Frac1(a
);
6516 aExp
= extractFloat128Exp(a
);
6517 aSign
= extractFloat128Sign(a
);
6518 if (aSign
&& (aExp
> 0x3FFE)) {
6519 float_raise(float_flag_invalid
, status
);
6520 if (float128_is_any_nan(a
)) {
6527 aSig0
|= UINT64_C(0x0001000000000000);
6529 shiftCount
= 0x402F - aExp
;
6530 if (shiftCount
<= 0) {
6531 if (0x403E < aExp
) {
6532 float_raise(float_flag_invalid
, status
);
6535 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6537 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6539 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6542 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6545 signed char current_rounding_mode
= status
->float_rounding_mode
;
6547 set_float_rounding_mode(float_round_to_zero
, status
);
6548 v
= float128_to_uint64(a
, status
);
6549 set_float_rounding_mode(current_rounding_mode
, status
);
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point
6556 | value `a' to the 32-bit unsigned integer format. The conversion
6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic except that the conversion is always rounded toward zero.
6559 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6560 | if the conversion overflows, the largest unsigned integer is returned.
6561 | If 'a' is negative, the value is rounded and zero is returned; negative
6562 | values that do not round to zero will raise the inexact exception.
6563 *----------------------------------------------------------------------------*/
6565 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6569 int old_exc_flags
= get_float_exception_flags(status
);
6571 v
= float128_to_uint64_round_to_zero(a
, status
);
6572 if (v
> 0xffffffff) {
6577 set_float_exception_flags(old_exc_flags
, status
);
6578 float_raise(float_flag_invalid
, status
);
6582 /*----------------------------------------------------------------------------
6583 | Returns the result of converting the quadruple-precision floating-point value
6584 | `a' to the 32-bit unsigned integer format. The conversion is
6585 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6586 | Arithmetic---which means in particular that the conversion is rounded
6587 | according to the current rounding mode. If `a' is a NaN, the largest
6588 | positive integer is returned. If the conversion overflows, the
6589 | largest unsigned integer is returned. If 'a' is negative, the value is
6590 | rounded and zero is returned; negative values that do not round to zero
6591 | will raise the inexact exception.
6592 *----------------------------------------------------------------------------*/
6594 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6598 int old_exc_flags
= get_float_exception_flags(status
);
6600 v
= float128_to_uint64(a
, status
);
6601 if (v
> 0xffffffff) {
6606 set_float_exception_flags(old_exc_flags
, status
);
6607 float_raise(float_flag_invalid
, status
);
6611 /*----------------------------------------------------------------------------
6612 | Returns the result of converting the quadruple-precision floating-point
6613 | value `a' to the single-precision floating-point format. The conversion
6614 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6616 *----------------------------------------------------------------------------*/
6618 float32
float128_to_float32(float128 a
, float_status
*status
)
6622 uint64_t aSig0
, aSig1
;
6625 aSig1
= extractFloat128Frac1( a
);
6626 aSig0
= extractFloat128Frac0( a
);
6627 aExp
= extractFloat128Exp( a
);
6628 aSign
= extractFloat128Sign( a
);
6629 if ( aExp
== 0x7FFF ) {
6630 if ( aSig0
| aSig1
) {
6631 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6633 return packFloat32( aSign
, 0xFF, 0 );
6635 aSig0
|= ( aSig1
!= 0 );
6636 shift64RightJamming( aSig0
, 18, &aSig0
);
6638 if ( aExp
|| zSig
) {
6642 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6646 /*----------------------------------------------------------------------------
6647 | Returns the result of converting the quadruple-precision floating-point
6648 | value `a' to the double-precision floating-point format. The conversion
6649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6651 *----------------------------------------------------------------------------*/
6653 float64
float128_to_float64(float128 a
, float_status
*status
)
6657 uint64_t aSig0
, aSig1
;
6659 aSig1
= extractFloat128Frac1( a
);
6660 aSig0
= extractFloat128Frac0( a
);
6661 aExp
= extractFloat128Exp( a
);
6662 aSign
= extractFloat128Sign( a
);
6663 if ( aExp
== 0x7FFF ) {
6664 if ( aSig0
| aSig1
) {
6665 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6667 return packFloat64( aSign
, 0x7FF, 0 );
6669 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6670 aSig0
|= ( aSig1
!= 0 );
6671 if ( aExp
|| aSig0
) {
6672 aSig0
|= UINT64_C(0x4000000000000000);
6675 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6679 /*----------------------------------------------------------------------------
6680 | Returns the result of converting the quadruple-precision floating-point
6681 | value `a' to the extended double-precision floating-point format. The
6682 | conversion is performed according to the IEC/IEEE Standard for Binary
6683 | Floating-Point Arithmetic.
6684 *----------------------------------------------------------------------------*/
6686 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6690 uint64_t aSig0
, aSig1
;
6692 aSig1
= extractFloat128Frac1( a
);
6693 aSig0
= extractFloat128Frac0( a
);
6694 aExp
= extractFloat128Exp( a
);
6695 aSign
= extractFloat128Sign( a
);
6696 if ( aExp
== 0x7FFF ) {
6697 if ( aSig0
| aSig1
) {
6698 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6700 return floatx80_silence_nan(res
, status
);
6702 return packFloatx80(aSign
, floatx80_infinity_high
,
6703 floatx80_infinity_low
);
6706 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6707 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6710 aSig0
|= UINT64_C(0x0001000000000000);
6712 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6713 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6717 /*----------------------------------------------------------------------------
6718 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6719 | returns the result as a quadruple-precision floating-point value. The
6720 | operation is performed according to the IEC/IEEE Standard for Binary
6721 | Floating-Point Arithmetic.
6722 *----------------------------------------------------------------------------*/
6724 float128
float128_round_to_int(float128 a
, float_status
*status
)
6728 uint64_t lastBitMask
, roundBitsMask
;
6731 aExp
= extractFloat128Exp( a
);
6732 if ( 0x402F <= aExp
) {
6733 if ( 0x406F <= aExp
) {
6734 if ( ( aExp
== 0x7FFF )
6735 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6737 return propagateFloat128NaN(a
, a
, status
);
6742 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6743 roundBitsMask
= lastBitMask
- 1;
6745 switch (status
->float_rounding_mode
) {
6746 case float_round_nearest_even
:
6747 if ( lastBitMask
) {
6748 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6749 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6752 if ( (int64_t) z
.low
< 0 ) {
6754 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6758 case float_round_ties_away
:
6760 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6762 if ((int64_t) z
.low
< 0) {
6767 case float_round_to_zero
:
6769 case float_round_up
:
6770 if (!extractFloat128Sign(z
)) {
6771 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6774 case float_round_down
:
6775 if (extractFloat128Sign(z
)) {
6776 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6779 case float_round_to_odd
:
6781 * Note that if lastBitMask == 0, the last bit is the lsb
6782 * of high, and roundBitsMask == -1.
6784 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6785 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6791 z
.low
&= ~ roundBitsMask
;
6794 if ( aExp
< 0x3FFF ) {
6795 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6796 status
->float_exception_flags
|= float_flag_inexact
;
6797 aSign
= extractFloat128Sign( a
);
6798 switch (status
->float_rounding_mode
) {
6799 case float_round_nearest_even
:
6800 if ( ( aExp
== 0x3FFE )
6801 && ( extractFloat128Frac0( a
)
6802 | extractFloat128Frac1( a
) )
6804 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6807 case float_round_ties_away
:
6808 if (aExp
== 0x3FFE) {
6809 return packFloat128(aSign
, 0x3FFF, 0, 0);
6812 case float_round_down
:
6814 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6815 : packFloat128( 0, 0, 0, 0 );
6816 case float_round_up
:
6818 aSign
? packFloat128( 1, 0, 0, 0 )
6819 : packFloat128( 0, 0x3FFF, 0, 0 );
6821 case float_round_to_odd
:
6822 return packFloat128(aSign
, 0x3FFF, 0, 0);
6824 case float_round_to_zero
:
6827 return packFloat128( aSign
, 0, 0, 0 );
6830 lastBitMask
<<= 0x402F - aExp
;
6831 roundBitsMask
= lastBitMask
- 1;
6834 switch (status
->float_rounding_mode
) {
6835 case float_round_nearest_even
:
6836 z
.high
+= lastBitMask
>>1;
6837 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6838 z
.high
&= ~ lastBitMask
;
6841 case float_round_ties_away
:
6842 z
.high
+= lastBitMask
>>1;
6844 case float_round_to_zero
:
6846 case float_round_up
:
6847 if (!extractFloat128Sign(z
)) {
6848 z
.high
|= ( a
.low
!= 0 );
6849 z
.high
+= roundBitsMask
;
6852 case float_round_down
:
6853 if (extractFloat128Sign(z
)) {
6854 z
.high
|= (a
.low
!= 0);
6855 z
.high
+= roundBitsMask
;
6858 case float_round_to_odd
:
6859 if ((z
.high
& lastBitMask
) == 0) {
6860 z
.high
|= (a
.low
!= 0);
6861 z
.high
+= roundBitsMask
;
6867 z
.high
&= ~ roundBitsMask
;
6869 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6870 status
->float_exception_flags
|= float_flag_inexact
;
6876 /*----------------------------------------------------------------------------
6877 | Returns the result of adding the absolute values of the quadruple-precision
6878 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6879 | before being returned. `zSign' is ignored if the result is a NaN.
6880 | The addition is performed according to the IEC/IEEE Standard for Binary
6881 | Floating-Point Arithmetic.
6882 *----------------------------------------------------------------------------*/
6884 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6885 float_status
*status
)
6887 int32_t aExp
, bExp
, zExp
;
6888 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6891 aSig1
= extractFloat128Frac1( a
);
6892 aSig0
= extractFloat128Frac0( a
);
6893 aExp
= extractFloat128Exp( a
);
6894 bSig1
= extractFloat128Frac1( b
);
6895 bSig0
= extractFloat128Frac0( b
);
6896 bExp
= extractFloat128Exp( b
);
6897 expDiff
= aExp
- bExp
;
6898 if ( 0 < expDiff
) {
6899 if ( aExp
== 0x7FFF ) {
6900 if (aSig0
| aSig1
) {
6901 return propagateFloat128NaN(a
, b
, status
);
6909 bSig0
|= UINT64_C(0x0001000000000000);
6911 shift128ExtraRightJamming(
6912 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6915 else if ( expDiff
< 0 ) {
6916 if ( bExp
== 0x7FFF ) {
6917 if (bSig0
| bSig1
) {
6918 return propagateFloat128NaN(a
, b
, status
);
6920 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6926 aSig0
|= UINT64_C(0x0001000000000000);
6928 shift128ExtraRightJamming(
6929 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6933 if ( aExp
== 0x7FFF ) {
6934 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6935 return propagateFloat128NaN(a
, b
, status
);
6939 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6941 if (status
->flush_to_zero
) {
6942 if (zSig0
| zSig1
) {
6943 float_raise(float_flag_output_denormal
, status
);
6945 return packFloat128(zSign
, 0, 0, 0);
6947 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6950 zSig0
|= UINT64_C(0x0002000000000000);
6954 aSig0
|= UINT64_C(0x0001000000000000);
6955 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6957 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
6960 shift128ExtraRightJamming(
6961 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6963 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6967 /*----------------------------------------------------------------------------
6968 | Returns the result of subtracting the absolute values of the quadruple-
6969 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6970 | difference is negated before being returned. `zSign' is ignored if the
6971 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6972 | Standard for Binary Floating-Point Arithmetic.
6973 *----------------------------------------------------------------------------*/
6975 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6976 float_status
*status
)
6978 int32_t aExp
, bExp
, zExp
;
6979 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6982 aSig1
= extractFloat128Frac1( a
);
6983 aSig0
= extractFloat128Frac0( a
);
6984 aExp
= extractFloat128Exp( a
);
6985 bSig1
= extractFloat128Frac1( b
);
6986 bSig0
= extractFloat128Frac0( b
);
6987 bExp
= extractFloat128Exp( b
);
6988 expDiff
= aExp
- bExp
;
6989 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6990 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6991 if ( 0 < expDiff
) goto aExpBigger
;
6992 if ( expDiff
< 0 ) goto bExpBigger
;
6993 if ( aExp
== 0x7FFF ) {
6994 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6995 return propagateFloat128NaN(a
, b
, status
);
6997 float_raise(float_flag_invalid
, status
);
6998 return float128_default_nan(status
);
7004 if ( bSig0
< aSig0
) goto aBigger
;
7005 if ( aSig0
< bSig0
) goto bBigger
;
7006 if ( bSig1
< aSig1
) goto aBigger
;
7007 if ( aSig1
< bSig1
) goto bBigger
;
7008 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7011 if ( bExp
== 0x7FFF ) {
7012 if (bSig0
| bSig1
) {
7013 return propagateFloat128NaN(a
, b
, status
);
7015 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7021 aSig0
|= UINT64_C(0x4000000000000000);
7023 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7024 bSig0
|= UINT64_C(0x4000000000000000);
7026 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7029 goto normalizeRoundAndPack
;
7031 if ( aExp
== 0x7FFF ) {
7032 if (aSig0
| aSig1
) {
7033 return propagateFloat128NaN(a
, b
, status
);
7041 bSig0
|= UINT64_C(0x4000000000000000);
7043 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7044 aSig0
|= UINT64_C(0x4000000000000000);
7046 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7048 normalizeRoundAndPack
:
7050 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7055 /*----------------------------------------------------------------------------
7056 | Returns the result of adding the quadruple-precision floating-point values
7057 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7058 | for Binary Floating-Point Arithmetic.
7059 *----------------------------------------------------------------------------*/
7061 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7065 aSign
= extractFloat128Sign( a
);
7066 bSign
= extractFloat128Sign( b
);
7067 if ( aSign
== bSign
) {
7068 return addFloat128Sigs(a
, b
, aSign
, status
);
7071 return subFloat128Sigs(a
, b
, aSign
, status
);
7076 /*----------------------------------------------------------------------------
7077 | Returns the result of subtracting the quadruple-precision floating-point
7078 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7079 | Standard for Binary Floating-Point Arithmetic.
7080 *----------------------------------------------------------------------------*/
7082 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7086 aSign
= extractFloat128Sign( a
);
7087 bSign
= extractFloat128Sign( b
);
7088 if ( aSign
== bSign
) {
7089 return subFloat128Sigs(a
, b
, aSign
, status
);
7092 return addFloat128Sigs(a
, b
, aSign
, status
);
7097 /*----------------------------------------------------------------------------
7098 | Returns the result of multiplying the quadruple-precision floating-point
7099 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7100 | Standard for Binary Floating-Point Arithmetic.
7101 *----------------------------------------------------------------------------*/
7103 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7105 bool aSign
, bSign
, zSign
;
7106 int32_t aExp
, bExp
, zExp
;
7107 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7109 aSig1
= extractFloat128Frac1( a
);
7110 aSig0
= extractFloat128Frac0( a
);
7111 aExp
= extractFloat128Exp( a
);
7112 aSign
= extractFloat128Sign( a
);
7113 bSig1
= extractFloat128Frac1( b
);
7114 bSig0
= extractFloat128Frac0( b
);
7115 bExp
= extractFloat128Exp( b
);
7116 bSign
= extractFloat128Sign( b
);
7117 zSign
= aSign
^ bSign
;
7118 if ( aExp
== 0x7FFF ) {
7119 if ( ( aSig0
| aSig1
)
7120 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7121 return propagateFloat128NaN(a
, b
, status
);
7123 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7124 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7126 if ( bExp
== 0x7FFF ) {
7127 if (bSig0
| bSig1
) {
7128 return propagateFloat128NaN(a
, b
, status
);
7130 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7132 float_raise(float_flag_invalid
, status
);
7133 return float128_default_nan(status
);
7135 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7138 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7139 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7142 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7143 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7145 zExp
= aExp
+ bExp
- 0x4000;
7146 aSig0
|= UINT64_C(0x0001000000000000);
7147 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7148 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7149 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7150 zSig2
|= ( zSig3
!= 0 );
7151 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7152 shift128ExtraRightJamming(
7153 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7156 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7160 /*----------------------------------------------------------------------------
7161 | Returns the result of dividing the quadruple-precision floating-point value
7162 | `a' by the corresponding value `b'. The operation is performed according to
7163 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7164 *----------------------------------------------------------------------------*/
7166 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7168 bool aSign
, bSign
, zSign
;
7169 int32_t aExp
, bExp
, zExp
;
7170 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7171 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7173 aSig1
= extractFloat128Frac1( a
);
7174 aSig0
= extractFloat128Frac0( a
);
7175 aExp
= extractFloat128Exp( a
);
7176 aSign
= extractFloat128Sign( a
);
7177 bSig1
= extractFloat128Frac1( b
);
7178 bSig0
= extractFloat128Frac0( b
);
7179 bExp
= extractFloat128Exp( b
);
7180 bSign
= extractFloat128Sign( b
);
7181 zSign
= aSign
^ bSign
;
7182 if ( aExp
== 0x7FFF ) {
7183 if (aSig0
| aSig1
) {
7184 return propagateFloat128NaN(a
, b
, status
);
7186 if ( bExp
== 0x7FFF ) {
7187 if (bSig0
| bSig1
) {
7188 return propagateFloat128NaN(a
, b
, status
);
7192 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7194 if ( bExp
== 0x7FFF ) {
7195 if (bSig0
| bSig1
) {
7196 return propagateFloat128NaN(a
, b
, status
);
7198 return packFloat128( zSign
, 0, 0, 0 );
7201 if ( ( bSig0
| bSig1
) == 0 ) {
7202 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7204 float_raise(float_flag_invalid
, status
);
7205 return float128_default_nan(status
);
7207 float_raise(float_flag_divbyzero
, status
);
7208 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7210 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7213 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7214 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7216 zExp
= aExp
- bExp
+ 0x3FFD;
7218 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7220 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7221 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7222 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7225 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7226 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7227 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7228 while ( (int64_t) rem0
< 0 ) {
7230 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7232 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7233 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7234 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7235 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7236 while ( (int64_t) rem1
< 0 ) {
7238 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7240 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7242 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7243 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7247 /*----------------------------------------------------------------------------
7248 | Returns the remainder of the quadruple-precision floating-point value `a'
7249 | with respect to the corresponding value `b'. The operation is performed
7250 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7251 *----------------------------------------------------------------------------*/
7253 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7256 int32_t aExp
, bExp
, expDiff
;
7257 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7258 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7261 aSig1
= extractFloat128Frac1( a
);
7262 aSig0
= extractFloat128Frac0( a
);
7263 aExp
= extractFloat128Exp( a
);
7264 aSign
= extractFloat128Sign( a
);
7265 bSig1
= extractFloat128Frac1( b
);
7266 bSig0
= extractFloat128Frac0( b
);
7267 bExp
= extractFloat128Exp( b
);
7268 if ( aExp
== 0x7FFF ) {
7269 if ( ( aSig0
| aSig1
)
7270 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7271 return propagateFloat128NaN(a
, b
, status
);
7275 if ( bExp
== 0x7FFF ) {
7276 if (bSig0
| bSig1
) {
7277 return propagateFloat128NaN(a
, b
, status
);
7282 if ( ( bSig0
| bSig1
) == 0 ) {
7284 float_raise(float_flag_invalid
, status
);
7285 return float128_default_nan(status
);
7287 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7290 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7291 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7293 expDiff
= aExp
- bExp
;
7294 if ( expDiff
< -1 ) return a
;
7296 aSig0
| UINT64_C(0x0001000000000000),
7298 15 - ( expDiff
< 0 ),
7303 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7304 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7305 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7307 while ( 0 < expDiff
) {
7308 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7309 q
= ( 4 < q
) ? q
- 4 : 0;
7310 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7311 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7312 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7313 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7316 if ( -64 < expDiff
) {
7317 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7318 q
= ( 4 < q
) ? q
- 4 : 0;
7320 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7322 if ( expDiff
< 0 ) {
7323 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7326 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7328 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7329 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7332 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7333 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7336 alternateASig0
= aSig0
;
7337 alternateASig1
= aSig1
;
7339 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7340 } while ( 0 <= (int64_t) aSig0
);
7342 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7343 if ( ( sigMean0
< 0 )
7344 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7345 aSig0
= alternateASig0
;
7346 aSig1
= alternateASig1
;
7348 zSign
= ( (int64_t) aSig0
< 0 );
7349 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7350 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7354 /*----------------------------------------------------------------------------
7355 | Returns the square root of the quadruple-precision floating-point value `a'.
7356 | The operation is performed according to the IEC/IEEE Standard for Binary
7357 | Floating-Point Arithmetic.
7358 *----------------------------------------------------------------------------*/
7360 float128
float128_sqrt(float128 a
, float_status
*status
)
7364 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7365 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7367 aSig1
= extractFloat128Frac1( a
);
7368 aSig0
= extractFloat128Frac0( a
);
7369 aExp
= extractFloat128Exp( a
);
7370 aSign
= extractFloat128Sign( a
);
7371 if ( aExp
== 0x7FFF ) {
7372 if (aSig0
| aSig1
) {
7373 return propagateFloat128NaN(a
, a
, status
);
7375 if ( ! aSign
) return a
;
7379 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7381 float_raise(float_flag_invalid
, status
);
7382 return float128_default_nan(status
);
7385 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7386 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7388 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7389 aSig0
|= UINT64_C(0x0001000000000000);
7390 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7391 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7392 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7393 doubleZSig0
= zSig0
<<1;
7394 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7395 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7396 while ( (int64_t) rem0
< 0 ) {
7399 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7401 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7402 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7403 if ( zSig1
== 0 ) zSig1
= 1;
7404 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7405 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7406 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7407 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7408 while ( (int64_t) rem1
< 0 ) {
7410 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7412 term2
|= doubleZSig0
;
7413 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7415 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7417 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7418 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7422 static inline FloatRelation
7423 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7424 float_status
*status
)
7428 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7429 float_raise(float_flag_invalid
, status
);
7430 return float_relation_unordered
;
7432 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7433 ( extractFloatx80Frac( a
)<<1 ) ) ||
7434 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7435 ( extractFloatx80Frac( b
)<<1 ) )) {
7437 floatx80_is_signaling_nan(a
, status
) ||
7438 floatx80_is_signaling_nan(b
, status
)) {
7439 float_raise(float_flag_invalid
, status
);
7441 return float_relation_unordered
;
7443 aSign
= extractFloatx80Sign( a
);
7444 bSign
= extractFloatx80Sign( b
);
7445 if ( aSign
!= bSign
) {
7447 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7448 ( ( a
.low
| b
.low
) == 0 ) ) {
7450 return float_relation_equal
;
7452 return 1 - (2 * aSign
);
7455 /* Normalize pseudo-denormals before comparison. */
7456 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7459 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7462 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7463 return float_relation_equal
;
7465 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7470 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7472 return floatx80_compare_internal(a
, b
, 0, status
);
7475 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7476 float_status
*status
)
7478 return floatx80_compare_internal(a
, b
, 1, status
);
7481 static inline FloatRelation
7482 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7483 float_status
*status
)
7487 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7488 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7489 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7490 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7492 float128_is_signaling_nan(a
, status
) ||
7493 float128_is_signaling_nan(b
, status
)) {
7494 float_raise(float_flag_invalid
, status
);
7496 return float_relation_unordered
;
7498 aSign
= extractFloat128Sign( a
);
7499 bSign
= extractFloat128Sign( b
);
7500 if ( aSign
!= bSign
) {
7501 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7503 return float_relation_equal
;
7505 return 1 - (2 * aSign
);
7508 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7509 return float_relation_equal
;
7511 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7516 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7518 return float128_compare_internal(a
, b
, 0, status
);
7521 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7522 float_status
*status
)
7524 return float128_compare_internal(a
, b
, 1, status
);
7527 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7533 if (floatx80_invalid_encoding(a
)) {
7534 float_raise(float_flag_invalid
, status
);
7535 return floatx80_default_nan(status
);
7537 aSig
= extractFloatx80Frac( a
);
7538 aExp
= extractFloatx80Exp( a
);
7539 aSign
= extractFloatx80Sign( a
);
7541 if ( aExp
== 0x7FFF ) {
7543 return propagateFloatx80NaN(a
, a
, status
);
7557 } else if (n
< -0x10000) {
7562 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7563 aSign
, aExp
, aSig
, 0, status
);
7566 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7570 uint64_t aSig0
, aSig1
;
7572 aSig1
= extractFloat128Frac1( a
);
7573 aSig0
= extractFloat128Frac0( a
);
7574 aExp
= extractFloat128Exp( a
);
7575 aSign
= extractFloat128Sign( a
);
7576 if ( aExp
== 0x7FFF ) {
7577 if ( aSig0
| aSig1
) {
7578 return propagateFloat128NaN(a
, a
, status
);
7583 aSig0
|= UINT64_C(0x0001000000000000);
7584 } else if (aSig0
== 0 && aSig1
== 0) {
7592 } else if (n
< -0x10000) {
7597 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7602 static void __attribute__((constructor
)) softfloat_init(void)
7604 union_float64 ua
, ub
, uc
, ur
;
7606 if (QEMU_NO_HARDFLOAT
) {
7610 * Test that the host's FMA is not obviously broken. For example,
7611 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7612 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7614 ua
.s
= 0x0020000000000001ULL
;
7615 ub
.s
= 0x3ca0000000000000ULL
;
7616 uc
.s
= 0x0020000000000000ULL
;
7617 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7618 if (ur
.s
!= 0x0020000000000001ULL
) {
7619 force_soft_fma
= true;