4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float. The
504 * exponent is unbiased and the fraction is normalized. All
505 * calculations are done with a 64 bit fraction and then rounded as
506 * appropriate for the final format.
508 * Thanks to the packed FloatClass a decent compiler should be able to
509 * fit the whole structure into registers and avoid using the stack
510 * for parameter passing.
520 #define DECOMPOSED_BINARY_POINT 63
521 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
545 uint64_t roundeven_mask
;
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
561 static const FloatFmt float16_params
= {
565 static const FloatFmt float16_params_ahp
= {
570 static const FloatFmt bfloat16_params
= {
574 static const FloatFmt float32_params
= {
578 static const FloatFmt float64_params
= {
582 /* Unpack a float to parts, but do not canonicalize. */
583 static inline FloatParts64
unpack_raw(FloatFmt fmt
, uint64_t raw
)
585 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
587 return (FloatParts64
) {
588 .cls
= float_class_unclassified
,
589 .sign
= extract64(raw
, sign_pos
, 1),
590 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
591 .frac
= extract64(raw
, 0, fmt
.frac_size
),
595 static inline FloatParts64
float16_unpack_raw(float16 f
)
597 return unpack_raw(float16_params
, f
);
600 static inline FloatParts64
bfloat16_unpack_raw(bfloat16 f
)
602 return unpack_raw(bfloat16_params
, f
);
605 static inline FloatParts64
float32_unpack_raw(float32 f
)
607 return unpack_raw(float32_params
, f
);
610 static inline FloatParts64
float64_unpack_raw(float64 f
)
612 return unpack_raw(float64_params
, f
);
615 /* Pack a float from parts, but do not canonicalize. */
616 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts64 p
)
618 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
619 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
620 return deposit64(ret
, sign_pos
, 1, p
.sign
);
623 static inline float16
float16_pack_raw(FloatParts64 p
)
625 return make_float16(pack_raw(float16_params
, p
));
628 static inline bfloat16
bfloat16_pack_raw(FloatParts64 p
)
630 return pack_raw(bfloat16_params
, p
);
633 static inline float32
float32_pack_raw(FloatParts64 p
)
635 return make_float32(pack_raw(float32_params
, p
));
638 static inline float64
float64_pack_raw(FloatParts64 p
)
640 return make_float64(pack_raw(float64_params
, p
));
643 /*----------------------------------------------------------------------------
644 | Functions and definitions to determine: (1) whether tininess for underflow
645 | is detected before or after rounding by default, (2) what (if anything)
646 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
647 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
648 | are propagated from function inputs to output. These details are target-
650 *----------------------------------------------------------------------------*/
651 #include "softfloat-specialize.c.inc"
653 #define parts_default_nan parts64_default_nan
655 /* Canonicalize EXP and FRAC, setting CLS. */
656 static FloatParts64
sf_canonicalize(FloatParts64 part
, const FloatFmt
*parm
,
657 float_status
*status
)
659 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
660 if (part
.frac
== 0) {
661 part
.cls
= float_class_inf
;
663 part
.frac
<<= parm
->frac_shift
;
664 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
665 ? float_class_snan
: float_class_qnan
);
667 } else if (part
.exp
== 0) {
668 if (likely(part
.frac
== 0)) {
669 part
.cls
= float_class_zero
;
670 } else if (status
->flush_inputs_to_zero
) {
671 float_raise(float_flag_input_denormal
, status
);
672 part
.cls
= float_class_zero
;
675 int shift
= clz64(part
.frac
);
676 part
.cls
= float_class_normal
;
677 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
681 part
.cls
= float_class_normal
;
682 part
.exp
-= parm
->exp_bias
;
683 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
688 /* Round and uncanonicalize a floating-point number by parts. There
689 * are FRAC_SHIFT bits that may require rounding at the bottom of the
690 * fraction; these bits will be removed. The exponent will be biased
691 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694 static FloatParts64
round_canonical(FloatParts64 p
, float_status
*s
,
695 const FloatFmt
*parm
)
697 const uint64_t frac_lsb
= parm
->frac_lsb
;
698 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
699 const uint64_t round_mask
= parm
->round_mask
;
700 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
701 const int exp_max
= parm
->exp_max
;
702 const int frac_shift
= parm
->frac_shift
;
711 case float_class_normal
:
712 switch (s
->float_rounding_mode
) {
713 case float_round_nearest_even
:
714 overflow_norm
= false;
715 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
717 case float_round_ties_away
:
718 overflow_norm
= false;
721 case float_round_to_zero
:
722 overflow_norm
= true;
726 inc
= p
.sign
? 0 : round_mask
;
727 overflow_norm
= p
.sign
;
729 case float_round_down
:
730 inc
= p
.sign
? round_mask
: 0;
731 overflow_norm
= !p
.sign
;
733 case float_round_to_odd
:
734 overflow_norm
= true;
735 inc
= frac
& frac_lsb
? 0 : round_mask
;
738 g_assert_not_reached();
741 exp
+= parm
->exp_bias
;
742 if (likely(exp
> 0)) {
743 if (frac
& round_mask
) {
744 flags
|= float_flag_inexact
;
745 if (uadd64_overflow(frac
, inc
, &frac
)) {
746 frac
= (frac
>> 1) | DECOMPOSED_IMPLICIT_BIT
;
752 if (parm
->arm_althp
) {
753 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
754 if (unlikely(exp
> exp_max
)) {
755 /* Overflow. Return the maximum normal. */
756 flags
= float_flag_invalid
;
760 } else if (unlikely(exp
>= exp_max
)) {
761 flags
|= float_flag_overflow
| float_flag_inexact
;
766 p
.cls
= float_class_inf
;
770 } else if (s
->flush_to_zero
) {
771 flags
|= float_flag_output_denormal
;
772 p
.cls
= float_class_zero
;
775 bool is_tiny
= s
->tininess_before_rounding
|| (exp
< 0);
779 is_tiny
= !uadd64_overflow(frac
, inc
, &discard
);
782 shift64RightJamming(frac
, 1 - exp
, &frac
);
783 if (frac
& round_mask
) {
784 /* Need to recompute round-to-even. */
785 switch (s
->float_rounding_mode
) {
786 case float_round_nearest_even
:
787 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
790 case float_round_to_odd
:
791 inc
= frac
& frac_lsb
? 0 : round_mask
;
796 flags
|= float_flag_inexact
;
800 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
803 if (is_tiny
&& (flags
& float_flag_inexact
)) {
804 flags
|= float_flag_underflow
;
806 if (exp
== 0 && frac
== 0) {
807 p
.cls
= float_class_zero
;
812 case float_class_zero
:
818 case float_class_inf
:
820 assert(!parm
->arm_althp
);
825 case float_class_qnan
:
826 case float_class_snan
:
827 assert(!parm
->arm_althp
);
829 frac
>>= parm
->frac_shift
;
833 g_assert_not_reached();
836 float_raise(flags
, s
);
842 static FloatParts64
return_nan(FloatParts64 a
, float_status
*s
)
844 g_assert(is_nan(a
.cls
));
845 if (is_snan(a
.cls
)) {
846 float_raise(float_flag_invalid
, s
);
847 if (!s
->default_nan_mode
) {
848 return parts_silence_nan(a
, s
);
850 } else if (!s
->default_nan_mode
) {
853 parts_default_nan(&a
, s
);
857 static FloatParts64
pick_nan(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
859 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
860 float_raise(float_flag_invalid
, s
);
863 if (s
->default_nan_mode
) {
864 parts_default_nan(&a
, s
);
866 if (pickNaN(a
.cls
, b
.cls
,
868 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
), s
)) {
871 if (is_snan(a
.cls
)) {
872 return parts_silence_nan(a
, s
);
878 static FloatParts64
pick_nan_muladd(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
879 bool inf_zero
, float_status
*s
)
883 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
884 float_raise(float_flag_invalid
, s
);
887 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
889 if (s
->default_nan_mode
) {
890 /* Note that this check is after pickNaNMulAdd so that function
891 * has an opportunity to set the Invalid flag.
906 parts_default_nan(&a
, s
);
909 g_assert_not_reached();
912 if (is_snan(a
.cls
)) {
913 return parts_silence_nan(a
, s
);
919 * Pack/unpack routines with a specific FloatFmt.
922 static FloatParts64
float16a_unpack_canonical(float16 f
, float_status
*s
,
923 const FloatFmt
*params
)
925 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
928 static FloatParts64
float16_unpack_canonical(float16 f
, float_status
*s
)
930 return float16a_unpack_canonical(f
, s
, &float16_params
);
933 static FloatParts64
bfloat16_unpack_canonical(bfloat16 f
, float_status
*s
)
935 return sf_canonicalize(bfloat16_unpack_raw(f
), &bfloat16_params
, s
);
938 static float16
float16a_round_pack_canonical(FloatParts64 p
, float_status
*s
,
939 const FloatFmt
*params
)
941 return float16_pack_raw(round_canonical(p
, s
, params
));
944 static float16
float16_round_pack_canonical(FloatParts64 p
, float_status
*s
)
946 return float16a_round_pack_canonical(p
, s
, &float16_params
);
949 static bfloat16
bfloat16_round_pack_canonical(FloatParts64 p
, float_status
*s
)
951 return bfloat16_pack_raw(round_canonical(p
, s
, &bfloat16_params
));
954 static FloatParts64
float32_unpack_canonical(float32 f
, float_status
*s
)
956 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
959 static float32
float32_round_pack_canonical(FloatParts64 p
, float_status
*s
)
961 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
964 static FloatParts64
float64_unpack_canonical(float64 f
, float_status
*s
)
966 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
969 static float64
float64_round_pack_canonical(FloatParts64 p
, float_status
*s
)
971 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
975 * Returns the result of adding or subtracting the values of the
976 * floating-point values `a' and `b'. The operation is performed
977 * according to the IEC/IEEE Standard for Binary Floating-Point
981 static FloatParts64
addsub_floats(FloatParts64 a
, FloatParts64 b
, bool subtract
,
984 bool a_sign
= a
.sign
;
985 bool b_sign
= b
.sign
^ subtract
;
987 if (a_sign
!= b_sign
) {
990 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
991 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
992 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
993 a
.frac
= a
.frac
- b
.frac
;
995 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
996 a
.frac
= b
.frac
- a
.frac
;
1002 a
.cls
= float_class_zero
;
1003 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1005 int shift
= clz64(a
.frac
);
1006 a
.frac
= a
.frac
<< shift
;
1007 a
.exp
= a
.exp
- shift
;
1012 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1013 return pick_nan(a
, b
, s
);
1015 if (a
.cls
== float_class_inf
) {
1016 if (b
.cls
== float_class_inf
) {
1017 float_raise(float_flag_invalid
, s
);
1018 parts_default_nan(&a
, s
);
1022 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1023 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1026 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1027 b
.sign
= a_sign
^ 1;
1030 if (b
.cls
== float_class_zero
) {
1035 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1036 if (a
.exp
> b
.exp
) {
1037 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1038 } else if (a
.exp
< b
.exp
) {
1039 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1043 if (uadd64_overflow(a
.frac
, b
.frac
, &a
.frac
)) {
1044 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1045 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
1050 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1051 return pick_nan(a
, b
, s
);
1053 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1056 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1061 g_assert_not_reached();
1065 * Returns the result of adding or subtracting the floating-point
1066 * values `a' and `b'. The operation is performed according to the
1067 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1070 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1072 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1073 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1074 FloatParts64 pr
= addsub_floats(pa
, pb
, false, status
);
1076 return float16_round_pack_canonical(pr
, status
);
1079 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1081 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1082 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1083 FloatParts64 pr
= addsub_floats(pa
, pb
, true, status
);
1085 return float16_round_pack_canonical(pr
, status
);
1088 static float32 QEMU_SOFTFLOAT_ATTR
1089 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1091 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1092 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1093 FloatParts64 pr
= addsub_floats(pa
, pb
, subtract
, status
);
1095 return float32_round_pack_canonical(pr
, status
);
1098 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1100 return soft_f32_addsub(a
, b
, false, status
);
1103 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1105 return soft_f32_addsub(a
, b
, true, status
);
1108 static float64 QEMU_SOFTFLOAT_ATTR
1109 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1111 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1112 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1113 FloatParts64 pr
= addsub_floats(pa
, pb
, subtract
, status
);
1115 return float64_round_pack_canonical(pr
, status
);
1118 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1120 return soft_f64_addsub(a
, b
, false, status
);
1123 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1125 return soft_f64_addsub(a
, b
, true, status
);
1128 static float hard_f32_add(float a
, float b
)
1133 static float hard_f32_sub(float a
, float b
)
1138 static double hard_f64_add(double a
, double b
)
1143 static double hard_f64_sub(double a
, double b
)
1148 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1150 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1151 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1153 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1156 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1158 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1159 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1161 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1165 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1166 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1168 return float32_gen2(a
, b
, s
, hard
, soft
,
1169 f32_is_zon2
, f32_addsubmul_post
);
1172 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1173 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1175 return float64_gen2(a
, b
, s
, hard
, soft
,
1176 f64_is_zon2
, f64_addsubmul_post
);
1179 float32 QEMU_FLATTEN
1180 float32_add(float32 a
, float32 b
, float_status
*s
)
1182 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1185 float32 QEMU_FLATTEN
1186 float32_sub(float32 a
, float32 b
, float_status
*s
)
1188 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1191 float64 QEMU_FLATTEN
1192 float64_add(float64 a
, float64 b
, float_status
*s
)
1194 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1197 float64 QEMU_FLATTEN
1198 float64_sub(float64 a
, float64 b
, float_status
*s
)
1200 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1204 * Returns the result of adding or subtracting the bfloat16
1205 * values `a' and `b'.
1207 bfloat16 QEMU_FLATTEN
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1209 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1210 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1211 FloatParts64 pr
= addsub_floats(pa
, pb
, false, status
);
1213 return bfloat16_round_pack_canonical(pr
, status
);
1216 bfloat16 QEMU_FLATTEN
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1218 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1219 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1220 FloatParts64 pr
= addsub_floats(pa
, pb
, true, status
);
1222 return bfloat16_round_pack_canonical(pr
, status
);
1226 * Returns the result of multiplying the floating-point values `a' and
1227 * `b'. The operation is performed according to the IEC/IEEE Standard
1228 * for Binary Floating-Point Arithmetic.
1231 static FloatParts64
mul_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1233 bool sign
= a
.sign
^ b
.sign
;
1235 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1237 int exp
= a
.exp
+ b
.exp
;
1239 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1240 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1253 /* handle all the NaN cases */
1254 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1255 return pick_nan(a
, b
, s
);
1257 /* Inf * Zero == NaN */
1258 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1259 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1260 float_raise(float_flag_invalid
, s
);
1261 parts_default_nan(&a
, s
);
1264 /* Multiply by 0 or Inf */
1265 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1269 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1273 g_assert_not_reached();
1276 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1278 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1279 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1280 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1282 return float16_round_pack_canonical(pr
, status
);
1285 static float32 QEMU_SOFTFLOAT_ATTR
1286 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1288 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1289 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1290 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1292 return float32_round_pack_canonical(pr
, status
);
1295 static float64 QEMU_SOFTFLOAT_ATTR
1296 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1298 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1299 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1300 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1302 return float64_round_pack_canonical(pr
, status
);
1305 static float hard_f32_mul(float a
, float b
)
1310 static double hard_f64_mul(double a
, double b
)
1315 float32 QEMU_FLATTEN
1316 float32_mul(float32 a
, float32 b
, float_status
*s
)
1318 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1319 f32_is_zon2
, f32_addsubmul_post
);
1322 float64 QEMU_FLATTEN
1323 float64_mul(float64 a
, float64 b
, float_status
*s
)
1325 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1326 f64_is_zon2
, f64_addsubmul_post
);
1330 * Returns the result of multiplying the bfloat16
1331 * values `a' and `b'.
1334 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1336 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1337 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1338 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1340 return bfloat16_round_pack_canonical(pr
, status
);
1344 * Returns the result of multiplying the floating-point values `a' and
1345 * `b' then adding 'c', with no intermediate rounding step after the
1346 * multiplication. The operation is performed according to the
1347 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1348 * The flags argument allows the caller to select negation of the
1349 * addend, the intermediate product, or the final result. (The
1350 * difference between this and having the caller do a separate
1351 * negation is that negating externally will flip the sign bit on
1355 static FloatParts64
muladd_floats(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
1356 int flags
, float_status
*s
)
1358 bool inf_zero
, p_sign
;
1359 bool sign_flip
= flags
& float_muladd_negate_result
;
1363 int ab_mask
, abc_mask
;
1365 ab_mask
= float_cmask(a
.cls
) | float_cmask(b
.cls
);
1366 abc_mask
= float_cmask(c
.cls
) | ab_mask
;
1367 inf_zero
= ab_mask
== float_cmask_infzero
;
1369 /* It is implementation-defined whether the cases of (0,inf,qnan)
1370 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1371 * they return if they do), so we have to hand this information
1372 * off to the target-specific pick-a-NaN routine.
1374 if (unlikely(abc_mask
& float_cmask_anynan
)) {
1375 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1379 float_raise(float_flag_invalid
, s
);
1380 parts_default_nan(&a
, s
);
1384 if (flags
& float_muladd_negate_c
) {
1388 p_sign
= a
.sign
^ b
.sign
;
1390 if (flags
& float_muladd_negate_product
) {
1394 if (ab_mask
& float_cmask_inf
) {
1395 p_class
= float_class_inf
;
1396 } else if (ab_mask
& float_cmask_zero
) {
1397 p_class
= float_class_zero
;
1399 p_class
= float_class_normal
;
1402 if (c
.cls
== float_class_inf
) {
1403 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1404 float_raise(float_flag_invalid
, s
);
1405 parts_default_nan(&c
, s
);
1407 c
.sign
^= sign_flip
;
1412 if (p_class
== float_class_inf
) {
1413 a
.cls
= float_class_inf
;
1414 a
.sign
= p_sign
^ sign_flip
;
1418 if (p_class
== float_class_zero
) {
1419 if (c
.cls
== float_class_zero
) {
1420 if (p_sign
!= c
.sign
) {
1421 p_sign
= s
->float_rounding_mode
== float_round_down
;
1424 } else if (flags
& float_muladd_halve_result
) {
1427 c
.sign
^= sign_flip
;
1431 /* a & b should be normals now... */
1432 assert(a
.cls
== float_class_normal
&&
1433 b
.cls
== float_class_normal
);
1435 p_exp
= a
.exp
+ b
.exp
;
1437 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1439 /* Renormalize to the msb. */
1440 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1443 shortShift128Left(hi
, lo
, 1, &hi
, &lo
);
1447 if (c
.cls
!= float_class_zero
) {
1448 int exp_diff
= p_exp
- c
.exp
;
1449 if (p_sign
== c
.sign
) {
1451 if (exp_diff
<= 0) {
1452 shift64RightJamming(hi
, -exp_diff
, &hi
);
1454 if (uadd64_overflow(hi
, c
.frac
, &hi
)) {
1455 shift64RightJamming(hi
, 1, &hi
);
1456 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1460 uint64_t c_hi
, c_lo
, over
;
1461 shift128RightJamming(c
.frac
, 0, exp_diff
, &c_hi
, &c_lo
);
1462 add192(0, hi
, lo
, 0, c_hi
, c_lo
, &over
, &hi
, &lo
);
1464 shift64RightJamming(hi
, 1, &hi
);
1465 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1471 uint64_t c_hi
= c
.frac
, c_lo
= 0;
1473 if (exp_diff
<= 0) {
1474 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1477 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1478 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1480 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1485 shift128RightJamming(c_hi
, c_lo
,
1488 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1491 if (hi
== 0 && lo
== 0) {
1492 a
.cls
= float_class_zero
;
1493 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1494 a
.sign
^= sign_flip
;
1501 shift
= clz64(lo
) + 64;
1503 /* Normalizing to a binary point of 124 is the
1504 correct adjust for the exponent. However since we're
1505 shifting, we might as well put the binary point back
1506 at 63 where we really want it. Therefore shift as
1507 if we're leaving 1 bit at the top of the word, but
1508 adjust the exponent as if we're leaving 3 bits. */
1509 shift128Left(hi
, lo
, shift
, &hi
, &lo
);
1516 if (flags
& float_muladd_halve_result
) {
1520 /* finally prepare our result */
1521 a
.cls
= float_class_normal
;
1522 a
.sign
= p_sign
^ sign_flip
;
1529 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1530 int flags
, float_status
*status
)
1532 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1533 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1534 FloatParts64 pc
= float16_unpack_canonical(c
, status
);
1535 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1537 return float16_round_pack_canonical(pr
, status
);
1540 static float32 QEMU_SOFTFLOAT_ATTR
1541 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1542 float_status
*status
)
1544 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1545 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1546 FloatParts64 pc
= float32_unpack_canonical(c
, status
);
1547 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1549 return float32_round_pack_canonical(pr
, status
);
1552 static float64 QEMU_SOFTFLOAT_ATTR
1553 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1554 float_status
*status
)
1556 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1557 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1558 FloatParts64 pc
= float64_unpack_canonical(c
, status
);
1559 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1561 return float64_round_pack_canonical(pr
, status
);
1564 static bool force_soft_fma
;
1566 float32 QEMU_FLATTEN
1567 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1569 union_float32 ua
, ub
, uc
, ur
;
1575 if (unlikely(!can_use_fpu(s
))) {
1578 if (unlikely(flags
& float_muladd_halve_result
)) {
1582 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1583 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1587 if (unlikely(force_soft_fma
)) {
1592 * When (a || b) == 0, there's no need to check for under/over flow,
1593 * since we know the addend is (normal || 0) and the product is 0.
1595 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1599 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1600 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1601 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1603 if (flags
& float_muladd_negate_c
) {
1608 union_float32 ua_orig
= ua
;
1609 union_float32 uc_orig
= uc
;
1611 if (flags
& float_muladd_negate_product
) {
1614 if (flags
& float_muladd_negate_c
) {
1618 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1620 if (unlikely(f32_is_inf(ur
))) {
1621 float_raise(float_flag_overflow
, s
);
1622 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1628 if (flags
& float_muladd_negate_result
) {
1629 return float32_chs(ur
.s
);
1634 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1637 float64 QEMU_FLATTEN
1638 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1640 union_float64 ua
, ub
, uc
, ur
;
1646 if (unlikely(!can_use_fpu(s
))) {
1649 if (unlikely(flags
& float_muladd_halve_result
)) {
1653 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1654 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1658 if (unlikely(force_soft_fma
)) {
1663 * When (a || b) == 0, there's no need to check for under/over flow,
1664 * since we know the addend is (normal || 0) and the product is 0.
1666 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1670 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1671 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1672 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1674 if (flags
& float_muladd_negate_c
) {
1679 union_float64 ua_orig
= ua
;
1680 union_float64 uc_orig
= uc
;
1682 if (flags
& float_muladd_negate_product
) {
1685 if (flags
& float_muladd_negate_c
) {
1689 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1691 if (unlikely(f64_is_inf(ur
))) {
1692 float_raise(float_flag_overflow
, s
);
1693 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1699 if (flags
& float_muladd_negate_result
) {
1700 return float64_chs(ur
.s
);
1705 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1709 * Returns the result of multiplying the bfloat16 values `a'
1710 * and `b' then adding 'c', with no intermediate rounding step after the
1714 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1715 int flags
, float_status
*status
)
1717 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1718 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1719 FloatParts64 pc
= bfloat16_unpack_canonical(c
, status
);
1720 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1722 return bfloat16_round_pack_canonical(pr
, status
);
1726 * Returns the result of dividing the floating-point value `a' by the
1727 * corresponding value `b'. The operation is performed according to
1728 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1731 static FloatParts64
div_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1733 bool sign
= a
.sign
^ b
.sign
;
1735 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1736 uint64_t n0
, n1
, q
, r
;
1737 int exp
= a
.exp
- b
.exp
;
1740 * We want a 2*N / N-bit division to produce exactly an N-bit
1741 * result, so that we do not lose any precision and so that we
1742 * do not have to renormalize afterward. If A.frac < B.frac,
1743 * then division would produce an (N-1)-bit result; shift A left
1744 * by one to produce the an N-bit result, and decrement the
1745 * exponent to match.
1747 * The udiv_qrnnd algorithm that we're using requires normalization,
1748 * i.e. the msb of the denominator must be set, which is already true.
1750 if (a
.frac
< b
.frac
) {
1752 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1754 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1756 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1758 /* Set lsb if there is a remainder, to set inexact. */
1759 a
.frac
= q
| (r
!= 0);
1764 /* handle all the NaN cases */
1765 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1766 return pick_nan(a
, b
, s
);
1768 /* 0/0 or Inf/Inf */
1771 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1772 float_raise(float_flag_invalid
, s
);
1773 parts_default_nan(&a
, s
);
1776 /* Inf / x or 0 / x */
1777 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1782 if (b
.cls
== float_class_zero
) {
1783 float_raise(float_flag_divbyzero
, s
);
1784 a
.cls
= float_class_inf
;
1789 if (b
.cls
== float_class_inf
) {
1790 a
.cls
= float_class_zero
;
1794 g_assert_not_reached();
1797 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1799 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1800 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1801 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1803 return float16_round_pack_canonical(pr
, status
);
1806 static float32 QEMU_SOFTFLOAT_ATTR
1807 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1809 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1810 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1811 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1813 return float32_round_pack_canonical(pr
, status
);
1816 static float64 QEMU_SOFTFLOAT_ATTR
1817 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1819 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1820 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1821 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1823 return float64_round_pack_canonical(pr
, status
);
1826 static float hard_f32_div(float a
, float b
)
1831 static double hard_f64_div(double a
, double b
)
1836 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1838 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1839 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1840 fpclassify(b
.h
) == FP_NORMAL
;
1842 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1845 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1847 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1848 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1849 fpclassify(b
.h
) == FP_NORMAL
;
1851 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1854 static bool f32_div_post(union_float32 a
, union_float32 b
)
1856 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1857 return fpclassify(a
.h
) != FP_ZERO
;
1859 return !float32_is_zero(a
.s
);
1862 static bool f64_div_post(union_float64 a
, union_float64 b
)
1864 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1865 return fpclassify(a
.h
) != FP_ZERO
;
1867 return !float64_is_zero(a
.s
);
1870 float32 QEMU_FLATTEN
1871 float32_div(float32 a
, float32 b
, float_status
*s
)
1873 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1874 f32_div_pre
, f32_div_post
);
1877 float64 QEMU_FLATTEN
1878 float64_div(float64 a
, float64 b
, float_status
*s
)
1880 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1881 f64_div_pre
, f64_div_post
);
1885 * Returns the result of dividing the bfloat16
1886 * value `a' by the corresponding value `b'.
1889 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1891 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1892 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1893 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1895 return bfloat16_round_pack_canonical(pr
, status
);
1899 * Float to Float conversions
1901 * Returns the result of converting one float format to another. The
1902 * conversion is performed according to the IEC/IEEE Standard for
1903 * Binary Floating-Point Arithmetic.
1905 * The float_to_float helper only needs to take care of raising
1906 * invalid exceptions and handling the conversion on NaNs.
1909 static FloatParts64
float_to_float(FloatParts64 a
, const FloatFmt
*dstf
,
1912 if (dstf
->arm_althp
) {
1914 case float_class_qnan
:
1915 case float_class_snan
:
1916 /* There is no NaN in the destination format. Raise Invalid
1917 * and return a zero with the sign of the input NaN.
1919 float_raise(float_flag_invalid
, s
);
1920 a
.cls
= float_class_zero
;
1925 case float_class_inf
:
1926 /* There is no Inf in the destination format. Raise Invalid
1927 * and return the maximum normal with the correct sign.
1929 float_raise(float_flag_invalid
, s
);
1930 a
.cls
= float_class_normal
;
1931 a
.exp
= dstf
->exp_max
;
1932 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1938 } else if (is_nan(a
.cls
)) {
1939 return return_nan(a
, s
);
1944 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1946 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1947 FloatParts64 p
= float16a_unpack_canonical(a
, s
, fmt16
);
1948 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
1949 return float32_round_pack_canonical(pr
, s
);
1952 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1954 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1955 FloatParts64 p
= float16a_unpack_canonical(a
, s
, fmt16
);
1956 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
1957 return float64_round_pack_canonical(pr
, s
);
1960 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1962 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1963 FloatParts64 p
= float32_unpack_canonical(a
, s
);
1964 FloatParts64 pr
= float_to_float(p
, fmt16
, s
);
1965 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1968 static float64 QEMU_SOFTFLOAT_ATTR
1969 soft_float32_to_float64(float32 a
, float_status
*s
)
1971 FloatParts64 p
= float32_unpack_canonical(a
, s
);
1972 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
1973 return float64_round_pack_canonical(pr
, s
);
1976 float64
float32_to_float64(float32 a
, float_status
*s
)
1978 if (likely(float32_is_normal(a
))) {
1979 /* Widening conversion can never produce inexact results. */
1985 } else if (float32_is_zero(a
)) {
1986 return float64_set_sign(float64_zero
, float32_is_neg(a
));
1988 return soft_float32_to_float64(a
, s
);
1992 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1994 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1995 FloatParts64 p
= float64_unpack_canonical(a
, s
);
1996 FloatParts64 pr
= float_to_float(p
, fmt16
, s
);
1997 return float16a_round_pack_canonical(pr
, s
, fmt16
);
2000 float32
float64_to_float32(float64 a
, float_status
*s
)
2002 FloatParts64 p
= float64_unpack_canonical(a
, s
);
2003 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
2004 return float32_round_pack_canonical(pr
, s
);
2007 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2009 FloatParts64 p
= bfloat16_unpack_canonical(a
, s
);
2010 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
2011 return float32_round_pack_canonical(pr
, s
);
2014 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2016 FloatParts64 p
= bfloat16_unpack_canonical(a
, s
);
2017 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
2018 return float64_round_pack_canonical(pr
, s
);
2021 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2023 FloatParts64 p
= float32_unpack_canonical(a
, s
);
2024 FloatParts64 pr
= float_to_float(p
, &bfloat16_params
, s
);
2025 return bfloat16_round_pack_canonical(pr
, s
);
2028 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2030 FloatParts64 p
= float64_unpack_canonical(a
, s
);
2031 FloatParts64 pr
= float_to_float(p
, &bfloat16_params
, s
);
2032 return bfloat16_round_pack_canonical(pr
, s
);
2036 * Rounds the floating-point value `a' to an integer, and returns the
2037 * result as a floating-point value. The operation is performed
2038 * according to the IEC/IEEE Standard for Binary Floating-Point
2042 static FloatParts64
round_to_int(FloatParts64 a
, FloatRoundMode rmode
,
2043 int scale
, float_status
*s
)
2046 case float_class_qnan
:
2047 case float_class_snan
:
2048 return return_nan(a
, s
);
2050 case float_class_zero
:
2051 case float_class_inf
:
2052 /* already "integral" */
2055 case float_class_normal
:
2056 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2059 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2060 /* already integral */
2065 /* all fractional */
2066 float_raise(float_flag_inexact
, s
);
2068 case float_round_nearest_even
:
2069 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2071 case float_round_ties_away
:
2072 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2074 case float_round_to_zero
:
2077 case float_round_up
:
2080 case float_round_down
:
2083 case float_round_to_odd
:
2087 g_assert_not_reached();
2091 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2094 a
.cls
= float_class_zero
;
2097 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2098 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2099 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2100 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2104 case float_round_nearest_even
:
2105 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2107 case float_round_ties_away
:
2110 case float_round_to_zero
:
2113 case float_round_up
:
2114 inc
= a
.sign
? 0 : rnd_mask
;
2116 case float_round_down
:
2117 inc
= a
.sign
? rnd_mask
: 0;
2119 case float_round_to_odd
:
2120 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2123 g_assert_not_reached();
2126 if (a
.frac
& rnd_mask
) {
2127 float_raise(float_flag_inexact
, s
);
2128 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2130 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2133 a
.frac
&= ~rnd_mask
;
2138 g_assert_not_reached();
2143 float16
float16_round_to_int(float16 a
, float_status
*s
)
2145 FloatParts64 pa
= float16_unpack_canonical(a
, s
);
2146 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2147 return float16_round_pack_canonical(pr
, s
);
2150 float32
float32_round_to_int(float32 a
, float_status
*s
)
2152 FloatParts64 pa
= float32_unpack_canonical(a
, s
);
2153 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2154 return float32_round_pack_canonical(pr
, s
);
2157 float64
float64_round_to_int(float64 a
, float_status
*s
)
2159 FloatParts64 pa
= float64_unpack_canonical(a
, s
);
2160 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2161 return float64_round_pack_canonical(pr
, s
);
2165 * Rounds the bfloat16 value `a' to an integer, and returns the
2166 * result as a bfloat16 value.
2169 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2171 FloatParts64 pa
= bfloat16_unpack_canonical(a
, s
);
2172 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2173 return bfloat16_round_pack_canonical(pr
, s
);
2177 * Returns the result of converting the floating-point value `a' to
2178 * the two's complement integer format. The conversion is performed
2179 * according to the IEC/IEEE Standard for Binary Floating-Point
2180 * Arithmetic---which means in particular that the conversion is
2181 * rounded according to the current rounding mode. If `a' is a NaN,
2182 * the largest positive integer is returned. Otherwise, if the
2183 * conversion overflows, the largest integer with the same sign as `a'
2187 static int64_t round_to_int_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2188 int scale
, int64_t min
, int64_t max
,
2192 int orig_flags
= get_float_exception_flags(s
);
2193 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2196 case float_class_snan
:
2197 case float_class_qnan
:
2198 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2200 case float_class_inf
:
2201 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2202 return p
.sign
? min
: max
;
2203 case float_class_zero
:
2205 case float_class_normal
:
2206 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2207 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2212 if (r
<= -(uint64_t) min
) {
2215 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2222 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2227 g_assert_not_reached();
2231 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2234 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2235 rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2238 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2241 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2242 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2245 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2248 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2249 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2252 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2255 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2256 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2259 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2262 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2263 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2266 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2269 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2270 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2273 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2276 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2277 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2280 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2283 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2284 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2287 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2290 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2291 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2294 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2297 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2298 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2301 int8_t float16_to_int8(float16 a
, float_status
*s
)
2303 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2306 int16_t float16_to_int16(float16 a
, float_status
*s
)
2308 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2311 int32_t float16_to_int32(float16 a
, float_status
*s
)
2313 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2316 int64_t float16_to_int64(float16 a
, float_status
*s
)
2318 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2321 int16_t float32_to_int16(float32 a
, float_status
*s
)
2323 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2326 int32_t float32_to_int32(float32 a
, float_status
*s
)
2328 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2331 int64_t float32_to_int64(float32 a
, float_status
*s
)
2333 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2336 int16_t float64_to_int16(float64 a
, float_status
*s
)
2338 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2341 int32_t float64_to_int32(float64 a
, float_status
*s
)
2343 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2346 int64_t float64_to_int64(float64 a
, float_status
*s
)
2348 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2351 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2353 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2356 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2358 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2361 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2363 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2366 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2368 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2371 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2373 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2376 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2378 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2381 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2383 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2386 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2388 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2391 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2393 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2397 * Returns the result of converting the floating-point value `a' to
2398 * the two's complement integer format.
2401 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2404 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2405 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2408 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2411 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2412 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2415 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2418 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2419 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2422 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2424 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2427 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2429 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2432 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2434 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2437 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2439 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2442 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2444 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2447 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2449 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2453 * Returns the result of converting the floating-point value `a' to
2454 * the unsigned integer format. The conversion is performed according
2455 * to the IEC/IEEE Standard for Binary Floating-Point
2456 * Arithmetic---which means in particular that the conversion is
2457 * rounded according to the current rounding mode. If `a' is a NaN,
2458 * the largest unsigned integer is returned. Otherwise, if the
2459 * conversion overflows, the largest unsigned integer is returned. If
2460 * the 'a' is negative, the result is rounded and zero is returned;
2461 * values that do not round to zero will raise the inexact exception
2465 static uint64_t round_to_uint_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2466 int scale
, uint64_t max
,
2469 int orig_flags
= get_float_exception_flags(s
);
2470 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2474 case float_class_snan
:
2475 case float_class_qnan
:
2476 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2478 case float_class_inf
:
2479 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2480 return p
.sign
? 0 : max
;
2481 case float_class_zero
:
2483 case float_class_normal
:
2485 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2489 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2490 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2492 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2496 /* For uint64 this will never trip, but if p.exp is too large
2497 * to shift a decomposed fraction we shall have exited via the
2501 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2506 g_assert_not_reached();
2510 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2513 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2514 rmode
, scale
, UINT8_MAX
, s
);
2517 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2520 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2521 rmode
, scale
, UINT16_MAX
, s
);
2524 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2527 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2528 rmode
, scale
, UINT32_MAX
, s
);
2531 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2534 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2535 rmode
, scale
, UINT64_MAX
, s
);
2538 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2541 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2542 rmode
, scale
, UINT16_MAX
, s
);
2545 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2548 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2549 rmode
, scale
, UINT32_MAX
, s
);
2552 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2555 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2556 rmode
, scale
, UINT64_MAX
, s
);
2559 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2562 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2563 rmode
, scale
, UINT16_MAX
, s
);
2566 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2569 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2570 rmode
, scale
, UINT32_MAX
, s
);
2573 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2576 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2577 rmode
, scale
, UINT64_MAX
, s
);
2580 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2582 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2585 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2587 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2590 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2592 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2595 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2597 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2600 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2602 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2605 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2607 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2610 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2612 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2615 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2617 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2620 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2622 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2625 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2627 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2630 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2632 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2635 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2637 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2640 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2642 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2645 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2647 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2650 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2652 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2655 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2657 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2660 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2662 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2665 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2667 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2670 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2672 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2676 * Returns the result of converting the bfloat16 value `a' to
2677 * the unsigned integer format.
2680 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2681 int scale
, float_status
*s
)
2683 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2684 rmode
, scale
, UINT16_MAX
, s
);
2687 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2688 int scale
, float_status
*s
)
2690 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2691 rmode
, scale
, UINT32_MAX
, s
);
2694 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2695 int scale
, float_status
*s
)
2697 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2698 rmode
, scale
, UINT64_MAX
, s
);
2701 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2703 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2706 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2708 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2711 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2713 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2716 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2718 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2721 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2723 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2726 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2728 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2732 * Integer to float conversions
2734 * Returns the result of converting the two's complement integer `a'
2735 * to the floating-point format. The conversion is performed according
2736 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2739 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2741 FloatParts64 r
= { .sign
= false };
2744 r
.cls
= float_class_zero
;
2749 r
.cls
= float_class_normal
;
2755 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2757 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2758 r
.frac
= f
<< shift
;
2764 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2766 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2767 return float16_round_pack_canonical(pa
, status
);
2770 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2772 return int64_to_float16_scalbn(a
, scale
, status
);
2775 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2777 return int64_to_float16_scalbn(a
, scale
, status
);
2780 float16
int64_to_float16(int64_t a
, float_status
*status
)
2782 return int64_to_float16_scalbn(a
, 0, status
);
2785 float16
int32_to_float16(int32_t a
, float_status
*status
)
2787 return int64_to_float16_scalbn(a
, 0, status
);
2790 float16
int16_to_float16(int16_t a
, float_status
*status
)
2792 return int64_to_float16_scalbn(a
, 0, status
);
2795 float16
int8_to_float16(int8_t a
, float_status
*status
)
2797 return int64_to_float16_scalbn(a
, 0, status
);
2800 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2802 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2803 return float32_round_pack_canonical(pa
, status
);
2806 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2808 return int64_to_float32_scalbn(a
, scale
, status
);
2811 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2813 return int64_to_float32_scalbn(a
, scale
, status
);
2816 float32
int64_to_float32(int64_t a
, float_status
*status
)
2818 return int64_to_float32_scalbn(a
, 0, status
);
2821 float32
int32_to_float32(int32_t a
, float_status
*status
)
2823 return int64_to_float32_scalbn(a
, 0, status
);
2826 float32
int16_to_float32(int16_t a
, float_status
*status
)
2828 return int64_to_float32_scalbn(a
, 0, status
);
2831 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2833 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2834 return float64_round_pack_canonical(pa
, status
);
2837 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2839 return int64_to_float64_scalbn(a
, scale
, status
);
2842 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2844 return int64_to_float64_scalbn(a
, scale
, status
);
2847 float64
int64_to_float64(int64_t a
, float_status
*status
)
2849 return int64_to_float64_scalbn(a
, 0, status
);
2852 float64
int32_to_float64(int32_t a
, float_status
*status
)
2854 return int64_to_float64_scalbn(a
, 0, status
);
2857 float64
int16_to_float64(int16_t a
, float_status
*status
)
2859 return int64_to_float64_scalbn(a
, 0, status
);
2863 * Returns the result of converting the two's complement integer `a'
2864 * to the bfloat16 format.
2867 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
2869 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2870 return bfloat16_round_pack_canonical(pa
, status
);
2873 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
2875 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2878 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
2880 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2883 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
2885 return int64_to_bfloat16_scalbn(a
, 0, status
);
2888 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
2890 return int64_to_bfloat16_scalbn(a
, 0, status
);
2893 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
2895 return int64_to_bfloat16_scalbn(a
, 0, status
);
2899 * Unsigned Integer to float conversions
2901 * Returns the result of converting the unsigned integer `a' to the
2902 * floating-point format. The conversion is performed according to the
2903 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2906 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2908 FloatParts64 r
= { .sign
= false };
2912 r
.cls
= float_class_zero
;
2914 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2916 r
.cls
= float_class_normal
;
2917 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2918 r
.frac
= a
<< shift
;
2924 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2926 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2927 return float16_round_pack_canonical(pa
, status
);
2930 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2932 return uint64_to_float16_scalbn(a
, scale
, status
);
2935 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2937 return uint64_to_float16_scalbn(a
, scale
, status
);
2940 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2942 return uint64_to_float16_scalbn(a
, 0, status
);
2945 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2947 return uint64_to_float16_scalbn(a
, 0, status
);
2950 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2952 return uint64_to_float16_scalbn(a
, 0, status
);
2955 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
2957 return uint64_to_float16_scalbn(a
, 0, status
);
2960 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2962 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2963 return float32_round_pack_canonical(pa
, status
);
2966 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2968 return uint64_to_float32_scalbn(a
, scale
, status
);
2971 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2973 return uint64_to_float32_scalbn(a
, scale
, status
);
2976 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2978 return uint64_to_float32_scalbn(a
, 0, status
);
2981 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2983 return uint64_to_float32_scalbn(a
, 0, status
);
2986 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2988 return uint64_to_float32_scalbn(a
, 0, status
);
2991 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2993 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2994 return float64_round_pack_canonical(pa
, status
);
2997 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2999 return uint64_to_float64_scalbn(a
, scale
, status
);
3002 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3004 return uint64_to_float64_scalbn(a
, scale
, status
);
3007 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3009 return uint64_to_float64_scalbn(a
, 0, status
);
3012 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3014 return uint64_to_float64_scalbn(a
, 0, status
);
3017 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3019 return uint64_to_float64_scalbn(a
, 0, status
);
3023 * Returns the result of converting the unsigned integer `a' to the
3027 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3029 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3030 return bfloat16_round_pack_canonical(pa
, status
);
3033 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3035 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3038 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3040 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3043 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3045 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3048 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3050 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3053 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3055 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3059 /* min() and max() functions. These can't be implemented as
3060 * 'compare and pick one input' because that would mishandle
3061 * NaNs and +0 vs -0.
3063 * minnum() and maxnum() functions. These are similar to the min()
3064 * and max() functions but if one of the arguments is a QNaN and
3065 * the other is numerical then the numerical argument is returned.
3066 * SNaNs will get quietened before being returned.
3067 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3068 * and maxNum() operations. min() and max() are the typical min/max
3069 * semantics provided by many CPUs which predate that specification.
3071 * minnummag() and maxnummag() functions correspond to minNumMag()
3072 * and minNumMag() from the IEEE-754 2008.
3074 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3075 bool ieee
, bool ismag
, float_status
*s
)
3077 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3079 /* Takes two floating-point values `a' and `b', one of
3080 * which is a NaN, and returns the appropriate NaN
3081 * result. If either `a' or `b' is a signaling NaN,
3082 * the invalid exception is raised.
3084 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3085 return pick_nan(a
, b
, s
);
3086 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3088 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3092 return pick_nan(a
, b
, s
);
3097 case float_class_normal
:
3100 case float_class_inf
:
3103 case float_class_zero
:
3107 g_assert_not_reached();
3111 case float_class_normal
:
3114 case float_class_inf
:
3117 case float_class_zero
:
3121 g_assert_not_reached();
3125 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3126 bool a_less
= a_exp
< b_exp
;
3127 if (a_exp
== b_exp
) {
3128 a_less
= a
.frac
< b
.frac
;
3130 return a_less
^ ismin
? b
: a
;
3133 if (a
.sign
== b
.sign
) {
3134 bool a_less
= a_exp
< b_exp
;
3135 if (a_exp
== b_exp
) {
3136 a_less
= a
.frac
< b
.frac
;
3138 return a
.sign
^ a_less
^ ismin
? b
: a
;
3140 return a
.sign
^ ismin
? b
: a
;
3145 #define MINMAX(sz, name, ismin, isiee, ismag) \
3146 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3149 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3150 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3151 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3153 return float ## sz ## _round_pack_canonical(pr, s); \
3156 MINMAX(16, min
, true, false, false)
3157 MINMAX(16, minnum
, true, true, false)
3158 MINMAX(16, minnummag
, true, true, true)
3159 MINMAX(16, max
, false, false, false)
3160 MINMAX(16, maxnum
, false, true, false)
3161 MINMAX(16, maxnummag
, false, true, true)
3163 MINMAX(32, min
, true, false, false)
3164 MINMAX(32, minnum
, true, true, false)
3165 MINMAX(32, minnummag
, true, true, true)
3166 MINMAX(32, max
, false, false, false)
3167 MINMAX(32, maxnum
, false, true, false)
3168 MINMAX(32, maxnummag
, false, true, true)
3170 MINMAX(64, min
, true, false, false)
3171 MINMAX(64, minnum
, true, true, false)
3172 MINMAX(64, minnummag
, true, true, true)
3173 MINMAX(64, max
, false, false, false)
3174 MINMAX(64, maxnum
, false, true, false)
3175 MINMAX(64, maxnummag
, false, true, true)
3179 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3180 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3182 FloatParts64 pa = bfloat16_unpack_canonical(a, s); \
3183 FloatParts64 pb = bfloat16_unpack_canonical(b, s); \
3184 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3186 return bfloat16_round_pack_canonical(pr, s); \
3189 BF16_MINMAX(min
, true, false, false)
3190 BF16_MINMAX(minnum
, true, true, false)
3191 BF16_MINMAX(minnummag
, true, true, true)
3192 BF16_MINMAX(max
, false, false, false)
3193 BF16_MINMAX(maxnum
, false, true, false)
3194 BF16_MINMAX(maxnummag
, false, true, true)
3198 /* Floating point compare */
3199 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3202 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3204 a
.cls
== float_class_snan
||
3205 b
.cls
== float_class_snan
) {
3206 float_raise(float_flag_invalid
, s
);
3208 return float_relation_unordered
;
3211 if (a
.cls
== float_class_zero
) {
3212 if (b
.cls
== float_class_zero
) {
3213 return float_relation_equal
;
3215 return b
.sign
? float_relation_greater
: float_relation_less
;
3216 } else if (b
.cls
== float_class_zero
) {
3217 return a
.sign
? float_relation_less
: float_relation_greater
;
3220 /* The only really important thing about infinity is its sign. If
3221 * both are infinities the sign marks the smallest of the two.
3223 if (a
.cls
== float_class_inf
) {
3224 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3225 return float_relation_equal
;
3227 return a
.sign
? float_relation_less
: float_relation_greater
;
3228 } else if (b
.cls
== float_class_inf
) {
3229 return b
.sign
? float_relation_greater
: float_relation_less
;
3232 if (a
.sign
!= b
.sign
) {
3233 return a
.sign
? float_relation_less
: float_relation_greater
;
3236 if (a
.exp
== b
.exp
) {
3237 if (a
.frac
== b
.frac
) {
3238 return float_relation_equal
;
3241 return a
.frac
> b
.frac
?
3242 float_relation_less
: float_relation_greater
;
3244 return a
.frac
> b
.frac
?
3245 float_relation_greater
: float_relation_less
;
3249 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3251 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3256 #define COMPARE(name, attr, sz) \
3258 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3260 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3261 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3262 return compare_floats(pa, pb, is_quiet, s); \
3265 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3266 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3267 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3271 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3273 return soft_f16_compare(a
, b
, false, s
);
3276 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3278 return soft_f16_compare(a
, b
, true, s
);
3281 static FloatRelation QEMU_FLATTEN
3282 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3284 union_float32 ua
, ub
;
3289 if (QEMU_NO_HARDFLOAT
) {
3293 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3294 if (isgreaterequal(ua
.h
, ub
.h
)) {
3295 if (isgreater(ua
.h
, ub
.h
)) {
3296 return float_relation_greater
;
3298 return float_relation_equal
;
3300 if (likely(isless(ua
.h
, ub
.h
))) {
3301 return float_relation_less
;
3303 /* The only condition remaining is unordered.
3304 * Fall through to set flags.
3307 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3310 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3312 return f32_compare(a
, b
, false, s
);
3315 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3317 return f32_compare(a
, b
, true, s
);
3320 static FloatRelation QEMU_FLATTEN
3321 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3323 union_float64 ua
, ub
;
3328 if (QEMU_NO_HARDFLOAT
) {
3332 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3333 if (isgreaterequal(ua
.h
, ub
.h
)) {
3334 if (isgreater(ua
.h
, ub
.h
)) {
3335 return float_relation_greater
;
3337 return float_relation_equal
;
3339 if (likely(isless(ua
.h
, ub
.h
))) {
3340 return float_relation_less
;
3342 /* The only condition remaining is unordered.
3343 * Fall through to set flags.
3346 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3349 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3351 return f64_compare(a
, b
, false, s
);
3354 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3356 return f64_compare(a
, b
, true, s
);
3359 static FloatRelation QEMU_FLATTEN
3360 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3362 FloatParts64 pa
= bfloat16_unpack_canonical(a
, s
);
3363 FloatParts64 pb
= bfloat16_unpack_canonical(b
, s
);
3364 return compare_floats(pa
, pb
, is_quiet
, s
);
3367 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3369 return soft_bf16_compare(a
, b
, false, s
);
3372 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3374 return soft_bf16_compare(a
, b
, true, s
);
3377 /* Multiply A by 2 raised to the power N. */
3378 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3380 if (unlikely(is_nan(a
.cls
))) {
3381 return return_nan(a
, s
);
3383 if (a
.cls
== float_class_normal
) {
3384 /* The largest float type (even though not supported by FloatParts64)
3385 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3386 * still allows rounding to infinity, without allowing overflow
3387 * within the int32_t that backs FloatParts64.exp.
3389 n
= MIN(MAX(n
, -0x10000), 0x10000);
3395 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3397 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
3398 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3399 return float16_round_pack_canonical(pr
, status
);
3402 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3404 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
3405 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3406 return float32_round_pack_canonical(pr
, status
);
3409 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3411 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
3412 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3413 return float64_round_pack_canonical(pr
, status
);
3416 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3418 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
3419 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3420 return bfloat16_round_pack_canonical(pr
, status
);
3426 * The old softfloat code did an approximation step before zeroing in
3427 * on the final result. However for simpleness we just compute the
3428 * square root by iterating down from the implicit bit to enough extra
3429 * bits to ensure we get a correctly rounded result.
3431 * This does mean however the calculation is slower than before,
3432 * especially for 64 bit floats.
3435 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3437 uint64_t a_frac
, r_frac
, s_frac
;
3440 if (is_nan(a
.cls
)) {
3441 return return_nan(a
, s
);
3443 if (a
.cls
== float_class_zero
) {
3444 return a
; /* sqrt(+-0) = +-0 */
3447 float_raise(float_flag_invalid
, s
);
3448 parts_default_nan(&a
, s
);
3451 if (a
.cls
== float_class_inf
) {
3452 return a
; /* sqrt(+inf) = +inf */
3455 assert(a
.cls
== float_class_normal
);
3457 /* We need two overflow bits at the top. Adding room for that is a
3458 * right shift. If the exponent is odd, we can discard the low bit
3459 * by multiplying the fraction by 2; that's a left shift. Combine
3460 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3462 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3465 /* Bit-by-bit computation of sqrt. */
3469 /* Iterate from implicit bit down to the 3 extra bits to compute a
3470 * properly rounded result. Remember we've inserted two more bits
3471 * at the top, so these positions are two less.
3473 bit
= DECOMPOSED_BINARY_POINT
- 2;
3474 last_bit
= MAX(p
->frac_shift
- 4, 0);
3476 uint64_t q
= 1ULL << bit
;
3477 uint64_t t_frac
= s_frac
+ q
;
3478 if (t_frac
<= a_frac
) {
3479 s_frac
= t_frac
+ q
;
3484 } while (--bit
>= last_bit
);
3486 /* Undo the right shift done above. If there is any remaining
3487 * fraction, the result is inexact. Set the sticky bit.
3489 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3494 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3496 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
3497 FloatParts64 pr
= sqrt_float(pa
, status
, &float16_params
);
3498 return float16_round_pack_canonical(pr
, status
);
3501 static float32 QEMU_SOFTFLOAT_ATTR
3502 soft_f32_sqrt(float32 a
, float_status
*status
)
3504 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
3505 FloatParts64 pr
= sqrt_float(pa
, status
, &float32_params
);
3506 return float32_round_pack_canonical(pr
, status
);
3509 static float64 QEMU_SOFTFLOAT_ATTR
3510 soft_f64_sqrt(float64 a
, float_status
*status
)
3512 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
3513 FloatParts64 pr
= sqrt_float(pa
, status
, &float64_params
);
3514 return float64_round_pack_canonical(pr
, status
);
3517 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3519 union_float32 ua
, ur
;
3522 if (unlikely(!can_use_fpu(s
))) {
3526 float32_input_flush1(&ua
.s
, s
);
3527 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3528 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3529 fpclassify(ua
.h
) == FP_ZERO
) ||
3533 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3534 float32_is_neg(ua
.s
))) {
3541 return soft_f32_sqrt(ua
.s
, s
);
3544 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3546 union_float64 ua
, ur
;
3549 if (unlikely(!can_use_fpu(s
))) {
3553 float64_input_flush1(&ua
.s
, s
);
3554 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3555 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3556 fpclassify(ua
.h
) == FP_ZERO
) ||
3560 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3561 float64_is_neg(ua
.s
))) {
3568 return soft_f64_sqrt(ua
.s
, s
);
3571 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3573 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
3574 FloatParts64 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3575 return bfloat16_round_pack_canonical(pr
, status
);
3578 /*----------------------------------------------------------------------------
3579 | The pattern for a default generated NaN.
3580 *----------------------------------------------------------------------------*/
3582 float16
float16_default_nan(float_status
*status
)
3586 parts_default_nan(&p
, status
);
3587 p
.frac
>>= float16_params
.frac_shift
;
3588 return float16_pack_raw(p
);
3591 float32
float32_default_nan(float_status
*status
)
3595 parts_default_nan(&p
, status
);
3596 p
.frac
>>= float32_params
.frac_shift
;
3597 return float32_pack_raw(p
);
3600 float64
float64_default_nan(float_status
*status
)
3604 parts_default_nan(&p
, status
);
3605 p
.frac
>>= float64_params
.frac_shift
;
3606 return float64_pack_raw(p
);
3609 float128
float128_default_nan(float_status
*status
)
3614 parts_default_nan(&p
, status
);
3615 /* Extrapolate from the choices made by parts_default_nan to fill
3616 * in the quad-floating format. If the low bit is set, assume we
3617 * want to set all non-snan bits.
3619 r
.low
= -(p
.frac
& 1);
3620 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3621 r
.high
|= UINT64_C(0x7FFF000000000000);
3622 r
.high
|= (uint64_t)p
.sign
<< 63;
3627 bfloat16
bfloat16_default_nan(float_status
*status
)
3631 parts_default_nan(&p
, status
);
3632 p
.frac
>>= bfloat16_params
.frac_shift
;
3633 return bfloat16_pack_raw(p
);
3636 /*----------------------------------------------------------------------------
3637 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3638 *----------------------------------------------------------------------------*/
3640 float16
float16_silence_nan(float16 a
, float_status
*status
)
3642 FloatParts64 p
= float16_unpack_raw(a
);
3643 p
.frac
<<= float16_params
.frac_shift
;
3644 p
= parts_silence_nan(p
, status
);
3645 p
.frac
>>= float16_params
.frac_shift
;
3646 return float16_pack_raw(p
);
3649 float32
float32_silence_nan(float32 a
, float_status
*status
)
3651 FloatParts64 p
= float32_unpack_raw(a
);
3652 p
.frac
<<= float32_params
.frac_shift
;
3653 p
= parts_silence_nan(p
, status
);
3654 p
.frac
>>= float32_params
.frac_shift
;
3655 return float32_pack_raw(p
);
3658 float64
float64_silence_nan(float64 a
, float_status
*status
)
3660 FloatParts64 p
= float64_unpack_raw(a
);
3661 p
.frac
<<= float64_params
.frac_shift
;
3662 p
= parts_silence_nan(p
, status
);
3663 p
.frac
>>= float64_params
.frac_shift
;
3664 return float64_pack_raw(p
);
3667 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3669 FloatParts64 p
= bfloat16_unpack_raw(a
);
3670 p
.frac
<<= bfloat16_params
.frac_shift
;
3671 p
= parts_silence_nan(p
, status
);
3672 p
.frac
>>= bfloat16_params
.frac_shift
;
3673 return bfloat16_pack_raw(p
);
3676 /*----------------------------------------------------------------------------
3677 | If `a' is denormal and we are in flush-to-zero mode then set the
3678 | input-denormal exception and return zero. Otherwise just return the value.
3679 *----------------------------------------------------------------------------*/
3681 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3683 if (p
.exp
== 0 && p
.frac
!= 0) {
3684 float_raise(float_flag_input_denormal
, status
);
3691 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3693 if (status
->flush_inputs_to_zero
) {
3694 FloatParts64 p
= float16_unpack_raw(a
);
3695 if (parts_squash_denormal(p
, status
)) {
3696 return float16_set_sign(float16_zero
, p
.sign
);
3702 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3704 if (status
->flush_inputs_to_zero
) {
3705 FloatParts64 p
= float32_unpack_raw(a
);
3706 if (parts_squash_denormal(p
, status
)) {
3707 return float32_set_sign(float32_zero
, p
.sign
);
3713 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3715 if (status
->flush_inputs_to_zero
) {
3716 FloatParts64 p
= float64_unpack_raw(a
);
3717 if (parts_squash_denormal(p
, status
)) {
3718 return float64_set_sign(float64_zero
, p
.sign
);
3724 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3726 if (status
->flush_inputs_to_zero
) {
3727 FloatParts64 p
= bfloat16_unpack_raw(a
);
3728 if (parts_squash_denormal(p
, status
)) {
3729 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3735 /*----------------------------------------------------------------------------
3736 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3737 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3738 | input. If `zSign' is 1, the input is negated before being converted to an
3739 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3740 | is simply rounded to an integer, with the inexact exception raised if the
3741 | input cannot be represented exactly as an integer. However, if the fixed-
3742 | point input is too large, the invalid exception is raised and the largest
3743 | positive or negative integer is returned.
3744 *----------------------------------------------------------------------------*/
3746 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3747 float_status
*status
)
3749 int8_t roundingMode
;
3750 bool roundNearestEven
;
3751 int8_t roundIncrement
, roundBits
;
3754 roundingMode
= status
->float_rounding_mode
;
3755 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3756 switch (roundingMode
) {
3757 case float_round_nearest_even
:
3758 case float_round_ties_away
:
3759 roundIncrement
= 0x40;
3761 case float_round_to_zero
:
3764 case float_round_up
:
3765 roundIncrement
= zSign
? 0 : 0x7f;
3767 case float_round_down
:
3768 roundIncrement
= zSign
? 0x7f : 0;
3770 case float_round_to_odd
:
3771 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3776 roundBits
= absZ
& 0x7F;
3777 absZ
= ( absZ
+ roundIncrement
)>>7;
3778 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3782 if ( zSign
) z
= - z
;
3783 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3784 float_raise(float_flag_invalid
, status
);
3785 return zSign
? INT32_MIN
: INT32_MAX
;
3788 float_raise(float_flag_inexact
, status
);
3794 /*----------------------------------------------------------------------------
3795 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3796 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3797 | and returns the properly rounded 64-bit integer corresponding to the input.
3798 | If `zSign' is 1, the input is negated before being converted to an integer.
3799 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3800 | the inexact exception raised if the input cannot be represented exactly as
3801 | an integer. However, if the fixed-point input is too large, the invalid
3802 | exception is raised and the largest positive or negative integer is
3804 *----------------------------------------------------------------------------*/
3806 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3807 float_status
*status
)
3809 int8_t roundingMode
;
3810 bool roundNearestEven
, increment
;
3813 roundingMode
= status
->float_rounding_mode
;
3814 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3815 switch (roundingMode
) {
3816 case float_round_nearest_even
:
3817 case float_round_ties_away
:
3818 increment
= ((int64_t) absZ1
< 0);
3820 case float_round_to_zero
:
3823 case float_round_up
:
3824 increment
= !zSign
&& absZ1
;
3826 case float_round_down
:
3827 increment
= zSign
&& absZ1
;
3829 case float_round_to_odd
:
3830 increment
= !(absZ0
& 1) && absZ1
;
3837 if ( absZ0
== 0 ) goto overflow
;
3838 if (!(absZ1
<< 1) && roundNearestEven
) {
3843 if ( zSign
) z
= - z
;
3844 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3846 float_raise(float_flag_invalid
, status
);
3847 return zSign
? INT64_MIN
: INT64_MAX
;
3850 float_raise(float_flag_inexact
, status
);
3856 /*----------------------------------------------------------------------------
3857 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3858 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3859 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3860 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3861 | with the inexact exception raised if the input cannot be represented exactly
3862 | as an integer. However, if the fixed-point input is too large, the invalid
3863 | exception is raised and the largest unsigned integer is returned.
3864 *----------------------------------------------------------------------------*/
3866 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
3867 uint64_t absZ1
, float_status
*status
)
3869 int8_t roundingMode
;
3870 bool roundNearestEven
, increment
;
3872 roundingMode
= status
->float_rounding_mode
;
3873 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3874 switch (roundingMode
) {
3875 case float_round_nearest_even
:
3876 case float_round_ties_away
:
3877 increment
= ((int64_t)absZ1
< 0);
3879 case float_round_to_zero
:
3882 case float_round_up
:
3883 increment
= !zSign
&& absZ1
;
3885 case float_round_down
:
3886 increment
= zSign
&& absZ1
;
3888 case float_round_to_odd
:
3889 increment
= !(absZ0
& 1) && absZ1
;
3897 float_raise(float_flag_invalid
, status
);
3900 if (!(absZ1
<< 1) && roundNearestEven
) {
3905 if (zSign
&& absZ0
) {
3906 float_raise(float_flag_invalid
, status
);
3911 float_raise(float_flag_inexact
, status
);
3916 /*----------------------------------------------------------------------------
3917 | Normalizes the subnormal single-precision floating-point value represented
3918 | by the denormalized significand `aSig'. The normalized exponent and
3919 | significand are stored at the locations pointed to by `zExpPtr' and
3920 | `zSigPtr', respectively.
3921 *----------------------------------------------------------------------------*/
3924 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3928 shiftCount
= clz32(aSig
) - 8;
3929 *zSigPtr
= aSig
<<shiftCount
;
3930 *zExpPtr
= 1 - shiftCount
;
3934 /*----------------------------------------------------------------------------
3935 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3936 | and significand `zSig', and returns the proper single-precision floating-
3937 | point value corresponding to the abstract input. Ordinarily, the abstract
3938 | value is simply rounded and packed into the single-precision format, with
3939 | the inexact exception raised if the abstract input cannot be represented
3940 | exactly. However, if the abstract value is too large, the overflow and
3941 | inexact exceptions are raised and an infinity or maximal finite value is
3942 | returned. If the abstract value is too small, the input value is rounded to
3943 | a subnormal number, and the underflow and inexact exceptions are raised if
3944 | the abstract input cannot be represented exactly as a subnormal single-
3945 | precision floating-point number.
3946 | The input significand `zSig' has its binary point between bits 30
3947 | and 29, which is 7 bits to the left of the usual location. This shifted
3948 | significand must be normalized or smaller. If `zSig' is not normalized,
3949 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3950 | and it must not require rounding. In the usual case that `zSig' is
3951 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3952 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3953 | Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3956 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3957 float_status
*status
)
3959 int8_t roundingMode
;
3960 bool roundNearestEven
;
3961 int8_t roundIncrement
, roundBits
;
3964 roundingMode
= status
->float_rounding_mode
;
3965 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3966 switch (roundingMode
) {
3967 case float_round_nearest_even
:
3968 case float_round_ties_away
:
3969 roundIncrement
= 0x40;
3971 case float_round_to_zero
:
3974 case float_round_up
:
3975 roundIncrement
= zSign
? 0 : 0x7f;
3977 case float_round_down
:
3978 roundIncrement
= zSign
? 0x7f : 0;
3980 case float_round_to_odd
:
3981 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3987 roundBits
= zSig
& 0x7F;
3988 if ( 0xFD <= (uint16_t) zExp
) {
3989 if ( ( 0xFD < zExp
)
3990 || ( ( zExp
== 0xFD )
3991 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3993 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3994 roundIncrement
!= 0;
3995 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3996 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3999 if (status
->flush_to_zero
) {
4000 float_raise(float_flag_output_denormal
, status
);
4001 return packFloat32(zSign
, 0, 0);
4003 isTiny
= status
->tininess_before_rounding
4005 || (zSig
+ roundIncrement
< 0x80000000);
4006 shift32RightJamming( zSig
, - zExp
, &zSig
);
4008 roundBits
= zSig
& 0x7F;
4009 if (isTiny
&& roundBits
) {
4010 float_raise(float_flag_underflow
, status
);
4012 if (roundingMode
== float_round_to_odd
) {
4014 * For round-to-odd case, the roundIncrement depends on
4015 * zSig which just changed.
4017 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4022 float_raise(float_flag_inexact
, status
);
4024 zSig
= ( zSig
+ roundIncrement
)>>7;
4025 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4028 if ( zSig
== 0 ) zExp
= 0;
4029 return packFloat32( zSign
, zExp
, zSig
);
4033 /*----------------------------------------------------------------------------
4034 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4035 | and significand `zSig', and returns the proper single-precision floating-
4036 | point value corresponding to the abstract input. This routine is just like
4037 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4038 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4039 | floating-point exponent.
4040 *----------------------------------------------------------------------------*/
4043 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4044 float_status
*status
)
4048 shiftCount
= clz32(zSig
) - 1;
4049 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4054 /*----------------------------------------------------------------------------
4055 | Normalizes the subnormal double-precision floating-point value represented
4056 | by the denormalized significand `aSig'. The normalized exponent and
4057 | significand are stored at the locations pointed to by `zExpPtr' and
4058 | `zSigPtr', respectively.
4059 *----------------------------------------------------------------------------*/
4062 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4066 shiftCount
= clz64(aSig
) - 11;
4067 *zSigPtr
= aSig
<<shiftCount
;
4068 *zExpPtr
= 1 - shiftCount
;
4072 /*----------------------------------------------------------------------------
4073 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4074 | double-precision floating-point value, returning the result. After being
4075 | shifted into the proper positions, the three fields are simply added
4076 | together to form the result. This means that any integer portion of `zSig'
4077 | will be added into the exponent. Since a properly normalized significand
4078 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4079 | than the desired result exponent whenever `zSig' is a complete, normalized
4081 *----------------------------------------------------------------------------*/
4083 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4086 return make_float64(
4087 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4091 /*----------------------------------------------------------------------------
4092 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4093 | and significand `zSig', and returns the proper double-precision floating-
4094 | point value corresponding to the abstract input. Ordinarily, the abstract
4095 | value is simply rounded and packed into the double-precision format, with
4096 | the inexact exception raised if the abstract input cannot be represented
4097 | exactly. However, if the abstract value is too large, the overflow and
4098 | inexact exceptions are raised and an infinity or maximal finite value is
4099 | returned. If the abstract value is too small, the input value is rounded to
4100 | a subnormal number, and the underflow and inexact exceptions are raised if
4101 | the abstract input cannot be represented exactly as a subnormal double-
4102 | precision floating-point number.
4103 | The input significand `zSig' has its binary point between bits 62
4104 | and 61, which is 10 bits to the left of the usual location. This shifted
4105 | significand must be normalized or smaller. If `zSig' is not normalized,
4106 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4107 | and it must not require rounding. In the usual case that `zSig' is
4108 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4109 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4110 | Binary Floating-Point Arithmetic.
4111 *----------------------------------------------------------------------------*/
4113 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4114 float_status
*status
)
4116 int8_t roundingMode
;
4117 bool roundNearestEven
;
4118 int roundIncrement
, roundBits
;
4121 roundingMode
= status
->float_rounding_mode
;
4122 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4123 switch (roundingMode
) {
4124 case float_round_nearest_even
:
4125 case float_round_ties_away
:
4126 roundIncrement
= 0x200;
4128 case float_round_to_zero
:
4131 case float_round_up
:
4132 roundIncrement
= zSign
? 0 : 0x3ff;
4134 case float_round_down
:
4135 roundIncrement
= zSign
? 0x3ff : 0;
4137 case float_round_to_odd
:
4138 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4143 roundBits
= zSig
& 0x3FF;
4144 if ( 0x7FD <= (uint16_t) zExp
) {
4145 if ( ( 0x7FD < zExp
)
4146 || ( ( zExp
== 0x7FD )
4147 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4149 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4150 roundIncrement
!= 0;
4151 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4152 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4155 if (status
->flush_to_zero
) {
4156 float_raise(float_flag_output_denormal
, status
);
4157 return packFloat64(zSign
, 0, 0);
4159 isTiny
= status
->tininess_before_rounding
4161 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4162 shift64RightJamming( zSig
, - zExp
, &zSig
);
4164 roundBits
= zSig
& 0x3FF;
4165 if (isTiny
&& roundBits
) {
4166 float_raise(float_flag_underflow
, status
);
4168 if (roundingMode
== float_round_to_odd
) {
4170 * For round-to-odd case, the roundIncrement depends on
4171 * zSig which just changed.
4173 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4178 float_raise(float_flag_inexact
, status
);
4180 zSig
= ( zSig
+ roundIncrement
)>>10;
4181 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4184 if ( zSig
== 0 ) zExp
= 0;
4185 return packFloat64( zSign
, zExp
, zSig
);
4189 /*----------------------------------------------------------------------------
4190 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4191 | and significand `zSig', and returns the proper double-precision floating-
4192 | point value corresponding to the abstract input. This routine is just like
4193 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4194 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4195 | floating-point exponent.
4196 *----------------------------------------------------------------------------*/
4199 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4200 float_status
*status
)
4204 shiftCount
= clz64(zSig
) - 1;
4205 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4210 /*----------------------------------------------------------------------------
4211 | Normalizes the subnormal extended double-precision floating-point value
4212 | represented by the denormalized significand `aSig'. The normalized exponent
4213 | and significand are stored at the locations pointed to by `zExpPtr' and
4214 | `zSigPtr', respectively.
4215 *----------------------------------------------------------------------------*/
4217 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4222 shiftCount
= clz64(aSig
);
4223 *zSigPtr
= aSig
<<shiftCount
;
4224 *zExpPtr
= 1 - shiftCount
;
4227 /*----------------------------------------------------------------------------
4228 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4229 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4230 | and returns the proper extended double-precision floating-point value
4231 | corresponding to the abstract input. Ordinarily, the abstract value is
4232 | rounded and packed into the extended double-precision format, with the
4233 | inexact exception raised if the abstract input cannot be represented
4234 | exactly. However, if the abstract value is too large, the overflow and
4235 | inexact exceptions are raised and an infinity or maximal finite value is
4236 | returned. If the abstract value is too small, the input value is rounded to
4237 | a subnormal number, and the underflow and inexact exceptions are raised if
4238 | the abstract input cannot be represented exactly as a subnormal extended
4239 | double-precision floating-point number.
4240 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4241 | number of bits as single or double precision, respectively. Otherwise, the
4242 | result is rounded to the full precision of the extended double-precision
4244 | The input significand must be normalized or smaller. If the input
4245 | significand is not normalized, `zExp' must be 0; in that case, the result
4246 | returned is a subnormal number, and it must not require rounding. The
4247 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4248 | Floating-Point Arithmetic.
4249 *----------------------------------------------------------------------------*/
4251 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4252 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4253 float_status
*status
)
4255 int8_t roundingMode
;
4256 bool roundNearestEven
, increment
, isTiny
;
4257 int64_t roundIncrement
, roundMask
, roundBits
;
4259 roundingMode
= status
->float_rounding_mode
;
4260 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4261 if ( roundingPrecision
== 80 ) goto precision80
;
4262 if ( roundingPrecision
== 64 ) {
4263 roundIncrement
= UINT64_C(0x0000000000000400);
4264 roundMask
= UINT64_C(0x00000000000007FF);
4266 else if ( roundingPrecision
== 32 ) {
4267 roundIncrement
= UINT64_C(0x0000008000000000);
4268 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4273 zSig0
|= ( zSig1
!= 0 );
4274 switch (roundingMode
) {
4275 case float_round_nearest_even
:
4276 case float_round_ties_away
:
4278 case float_round_to_zero
:
4281 case float_round_up
:
4282 roundIncrement
= zSign
? 0 : roundMask
;
4284 case float_round_down
:
4285 roundIncrement
= zSign
? roundMask
: 0;
4290 roundBits
= zSig0
& roundMask
;
4291 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4292 if ( ( 0x7FFE < zExp
)
4293 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4298 if (status
->flush_to_zero
) {
4299 float_raise(float_flag_output_denormal
, status
);
4300 return packFloatx80(zSign
, 0, 0);
4302 isTiny
= status
->tininess_before_rounding
4304 || (zSig0
<= zSig0
+ roundIncrement
);
4305 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4307 roundBits
= zSig0
& roundMask
;
4308 if (isTiny
&& roundBits
) {
4309 float_raise(float_flag_underflow
, status
);
4312 float_raise(float_flag_inexact
, status
);
4314 zSig0
+= roundIncrement
;
4315 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4316 roundIncrement
= roundMask
+ 1;
4317 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4318 roundMask
|= roundIncrement
;
4320 zSig0
&= ~ roundMask
;
4321 return packFloatx80( zSign
, zExp
, zSig0
);
4325 float_raise(float_flag_inexact
, status
);
4327 zSig0
+= roundIncrement
;
4328 if ( zSig0
< roundIncrement
) {
4330 zSig0
= UINT64_C(0x8000000000000000);
4332 roundIncrement
= roundMask
+ 1;
4333 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4334 roundMask
|= roundIncrement
;
4336 zSig0
&= ~ roundMask
;
4337 if ( zSig0
== 0 ) zExp
= 0;
4338 return packFloatx80( zSign
, zExp
, zSig0
);
4340 switch (roundingMode
) {
4341 case float_round_nearest_even
:
4342 case float_round_ties_away
:
4343 increment
= ((int64_t)zSig1
< 0);
4345 case float_round_to_zero
:
4348 case float_round_up
:
4349 increment
= !zSign
&& zSig1
;
4351 case float_round_down
:
4352 increment
= zSign
&& zSig1
;
4357 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4358 if ( ( 0x7FFE < zExp
)
4359 || ( ( zExp
== 0x7FFE )
4360 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4366 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4367 if ( ( roundingMode
== float_round_to_zero
)
4368 || ( zSign
&& ( roundingMode
== float_round_up
) )
4369 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4371 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4373 return packFloatx80(zSign
,
4374 floatx80_infinity_high
,
4375 floatx80_infinity_low
);
4378 isTiny
= status
->tininess_before_rounding
4381 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4382 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4384 if (isTiny
&& zSig1
) {
4385 float_raise(float_flag_underflow
, status
);
4388 float_raise(float_flag_inexact
, status
);
4390 switch (roundingMode
) {
4391 case float_round_nearest_even
:
4392 case float_round_ties_away
:
4393 increment
= ((int64_t)zSig1
< 0);
4395 case float_round_to_zero
:
4398 case float_round_up
:
4399 increment
= !zSign
&& zSig1
;
4401 case float_round_down
:
4402 increment
= zSign
&& zSig1
;
4409 if (!(zSig1
<< 1) && roundNearestEven
) {
4412 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4414 return packFloatx80( zSign
, zExp
, zSig0
);
4418 float_raise(float_flag_inexact
, status
);
4424 zSig0
= UINT64_C(0x8000000000000000);
4427 if (!(zSig1
<< 1) && roundNearestEven
) {
4433 if ( zSig0
== 0 ) zExp
= 0;
4435 return packFloatx80( zSign
, zExp
, zSig0
);
4439 /*----------------------------------------------------------------------------
4440 | Takes an abstract floating-point value having sign `zSign', exponent
4441 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4442 | and returns the proper extended double-precision floating-point value
4443 | corresponding to the abstract input. This routine is just like
4444 | `roundAndPackFloatx80' except that the input significand does not have to be
4446 *----------------------------------------------------------------------------*/
4448 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4449 bool zSign
, int32_t zExp
,
4450 uint64_t zSig0
, uint64_t zSig1
,
4451 float_status
*status
)
4460 shiftCount
= clz64(zSig0
);
4461 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4463 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4464 zSig0
, zSig1
, status
);
4468 /*----------------------------------------------------------------------------
4469 | Returns the least-significant 64 fraction bits of the quadruple-precision
4470 | floating-point value `a'.
4471 *----------------------------------------------------------------------------*/
4473 static inline uint64_t extractFloat128Frac1( float128 a
)
4480 /*----------------------------------------------------------------------------
4481 | Returns the most-significant 48 fraction bits of the quadruple-precision
4482 | floating-point value `a'.
4483 *----------------------------------------------------------------------------*/
4485 static inline uint64_t extractFloat128Frac0( float128 a
)
4488 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4492 /*----------------------------------------------------------------------------
4493 | Returns the exponent bits of the quadruple-precision floating-point value
4495 *----------------------------------------------------------------------------*/
4497 static inline int32_t extractFloat128Exp( float128 a
)
4500 return ( a
.high
>>48 ) & 0x7FFF;
4504 /*----------------------------------------------------------------------------
4505 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4506 *----------------------------------------------------------------------------*/
4508 static inline bool extractFloat128Sign(float128 a
)
4510 return a
.high
>> 63;
4513 /*----------------------------------------------------------------------------
4514 | Normalizes the subnormal quadruple-precision floating-point value
4515 | represented by the denormalized significand formed by the concatenation of
4516 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4517 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4518 | significand are stored at the location pointed to by `zSig0Ptr', and the
4519 | least significant 64 bits of the normalized significand are stored at the
4520 | location pointed to by `zSig1Ptr'.
4521 *----------------------------------------------------------------------------*/
4524 normalizeFloat128Subnormal(
4535 shiftCount
= clz64(aSig1
) - 15;
4536 if ( shiftCount
< 0 ) {
4537 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4538 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4541 *zSig0Ptr
= aSig1
<<shiftCount
;
4544 *zExpPtr
= - shiftCount
- 63;
4547 shiftCount
= clz64(aSig0
) - 15;
4548 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4549 *zExpPtr
= 1 - shiftCount
;
4554 /*----------------------------------------------------------------------------
4555 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4556 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4557 | floating-point value, returning the result. After being shifted into the
4558 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4559 | added together to form the most significant 32 bits of the result. This
4560 | means that any integer portion of `zSig0' will be added into the exponent.
4561 | Since a properly normalized significand will have an integer portion equal
4562 | to 1, the `zExp' input should be 1 less than the desired result exponent
4563 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4565 *----------------------------------------------------------------------------*/
4567 static inline float128
4568 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4573 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4577 /*----------------------------------------------------------------------------
4578 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4579 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4580 | and `zSig2', and returns the proper quadruple-precision floating-point value
4581 | corresponding to the abstract input. Ordinarily, the abstract value is
4582 | simply rounded and packed into the quadruple-precision format, with the
4583 | inexact exception raised if the abstract input cannot be represented
4584 | exactly. However, if the abstract value is too large, the overflow and
4585 | inexact exceptions are raised and an infinity or maximal finite value is
4586 | returned. If the abstract value is too small, the input value is rounded to
4587 | a subnormal number, and the underflow and inexact exceptions are raised if
4588 | the abstract input cannot be represented exactly as a subnormal quadruple-
4589 | precision floating-point number.
4590 | The input significand must be normalized or smaller. If the input
4591 | significand is not normalized, `zExp' must be 0; in that case, the result
4592 | returned is a subnormal number, and it must not require rounding. In the
4593 | usual case that the input significand is normalized, `zExp' must be 1 less
4594 | than the ``true'' floating-point exponent. The handling of underflow and
4595 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4596 *----------------------------------------------------------------------------*/
4598 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4599 uint64_t zSig0
, uint64_t zSig1
,
4600 uint64_t zSig2
, float_status
*status
)
4602 int8_t roundingMode
;
4603 bool roundNearestEven
, increment
, isTiny
;
4605 roundingMode
= status
->float_rounding_mode
;
4606 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4607 switch (roundingMode
) {
4608 case float_round_nearest_even
:
4609 case float_round_ties_away
:
4610 increment
= ((int64_t)zSig2
< 0);
4612 case float_round_to_zero
:
4615 case float_round_up
:
4616 increment
= !zSign
&& zSig2
;
4618 case float_round_down
:
4619 increment
= zSign
&& zSig2
;
4621 case float_round_to_odd
:
4622 increment
= !(zSig1
& 0x1) && zSig2
;
4627 if ( 0x7FFD <= (uint32_t) zExp
) {
4628 if ( ( 0x7FFD < zExp
)
4629 || ( ( zExp
== 0x7FFD )
4631 UINT64_C(0x0001FFFFFFFFFFFF),
4632 UINT64_C(0xFFFFFFFFFFFFFFFF),
4639 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4640 if ( ( roundingMode
== float_round_to_zero
)
4641 || ( zSign
&& ( roundingMode
== float_round_up
) )
4642 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4643 || (roundingMode
== float_round_to_odd
)
4649 UINT64_C(0x0000FFFFFFFFFFFF),
4650 UINT64_C(0xFFFFFFFFFFFFFFFF)
4653 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4656 if (status
->flush_to_zero
) {
4657 float_raise(float_flag_output_denormal
, status
);
4658 return packFloat128(zSign
, 0, 0, 0);
4660 isTiny
= status
->tininess_before_rounding
4663 || lt128(zSig0
, zSig1
,
4664 UINT64_C(0x0001FFFFFFFFFFFF),
4665 UINT64_C(0xFFFFFFFFFFFFFFFF));
4666 shift128ExtraRightJamming(
4667 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4669 if (isTiny
&& zSig2
) {
4670 float_raise(float_flag_underflow
, status
);
4672 switch (roundingMode
) {
4673 case float_round_nearest_even
:
4674 case float_round_ties_away
:
4675 increment
= ((int64_t)zSig2
< 0);
4677 case float_round_to_zero
:
4680 case float_round_up
:
4681 increment
= !zSign
&& zSig2
;
4683 case float_round_down
:
4684 increment
= zSign
&& zSig2
;
4686 case float_round_to_odd
:
4687 increment
= !(zSig1
& 0x1) && zSig2
;
4695 float_raise(float_flag_inexact
, status
);
4698 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4699 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4704 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4706 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4710 /*----------------------------------------------------------------------------
4711 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4712 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4713 | returns the proper quadruple-precision floating-point value corresponding
4714 | to the abstract input. This routine is just like `roundAndPackFloat128'
4715 | except that the input significand has fewer bits and does not have to be
4716 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4718 *----------------------------------------------------------------------------*/
4720 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4721 uint64_t zSig0
, uint64_t zSig1
,
4722 float_status
*status
)
4732 shiftCount
= clz64(zSig0
) - 15;
4733 if ( 0 <= shiftCount
) {
4735 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4738 shift128ExtraRightJamming(
4739 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4742 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4747 /*----------------------------------------------------------------------------
4748 | Returns the result of converting the 32-bit two's complement integer `a'
4749 | to the extended double-precision floating-point format. The conversion
4750 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4752 *----------------------------------------------------------------------------*/
4754 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4761 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4763 absA
= zSign
? - a
: a
;
4764 shiftCount
= clz32(absA
) + 32;
4766 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the 32-bit two's complement integer `a' to
4772 | the quadruple-precision floating-point format. The conversion is performed
4773 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4774 *----------------------------------------------------------------------------*/
4776 float128
int32_to_float128(int32_t a
, float_status
*status
)
4783 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4785 absA
= zSign
? - a
: a
;
4786 shiftCount
= clz32(absA
) + 17;
4788 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4792 /*----------------------------------------------------------------------------
4793 | Returns the result of converting the 64-bit two's complement integer `a'
4794 | to the extended double-precision floating-point format. The conversion
4795 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4797 *----------------------------------------------------------------------------*/
4799 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4805 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4807 absA
= zSign
? - a
: a
;
4808 shiftCount
= clz64(absA
);
4809 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of converting the 64-bit two's complement integer `a' to
4815 | the quadruple-precision floating-point format. The conversion is performed
4816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4819 float128
int64_to_float128(int64_t a
, float_status
*status
)
4825 uint64_t zSig0
, zSig1
;
4827 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4829 absA
= zSign
? - a
: a
;
4830 shiftCount
= clz64(absA
) + 49;
4831 zExp
= 0x406E - shiftCount
;
4832 if ( 64 <= shiftCount
) {
4841 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4842 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4846 /*----------------------------------------------------------------------------
4847 | Returns the result of converting the 64-bit unsigned integer `a'
4848 | to the quadruple-precision floating-point format. The conversion is performed
4849 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4850 *----------------------------------------------------------------------------*/
4852 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4855 return float128_zero
;
4857 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4860 /*----------------------------------------------------------------------------
4861 | Returns the result of converting the single-precision floating-point value
4862 | `a' to the extended double-precision floating-point format. The conversion
4863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4865 *----------------------------------------------------------------------------*/
4867 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4873 a
= float32_squash_input_denormal(a
, status
);
4874 aSig
= extractFloat32Frac( a
);
4875 aExp
= extractFloat32Exp( a
);
4876 aSign
= extractFloat32Sign( a
);
4877 if ( aExp
== 0xFF ) {
4879 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4881 return floatx80_silence_nan(res
, status
);
4883 return packFloatx80(aSign
,
4884 floatx80_infinity_high
,
4885 floatx80_infinity_low
);
4888 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4889 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4892 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of converting the single-precision floating-point value
4898 | `a' to the double-precision floating-point format. The conversion is
4899 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4901 *----------------------------------------------------------------------------*/
4903 float128
float32_to_float128(float32 a
, float_status
*status
)
4909 a
= float32_squash_input_denormal(a
, status
);
4910 aSig
= extractFloat32Frac( a
);
4911 aExp
= extractFloat32Exp( a
);
4912 aSign
= extractFloat32Sign( a
);
4913 if ( aExp
== 0xFF ) {
4915 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4917 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4920 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4921 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4924 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4928 /*----------------------------------------------------------------------------
4929 | Returns the remainder of the single-precision floating-point value `a'
4930 | with respect to the corresponding value `b'. The operation is performed
4931 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4934 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4937 int aExp
, bExp
, expDiff
;
4938 uint32_t aSig
, bSig
;
4940 uint64_t aSig64
, bSig64
, q64
;
4941 uint32_t alternateASig
;
4943 a
= float32_squash_input_denormal(a
, status
);
4944 b
= float32_squash_input_denormal(b
, status
);
4946 aSig
= extractFloat32Frac( a
);
4947 aExp
= extractFloat32Exp( a
);
4948 aSign
= extractFloat32Sign( a
);
4949 bSig
= extractFloat32Frac( b
);
4950 bExp
= extractFloat32Exp( b
);
4951 if ( aExp
== 0xFF ) {
4952 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4953 return propagateFloat32NaN(a
, b
, status
);
4955 float_raise(float_flag_invalid
, status
);
4956 return float32_default_nan(status
);
4958 if ( bExp
== 0xFF ) {
4960 return propagateFloat32NaN(a
, b
, status
);
4966 float_raise(float_flag_invalid
, status
);
4967 return float32_default_nan(status
);
4969 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4972 if ( aSig
== 0 ) return a
;
4973 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4975 expDiff
= aExp
- bExp
;
4978 if ( expDiff
< 32 ) {
4981 if ( expDiff
< 0 ) {
4982 if ( expDiff
< -1 ) return a
;
4985 q
= ( bSig
<= aSig
);
4986 if ( q
) aSig
-= bSig
;
4987 if ( 0 < expDiff
) {
4988 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4991 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4999 if ( bSig
<= aSig
) aSig
-= bSig
;
5000 aSig64
= ( (uint64_t) aSig
)<<40;
5001 bSig64
= ( (uint64_t) bSig
)<<40;
5003 while ( 0 < expDiff
) {
5004 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5005 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5006 aSig64
= - ( ( bSig
* q64
)<<38 );
5010 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5011 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5012 q
= q64
>>( 64 - expDiff
);
5014 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5017 alternateASig
= aSig
;
5020 } while ( 0 <= (int32_t) aSig
);
5021 sigMean
= aSig
+ alternateASig
;
5022 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5023 aSig
= alternateASig
;
5025 zSign
= ( (int32_t) aSig
< 0 );
5026 if ( zSign
) aSig
= - aSig
;
5027 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5032 /*----------------------------------------------------------------------------
5033 | Returns the binary exponential of the single-precision floating-point value
5034 | `a'. The operation is performed according to the IEC/IEEE Standard for
5035 | Binary Floating-Point Arithmetic.
5037 | Uses the following identities:
5039 | 1. -------------------------------------------------------------------------
5043 | 2. -------------------------------------------------------------------------
5046 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5048 *----------------------------------------------------------------------------*/
5050 static const float64 float32_exp2_coefficients
[15] =
5052 const_float64( 0x3ff0000000000000ll
), /* 1 */
5053 const_float64( 0x3fe0000000000000ll
), /* 2 */
5054 const_float64( 0x3fc5555555555555ll
), /* 3 */
5055 const_float64( 0x3fa5555555555555ll
), /* 4 */
5056 const_float64( 0x3f81111111111111ll
), /* 5 */
5057 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5058 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5059 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5060 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5061 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5062 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5063 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5064 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5065 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5066 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5069 float32
float32_exp2(float32 a
, float_status
*status
)
5076 a
= float32_squash_input_denormal(a
, status
);
5078 aSig
= extractFloat32Frac( a
);
5079 aExp
= extractFloat32Exp( a
);
5080 aSign
= extractFloat32Sign( a
);
5082 if ( aExp
== 0xFF) {
5084 return propagateFloat32NaN(a
, float32_zero
, status
);
5086 return (aSign
) ? float32_zero
: a
;
5089 if (aSig
== 0) return float32_one
;
5092 float_raise(float_flag_inexact
, status
);
5094 /* ******************************* */
5095 /* using float64 for approximation */
5096 /* ******************************* */
5097 x
= float32_to_float64(a
, status
);
5098 x
= float64_mul(x
, float64_ln2
, status
);
5102 for (i
= 0 ; i
< 15 ; i
++) {
5105 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5106 r
= float64_add(r
, f
, status
);
5108 xn
= float64_mul(xn
, x
, status
);
5111 return float64_to_float32(r
, status
);
5114 /*----------------------------------------------------------------------------
5115 | Returns the binary log of the single-precision floating-point value `a'.
5116 | The operation is performed according to the IEC/IEEE Standard for Binary
5117 | Floating-Point Arithmetic.
5118 *----------------------------------------------------------------------------*/
5119 float32
float32_log2(float32 a
, float_status
*status
)
5123 uint32_t aSig
, zSig
, i
;
5125 a
= float32_squash_input_denormal(a
, status
);
5126 aSig
= extractFloat32Frac( a
);
5127 aExp
= extractFloat32Exp( a
);
5128 aSign
= extractFloat32Sign( a
);
5131 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5132 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5135 float_raise(float_flag_invalid
, status
);
5136 return float32_default_nan(status
);
5138 if ( aExp
== 0xFF ) {
5140 return propagateFloat32NaN(a
, float32_zero
, status
);
5150 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5151 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5152 if ( aSig
& 0x01000000 ) {
5161 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5164 /*----------------------------------------------------------------------------
5165 | Returns the result of converting the double-precision floating-point value
5166 | `a' to the extended double-precision floating-point format. The conversion
5167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5169 *----------------------------------------------------------------------------*/
5171 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5177 a
= float64_squash_input_denormal(a
, status
);
5178 aSig
= extractFloat64Frac( a
);
5179 aExp
= extractFloat64Exp( a
);
5180 aSign
= extractFloat64Sign( a
);
5181 if ( aExp
== 0x7FF ) {
5183 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5185 return floatx80_silence_nan(res
, status
);
5187 return packFloatx80(aSign
,
5188 floatx80_infinity_high
,
5189 floatx80_infinity_low
);
5192 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5193 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5197 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5201 /*----------------------------------------------------------------------------
5202 | Returns the result of converting the double-precision floating-point value
5203 | `a' to the quadruple-precision floating-point format. The conversion is
5204 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5206 *----------------------------------------------------------------------------*/
5208 float128
float64_to_float128(float64 a
, float_status
*status
)
5212 uint64_t aSig
, zSig0
, zSig1
;
5214 a
= float64_squash_input_denormal(a
, status
);
5215 aSig
= extractFloat64Frac( a
);
5216 aExp
= extractFloat64Exp( a
);
5217 aSign
= extractFloat64Sign( a
);
5218 if ( aExp
== 0x7FF ) {
5220 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5222 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5225 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5226 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5229 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5230 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5235 /*----------------------------------------------------------------------------
5236 | Returns the remainder of the double-precision floating-point value `a'
5237 | with respect to the corresponding value `b'. The operation is performed
5238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5239 *----------------------------------------------------------------------------*/
5241 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5244 int aExp
, bExp
, expDiff
;
5245 uint64_t aSig
, bSig
;
5246 uint64_t q
, alternateASig
;
5249 a
= float64_squash_input_denormal(a
, status
);
5250 b
= float64_squash_input_denormal(b
, status
);
5251 aSig
= extractFloat64Frac( a
);
5252 aExp
= extractFloat64Exp( a
);
5253 aSign
= extractFloat64Sign( a
);
5254 bSig
= extractFloat64Frac( b
);
5255 bExp
= extractFloat64Exp( b
);
5256 if ( aExp
== 0x7FF ) {
5257 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5258 return propagateFloat64NaN(a
, b
, status
);
5260 float_raise(float_flag_invalid
, status
);
5261 return float64_default_nan(status
);
5263 if ( bExp
== 0x7FF ) {
5265 return propagateFloat64NaN(a
, b
, status
);
5271 float_raise(float_flag_invalid
, status
);
5272 return float64_default_nan(status
);
5274 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5277 if ( aSig
== 0 ) return a
;
5278 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5280 expDiff
= aExp
- bExp
;
5281 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5282 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5283 if ( expDiff
< 0 ) {
5284 if ( expDiff
< -1 ) return a
;
5287 q
= ( bSig
<= aSig
);
5288 if ( q
) aSig
-= bSig
;
5290 while ( 0 < expDiff
) {
5291 q
= estimateDiv128To64( aSig
, 0, bSig
);
5292 q
= ( 2 < q
) ? q
- 2 : 0;
5293 aSig
= - ( ( bSig
>>2 ) * q
);
5297 if ( 0 < expDiff
) {
5298 q
= estimateDiv128To64( aSig
, 0, bSig
);
5299 q
= ( 2 < q
) ? q
- 2 : 0;
5302 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5309 alternateASig
= aSig
;
5312 } while ( 0 <= (int64_t) aSig
);
5313 sigMean
= aSig
+ alternateASig
;
5314 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5315 aSig
= alternateASig
;
5317 zSign
= ( (int64_t) aSig
< 0 );
5318 if ( zSign
) aSig
= - aSig
;
5319 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5323 /*----------------------------------------------------------------------------
5324 | Returns the binary log of the double-precision floating-point value `a'.
5325 | The operation is performed according to the IEC/IEEE Standard for Binary
5326 | Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328 float64
float64_log2(float64 a
, float_status
*status
)
5332 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5333 a
= float64_squash_input_denormal(a
, status
);
5335 aSig
= extractFloat64Frac( a
);
5336 aExp
= extractFloat64Exp( a
);
5337 aSign
= extractFloat64Sign( a
);
5340 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5341 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5344 float_raise(float_flag_invalid
, status
);
5345 return float64_default_nan(status
);
5347 if ( aExp
== 0x7FF ) {
5349 return propagateFloat64NaN(a
, float64_zero
, status
);
5355 aSig
|= UINT64_C(0x0010000000000000);
5357 zSig
= (uint64_t)aExp
<< 52;
5358 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5359 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5360 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5361 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5369 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5372 /*----------------------------------------------------------------------------
5373 | Returns the result of converting the extended double-precision floating-
5374 | point value `a' to the 32-bit two's complement integer format. The
5375 | conversion is performed according to the IEC/IEEE Standard for Binary
5376 | Floating-Point Arithmetic---which means in particular that the conversion
5377 | is rounded according to the current rounding mode. If `a' is a NaN, the
5378 | largest positive integer is returned. Otherwise, if the conversion
5379 | overflows, the largest integer with the same sign as `a' is returned.
5380 *----------------------------------------------------------------------------*/
5382 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5385 int32_t aExp
, shiftCount
;
5388 if (floatx80_invalid_encoding(a
)) {
5389 float_raise(float_flag_invalid
, status
);
5392 aSig
= extractFloatx80Frac( a
);
5393 aExp
= extractFloatx80Exp( a
);
5394 aSign
= extractFloatx80Sign( a
);
5395 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5396 shiftCount
= 0x4037 - aExp
;
5397 if ( shiftCount
<= 0 ) shiftCount
= 1;
5398 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5399 return roundAndPackInt32(aSign
, aSig
, status
);
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of converting the extended double-precision floating-
5405 | point value `a' to the 32-bit two's complement integer format. The
5406 | conversion is performed according to the IEC/IEEE Standard for Binary
5407 | Floating-Point Arithmetic, except that the conversion is always rounded
5408 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5409 | Otherwise, if the conversion overflows, the largest integer with the same
5410 | sign as `a' is returned.
5411 *----------------------------------------------------------------------------*/
5413 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5416 int32_t aExp
, shiftCount
;
5417 uint64_t aSig
, savedASig
;
5420 if (floatx80_invalid_encoding(a
)) {
5421 float_raise(float_flag_invalid
, status
);
5424 aSig
= extractFloatx80Frac( a
);
5425 aExp
= extractFloatx80Exp( a
);
5426 aSign
= extractFloatx80Sign( a
);
5427 if ( 0x401E < aExp
) {
5428 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5431 else if ( aExp
< 0x3FFF ) {
5433 float_raise(float_flag_inexact
, status
);
5437 shiftCount
= 0x403E - aExp
;
5439 aSig
>>= shiftCount
;
5441 if ( aSign
) z
= - z
;
5442 if ( ( z
< 0 ) ^ aSign
) {
5444 float_raise(float_flag_invalid
, status
);
5445 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5447 if ( ( aSig
<<shiftCount
) != savedASig
) {
5448 float_raise(float_flag_inexact
, status
);
5454 /*----------------------------------------------------------------------------
5455 | Returns the result of converting the extended double-precision floating-
5456 | point value `a' to the 64-bit two's complement integer format. The
5457 | conversion is performed according to the IEC/IEEE Standard for Binary
5458 | Floating-Point Arithmetic---which means in particular that the conversion
5459 | is rounded according to the current rounding mode. If `a' is a NaN,
5460 | the largest positive integer is returned. Otherwise, if the conversion
5461 | overflows, the largest integer with the same sign as `a' is returned.
5462 *----------------------------------------------------------------------------*/
5464 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5467 int32_t aExp
, shiftCount
;
5468 uint64_t aSig
, aSigExtra
;
5470 if (floatx80_invalid_encoding(a
)) {
5471 float_raise(float_flag_invalid
, status
);
5474 aSig
= extractFloatx80Frac( a
);
5475 aExp
= extractFloatx80Exp( a
);
5476 aSign
= extractFloatx80Sign( a
);
5477 shiftCount
= 0x403E - aExp
;
5478 if ( shiftCount
<= 0 ) {
5480 float_raise(float_flag_invalid
, status
);
5481 if (!aSign
|| floatx80_is_any_nan(a
)) {
5489 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5491 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5495 /*----------------------------------------------------------------------------
5496 | Returns the result of converting the extended double-precision floating-
5497 | point value `a' to the 64-bit two's complement integer format. The
5498 | conversion is performed according to the IEC/IEEE Standard for Binary
5499 | Floating-Point Arithmetic, except that the conversion is always rounded
5500 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5501 | Otherwise, if the conversion overflows, the largest integer with the same
5502 | sign as `a' is returned.
5503 *----------------------------------------------------------------------------*/
5505 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5508 int32_t aExp
, shiftCount
;
5512 if (floatx80_invalid_encoding(a
)) {
5513 float_raise(float_flag_invalid
, status
);
5516 aSig
= extractFloatx80Frac( a
);
5517 aExp
= extractFloatx80Exp( a
);
5518 aSign
= extractFloatx80Sign( a
);
5519 shiftCount
= aExp
- 0x403E;
5520 if ( 0 <= shiftCount
) {
5521 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5522 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5523 float_raise(float_flag_invalid
, status
);
5524 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5530 else if ( aExp
< 0x3FFF ) {
5532 float_raise(float_flag_inexact
, status
);
5536 z
= aSig
>>( - shiftCount
);
5537 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5538 float_raise(float_flag_inexact
, status
);
5540 if ( aSign
) z
= - z
;
5545 /*----------------------------------------------------------------------------
5546 | Returns the result of converting the extended double-precision floating-
5547 | point value `a' to the single-precision floating-point format. The
5548 | conversion is performed according to the IEC/IEEE Standard for Binary
5549 | Floating-Point Arithmetic.
5550 *----------------------------------------------------------------------------*/
5552 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5558 if (floatx80_invalid_encoding(a
)) {
5559 float_raise(float_flag_invalid
, status
);
5560 return float32_default_nan(status
);
5562 aSig
= extractFloatx80Frac( a
);
5563 aExp
= extractFloatx80Exp( a
);
5564 aSign
= extractFloatx80Sign( a
);
5565 if ( aExp
== 0x7FFF ) {
5566 if ( (uint64_t) ( aSig
<<1 ) ) {
5567 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5569 return float32_silence_nan(res
, status
);
5571 return packFloat32( aSign
, 0xFF, 0 );
5573 shift64RightJamming( aSig
, 33, &aSig
);
5574 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5575 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5579 /*----------------------------------------------------------------------------
5580 | Returns the result of converting the extended double-precision floating-
5581 | point value `a' to the double-precision floating-point format. The
5582 | conversion is performed according to the IEC/IEEE Standard for Binary
5583 | Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5586 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5590 uint64_t aSig
, zSig
;
5592 if (floatx80_invalid_encoding(a
)) {
5593 float_raise(float_flag_invalid
, status
);
5594 return float64_default_nan(status
);
5596 aSig
= extractFloatx80Frac( a
);
5597 aExp
= extractFloatx80Exp( a
);
5598 aSign
= extractFloatx80Sign( a
);
5599 if ( aExp
== 0x7FFF ) {
5600 if ( (uint64_t) ( aSig
<<1 ) ) {
5601 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5603 return float64_silence_nan(res
, status
);
5605 return packFloat64( aSign
, 0x7FF, 0 );
5607 shift64RightJamming( aSig
, 1, &zSig
);
5608 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5609 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the extended double-precision floating-
5615 | point value `a' to the quadruple-precision floating-point format. The
5616 | conversion is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5620 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5624 uint64_t aSig
, zSig0
, zSig1
;
5626 if (floatx80_invalid_encoding(a
)) {
5627 float_raise(float_flag_invalid
, status
);
5628 return float128_default_nan(status
);
5630 aSig
= extractFloatx80Frac( a
);
5631 aExp
= extractFloatx80Exp( a
);
5632 aSign
= extractFloatx80Sign( a
);
5633 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5634 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5636 return float128_silence_nan(res
, status
);
5638 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5639 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5643 /*----------------------------------------------------------------------------
5644 | Rounds the extended double-precision floating-point value `a'
5645 | to the precision provided by floatx80_rounding_precision and returns the
5646 | result as an extended double-precision floating-point value.
5647 | The operation is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5651 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5653 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5654 extractFloatx80Sign(a
),
5655 extractFloatx80Exp(a
),
5656 extractFloatx80Frac(a
), 0, status
);
5659 /*----------------------------------------------------------------------------
5660 | Rounds the extended double-precision floating-point value `a' to an integer,
5661 | and returns the result as an extended quadruple-precision floating-point
5662 | value. The operation is performed according to the IEC/IEEE Standard for
5663 | Binary Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5666 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5670 uint64_t lastBitMask
, roundBitsMask
;
5673 if (floatx80_invalid_encoding(a
)) {
5674 float_raise(float_flag_invalid
, status
);
5675 return floatx80_default_nan(status
);
5677 aExp
= extractFloatx80Exp( a
);
5678 if ( 0x403E <= aExp
) {
5679 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5680 return propagateFloatx80NaN(a
, a
, status
);
5684 if ( aExp
< 0x3FFF ) {
5686 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5689 float_raise(float_flag_inexact
, status
);
5690 aSign
= extractFloatx80Sign( a
);
5691 switch (status
->float_rounding_mode
) {
5692 case float_round_nearest_even
:
5693 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5696 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5699 case float_round_ties_away
:
5700 if (aExp
== 0x3FFE) {
5701 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5704 case float_round_down
:
5707 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5708 : packFloatx80( 0, 0, 0 );
5709 case float_round_up
:
5711 aSign
? packFloatx80( 1, 0, 0 )
5712 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5714 case float_round_to_zero
:
5717 g_assert_not_reached();
5719 return packFloatx80( aSign
, 0, 0 );
5722 lastBitMask
<<= 0x403E - aExp
;
5723 roundBitsMask
= lastBitMask
- 1;
5725 switch (status
->float_rounding_mode
) {
5726 case float_round_nearest_even
:
5727 z
.low
+= lastBitMask
>>1;
5728 if ((z
.low
& roundBitsMask
) == 0) {
5729 z
.low
&= ~lastBitMask
;
5732 case float_round_ties_away
:
5733 z
.low
+= lastBitMask
>> 1;
5735 case float_round_to_zero
:
5737 case float_round_up
:
5738 if (!extractFloatx80Sign(z
)) {
5739 z
.low
+= roundBitsMask
;
5742 case float_round_down
:
5743 if (extractFloatx80Sign(z
)) {
5744 z
.low
+= roundBitsMask
;
5750 z
.low
&= ~ roundBitsMask
;
5753 z
.low
= UINT64_C(0x8000000000000000);
5755 if (z
.low
!= a
.low
) {
5756 float_raise(float_flag_inexact
, status
);
5762 /*----------------------------------------------------------------------------
5763 | Returns the result of adding the absolute values of the extended double-
5764 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5765 | negated before being returned. `zSign' is ignored if the result is a NaN.
5766 | The addition is performed according to the IEC/IEEE Standard for Binary
5767 | Floating-Point Arithmetic.
5768 *----------------------------------------------------------------------------*/
5770 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5771 float_status
*status
)
5773 int32_t aExp
, bExp
, zExp
;
5774 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5777 aSig
= extractFloatx80Frac( a
);
5778 aExp
= extractFloatx80Exp( a
);
5779 bSig
= extractFloatx80Frac( b
);
5780 bExp
= extractFloatx80Exp( b
);
5781 expDiff
= aExp
- bExp
;
5782 if ( 0 < expDiff
) {
5783 if ( aExp
== 0x7FFF ) {
5784 if ((uint64_t)(aSig
<< 1)) {
5785 return propagateFloatx80NaN(a
, b
, status
);
5789 if ( bExp
== 0 ) --expDiff
;
5790 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5793 else if ( expDiff
< 0 ) {
5794 if ( bExp
== 0x7FFF ) {
5795 if ((uint64_t)(bSig
<< 1)) {
5796 return propagateFloatx80NaN(a
, b
, status
);
5798 return packFloatx80(zSign
,
5799 floatx80_infinity_high
,
5800 floatx80_infinity_low
);
5802 if ( aExp
== 0 ) ++expDiff
;
5803 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5807 if ( aExp
== 0x7FFF ) {
5808 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5809 return propagateFloatx80NaN(a
, b
, status
);
5814 zSig0
= aSig
+ bSig
;
5816 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5817 /* At least one of the values is a pseudo-denormal,
5818 * and there is a carry out of the result. */
5823 return packFloatx80(zSign
, 0, 0);
5825 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5831 zSig0
= aSig
+ bSig
;
5832 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5834 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5835 zSig0
|= UINT64_C(0x8000000000000000);
5838 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5839 zSign
, zExp
, zSig0
, zSig1
, status
);
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of subtracting the absolute values of the extended
5844 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5845 | difference is negated before being returned. `zSign' is ignored if the
5846 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5847 | Standard for Binary Floating-Point Arithmetic.
5848 *----------------------------------------------------------------------------*/
5850 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5851 float_status
*status
)
5853 int32_t aExp
, bExp
, zExp
;
5854 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5857 aSig
= extractFloatx80Frac( a
);
5858 aExp
= extractFloatx80Exp( a
);
5859 bSig
= extractFloatx80Frac( b
);
5860 bExp
= extractFloatx80Exp( b
);
5861 expDiff
= aExp
- bExp
;
5862 if ( 0 < expDiff
) goto aExpBigger
;
5863 if ( expDiff
< 0 ) goto bExpBigger
;
5864 if ( aExp
== 0x7FFF ) {
5865 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5866 return propagateFloatx80NaN(a
, b
, status
);
5868 float_raise(float_flag_invalid
, status
);
5869 return floatx80_default_nan(status
);
5876 if ( bSig
< aSig
) goto aBigger
;
5877 if ( aSig
< bSig
) goto bBigger
;
5878 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5880 if ( bExp
== 0x7FFF ) {
5881 if ((uint64_t)(bSig
<< 1)) {
5882 return propagateFloatx80NaN(a
, b
, status
);
5884 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5885 floatx80_infinity_low
);
5887 if ( aExp
== 0 ) ++expDiff
;
5888 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5890 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5893 goto normalizeRoundAndPack
;
5895 if ( aExp
== 0x7FFF ) {
5896 if ((uint64_t)(aSig
<< 1)) {
5897 return propagateFloatx80NaN(a
, b
, status
);
5901 if ( bExp
== 0 ) --expDiff
;
5902 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5904 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5906 normalizeRoundAndPack
:
5907 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5908 zSign
, zExp
, zSig0
, zSig1
, status
);
5911 /*----------------------------------------------------------------------------
5912 | Returns the result of adding the extended double-precision floating-point
5913 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5914 | Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5917 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5921 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5922 float_raise(float_flag_invalid
, status
);
5923 return floatx80_default_nan(status
);
5925 aSign
= extractFloatx80Sign( a
);
5926 bSign
= extractFloatx80Sign( b
);
5927 if ( aSign
== bSign
) {
5928 return addFloatx80Sigs(a
, b
, aSign
, status
);
5931 return subFloatx80Sigs(a
, b
, aSign
, status
);
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of subtracting the extended double-precision floating-
5938 | point values `a' and `b'. The operation is performed according to the
5939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5942 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5946 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5947 float_raise(float_flag_invalid
, status
);
5948 return floatx80_default_nan(status
);
5950 aSign
= extractFloatx80Sign( a
);
5951 bSign
= extractFloatx80Sign( b
);
5952 if ( aSign
== bSign
) {
5953 return subFloatx80Sigs(a
, b
, aSign
, status
);
5956 return addFloatx80Sigs(a
, b
, aSign
, status
);
5961 /*----------------------------------------------------------------------------
5962 | Returns the result of multiplying the extended double-precision floating-
5963 | point values `a' and `b'. The operation is performed according to the
5964 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5965 *----------------------------------------------------------------------------*/
5967 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5969 bool aSign
, bSign
, zSign
;
5970 int32_t aExp
, bExp
, zExp
;
5971 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5973 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5974 float_raise(float_flag_invalid
, status
);
5975 return floatx80_default_nan(status
);
5977 aSig
= extractFloatx80Frac( a
);
5978 aExp
= extractFloatx80Exp( a
);
5979 aSign
= extractFloatx80Sign( a
);
5980 bSig
= extractFloatx80Frac( b
);
5981 bExp
= extractFloatx80Exp( b
);
5982 bSign
= extractFloatx80Sign( b
);
5983 zSign
= aSign
^ bSign
;
5984 if ( aExp
== 0x7FFF ) {
5985 if ( (uint64_t) ( aSig
<<1 )
5986 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5987 return propagateFloatx80NaN(a
, b
, status
);
5989 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5990 return packFloatx80(zSign
, floatx80_infinity_high
,
5991 floatx80_infinity_low
);
5993 if ( bExp
== 0x7FFF ) {
5994 if ((uint64_t)(bSig
<< 1)) {
5995 return propagateFloatx80NaN(a
, b
, status
);
5997 if ( ( aExp
| aSig
) == 0 ) {
5999 float_raise(float_flag_invalid
, status
);
6000 return floatx80_default_nan(status
);
6002 return packFloatx80(zSign
, floatx80_infinity_high
,
6003 floatx80_infinity_low
);
6006 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6007 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6010 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6011 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6013 zExp
= aExp
+ bExp
- 0x3FFE;
6014 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6015 if ( 0 < (int64_t) zSig0
) {
6016 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6019 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6020 zSign
, zExp
, zSig0
, zSig1
, status
);
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of dividing the extended double-precision floating-point
6025 | value `a' by the corresponding value `b'. The operation is performed
6026 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6027 *----------------------------------------------------------------------------*/
6029 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6031 bool aSign
, bSign
, zSign
;
6032 int32_t aExp
, bExp
, zExp
;
6033 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6034 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6036 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6037 float_raise(float_flag_invalid
, status
);
6038 return floatx80_default_nan(status
);
6040 aSig
= extractFloatx80Frac( a
);
6041 aExp
= extractFloatx80Exp( a
);
6042 aSign
= extractFloatx80Sign( a
);
6043 bSig
= extractFloatx80Frac( b
);
6044 bExp
= extractFloatx80Exp( b
);
6045 bSign
= extractFloatx80Sign( b
);
6046 zSign
= aSign
^ bSign
;
6047 if ( aExp
== 0x7FFF ) {
6048 if ((uint64_t)(aSig
<< 1)) {
6049 return propagateFloatx80NaN(a
, b
, status
);
6051 if ( bExp
== 0x7FFF ) {
6052 if ((uint64_t)(bSig
<< 1)) {
6053 return propagateFloatx80NaN(a
, b
, status
);
6057 return packFloatx80(zSign
, floatx80_infinity_high
,
6058 floatx80_infinity_low
);
6060 if ( bExp
== 0x7FFF ) {
6061 if ((uint64_t)(bSig
<< 1)) {
6062 return propagateFloatx80NaN(a
, b
, status
);
6064 return packFloatx80( zSign
, 0, 0 );
6068 if ( ( aExp
| aSig
) == 0 ) {
6070 float_raise(float_flag_invalid
, status
);
6071 return floatx80_default_nan(status
);
6073 float_raise(float_flag_divbyzero
, status
);
6074 return packFloatx80(zSign
, floatx80_infinity_high
,
6075 floatx80_infinity_low
);
6077 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6080 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6081 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6083 zExp
= aExp
- bExp
+ 0x3FFE;
6085 if ( bSig
<= aSig
) {
6086 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6089 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6090 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6091 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6092 while ( (int64_t) rem0
< 0 ) {
6094 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6096 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6097 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6098 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6099 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6100 while ( (int64_t) rem1
< 0 ) {
6102 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6104 zSig1
|= ( ( rem1
| rem2
) != 0 );
6106 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6107 zSign
, zExp
, zSig0
, zSig1
, status
);
6110 /*----------------------------------------------------------------------------
6111 | Returns the remainder of the extended double-precision floating-point value
6112 | `a' with respect to the corresponding value `b'. The operation is performed
6113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6114 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6115 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6116 | the absolute value of the integer quotient.
6117 *----------------------------------------------------------------------------*/
6119 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6120 float_status
*status
)
6123 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6124 uint64_t aSig0
, aSig1
, bSig
;
6125 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6128 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6129 float_raise(float_flag_invalid
, status
);
6130 return floatx80_default_nan(status
);
6132 aSig0
= extractFloatx80Frac( a
);
6133 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6134 aSign
= extractFloatx80Sign( a
);
6135 bSig
= extractFloatx80Frac( b
);
6136 bExp
= extractFloatx80Exp( b
);
6137 if ( aExp
== 0x7FFF ) {
6138 if ( (uint64_t) ( aSig0
<<1 )
6139 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6140 return propagateFloatx80NaN(a
, b
, status
);
6144 if ( bExp
== 0x7FFF ) {
6145 if ((uint64_t)(bSig
<< 1)) {
6146 return propagateFloatx80NaN(a
, b
, status
);
6148 if (aExp
== 0 && aSig0
>> 63) {
6150 * Pseudo-denormal argument must be returned in normalized
6153 return packFloatx80(aSign
, 1, aSig0
);
6160 float_raise(float_flag_invalid
, status
);
6161 return floatx80_default_nan(status
);
6163 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6166 if ( aSig0
== 0 ) return a
;
6167 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6170 expDiff
= aExp
- bExp
;
6172 if ( expDiff
< 0 ) {
6173 if ( mod
|| expDiff
< -1 ) {
6174 if (aExp
== 1 && aExpOrig
== 0) {
6176 * Pseudo-denormal argument must be returned in
6179 return packFloatx80(aSign
, aExp
, aSig0
);
6183 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6186 *quotient
= q
= ( bSig
<= aSig0
);
6187 if ( q
) aSig0
-= bSig
;
6189 while ( 0 < expDiff
) {
6190 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6191 q
= ( 2 < q
) ? q
- 2 : 0;
6192 mul64To128( bSig
, q
, &term0
, &term1
);
6193 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6194 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6200 if ( 0 < expDiff
) {
6201 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6202 q
= ( 2 < q
) ? q
- 2 : 0;
6204 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6205 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6206 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6207 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6209 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6212 *quotient
<<= expDiff
;
6223 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6224 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6225 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6228 aSig0
= alternateASig0
;
6229 aSig1
= alternateASig1
;
6235 normalizeRoundAndPackFloatx80(
6236 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6240 /*----------------------------------------------------------------------------
6241 | Returns the remainder of the extended double-precision floating-point value
6242 | `a' with respect to the corresponding value `b'. The operation is performed
6243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6244 *----------------------------------------------------------------------------*/
6246 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6249 return floatx80_modrem(a
, b
, false, "ient
, status
);
6252 /*----------------------------------------------------------------------------
6253 | Returns the remainder of the extended double-precision floating-point value
6254 | `a' with respect to the corresponding value `b', with the quotient truncated
6256 *----------------------------------------------------------------------------*/
6258 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6261 return floatx80_modrem(a
, b
, true, "ient
, status
);
6264 /*----------------------------------------------------------------------------
6265 | Returns the square root of the extended double-precision floating-point
6266 | value `a'. The operation is performed according to the IEC/IEEE Standard
6267 | for Binary Floating-Point Arithmetic.
6268 *----------------------------------------------------------------------------*/
6270 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6274 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6275 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6277 if (floatx80_invalid_encoding(a
)) {
6278 float_raise(float_flag_invalid
, status
);
6279 return floatx80_default_nan(status
);
6281 aSig0
= extractFloatx80Frac( a
);
6282 aExp
= extractFloatx80Exp( a
);
6283 aSign
= extractFloatx80Sign( a
);
6284 if ( aExp
== 0x7FFF ) {
6285 if ((uint64_t)(aSig0
<< 1)) {
6286 return propagateFloatx80NaN(a
, a
, status
);
6288 if ( ! aSign
) return a
;
6292 if ( ( aExp
| aSig0
) == 0 ) return a
;
6294 float_raise(float_flag_invalid
, status
);
6295 return floatx80_default_nan(status
);
6298 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6299 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6301 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6302 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6303 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6304 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6305 doubleZSig0
= zSig0
<<1;
6306 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6307 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6308 while ( (int64_t) rem0
< 0 ) {
6311 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6313 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6314 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6315 if ( zSig1
== 0 ) zSig1
= 1;
6316 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6317 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6318 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6319 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6320 while ( (int64_t) rem1
< 0 ) {
6322 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6324 term2
|= doubleZSig0
;
6325 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6327 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6329 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6330 zSig0
|= doubleZSig0
;
6331 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6332 0, zExp
, zSig0
, zSig1
, status
);
6335 /*----------------------------------------------------------------------------
6336 | Returns the result of converting the quadruple-precision floating-point
6337 | value `a' to the 32-bit two's complement integer format. The conversion
6338 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6339 | Arithmetic---which means in particular that the conversion is rounded
6340 | according to the current rounding mode. If `a' is a NaN, the largest
6341 | positive integer is returned. Otherwise, if the conversion overflows, the
6342 | largest integer with the same sign as `a' is returned.
6343 *----------------------------------------------------------------------------*/
6345 int32_t float128_to_int32(float128 a
, float_status
*status
)
6348 int32_t aExp
, shiftCount
;
6349 uint64_t aSig0
, aSig1
;
6351 aSig1
= extractFloat128Frac1( a
);
6352 aSig0
= extractFloat128Frac0( a
);
6353 aExp
= extractFloat128Exp( a
);
6354 aSign
= extractFloat128Sign( a
);
6355 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6356 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6357 aSig0
|= ( aSig1
!= 0 );
6358 shiftCount
= 0x4028 - aExp
;
6359 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6360 return roundAndPackInt32(aSign
, aSig0
, status
);
6364 /*----------------------------------------------------------------------------
6365 | Returns the result of converting the quadruple-precision floating-point
6366 | value `a' to the 32-bit two's complement integer format. The conversion
6367 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6368 | Arithmetic, except that the conversion is always rounded toward zero. If
6369 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6370 | conversion overflows, the largest integer with the same sign as `a' is
6372 *----------------------------------------------------------------------------*/
6374 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6377 int32_t aExp
, shiftCount
;
6378 uint64_t aSig0
, aSig1
, savedASig
;
6381 aSig1
= extractFloat128Frac1( a
);
6382 aSig0
= extractFloat128Frac0( a
);
6383 aExp
= extractFloat128Exp( a
);
6384 aSign
= extractFloat128Sign( a
);
6385 aSig0
|= ( aSig1
!= 0 );
6386 if ( 0x401E < aExp
) {
6387 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6390 else if ( aExp
< 0x3FFF ) {
6391 if (aExp
|| aSig0
) {
6392 float_raise(float_flag_inexact
, status
);
6396 aSig0
|= UINT64_C(0x0001000000000000);
6397 shiftCount
= 0x402F - aExp
;
6399 aSig0
>>= shiftCount
;
6401 if ( aSign
) z
= - z
;
6402 if ( ( z
< 0 ) ^ aSign
) {
6404 float_raise(float_flag_invalid
, status
);
6405 return aSign
? INT32_MIN
: INT32_MAX
;
6407 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6408 float_raise(float_flag_inexact
, status
);
6414 /*----------------------------------------------------------------------------
6415 | Returns the result of converting the quadruple-precision floating-point
6416 | value `a' to the 64-bit two's complement integer format. The conversion
6417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6418 | Arithmetic---which means in particular that the conversion is rounded
6419 | according to the current rounding mode. If `a' is a NaN, the largest
6420 | positive integer is returned. Otherwise, if the conversion overflows, the
6421 | largest integer with the same sign as `a' is returned.
6422 *----------------------------------------------------------------------------*/
6424 int64_t float128_to_int64(float128 a
, float_status
*status
)
6427 int32_t aExp
, shiftCount
;
6428 uint64_t aSig0
, aSig1
;
6430 aSig1
= extractFloat128Frac1( a
);
6431 aSig0
= extractFloat128Frac0( a
);
6432 aExp
= extractFloat128Exp( a
);
6433 aSign
= extractFloat128Sign( a
);
6434 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6435 shiftCount
= 0x402F - aExp
;
6436 if ( shiftCount
<= 0 ) {
6437 if ( 0x403E < aExp
) {
6438 float_raise(float_flag_invalid
, status
);
6440 || ( ( aExp
== 0x7FFF )
6441 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6448 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6451 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6453 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6457 /*----------------------------------------------------------------------------
6458 | Returns the result of converting the quadruple-precision floating-point
6459 | value `a' to the 64-bit two's complement integer format. The conversion
6460 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6461 | Arithmetic, except that the conversion is always rounded toward zero.
6462 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6463 | the conversion overflows, the largest integer with the same sign as `a' is
6465 *----------------------------------------------------------------------------*/
6467 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6470 int32_t aExp
, shiftCount
;
6471 uint64_t aSig0
, aSig1
;
6474 aSig1
= extractFloat128Frac1( a
);
6475 aSig0
= extractFloat128Frac0( a
);
6476 aExp
= extractFloat128Exp( a
);
6477 aSign
= extractFloat128Sign( a
);
6478 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6479 shiftCount
= aExp
- 0x402F;
6480 if ( 0 < shiftCount
) {
6481 if ( 0x403E <= aExp
) {
6482 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6483 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6484 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6486 float_raise(float_flag_inexact
, status
);
6490 float_raise(float_flag_invalid
, status
);
6491 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6497 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6498 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6499 float_raise(float_flag_inexact
, status
);
6503 if ( aExp
< 0x3FFF ) {
6504 if ( aExp
| aSig0
| aSig1
) {
6505 float_raise(float_flag_inexact
, status
);
6509 z
= aSig0
>>( - shiftCount
);
6511 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6512 float_raise(float_flag_inexact
, status
);
6515 if ( aSign
) z
= - z
;
6520 /*----------------------------------------------------------------------------
6521 | Returns the result of converting the quadruple-precision floating-point value
6522 | `a' to the 64-bit unsigned integer format. The conversion is
6523 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6524 | Arithmetic---which means in particular that the conversion is rounded
6525 | according to the current rounding mode. If `a' is a NaN, the largest
6526 | positive integer is returned. If the conversion overflows, the
6527 | largest unsigned integer is returned. If 'a' is negative, the value is
6528 | rounded and zero is returned; negative values that do not round to zero
6529 | will raise the inexact exception.
6530 *----------------------------------------------------------------------------*/
6532 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6537 uint64_t aSig0
, aSig1
;
6539 aSig0
= extractFloat128Frac0(a
);
6540 aSig1
= extractFloat128Frac1(a
);
6541 aExp
= extractFloat128Exp(a
);
6542 aSign
= extractFloat128Sign(a
);
6543 if (aSign
&& (aExp
> 0x3FFE)) {
6544 float_raise(float_flag_invalid
, status
);
6545 if (float128_is_any_nan(a
)) {
6552 aSig0
|= UINT64_C(0x0001000000000000);
6554 shiftCount
= 0x402F - aExp
;
6555 if (shiftCount
<= 0) {
6556 if (0x403E < aExp
) {
6557 float_raise(float_flag_invalid
, status
);
6560 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6562 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6564 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6567 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6570 signed char current_rounding_mode
= status
->float_rounding_mode
;
6572 set_float_rounding_mode(float_round_to_zero
, status
);
6573 v
= float128_to_uint64(a
, status
);
6574 set_float_rounding_mode(current_rounding_mode
, status
);
6579 /*----------------------------------------------------------------------------
6580 | Returns the result of converting the quadruple-precision floating-point
6581 | value `a' to the 32-bit unsigned integer format. The conversion
6582 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6583 | Arithmetic except that the conversion is always rounded toward zero.
6584 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6585 | if the conversion overflows, the largest unsigned integer is returned.
6586 | If 'a' is negative, the value is rounded and zero is returned; negative
6587 | values that do not round to zero will raise the inexact exception.
6588 *----------------------------------------------------------------------------*/
6590 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6594 int old_exc_flags
= get_float_exception_flags(status
);
6596 v
= float128_to_uint64_round_to_zero(a
, status
);
6597 if (v
> 0xffffffff) {
6602 set_float_exception_flags(old_exc_flags
, status
);
6603 float_raise(float_flag_invalid
, status
);
6607 /*----------------------------------------------------------------------------
6608 | Returns the result of converting the quadruple-precision floating-point value
6609 | `a' to the 32-bit unsigned integer format. The conversion is
6610 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6611 | Arithmetic---which means in particular that the conversion is rounded
6612 | according to the current rounding mode. If `a' is a NaN, the largest
6613 | positive integer is returned. If the conversion overflows, the
6614 | largest unsigned integer is returned. If 'a' is negative, the value is
6615 | rounded and zero is returned; negative values that do not round to zero
6616 | will raise the inexact exception.
6617 *----------------------------------------------------------------------------*/
6619 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6623 int old_exc_flags
= get_float_exception_flags(status
);
6625 v
= float128_to_uint64(a
, status
);
6626 if (v
> 0xffffffff) {
6631 set_float_exception_flags(old_exc_flags
, status
);
6632 float_raise(float_flag_invalid
, status
);
6636 /*----------------------------------------------------------------------------
6637 | Returns the result of converting the quadruple-precision floating-point
6638 | value `a' to the single-precision floating-point format. The conversion
6639 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6641 *----------------------------------------------------------------------------*/
6643 float32
float128_to_float32(float128 a
, float_status
*status
)
6647 uint64_t aSig0
, aSig1
;
6650 aSig1
= extractFloat128Frac1( a
);
6651 aSig0
= extractFloat128Frac0( a
);
6652 aExp
= extractFloat128Exp( a
);
6653 aSign
= extractFloat128Sign( a
);
6654 if ( aExp
== 0x7FFF ) {
6655 if ( aSig0
| aSig1
) {
6656 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6658 return packFloat32( aSign
, 0xFF, 0 );
6660 aSig0
|= ( aSig1
!= 0 );
6661 shift64RightJamming( aSig0
, 18, &aSig0
);
6663 if ( aExp
|| zSig
) {
6667 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6671 /*----------------------------------------------------------------------------
6672 | Returns the result of converting the quadruple-precision floating-point
6673 | value `a' to the double-precision floating-point format. The conversion
6674 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6676 *----------------------------------------------------------------------------*/
6678 float64
float128_to_float64(float128 a
, float_status
*status
)
6682 uint64_t aSig0
, aSig1
;
6684 aSig1
= extractFloat128Frac1( a
);
6685 aSig0
= extractFloat128Frac0( a
);
6686 aExp
= extractFloat128Exp( a
);
6687 aSign
= extractFloat128Sign( a
);
6688 if ( aExp
== 0x7FFF ) {
6689 if ( aSig0
| aSig1
) {
6690 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6692 return packFloat64( aSign
, 0x7FF, 0 );
6694 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6695 aSig0
|= ( aSig1
!= 0 );
6696 if ( aExp
|| aSig0
) {
6697 aSig0
|= UINT64_C(0x4000000000000000);
6700 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6704 /*----------------------------------------------------------------------------
6705 | Returns the result of converting the quadruple-precision floating-point
6706 | value `a' to the extended double-precision floating-point format. The
6707 | conversion is performed according to the IEC/IEEE Standard for Binary
6708 | Floating-Point Arithmetic.
6709 *----------------------------------------------------------------------------*/
6711 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6715 uint64_t aSig0
, aSig1
;
6717 aSig1
= extractFloat128Frac1( a
);
6718 aSig0
= extractFloat128Frac0( a
);
6719 aExp
= extractFloat128Exp( a
);
6720 aSign
= extractFloat128Sign( a
);
6721 if ( aExp
== 0x7FFF ) {
6722 if ( aSig0
| aSig1
) {
6723 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6725 return floatx80_silence_nan(res
, status
);
6727 return packFloatx80(aSign
, floatx80_infinity_high
,
6728 floatx80_infinity_low
);
6731 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6732 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6735 aSig0
|= UINT64_C(0x0001000000000000);
6737 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6738 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6742 /*----------------------------------------------------------------------------
6743 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6744 | returns the result as a quadruple-precision floating-point value. The
6745 | operation is performed according to the IEC/IEEE Standard for Binary
6746 | Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6749 float128
float128_round_to_int(float128 a
, float_status
*status
)
6753 uint64_t lastBitMask
, roundBitsMask
;
6756 aExp
= extractFloat128Exp( a
);
6757 if ( 0x402F <= aExp
) {
6758 if ( 0x406F <= aExp
) {
6759 if ( ( aExp
== 0x7FFF )
6760 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6762 return propagateFloat128NaN(a
, a
, status
);
6767 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6768 roundBitsMask
= lastBitMask
- 1;
6770 switch (status
->float_rounding_mode
) {
6771 case float_round_nearest_even
:
6772 if ( lastBitMask
) {
6773 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6774 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6777 if ( (int64_t) z
.low
< 0 ) {
6779 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6783 case float_round_ties_away
:
6785 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6787 if ((int64_t) z
.low
< 0) {
6792 case float_round_to_zero
:
6794 case float_round_up
:
6795 if (!extractFloat128Sign(z
)) {
6796 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6799 case float_round_down
:
6800 if (extractFloat128Sign(z
)) {
6801 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6804 case float_round_to_odd
:
6806 * Note that if lastBitMask == 0, the last bit is the lsb
6807 * of high, and roundBitsMask == -1.
6809 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6810 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6816 z
.low
&= ~ roundBitsMask
;
6819 if ( aExp
< 0x3FFF ) {
6820 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6821 float_raise(float_flag_inexact
, status
);
6822 aSign
= extractFloat128Sign( a
);
6823 switch (status
->float_rounding_mode
) {
6824 case float_round_nearest_even
:
6825 if ( ( aExp
== 0x3FFE )
6826 && ( extractFloat128Frac0( a
)
6827 | extractFloat128Frac1( a
) )
6829 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6832 case float_round_ties_away
:
6833 if (aExp
== 0x3FFE) {
6834 return packFloat128(aSign
, 0x3FFF, 0, 0);
6837 case float_round_down
:
6839 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6840 : packFloat128( 0, 0, 0, 0 );
6841 case float_round_up
:
6843 aSign
? packFloat128( 1, 0, 0, 0 )
6844 : packFloat128( 0, 0x3FFF, 0, 0 );
6846 case float_round_to_odd
:
6847 return packFloat128(aSign
, 0x3FFF, 0, 0);
6849 case float_round_to_zero
:
6852 return packFloat128( aSign
, 0, 0, 0 );
6855 lastBitMask
<<= 0x402F - aExp
;
6856 roundBitsMask
= lastBitMask
- 1;
6859 switch (status
->float_rounding_mode
) {
6860 case float_round_nearest_even
:
6861 z
.high
+= lastBitMask
>>1;
6862 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6863 z
.high
&= ~ lastBitMask
;
6866 case float_round_ties_away
:
6867 z
.high
+= lastBitMask
>>1;
6869 case float_round_to_zero
:
6871 case float_round_up
:
6872 if (!extractFloat128Sign(z
)) {
6873 z
.high
|= ( a
.low
!= 0 );
6874 z
.high
+= roundBitsMask
;
6877 case float_round_down
:
6878 if (extractFloat128Sign(z
)) {
6879 z
.high
|= (a
.low
!= 0);
6880 z
.high
+= roundBitsMask
;
6883 case float_round_to_odd
:
6884 if ((z
.high
& lastBitMask
) == 0) {
6885 z
.high
|= (a
.low
!= 0);
6886 z
.high
+= roundBitsMask
;
6892 z
.high
&= ~ roundBitsMask
;
6894 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6895 float_raise(float_flag_inexact
, status
);
6901 /*----------------------------------------------------------------------------
6902 | Returns the result of adding the absolute values of the quadruple-precision
6903 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6904 | before being returned. `zSign' is ignored if the result is a NaN.
6905 | The addition is performed according to the IEC/IEEE Standard for Binary
6906 | Floating-Point Arithmetic.
6907 *----------------------------------------------------------------------------*/
6909 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6910 float_status
*status
)
6912 int32_t aExp
, bExp
, zExp
;
6913 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6916 aSig1
= extractFloat128Frac1( a
);
6917 aSig0
= extractFloat128Frac0( a
);
6918 aExp
= extractFloat128Exp( a
);
6919 bSig1
= extractFloat128Frac1( b
);
6920 bSig0
= extractFloat128Frac0( b
);
6921 bExp
= extractFloat128Exp( b
);
6922 expDiff
= aExp
- bExp
;
6923 if ( 0 < expDiff
) {
6924 if ( aExp
== 0x7FFF ) {
6925 if (aSig0
| aSig1
) {
6926 return propagateFloat128NaN(a
, b
, status
);
6934 bSig0
|= UINT64_C(0x0001000000000000);
6936 shift128ExtraRightJamming(
6937 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6940 else if ( expDiff
< 0 ) {
6941 if ( bExp
== 0x7FFF ) {
6942 if (bSig0
| bSig1
) {
6943 return propagateFloat128NaN(a
, b
, status
);
6945 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6951 aSig0
|= UINT64_C(0x0001000000000000);
6953 shift128ExtraRightJamming(
6954 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6958 if ( aExp
== 0x7FFF ) {
6959 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6960 return propagateFloat128NaN(a
, b
, status
);
6964 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6966 if (status
->flush_to_zero
) {
6967 if (zSig0
| zSig1
) {
6968 float_raise(float_flag_output_denormal
, status
);
6970 return packFloat128(zSign
, 0, 0, 0);
6972 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6975 zSig0
|= UINT64_C(0x0002000000000000);
6979 aSig0
|= UINT64_C(0x0001000000000000);
6980 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6982 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
6985 shift128ExtraRightJamming(
6986 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6988 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6992 /*----------------------------------------------------------------------------
6993 | Returns the result of subtracting the absolute values of the quadruple-
6994 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6995 | difference is negated before being returned. `zSign' is ignored if the
6996 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6997 | Standard for Binary Floating-Point Arithmetic.
6998 *----------------------------------------------------------------------------*/
7000 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
7001 float_status
*status
)
7003 int32_t aExp
, bExp
, zExp
;
7004 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7007 aSig1
= extractFloat128Frac1( a
);
7008 aSig0
= extractFloat128Frac0( a
);
7009 aExp
= extractFloat128Exp( a
);
7010 bSig1
= extractFloat128Frac1( b
);
7011 bSig0
= extractFloat128Frac0( b
);
7012 bExp
= extractFloat128Exp( b
);
7013 expDiff
= aExp
- bExp
;
7014 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7015 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7016 if ( 0 < expDiff
) goto aExpBigger
;
7017 if ( expDiff
< 0 ) goto bExpBigger
;
7018 if ( aExp
== 0x7FFF ) {
7019 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7020 return propagateFloat128NaN(a
, b
, status
);
7022 float_raise(float_flag_invalid
, status
);
7023 return float128_default_nan(status
);
7029 if ( bSig0
< aSig0
) goto aBigger
;
7030 if ( aSig0
< bSig0
) goto bBigger
;
7031 if ( bSig1
< aSig1
) goto aBigger
;
7032 if ( aSig1
< bSig1
) goto bBigger
;
7033 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7036 if ( bExp
== 0x7FFF ) {
7037 if (bSig0
| bSig1
) {
7038 return propagateFloat128NaN(a
, b
, status
);
7040 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7046 aSig0
|= UINT64_C(0x4000000000000000);
7048 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7049 bSig0
|= UINT64_C(0x4000000000000000);
7051 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7054 goto normalizeRoundAndPack
;
7056 if ( aExp
== 0x7FFF ) {
7057 if (aSig0
| aSig1
) {
7058 return propagateFloat128NaN(a
, b
, status
);
7066 bSig0
|= UINT64_C(0x4000000000000000);
7068 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7069 aSig0
|= UINT64_C(0x4000000000000000);
7071 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7073 normalizeRoundAndPack
:
7075 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7080 /*----------------------------------------------------------------------------
7081 | Returns the result of adding the quadruple-precision floating-point values
7082 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7083 | for Binary Floating-Point Arithmetic.
7084 *----------------------------------------------------------------------------*/
7086 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7090 aSign
= extractFloat128Sign( a
);
7091 bSign
= extractFloat128Sign( b
);
7092 if ( aSign
== bSign
) {
7093 return addFloat128Sigs(a
, b
, aSign
, status
);
7096 return subFloat128Sigs(a
, b
, aSign
, status
);
7101 /*----------------------------------------------------------------------------
7102 | Returns the result of subtracting the quadruple-precision floating-point
7103 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7104 | Standard for Binary Floating-Point Arithmetic.
7105 *----------------------------------------------------------------------------*/
7107 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7111 aSign
= extractFloat128Sign( a
);
7112 bSign
= extractFloat128Sign( b
);
7113 if ( aSign
== bSign
) {
7114 return subFloat128Sigs(a
, b
, aSign
, status
);
7117 return addFloat128Sigs(a
, b
, aSign
, status
);
7122 /*----------------------------------------------------------------------------
7123 | Returns the result of multiplying the quadruple-precision floating-point
7124 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7125 | Standard for Binary Floating-Point Arithmetic.
7126 *----------------------------------------------------------------------------*/
7128 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7130 bool aSign
, bSign
, zSign
;
7131 int32_t aExp
, bExp
, zExp
;
7132 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7134 aSig1
= extractFloat128Frac1( a
);
7135 aSig0
= extractFloat128Frac0( a
);
7136 aExp
= extractFloat128Exp( a
);
7137 aSign
= extractFloat128Sign( a
);
7138 bSig1
= extractFloat128Frac1( b
);
7139 bSig0
= extractFloat128Frac0( b
);
7140 bExp
= extractFloat128Exp( b
);
7141 bSign
= extractFloat128Sign( b
);
7142 zSign
= aSign
^ bSign
;
7143 if ( aExp
== 0x7FFF ) {
7144 if ( ( aSig0
| aSig1
)
7145 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7146 return propagateFloat128NaN(a
, b
, status
);
7148 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7149 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7151 if ( bExp
== 0x7FFF ) {
7152 if (bSig0
| bSig1
) {
7153 return propagateFloat128NaN(a
, b
, status
);
7155 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7157 float_raise(float_flag_invalid
, status
);
7158 return float128_default_nan(status
);
7160 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7163 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7164 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7167 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7168 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7170 zExp
= aExp
+ bExp
- 0x4000;
7171 aSig0
|= UINT64_C(0x0001000000000000);
7172 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7173 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7174 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7175 zSig2
|= ( zSig3
!= 0 );
7176 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7177 shift128ExtraRightJamming(
7178 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7181 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7185 /*----------------------------------------------------------------------------
7186 | Returns the result of dividing the quadruple-precision floating-point value
7187 | `a' by the corresponding value `b'. The operation is performed according to
7188 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7189 *----------------------------------------------------------------------------*/
7191 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7193 bool aSign
, bSign
, zSign
;
7194 int32_t aExp
, bExp
, zExp
;
7195 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7196 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7198 aSig1
= extractFloat128Frac1( a
);
7199 aSig0
= extractFloat128Frac0( a
);
7200 aExp
= extractFloat128Exp( a
);
7201 aSign
= extractFloat128Sign( a
);
7202 bSig1
= extractFloat128Frac1( b
);
7203 bSig0
= extractFloat128Frac0( b
);
7204 bExp
= extractFloat128Exp( b
);
7205 bSign
= extractFloat128Sign( b
);
7206 zSign
= aSign
^ bSign
;
7207 if ( aExp
== 0x7FFF ) {
7208 if (aSig0
| aSig1
) {
7209 return propagateFloat128NaN(a
, b
, status
);
7211 if ( bExp
== 0x7FFF ) {
7212 if (bSig0
| bSig1
) {
7213 return propagateFloat128NaN(a
, b
, status
);
7217 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7219 if ( bExp
== 0x7FFF ) {
7220 if (bSig0
| bSig1
) {
7221 return propagateFloat128NaN(a
, b
, status
);
7223 return packFloat128( zSign
, 0, 0, 0 );
7226 if ( ( bSig0
| bSig1
) == 0 ) {
7227 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7229 float_raise(float_flag_invalid
, status
);
7230 return float128_default_nan(status
);
7232 float_raise(float_flag_divbyzero
, status
);
7233 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7235 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7238 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7239 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7241 zExp
= aExp
- bExp
+ 0x3FFD;
7243 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7245 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7246 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7247 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7250 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7251 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7252 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7253 while ( (int64_t) rem0
< 0 ) {
7255 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7257 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7258 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7259 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7260 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7261 while ( (int64_t) rem1
< 0 ) {
7263 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7265 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7267 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7268 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7272 /*----------------------------------------------------------------------------
7273 | Returns the remainder of the quadruple-precision floating-point value `a'
7274 | with respect to the corresponding value `b'. The operation is performed
7275 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7276 *----------------------------------------------------------------------------*/
7278 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7281 int32_t aExp
, bExp
, expDiff
;
7282 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7283 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7286 aSig1
= extractFloat128Frac1( a
);
7287 aSig0
= extractFloat128Frac0( a
);
7288 aExp
= extractFloat128Exp( a
);
7289 aSign
= extractFloat128Sign( a
);
7290 bSig1
= extractFloat128Frac1( b
);
7291 bSig0
= extractFloat128Frac0( b
);
7292 bExp
= extractFloat128Exp( b
);
7293 if ( aExp
== 0x7FFF ) {
7294 if ( ( aSig0
| aSig1
)
7295 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7296 return propagateFloat128NaN(a
, b
, status
);
7300 if ( bExp
== 0x7FFF ) {
7301 if (bSig0
| bSig1
) {
7302 return propagateFloat128NaN(a
, b
, status
);
7307 if ( ( bSig0
| bSig1
) == 0 ) {
7309 float_raise(float_flag_invalid
, status
);
7310 return float128_default_nan(status
);
7312 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7315 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7316 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7318 expDiff
= aExp
- bExp
;
7319 if ( expDiff
< -1 ) return a
;
7321 aSig0
| UINT64_C(0x0001000000000000),
7323 15 - ( expDiff
< 0 ),
7328 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7329 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7330 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7332 while ( 0 < expDiff
) {
7333 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7334 q
= ( 4 < q
) ? q
- 4 : 0;
7335 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7336 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7337 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7338 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7341 if ( -64 < expDiff
) {
7342 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7343 q
= ( 4 < q
) ? q
- 4 : 0;
7345 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7347 if ( expDiff
< 0 ) {
7348 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7351 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7353 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7354 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7357 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7358 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7361 alternateASig0
= aSig0
;
7362 alternateASig1
= aSig1
;
7364 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7365 } while ( 0 <= (int64_t) aSig0
);
7367 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7368 if ( ( sigMean0
< 0 )
7369 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7370 aSig0
= alternateASig0
;
7371 aSig1
= alternateASig1
;
7373 zSign
= ( (int64_t) aSig0
< 0 );
7374 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7375 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7379 /*----------------------------------------------------------------------------
7380 | Returns the square root of the quadruple-precision floating-point value `a'.
7381 | The operation is performed according to the IEC/IEEE Standard for Binary
7382 | Floating-Point Arithmetic.
7383 *----------------------------------------------------------------------------*/
7385 float128
float128_sqrt(float128 a
, float_status
*status
)
7389 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7390 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7392 aSig1
= extractFloat128Frac1( a
);
7393 aSig0
= extractFloat128Frac0( a
);
7394 aExp
= extractFloat128Exp( a
);
7395 aSign
= extractFloat128Sign( a
);
7396 if ( aExp
== 0x7FFF ) {
7397 if (aSig0
| aSig1
) {
7398 return propagateFloat128NaN(a
, a
, status
);
7400 if ( ! aSign
) return a
;
7404 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7406 float_raise(float_flag_invalid
, status
);
7407 return float128_default_nan(status
);
7410 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7411 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7413 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7414 aSig0
|= UINT64_C(0x0001000000000000);
7415 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7416 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7417 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7418 doubleZSig0
= zSig0
<<1;
7419 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7420 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7421 while ( (int64_t) rem0
< 0 ) {
7424 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7426 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7427 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7428 if ( zSig1
== 0 ) zSig1
= 1;
7429 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7430 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7431 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7432 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7433 while ( (int64_t) rem1
< 0 ) {
7435 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7437 term2
|= doubleZSig0
;
7438 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7440 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7442 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7443 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7447 static inline FloatRelation
7448 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7449 float_status
*status
)
7453 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7454 float_raise(float_flag_invalid
, status
);
7455 return float_relation_unordered
;
7457 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7458 ( extractFloatx80Frac( a
)<<1 ) ) ||
7459 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7460 ( extractFloatx80Frac( b
)<<1 ) )) {
7462 floatx80_is_signaling_nan(a
, status
) ||
7463 floatx80_is_signaling_nan(b
, status
)) {
7464 float_raise(float_flag_invalid
, status
);
7466 return float_relation_unordered
;
7468 aSign
= extractFloatx80Sign( a
);
7469 bSign
= extractFloatx80Sign( b
);
7470 if ( aSign
!= bSign
) {
7472 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7473 ( ( a
.low
| b
.low
) == 0 ) ) {
7475 return float_relation_equal
;
7477 return 1 - (2 * aSign
);
7480 /* Normalize pseudo-denormals before comparison. */
7481 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7484 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7487 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7488 return float_relation_equal
;
7490 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7495 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7497 return floatx80_compare_internal(a
, b
, 0, status
);
7500 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7501 float_status
*status
)
7503 return floatx80_compare_internal(a
, b
, 1, status
);
7506 static inline FloatRelation
7507 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7508 float_status
*status
)
7512 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7513 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7514 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7515 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7517 float128_is_signaling_nan(a
, status
) ||
7518 float128_is_signaling_nan(b
, status
)) {
7519 float_raise(float_flag_invalid
, status
);
7521 return float_relation_unordered
;
7523 aSign
= extractFloat128Sign( a
);
7524 bSign
= extractFloat128Sign( b
);
7525 if ( aSign
!= bSign
) {
7526 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7528 return float_relation_equal
;
7530 return 1 - (2 * aSign
);
7533 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7534 return float_relation_equal
;
7536 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7541 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7543 return float128_compare_internal(a
, b
, 0, status
);
7546 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7547 float_status
*status
)
7549 return float128_compare_internal(a
, b
, 1, status
);
7552 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7558 if (floatx80_invalid_encoding(a
)) {
7559 float_raise(float_flag_invalid
, status
);
7560 return floatx80_default_nan(status
);
7562 aSig
= extractFloatx80Frac( a
);
7563 aExp
= extractFloatx80Exp( a
);
7564 aSign
= extractFloatx80Sign( a
);
7566 if ( aExp
== 0x7FFF ) {
7568 return propagateFloatx80NaN(a
, a
, status
);
7582 } else if (n
< -0x10000) {
7587 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7588 aSign
, aExp
, aSig
, 0, status
);
7591 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7595 uint64_t aSig0
, aSig1
;
7597 aSig1
= extractFloat128Frac1( a
);
7598 aSig0
= extractFloat128Frac0( a
);
7599 aExp
= extractFloat128Exp( a
);
7600 aSign
= extractFloat128Sign( a
);
7601 if ( aExp
== 0x7FFF ) {
7602 if ( aSig0
| aSig1
) {
7603 return propagateFloat128NaN(a
, a
, status
);
7608 aSig0
|= UINT64_C(0x0001000000000000);
7609 } else if (aSig0
== 0 && aSig1
== 0) {
7617 } else if (n
< -0x10000) {
7622 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7627 static void __attribute__((constructor
)) softfloat_init(void)
7629 union_float64 ua
, ub
, uc
, ur
;
7631 if (QEMU_NO_HARDFLOAT
) {
7635 * Test that the host's FMA is not obviously broken. For example,
7636 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7637 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7639 ua
.s
= 0x0020000000000001ULL
;
7640 ub
.s
= 0x3ca0000000000000ULL
;
7641 uc
.s
= 0x0020000000000000ULL
;
7642 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7643 if (ur
.s
!= 0x0020000000000001ULL
) {
7644 force_soft_fma
= true;