4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 float_raise(float_flag_overflow
, s
);
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 float_raise(float_flag_overflow
, s
);
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 #define float_cmask(bit) (1u << (bit))
475 float_cmask_zero
= float_cmask(float_class_zero
),
476 float_cmask_normal
= float_cmask(float_class_normal
),
477 float_cmask_inf
= float_cmask(float_class_inf
),
478 float_cmask_qnan
= float_cmask(float_class_qnan
),
479 float_cmask_snan
= float_cmask(float_class_snan
),
481 float_cmask_infzero
= float_cmask_zero
| float_cmask_inf
,
482 float_cmask_anynan
= float_cmask_qnan
| float_cmask_snan
,
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
489 return unlikely(c
>= float_class_qnan
);
492 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
494 return c
== float_class_snan
;
497 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
499 return c
== float_class_qnan
;
503 * Structure holding all of the decomposed parts of a float. The
504 * exponent is unbiased and the fraction is normalized. All
505 * calculations are done with a 64 bit fraction and then rounded as
506 * appropriate for the final format.
508 * Thanks to the packed FloatClass a decent compiler should be able to
509 * fit the whole structure into registers and avoid using the stack
510 * for parameter passing.
520 #define DECOMPOSED_BINARY_POINT 63
521 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
545 uint64_t roundeven_mask
;
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
561 static const FloatFmt float16_params
= {
565 static const FloatFmt float16_params_ahp
= {
570 static const FloatFmt bfloat16_params
= {
574 static const FloatFmt float32_params
= {
578 static const FloatFmt float64_params
= {
582 /* Unpack a float to parts, but do not canonicalize. */
583 static inline FloatParts64
unpack_raw(FloatFmt fmt
, uint64_t raw
)
585 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
587 return (FloatParts64
) {
588 .cls
= float_class_unclassified
,
589 .sign
= extract64(raw
, sign_pos
, 1),
590 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
591 .frac
= extract64(raw
, 0, fmt
.frac_size
),
595 static inline FloatParts64
float16_unpack_raw(float16 f
)
597 return unpack_raw(float16_params
, f
);
600 static inline FloatParts64
bfloat16_unpack_raw(bfloat16 f
)
602 return unpack_raw(bfloat16_params
, f
);
605 static inline FloatParts64
float32_unpack_raw(float32 f
)
607 return unpack_raw(float32_params
, f
);
610 static inline FloatParts64
float64_unpack_raw(float64 f
)
612 return unpack_raw(float64_params
, f
);
615 /* Pack a float from parts, but do not canonicalize. */
616 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts64 p
)
618 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
619 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
620 return deposit64(ret
, sign_pos
, 1, p
.sign
);
623 static inline float16
float16_pack_raw(FloatParts64 p
)
625 return make_float16(pack_raw(float16_params
, p
));
628 static inline bfloat16
bfloat16_pack_raw(FloatParts64 p
)
630 return pack_raw(bfloat16_params
, p
);
633 static inline float32
float32_pack_raw(FloatParts64 p
)
635 return make_float32(pack_raw(float32_params
, p
));
638 static inline float64
float64_pack_raw(FloatParts64 p
)
640 return make_float64(pack_raw(float64_params
, p
));
643 /*----------------------------------------------------------------------------
644 | Functions and definitions to determine: (1) whether tininess for underflow
645 | is detected before or after rounding by default, (2) what (if anything)
646 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
647 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
648 | are propagated from function inputs to output. These details are target-
650 *----------------------------------------------------------------------------*/
651 #include "softfloat-specialize.c.inc"
653 /* Canonicalize EXP and FRAC, setting CLS. */
654 static FloatParts64
sf_canonicalize(FloatParts64 part
, const FloatFmt
*parm
,
655 float_status
*status
)
657 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
658 if (part
.frac
== 0) {
659 part
.cls
= float_class_inf
;
661 part
.frac
<<= parm
->frac_shift
;
662 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
663 ? float_class_snan
: float_class_qnan
);
665 } else if (part
.exp
== 0) {
666 if (likely(part
.frac
== 0)) {
667 part
.cls
= float_class_zero
;
668 } else if (status
->flush_inputs_to_zero
) {
669 float_raise(float_flag_input_denormal
, status
);
670 part
.cls
= float_class_zero
;
673 int shift
= clz64(part
.frac
);
674 part
.cls
= float_class_normal
;
675 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
679 part
.cls
= float_class_normal
;
680 part
.exp
-= parm
->exp_bias
;
681 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
686 /* Round and uncanonicalize a floating-point number by parts. There
687 * are FRAC_SHIFT bits that may require rounding at the bottom of the
688 * fraction; these bits will be removed. The exponent will be biased
689 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
692 static FloatParts64
round_canonical(FloatParts64 p
, float_status
*s
,
693 const FloatFmt
*parm
)
695 const uint64_t frac_lsb
= parm
->frac_lsb
;
696 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
697 const uint64_t round_mask
= parm
->round_mask
;
698 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
699 const int exp_max
= parm
->exp_max
;
700 const int frac_shift
= parm
->frac_shift
;
709 case float_class_normal
:
710 switch (s
->float_rounding_mode
) {
711 case float_round_nearest_even
:
712 overflow_norm
= false;
713 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
715 case float_round_ties_away
:
716 overflow_norm
= false;
719 case float_round_to_zero
:
720 overflow_norm
= true;
724 inc
= p
.sign
? 0 : round_mask
;
725 overflow_norm
= p
.sign
;
727 case float_round_down
:
728 inc
= p
.sign
? round_mask
: 0;
729 overflow_norm
= !p
.sign
;
731 case float_round_to_odd
:
732 overflow_norm
= true;
733 inc
= frac
& frac_lsb
? 0 : round_mask
;
736 g_assert_not_reached();
739 exp
+= parm
->exp_bias
;
740 if (likely(exp
> 0)) {
741 if (frac
& round_mask
) {
742 flags
|= float_flag_inexact
;
743 if (uadd64_overflow(frac
, inc
, &frac
)) {
744 frac
= (frac
>> 1) | DECOMPOSED_IMPLICIT_BIT
;
750 if (parm
->arm_althp
) {
751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
752 if (unlikely(exp
> exp_max
)) {
753 /* Overflow. Return the maximum normal. */
754 flags
= float_flag_invalid
;
758 } else if (unlikely(exp
>= exp_max
)) {
759 flags
|= float_flag_overflow
| float_flag_inexact
;
764 p
.cls
= float_class_inf
;
768 } else if (s
->flush_to_zero
) {
769 flags
|= float_flag_output_denormal
;
770 p
.cls
= float_class_zero
;
773 bool is_tiny
= s
->tininess_before_rounding
|| (exp
< 0);
777 is_tiny
= !uadd64_overflow(frac
, inc
, &discard
);
780 shift64RightJamming(frac
, 1 - exp
, &frac
);
781 if (frac
& round_mask
) {
782 /* Need to recompute round-to-even. */
783 switch (s
->float_rounding_mode
) {
784 case float_round_nearest_even
:
785 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
788 case float_round_to_odd
:
789 inc
= frac
& frac_lsb
? 0 : round_mask
;
794 flags
|= float_flag_inexact
;
798 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
801 if (is_tiny
&& (flags
& float_flag_inexact
)) {
802 flags
|= float_flag_underflow
;
804 if (exp
== 0 && frac
== 0) {
805 p
.cls
= float_class_zero
;
810 case float_class_zero
:
816 case float_class_inf
:
818 assert(!parm
->arm_althp
);
823 case float_class_qnan
:
824 case float_class_snan
:
825 assert(!parm
->arm_althp
);
827 frac
>>= parm
->frac_shift
;
831 g_assert_not_reached();
834 float_raise(flags
, s
);
840 /* Explicit FloatFmt version */
841 static FloatParts64
float16a_unpack_canonical(float16 f
, float_status
*s
,
842 const FloatFmt
*params
)
844 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
847 static FloatParts64
float16_unpack_canonical(float16 f
, float_status
*s
)
849 return float16a_unpack_canonical(f
, s
, &float16_params
);
852 static FloatParts64
bfloat16_unpack_canonical(bfloat16 f
, float_status
*s
)
854 return sf_canonicalize(bfloat16_unpack_raw(f
), &bfloat16_params
, s
);
857 static float16
float16a_round_pack_canonical(FloatParts64 p
, float_status
*s
,
858 const FloatFmt
*params
)
860 return float16_pack_raw(round_canonical(p
, s
, params
));
863 static float16
float16_round_pack_canonical(FloatParts64 p
, float_status
*s
)
865 return float16a_round_pack_canonical(p
, s
, &float16_params
);
868 static bfloat16
bfloat16_round_pack_canonical(FloatParts64 p
, float_status
*s
)
870 return bfloat16_pack_raw(round_canonical(p
, s
, &bfloat16_params
));
873 static FloatParts64
float32_unpack_canonical(float32 f
, float_status
*s
)
875 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
878 static float32
float32_round_pack_canonical(FloatParts64 p
, float_status
*s
)
880 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
883 static FloatParts64
float64_unpack_canonical(float64 f
, float_status
*s
)
885 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
888 static float64
float64_round_pack_canonical(FloatParts64 p
, float_status
*s
)
890 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
893 static FloatParts64
return_nan(FloatParts64 a
, float_status
*s
)
895 g_assert(is_nan(a
.cls
));
896 if (is_snan(a
.cls
)) {
897 float_raise(float_flag_invalid
, s
);
898 if (!s
->default_nan_mode
) {
899 return parts_silence_nan(a
, s
);
901 } else if (!s
->default_nan_mode
) {
904 return parts_default_nan(s
);
907 static FloatParts64
pick_nan(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
909 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
910 float_raise(float_flag_invalid
, s
);
913 if (s
->default_nan_mode
) {
914 return parts_default_nan(s
);
916 if (pickNaN(a
.cls
, b
.cls
,
918 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
), s
)) {
921 if (is_snan(a
.cls
)) {
922 return parts_silence_nan(a
, s
);
928 static FloatParts64
pick_nan_muladd(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
929 bool inf_zero
, float_status
*s
)
933 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
934 float_raise(float_flag_invalid
, s
);
937 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
939 if (s
->default_nan_mode
) {
940 /* Note that this check is after pickNaNMulAdd so that function
941 * has an opportunity to set the Invalid flag.
956 return parts_default_nan(s
);
958 g_assert_not_reached();
961 if (is_snan(a
.cls
)) {
962 return parts_silence_nan(a
, s
);
968 * Returns the result of adding or subtracting the values of the
969 * floating-point values `a' and `b'. The operation is performed
970 * according to the IEC/IEEE Standard for Binary Floating-Point
974 static FloatParts64
addsub_floats(FloatParts64 a
, FloatParts64 b
, bool subtract
,
977 bool a_sign
= a
.sign
;
978 bool b_sign
= b
.sign
^ subtract
;
980 if (a_sign
!= b_sign
) {
983 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
984 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
985 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
986 a
.frac
= a
.frac
- b
.frac
;
988 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
989 a
.frac
= b
.frac
- a
.frac
;
995 a
.cls
= float_class_zero
;
996 a
.sign
= s
->float_rounding_mode
== float_round_down
;
998 int shift
= clz64(a
.frac
);
999 a
.frac
= a
.frac
<< shift
;
1000 a
.exp
= a
.exp
- shift
;
1005 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1006 return pick_nan(a
, b
, s
);
1008 if (a
.cls
== float_class_inf
) {
1009 if (b
.cls
== float_class_inf
) {
1010 float_raise(float_flag_invalid
, s
);
1011 return parts_default_nan(s
);
1015 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1016 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1019 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1020 b
.sign
= a_sign
^ 1;
1023 if (b
.cls
== float_class_zero
) {
1028 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1029 if (a
.exp
> b
.exp
) {
1030 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1031 } else if (a
.exp
< b
.exp
) {
1032 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1036 if (uadd64_overflow(a
.frac
, b
.frac
, &a
.frac
)) {
1037 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1038 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
1043 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1044 return pick_nan(a
, b
, s
);
1046 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1049 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1054 g_assert_not_reached();
1058 * Returns the result of adding or subtracting the floating-point
1059 * values `a' and `b'. The operation is performed according to the
1060 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1063 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1065 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1066 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1067 FloatParts64 pr
= addsub_floats(pa
, pb
, false, status
);
1069 return float16_round_pack_canonical(pr
, status
);
1072 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1074 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1075 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1076 FloatParts64 pr
= addsub_floats(pa
, pb
, true, status
);
1078 return float16_round_pack_canonical(pr
, status
);
1081 static float32 QEMU_SOFTFLOAT_ATTR
1082 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1084 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1085 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1086 FloatParts64 pr
= addsub_floats(pa
, pb
, subtract
, status
);
1088 return float32_round_pack_canonical(pr
, status
);
1091 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1093 return soft_f32_addsub(a
, b
, false, status
);
1096 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1098 return soft_f32_addsub(a
, b
, true, status
);
1101 static float64 QEMU_SOFTFLOAT_ATTR
1102 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1104 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1105 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1106 FloatParts64 pr
= addsub_floats(pa
, pb
, subtract
, status
);
1108 return float64_round_pack_canonical(pr
, status
);
1111 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1113 return soft_f64_addsub(a
, b
, false, status
);
1116 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1118 return soft_f64_addsub(a
, b
, true, status
);
1121 static float hard_f32_add(float a
, float b
)
1126 static float hard_f32_sub(float a
, float b
)
1131 static double hard_f64_add(double a
, double b
)
1136 static double hard_f64_sub(double a
, double b
)
1141 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1143 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1144 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1146 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1149 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1151 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1152 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1154 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1158 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1159 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1161 return float32_gen2(a
, b
, s
, hard
, soft
,
1162 f32_is_zon2
, f32_addsubmul_post
);
1165 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1166 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1168 return float64_gen2(a
, b
, s
, hard
, soft
,
1169 f64_is_zon2
, f64_addsubmul_post
);
1172 float32 QEMU_FLATTEN
1173 float32_add(float32 a
, float32 b
, float_status
*s
)
1175 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1178 float32 QEMU_FLATTEN
1179 float32_sub(float32 a
, float32 b
, float_status
*s
)
1181 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1184 float64 QEMU_FLATTEN
1185 float64_add(float64 a
, float64 b
, float_status
*s
)
1187 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1190 float64 QEMU_FLATTEN
1191 float64_sub(float64 a
, float64 b
, float_status
*s
)
1193 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1197 * Returns the result of adding or subtracting the bfloat16
1198 * values `a' and `b'.
1200 bfloat16 QEMU_FLATTEN
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1202 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1203 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1204 FloatParts64 pr
= addsub_floats(pa
, pb
, false, status
);
1206 return bfloat16_round_pack_canonical(pr
, status
);
1209 bfloat16 QEMU_FLATTEN
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1211 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1212 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1213 FloatParts64 pr
= addsub_floats(pa
, pb
, true, status
);
1215 return bfloat16_round_pack_canonical(pr
, status
);
1219 * Returns the result of multiplying the floating-point values `a' and
1220 * `b'. The operation is performed according to the IEC/IEEE Standard
1221 * for Binary Floating-Point Arithmetic.
1224 static FloatParts64
mul_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1226 bool sign
= a
.sign
^ b
.sign
;
1228 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1230 int exp
= a
.exp
+ b
.exp
;
1232 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1233 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1246 /* handle all the NaN cases */
1247 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1248 return pick_nan(a
, b
, s
);
1250 /* Inf * Zero == NaN */
1251 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1252 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1253 float_raise(float_flag_invalid
, s
);
1254 return parts_default_nan(s
);
1256 /* Multiply by 0 or Inf */
1257 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1261 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1265 g_assert_not_reached();
1268 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1270 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1271 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1272 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1274 return float16_round_pack_canonical(pr
, status
);
1277 static float32 QEMU_SOFTFLOAT_ATTR
1278 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1280 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1281 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1282 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1284 return float32_round_pack_canonical(pr
, status
);
1287 static float64 QEMU_SOFTFLOAT_ATTR
1288 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1290 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1291 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1292 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1294 return float64_round_pack_canonical(pr
, status
);
1297 static float hard_f32_mul(float a
, float b
)
1302 static double hard_f64_mul(double a
, double b
)
1307 float32 QEMU_FLATTEN
1308 float32_mul(float32 a
, float32 b
, float_status
*s
)
1310 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1311 f32_is_zon2
, f32_addsubmul_post
);
1314 float64 QEMU_FLATTEN
1315 float64_mul(float64 a
, float64 b
, float_status
*s
)
1317 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1318 f64_is_zon2
, f64_addsubmul_post
);
1322 * Returns the result of multiplying the bfloat16
1323 * values `a' and `b'.
1326 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1328 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1329 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1330 FloatParts64 pr
= mul_floats(pa
, pb
, status
);
1332 return bfloat16_round_pack_canonical(pr
, status
);
1336 * Returns the result of multiplying the floating-point values `a' and
1337 * `b' then adding 'c', with no intermediate rounding step after the
1338 * multiplication. The operation is performed according to the
1339 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1340 * The flags argument allows the caller to select negation of the
1341 * addend, the intermediate product, or the final result. (The
1342 * difference between this and having the caller do a separate
1343 * negation is that negating externally will flip the sign bit on
1347 static FloatParts64
muladd_floats(FloatParts64 a
, FloatParts64 b
, FloatParts64 c
,
1348 int flags
, float_status
*s
)
1350 bool inf_zero
, p_sign
;
1351 bool sign_flip
= flags
& float_muladd_negate_result
;
1355 int ab_mask
, abc_mask
;
1357 ab_mask
= float_cmask(a
.cls
) | float_cmask(b
.cls
);
1358 abc_mask
= float_cmask(c
.cls
) | ab_mask
;
1359 inf_zero
= ab_mask
== float_cmask_infzero
;
1361 /* It is implementation-defined whether the cases of (0,inf,qnan)
1362 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1363 * they return if they do), so we have to hand this information
1364 * off to the target-specific pick-a-NaN routine.
1366 if (unlikely(abc_mask
& float_cmask_anynan
)) {
1367 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1371 float_raise(float_flag_invalid
, s
);
1372 return parts_default_nan(s
);
1375 if (flags
& float_muladd_negate_c
) {
1379 p_sign
= a
.sign
^ b
.sign
;
1381 if (flags
& float_muladd_negate_product
) {
1385 if (ab_mask
& float_cmask_inf
) {
1386 p_class
= float_class_inf
;
1387 } else if (ab_mask
& float_cmask_zero
) {
1388 p_class
= float_class_zero
;
1390 p_class
= float_class_normal
;
1393 if (c
.cls
== float_class_inf
) {
1394 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1395 float_raise(float_flag_invalid
, s
);
1396 return parts_default_nan(s
);
1398 c
.sign
^= sign_flip
;
1403 if (p_class
== float_class_inf
) {
1404 a
.cls
= float_class_inf
;
1405 a
.sign
= p_sign
^ sign_flip
;
1409 if (p_class
== float_class_zero
) {
1410 if (c
.cls
== float_class_zero
) {
1411 if (p_sign
!= c
.sign
) {
1412 p_sign
= s
->float_rounding_mode
== float_round_down
;
1415 } else if (flags
& float_muladd_halve_result
) {
1418 c
.sign
^= sign_flip
;
1422 /* a & b should be normals now... */
1423 assert(a
.cls
== float_class_normal
&&
1424 b
.cls
== float_class_normal
);
1426 p_exp
= a
.exp
+ b
.exp
;
1428 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1430 /* Renormalize to the msb. */
1431 if (hi
& DECOMPOSED_IMPLICIT_BIT
) {
1434 shortShift128Left(hi
, lo
, 1, &hi
, &lo
);
1438 if (c
.cls
!= float_class_zero
) {
1439 int exp_diff
= p_exp
- c
.exp
;
1440 if (p_sign
== c
.sign
) {
1442 if (exp_diff
<= 0) {
1443 shift64RightJamming(hi
, -exp_diff
, &hi
);
1445 if (uadd64_overflow(hi
, c
.frac
, &hi
)) {
1446 shift64RightJamming(hi
, 1, &hi
);
1447 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1451 uint64_t c_hi
, c_lo
, over
;
1452 shift128RightJamming(c
.frac
, 0, exp_diff
, &c_hi
, &c_lo
);
1453 add192(0, hi
, lo
, 0, c_hi
, c_lo
, &over
, &hi
, &lo
);
1455 shift64RightJamming(hi
, 1, &hi
);
1456 hi
|= DECOMPOSED_IMPLICIT_BIT
;
1462 uint64_t c_hi
= c
.frac
, c_lo
= 0;
1464 if (exp_diff
<= 0) {
1465 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1468 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1469 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1471 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1476 shift128RightJamming(c_hi
, c_lo
,
1479 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1482 if (hi
== 0 && lo
== 0) {
1483 a
.cls
= float_class_zero
;
1484 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1485 a
.sign
^= sign_flip
;
1492 shift
= clz64(lo
) + 64;
1494 /* Normalizing to a binary point of 124 is the
1495 correct adjust for the exponent. However since we're
1496 shifting, we might as well put the binary point back
1497 at 63 where we really want it. Therefore shift as
1498 if we're leaving 1 bit at the top of the word, but
1499 adjust the exponent as if we're leaving 3 bits. */
1500 shift128Left(hi
, lo
, shift
, &hi
, &lo
);
1507 if (flags
& float_muladd_halve_result
) {
1511 /* finally prepare our result */
1512 a
.cls
= float_class_normal
;
1513 a
.sign
= p_sign
^ sign_flip
;
1520 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1521 int flags
, float_status
*status
)
1523 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1524 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1525 FloatParts64 pc
= float16_unpack_canonical(c
, status
);
1526 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1528 return float16_round_pack_canonical(pr
, status
);
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1533 float_status
*status
)
1535 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1536 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1537 FloatParts64 pc
= float32_unpack_canonical(c
, status
);
1538 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1540 return float32_round_pack_canonical(pr
, status
);
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1545 float_status
*status
)
1547 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1548 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1549 FloatParts64 pc
= float64_unpack_canonical(c
, status
);
1550 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1552 return float64_round_pack_canonical(pr
, status
);
1555 static bool force_soft_fma
;
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1560 union_float32 ua
, ub
, uc
, ur
;
1566 if (unlikely(!can_use_fpu(s
))) {
1569 if (unlikely(flags
& float_muladd_halve_result
)) {
1573 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1574 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1578 if (unlikely(force_soft_fma
)) {
1583 * When (a || b) == 0, there's no need to check for under/over flow,
1584 * since we know the addend is (normal || 0) and the product is 0.
1586 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1590 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1591 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1592 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1594 if (flags
& float_muladd_negate_c
) {
1599 union_float32 ua_orig
= ua
;
1600 union_float32 uc_orig
= uc
;
1602 if (flags
& float_muladd_negate_product
) {
1605 if (flags
& float_muladd_negate_c
) {
1609 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1611 if (unlikely(f32_is_inf(ur
))) {
1612 float_raise(float_flag_overflow
, s
);
1613 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1619 if (flags
& float_muladd_negate_result
) {
1620 return float32_chs(ur
.s
);
1625 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1628 float64 QEMU_FLATTEN
1629 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1631 union_float64 ua
, ub
, uc
, ur
;
1637 if (unlikely(!can_use_fpu(s
))) {
1640 if (unlikely(flags
& float_muladd_halve_result
)) {
1644 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1645 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1649 if (unlikely(force_soft_fma
)) {
1654 * When (a || b) == 0, there's no need to check for under/over flow,
1655 * since we know the addend is (normal || 0) and the product is 0.
1657 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1661 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1662 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1663 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1665 if (flags
& float_muladd_negate_c
) {
1670 union_float64 ua_orig
= ua
;
1671 union_float64 uc_orig
= uc
;
1673 if (flags
& float_muladd_negate_product
) {
1676 if (flags
& float_muladd_negate_c
) {
1680 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1682 if (unlikely(f64_is_inf(ur
))) {
1683 float_raise(float_flag_overflow
, s
);
1684 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1690 if (flags
& float_muladd_negate_result
) {
1691 return float64_chs(ur
.s
);
1696 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1700 * Returns the result of multiplying the bfloat16 values `a'
1701 * and `b' then adding 'c', with no intermediate rounding step after the
1705 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1706 int flags
, float_status
*status
)
1708 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1709 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1710 FloatParts64 pc
= bfloat16_unpack_canonical(c
, status
);
1711 FloatParts64 pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1713 return bfloat16_round_pack_canonical(pr
, status
);
1717 * Returns the result of dividing the floating-point value `a' by the
1718 * corresponding value `b'. The operation is performed according to
1719 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1722 static FloatParts64
div_floats(FloatParts64 a
, FloatParts64 b
, float_status
*s
)
1724 bool sign
= a
.sign
^ b
.sign
;
1726 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1727 uint64_t n0
, n1
, q
, r
;
1728 int exp
= a
.exp
- b
.exp
;
1731 * We want a 2*N / N-bit division to produce exactly an N-bit
1732 * result, so that we do not lose any precision and so that we
1733 * do not have to renormalize afterward. If A.frac < B.frac,
1734 * then division would produce an (N-1)-bit result; shift A left
1735 * by one to produce the an N-bit result, and decrement the
1736 * exponent to match.
1738 * The udiv_qrnnd algorithm that we're using requires normalization,
1739 * i.e. the msb of the denominator must be set, which is already true.
1741 if (a
.frac
< b
.frac
) {
1743 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1745 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
, &n1
, &n0
);
1747 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
);
1749 /* Set lsb if there is a remainder, to set inexact. */
1750 a
.frac
= q
| (r
!= 0);
1755 /* handle all the NaN cases */
1756 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1757 return pick_nan(a
, b
, s
);
1759 /* 0/0 or Inf/Inf */
1762 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1763 float_raise(float_flag_invalid
, s
);
1764 return parts_default_nan(s
);
1766 /* Inf / x or 0 / x */
1767 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1772 if (b
.cls
== float_class_zero
) {
1773 float_raise(float_flag_divbyzero
, s
);
1774 a
.cls
= float_class_inf
;
1779 if (b
.cls
== float_class_inf
) {
1780 a
.cls
= float_class_zero
;
1784 g_assert_not_reached();
1787 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1789 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
1790 FloatParts64 pb
= float16_unpack_canonical(b
, status
);
1791 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1793 return float16_round_pack_canonical(pr
, status
);
1796 static float32 QEMU_SOFTFLOAT_ATTR
1797 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1799 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
1800 FloatParts64 pb
= float32_unpack_canonical(b
, status
);
1801 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1803 return float32_round_pack_canonical(pr
, status
);
1806 static float64 QEMU_SOFTFLOAT_ATTR
1807 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1809 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
1810 FloatParts64 pb
= float64_unpack_canonical(b
, status
);
1811 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1813 return float64_round_pack_canonical(pr
, status
);
1816 static float hard_f32_div(float a
, float b
)
1821 static double hard_f64_div(double a
, double b
)
1826 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1828 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1829 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1830 fpclassify(b
.h
) == FP_NORMAL
;
1832 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1835 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1837 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1838 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1839 fpclassify(b
.h
) == FP_NORMAL
;
1841 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1844 static bool f32_div_post(union_float32 a
, union_float32 b
)
1846 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1847 return fpclassify(a
.h
) != FP_ZERO
;
1849 return !float32_is_zero(a
.s
);
1852 static bool f64_div_post(union_float64 a
, union_float64 b
)
1854 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1855 return fpclassify(a
.h
) != FP_ZERO
;
1857 return !float64_is_zero(a
.s
);
1860 float32 QEMU_FLATTEN
1861 float32_div(float32 a
, float32 b
, float_status
*s
)
1863 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1864 f32_div_pre
, f32_div_post
);
1867 float64 QEMU_FLATTEN
1868 float64_div(float64 a
, float64 b
, float_status
*s
)
1870 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1871 f64_div_pre
, f64_div_post
);
1875 * Returns the result of dividing the bfloat16
1876 * value `a' by the corresponding value `b'.
1879 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1881 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
1882 FloatParts64 pb
= bfloat16_unpack_canonical(b
, status
);
1883 FloatParts64 pr
= div_floats(pa
, pb
, status
);
1885 return bfloat16_round_pack_canonical(pr
, status
);
1889 * Float to Float conversions
1891 * Returns the result of converting one float format to another. The
1892 * conversion is performed according to the IEC/IEEE Standard for
1893 * Binary Floating-Point Arithmetic.
1895 * The float_to_float helper only needs to take care of raising
1896 * invalid exceptions and handling the conversion on NaNs.
1899 static FloatParts64
float_to_float(FloatParts64 a
, const FloatFmt
*dstf
,
1902 if (dstf
->arm_althp
) {
1904 case float_class_qnan
:
1905 case float_class_snan
:
1906 /* There is no NaN in the destination format. Raise Invalid
1907 * and return a zero with the sign of the input NaN.
1909 float_raise(float_flag_invalid
, s
);
1910 a
.cls
= float_class_zero
;
1915 case float_class_inf
:
1916 /* There is no Inf in the destination format. Raise Invalid
1917 * and return the maximum normal with the correct sign.
1919 float_raise(float_flag_invalid
, s
);
1920 a
.cls
= float_class_normal
;
1921 a
.exp
= dstf
->exp_max
;
1922 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1928 } else if (is_nan(a
.cls
)) {
1929 return return_nan(a
, s
);
1934 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1936 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1937 FloatParts64 p
= float16a_unpack_canonical(a
, s
, fmt16
);
1938 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
1939 return float32_round_pack_canonical(pr
, s
);
1942 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1944 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1945 FloatParts64 p
= float16a_unpack_canonical(a
, s
, fmt16
);
1946 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
1947 return float64_round_pack_canonical(pr
, s
);
1950 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1952 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1953 FloatParts64 p
= float32_unpack_canonical(a
, s
);
1954 FloatParts64 pr
= float_to_float(p
, fmt16
, s
);
1955 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1958 static float64 QEMU_SOFTFLOAT_ATTR
1959 soft_float32_to_float64(float32 a
, float_status
*s
)
1961 FloatParts64 p
= float32_unpack_canonical(a
, s
);
1962 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
1963 return float64_round_pack_canonical(pr
, s
);
1966 float64
float32_to_float64(float32 a
, float_status
*s
)
1968 if (likely(float32_is_normal(a
))) {
1969 /* Widening conversion can never produce inexact results. */
1975 } else if (float32_is_zero(a
)) {
1976 return float64_set_sign(float64_zero
, float32_is_neg(a
));
1978 return soft_float32_to_float64(a
, s
);
1982 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1984 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1985 FloatParts64 p
= float64_unpack_canonical(a
, s
);
1986 FloatParts64 pr
= float_to_float(p
, fmt16
, s
);
1987 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1990 float32
float64_to_float32(float64 a
, float_status
*s
)
1992 FloatParts64 p
= float64_unpack_canonical(a
, s
);
1993 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
1994 return float32_round_pack_canonical(pr
, s
);
1997 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
1999 FloatParts64 p
= bfloat16_unpack_canonical(a
, s
);
2000 FloatParts64 pr
= float_to_float(p
, &float32_params
, s
);
2001 return float32_round_pack_canonical(pr
, s
);
2004 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2006 FloatParts64 p
= bfloat16_unpack_canonical(a
, s
);
2007 FloatParts64 pr
= float_to_float(p
, &float64_params
, s
);
2008 return float64_round_pack_canonical(pr
, s
);
2011 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2013 FloatParts64 p
= float32_unpack_canonical(a
, s
);
2014 FloatParts64 pr
= float_to_float(p
, &bfloat16_params
, s
);
2015 return bfloat16_round_pack_canonical(pr
, s
);
2018 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2020 FloatParts64 p
= float64_unpack_canonical(a
, s
);
2021 FloatParts64 pr
= float_to_float(p
, &bfloat16_params
, s
);
2022 return bfloat16_round_pack_canonical(pr
, s
);
2026 * Rounds the floating-point value `a' to an integer, and returns the
2027 * result as a floating-point value. The operation is performed
2028 * according to the IEC/IEEE Standard for Binary Floating-Point
2032 static FloatParts64
round_to_int(FloatParts64 a
, FloatRoundMode rmode
,
2033 int scale
, float_status
*s
)
2036 case float_class_qnan
:
2037 case float_class_snan
:
2038 return return_nan(a
, s
);
2040 case float_class_zero
:
2041 case float_class_inf
:
2042 /* already "integral" */
2045 case float_class_normal
:
2046 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2049 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2050 /* already integral */
2055 /* all fractional */
2056 float_raise(float_flag_inexact
, s
);
2058 case float_round_nearest_even
:
2059 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2061 case float_round_ties_away
:
2062 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2064 case float_round_to_zero
:
2067 case float_round_up
:
2070 case float_round_down
:
2073 case float_round_to_odd
:
2077 g_assert_not_reached();
2081 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2084 a
.cls
= float_class_zero
;
2087 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2088 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2089 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2090 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2094 case float_round_nearest_even
:
2095 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2097 case float_round_ties_away
:
2100 case float_round_to_zero
:
2103 case float_round_up
:
2104 inc
= a
.sign
? 0 : rnd_mask
;
2106 case float_round_down
:
2107 inc
= a
.sign
? rnd_mask
: 0;
2109 case float_round_to_odd
:
2110 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2113 g_assert_not_reached();
2116 if (a
.frac
& rnd_mask
) {
2117 float_raise(float_flag_inexact
, s
);
2118 if (uadd64_overflow(a
.frac
, inc
, &a
.frac
)) {
2120 a
.frac
|= DECOMPOSED_IMPLICIT_BIT
;
2123 a
.frac
&= ~rnd_mask
;
2128 g_assert_not_reached();
2133 float16
float16_round_to_int(float16 a
, float_status
*s
)
2135 FloatParts64 pa
= float16_unpack_canonical(a
, s
);
2136 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2137 return float16_round_pack_canonical(pr
, s
);
2140 float32
float32_round_to_int(float32 a
, float_status
*s
)
2142 FloatParts64 pa
= float32_unpack_canonical(a
, s
);
2143 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2144 return float32_round_pack_canonical(pr
, s
);
2147 float64
float64_round_to_int(float64 a
, float_status
*s
)
2149 FloatParts64 pa
= float64_unpack_canonical(a
, s
);
2150 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2151 return float64_round_pack_canonical(pr
, s
);
2155 * Rounds the bfloat16 value `a' to an integer, and returns the
2156 * result as a bfloat16 value.
2159 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2161 FloatParts64 pa
= bfloat16_unpack_canonical(a
, s
);
2162 FloatParts64 pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2163 return bfloat16_round_pack_canonical(pr
, s
);
2167 * Returns the result of converting the floating-point value `a' to
2168 * the two's complement integer format. The conversion is performed
2169 * according to the IEC/IEEE Standard for Binary Floating-Point
2170 * Arithmetic---which means in particular that the conversion is
2171 * rounded according to the current rounding mode. If `a' is a NaN,
2172 * the largest positive integer is returned. Otherwise, if the
2173 * conversion overflows, the largest integer with the same sign as `a'
2177 static int64_t round_to_int_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2178 int scale
, int64_t min
, int64_t max
,
2182 int orig_flags
= get_float_exception_flags(s
);
2183 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2186 case float_class_snan
:
2187 case float_class_qnan
:
2188 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2190 case float_class_inf
:
2191 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2192 return p
.sign
? min
: max
;
2193 case float_class_zero
:
2195 case float_class_normal
:
2196 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2197 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2202 if (r
<= -(uint64_t) min
) {
2205 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2212 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2217 g_assert_not_reached();
2221 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2224 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2225 rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2228 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2231 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2232 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2235 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2238 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2239 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2242 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2245 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2246 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2249 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2252 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2253 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2256 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2259 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2260 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2263 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2266 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2267 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2270 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2273 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2274 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2277 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2280 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2281 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2284 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2287 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2288 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2291 int8_t float16_to_int8(float16 a
, float_status
*s
)
2293 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2296 int16_t float16_to_int16(float16 a
, float_status
*s
)
2298 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2301 int32_t float16_to_int32(float16 a
, float_status
*s
)
2303 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2306 int64_t float16_to_int64(float16 a
, float_status
*s
)
2308 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2311 int16_t float32_to_int16(float32 a
, float_status
*s
)
2313 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2316 int32_t float32_to_int32(float32 a
, float_status
*s
)
2318 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2321 int64_t float32_to_int64(float32 a
, float_status
*s
)
2323 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2326 int16_t float64_to_int16(float64 a
, float_status
*s
)
2328 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2331 int32_t float64_to_int32(float64 a
, float_status
*s
)
2333 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2336 int64_t float64_to_int64(float64 a
, float_status
*s
)
2338 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2341 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2343 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2346 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2348 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2351 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2353 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2356 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2358 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2361 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2363 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2366 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2368 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2371 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2373 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2376 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2378 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2381 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2383 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2387 * Returns the result of converting the floating-point value `a' to
2388 * the two's complement integer format.
2391 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2394 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2395 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2398 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2401 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2402 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2405 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2408 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2409 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2412 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2414 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2417 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2419 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2422 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2424 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2427 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2429 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2432 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2434 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2437 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2439 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2443 * Returns the result of converting the floating-point value `a' to
2444 * the unsigned integer format. The conversion is performed according
2445 * to the IEC/IEEE Standard for Binary Floating-Point
2446 * Arithmetic---which means in particular that the conversion is
2447 * rounded according to the current rounding mode. If `a' is a NaN,
2448 * the largest unsigned integer is returned. Otherwise, if the
2449 * conversion overflows, the largest unsigned integer is returned. If
2450 * the 'a' is negative, the result is rounded and zero is returned;
2451 * values that do not round to zero will raise the inexact exception
2455 static uint64_t round_to_uint_and_pack(FloatParts64 in
, FloatRoundMode rmode
,
2456 int scale
, uint64_t max
,
2459 int orig_flags
= get_float_exception_flags(s
);
2460 FloatParts64 p
= round_to_int(in
, rmode
, scale
, s
);
2464 case float_class_snan
:
2465 case float_class_qnan
:
2466 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2468 case float_class_inf
:
2469 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2470 return p
.sign
? 0 : max
;
2471 case float_class_zero
:
2473 case float_class_normal
:
2475 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2479 if (p
.exp
<= DECOMPOSED_BINARY_POINT
) {
2480 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2482 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2486 /* For uint64 this will never trip, but if p.exp is too large
2487 * to shift a decomposed fraction we shall have exited via the
2491 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2496 g_assert_not_reached();
2500 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2503 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2504 rmode
, scale
, UINT8_MAX
, s
);
2507 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2510 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2511 rmode
, scale
, UINT16_MAX
, s
);
2514 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2517 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2518 rmode
, scale
, UINT32_MAX
, s
);
2521 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2524 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2525 rmode
, scale
, UINT64_MAX
, s
);
2528 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2531 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2532 rmode
, scale
, UINT16_MAX
, s
);
2535 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2538 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2539 rmode
, scale
, UINT32_MAX
, s
);
2542 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2545 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2546 rmode
, scale
, UINT64_MAX
, s
);
2549 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2552 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2553 rmode
, scale
, UINT16_MAX
, s
);
2556 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2559 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2560 rmode
, scale
, UINT32_MAX
, s
);
2563 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2566 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2567 rmode
, scale
, UINT64_MAX
, s
);
2570 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2572 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2575 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2577 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2580 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2582 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2585 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2587 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2590 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2592 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2595 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2597 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2600 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2602 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2605 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2607 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2610 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2612 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2615 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2617 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2620 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2622 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2625 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2627 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2630 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2632 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2635 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2637 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2640 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2642 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2645 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2647 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2650 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2652 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2655 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2657 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2660 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2662 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2666 * Returns the result of converting the bfloat16 value `a' to
2667 * the unsigned integer format.
2670 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2671 int scale
, float_status
*s
)
2673 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2674 rmode
, scale
, UINT16_MAX
, s
);
2677 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2678 int scale
, float_status
*s
)
2680 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2681 rmode
, scale
, UINT32_MAX
, s
);
2684 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2685 int scale
, float_status
*s
)
2687 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2688 rmode
, scale
, UINT64_MAX
, s
);
2691 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2693 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2696 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2698 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2701 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2703 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2706 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2708 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2711 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2713 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2716 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2718 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2722 * Integer to float conversions
2724 * Returns the result of converting the two's complement integer `a'
2725 * to the floating-point format. The conversion is performed according
2726 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2729 static FloatParts64
int_to_float(int64_t a
, int scale
, float_status
*status
)
2731 FloatParts64 r
= { .sign
= false };
2734 r
.cls
= float_class_zero
;
2739 r
.cls
= float_class_normal
;
2745 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2747 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2748 r
.frac
= f
<< shift
;
2754 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2756 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2757 return float16_round_pack_canonical(pa
, status
);
2760 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2762 return int64_to_float16_scalbn(a
, scale
, status
);
2765 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2767 return int64_to_float16_scalbn(a
, scale
, status
);
2770 float16
int64_to_float16(int64_t a
, float_status
*status
)
2772 return int64_to_float16_scalbn(a
, 0, status
);
2775 float16
int32_to_float16(int32_t a
, float_status
*status
)
2777 return int64_to_float16_scalbn(a
, 0, status
);
2780 float16
int16_to_float16(int16_t a
, float_status
*status
)
2782 return int64_to_float16_scalbn(a
, 0, status
);
2785 float16
int8_to_float16(int8_t a
, float_status
*status
)
2787 return int64_to_float16_scalbn(a
, 0, status
);
2790 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2792 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2793 return float32_round_pack_canonical(pa
, status
);
2796 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2798 return int64_to_float32_scalbn(a
, scale
, status
);
2801 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2803 return int64_to_float32_scalbn(a
, scale
, status
);
2806 float32
int64_to_float32(int64_t a
, float_status
*status
)
2808 return int64_to_float32_scalbn(a
, 0, status
);
2811 float32
int32_to_float32(int32_t a
, float_status
*status
)
2813 return int64_to_float32_scalbn(a
, 0, status
);
2816 float32
int16_to_float32(int16_t a
, float_status
*status
)
2818 return int64_to_float32_scalbn(a
, 0, status
);
2821 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2823 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2824 return float64_round_pack_canonical(pa
, status
);
2827 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2829 return int64_to_float64_scalbn(a
, scale
, status
);
2832 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2834 return int64_to_float64_scalbn(a
, scale
, status
);
2837 float64
int64_to_float64(int64_t a
, float_status
*status
)
2839 return int64_to_float64_scalbn(a
, 0, status
);
2842 float64
int32_to_float64(int32_t a
, float_status
*status
)
2844 return int64_to_float64_scalbn(a
, 0, status
);
2847 float64
int16_to_float64(int16_t a
, float_status
*status
)
2849 return int64_to_float64_scalbn(a
, 0, status
);
2853 * Returns the result of converting the two's complement integer `a'
2854 * to the bfloat16 format.
2857 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
2859 FloatParts64 pa
= int_to_float(a
, scale
, status
);
2860 return bfloat16_round_pack_canonical(pa
, status
);
2863 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
2865 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2868 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
2870 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2873 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
2875 return int64_to_bfloat16_scalbn(a
, 0, status
);
2878 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
2880 return int64_to_bfloat16_scalbn(a
, 0, status
);
2883 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
2885 return int64_to_bfloat16_scalbn(a
, 0, status
);
2889 * Unsigned Integer to float conversions
2891 * Returns the result of converting the unsigned integer `a' to the
2892 * floating-point format. The conversion is performed according to the
2893 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2896 static FloatParts64
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2898 FloatParts64 r
= { .sign
= false };
2902 r
.cls
= float_class_zero
;
2904 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2906 r
.cls
= float_class_normal
;
2907 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2908 r
.frac
= a
<< shift
;
2914 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2916 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2917 return float16_round_pack_canonical(pa
, status
);
2920 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2922 return uint64_to_float16_scalbn(a
, scale
, status
);
2925 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2927 return uint64_to_float16_scalbn(a
, scale
, status
);
2930 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2932 return uint64_to_float16_scalbn(a
, 0, status
);
2935 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2937 return uint64_to_float16_scalbn(a
, 0, status
);
2940 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2942 return uint64_to_float16_scalbn(a
, 0, status
);
2945 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
2947 return uint64_to_float16_scalbn(a
, 0, status
);
2950 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2952 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2953 return float32_round_pack_canonical(pa
, status
);
2956 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2958 return uint64_to_float32_scalbn(a
, scale
, status
);
2961 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2963 return uint64_to_float32_scalbn(a
, scale
, status
);
2966 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2968 return uint64_to_float32_scalbn(a
, 0, status
);
2971 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2973 return uint64_to_float32_scalbn(a
, 0, status
);
2976 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2978 return uint64_to_float32_scalbn(a
, 0, status
);
2981 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2983 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
2984 return float64_round_pack_canonical(pa
, status
);
2987 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2989 return uint64_to_float64_scalbn(a
, scale
, status
);
2992 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2994 return uint64_to_float64_scalbn(a
, scale
, status
);
2997 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2999 return uint64_to_float64_scalbn(a
, 0, status
);
3002 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3004 return uint64_to_float64_scalbn(a
, 0, status
);
3007 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3009 return uint64_to_float64_scalbn(a
, 0, status
);
3013 * Returns the result of converting the unsigned integer `a' to the
3017 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3019 FloatParts64 pa
= uint_to_float(a
, scale
, status
);
3020 return bfloat16_round_pack_canonical(pa
, status
);
3023 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3025 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3028 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3030 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3033 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3035 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3038 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3040 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3043 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3045 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3049 /* min() and max() functions. These can't be implemented as
3050 * 'compare and pick one input' because that would mishandle
3051 * NaNs and +0 vs -0.
3053 * minnum() and maxnum() functions. These are similar to the min()
3054 * and max() functions but if one of the arguments is a QNaN and
3055 * the other is numerical then the numerical argument is returned.
3056 * SNaNs will get quietened before being returned.
3057 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3058 * and maxNum() operations. min() and max() are the typical min/max
3059 * semantics provided by many CPUs which predate that specification.
3061 * minnummag() and maxnummag() functions correspond to minNumMag()
3062 * and minNumMag() from the IEEE-754 2008.
3064 static FloatParts64
minmax_floats(FloatParts64 a
, FloatParts64 b
, bool ismin
,
3065 bool ieee
, bool ismag
, float_status
*s
)
3067 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3069 /* Takes two floating-point values `a' and `b', one of
3070 * which is a NaN, and returns the appropriate NaN
3071 * result. If either `a' or `b' is a signaling NaN,
3072 * the invalid exception is raised.
3074 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3075 return pick_nan(a
, b
, s
);
3076 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3078 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3082 return pick_nan(a
, b
, s
);
3087 case float_class_normal
:
3090 case float_class_inf
:
3093 case float_class_zero
:
3097 g_assert_not_reached();
3101 case float_class_normal
:
3104 case float_class_inf
:
3107 case float_class_zero
:
3111 g_assert_not_reached();
3115 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3116 bool a_less
= a_exp
< b_exp
;
3117 if (a_exp
== b_exp
) {
3118 a_less
= a
.frac
< b
.frac
;
3120 return a_less
^ ismin
? b
: a
;
3123 if (a
.sign
== b
.sign
) {
3124 bool a_less
= a_exp
< b_exp
;
3125 if (a_exp
== b_exp
) {
3126 a_less
= a
.frac
< b
.frac
;
3128 return a
.sign
^ a_less
^ ismin
? b
: a
;
3130 return a
.sign
^ ismin
? b
: a
;
3135 #define MINMAX(sz, name, ismin, isiee, ismag) \
3136 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3139 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3140 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3141 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3143 return float ## sz ## _round_pack_canonical(pr, s); \
3146 MINMAX(16, min
, true, false, false)
3147 MINMAX(16, minnum
, true, true, false)
3148 MINMAX(16, minnummag
, true, true, true)
3149 MINMAX(16, max
, false, false, false)
3150 MINMAX(16, maxnum
, false, true, false)
3151 MINMAX(16, maxnummag
, false, true, true)
3153 MINMAX(32, min
, true, false, false)
3154 MINMAX(32, minnum
, true, true, false)
3155 MINMAX(32, minnummag
, true, true, true)
3156 MINMAX(32, max
, false, false, false)
3157 MINMAX(32, maxnum
, false, true, false)
3158 MINMAX(32, maxnummag
, false, true, true)
3160 MINMAX(64, min
, true, false, false)
3161 MINMAX(64, minnum
, true, true, false)
3162 MINMAX(64, minnummag
, true, true, true)
3163 MINMAX(64, max
, false, false, false)
3164 MINMAX(64, maxnum
, false, true, false)
3165 MINMAX(64, maxnummag
, false, true, true)
3169 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3170 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3172 FloatParts64 pa = bfloat16_unpack_canonical(a, s); \
3173 FloatParts64 pb = bfloat16_unpack_canonical(b, s); \
3174 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3176 return bfloat16_round_pack_canonical(pr, s); \
3179 BF16_MINMAX(min
, true, false, false)
3180 BF16_MINMAX(minnum
, true, true, false)
3181 BF16_MINMAX(minnummag
, true, true, true)
3182 BF16_MINMAX(max
, false, false, false)
3183 BF16_MINMAX(maxnum
, false, true, false)
3184 BF16_MINMAX(maxnummag
, false, true, true)
3188 /* Floating point compare */
3189 static FloatRelation
compare_floats(FloatParts64 a
, FloatParts64 b
, bool is_quiet
,
3192 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3194 a
.cls
== float_class_snan
||
3195 b
.cls
== float_class_snan
) {
3196 float_raise(float_flag_invalid
, s
);
3198 return float_relation_unordered
;
3201 if (a
.cls
== float_class_zero
) {
3202 if (b
.cls
== float_class_zero
) {
3203 return float_relation_equal
;
3205 return b
.sign
? float_relation_greater
: float_relation_less
;
3206 } else if (b
.cls
== float_class_zero
) {
3207 return a
.sign
? float_relation_less
: float_relation_greater
;
3210 /* The only really important thing about infinity is its sign. If
3211 * both are infinities the sign marks the smallest of the two.
3213 if (a
.cls
== float_class_inf
) {
3214 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3215 return float_relation_equal
;
3217 return a
.sign
? float_relation_less
: float_relation_greater
;
3218 } else if (b
.cls
== float_class_inf
) {
3219 return b
.sign
? float_relation_greater
: float_relation_less
;
3222 if (a
.sign
!= b
.sign
) {
3223 return a
.sign
? float_relation_less
: float_relation_greater
;
3226 if (a
.exp
== b
.exp
) {
3227 if (a
.frac
== b
.frac
) {
3228 return float_relation_equal
;
3231 return a
.frac
> b
.frac
?
3232 float_relation_less
: float_relation_greater
;
3234 return a
.frac
> b
.frac
?
3235 float_relation_greater
: float_relation_less
;
3239 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3241 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3246 #define COMPARE(name, attr, sz) \
3248 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3250 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3251 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3252 return compare_floats(pa, pb, is_quiet, s); \
3255 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3256 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3257 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3261 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3263 return soft_f16_compare(a
, b
, false, s
);
3266 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3268 return soft_f16_compare(a
, b
, true, s
);
3271 static FloatRelation QEMU_FLATTEN
3272 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3274 union_float32 ua
, ub
;
3279 if (QEMU_NO_HARDFLOAT
) {
3283 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3284 if (isgreaterequal(ua
.h
, ub
.h
)) {
3285 if (isgreater(ua
.h
, ub
.h
)) {
3286 return float_relation_greater
;
3288 return float_relation_equal
;
3290 if (likely(isless(ua
.h
, ub
.h
))) {
3291 return float_relation_less
;
3293 /* The only condition remaining is unordered.
3294 * Fall through to set flags.
3297 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3300 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3302 return f32_compare(a
, b
, false, s
);
3305 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3307 return f32_compare(a
, b
, true, s
);
3310 static FloatRelation QEMU_FLATTEN
3311 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3313 union_float64 ua
, ub
;
3318 if (QEMU_NO_HARDFLOAT
) {
3322 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3323 if (isgreaterequal(ua
.h
, ub
.h
)) {
3324 if (isgreater(ua
.h
, ub
.h
)) {
3325 return float_relation_greater
;
3327 return float_relation_equal
;
3329 if (likely(isless(ua
.h
, ub
.h
))) {
3330 return float_relation_less
;
3332 /* The only condition remaining is unordered.
3333 * Fall through to set flags.
3336 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3339 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3341 return f64_compare(a
, b
, false, s
);
3344 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3346 return f64_compare(a
, b
, true, s
);
3349 static FloatRelation QEMU_FLATTEN
3350 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3352 FloatParts64 pa
= bfloat16_unpack_canonical(a
, s
);
3353 FloatParts64 pb
= bfloat16_unpack_canonical(b
, s
);
3354 return compare_floats(pa
, pb
, is_quiet
, s
);
3357 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3359 return soft_bf16_compare(a
, b
, false, s
);
3362 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3364 return soft_bf16_compare(a
, b
, true, s
);
3367 /* Multiply A by 2 raised to the power N. */
3368 static FloatParts64
scalbn_decomposed(FloatParts64 a
, int n
, float_status
*s
)
3370 if (unlikely(is_nan(a
.cls
))) {
3371 return return_nan(a
, s
);
3373 if (a
.cls
== float_class_normal
) {
3374 /* The largest float type (even though not supported by FloatParts64)
3375 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3376 * still allows rounding to infinity, without allowing overflow
3377 * within the int32_t that backs FloatParts64.exp.
3379 n
= MIN(MAX(n
, -0x10000), 0x10000);
3385 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3387 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
3388 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3389 return float16_round_pack_canonical(pr
, status
);
3392 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3394 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
3395 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3396 return float32_round_pack_canonical(pr
, status
);
3399 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3401 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
3402 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3403 return float64_round_pack_canonical(pr
, status
);
3406 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3408 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
3409 FloatParts64 pr
= scalbn_decomposed(pa
, n
, status
);
3410 return bfloat16_round_pack_canonical(pr
, status
);
3416 * The old softfloat code did an approximation step before zeroing in
3417 * on the final result. However for simpleness we just compute the
3418 * square root by iterating down from the implicit bit to enough extra
3419 * bits to ensure we get a correctly rounded result.
3421 * This does mean however the calculation is slower than before,
3422 * especially for 64 bit floats.
3425 static FloatParts64
sqrt_float(FloatParts64 a
, float_status
*s
, const FloatFmt
*p
)
3427 uint64_t a_frac
, r_frac
, s_frac
;
3430 if (is_nan(a
.cls
)) {
3431 return return_nan(a
, s
);
3433 if (a
.cls
== float_class_zero
) {
3434 return a
; /* sqrt(+-0) = +-0 */
3437 float_raise(float_flag_invalid
, s
);
3438 return parts_default_nan(s
);
3440 if (a
.cls
== float_class_inf
) {
3441 return a
; /* sqrt(+inf) = +inf */
3444 assert(a
.cls
== float_class_normal
);
3446 /* We need two overflow bits at the top. Adding room for that is a
3447 * right shift. If the exponent is odd, we can discard the low bit
3448 * by multiplying the fraction by 2; that's a left shift. Combine
3449 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3451 a_frac
= a
.frac
>> (2 - (a
.exp
& 1));
3454 /* Bit-by-bit computation of sqrt. */
3458 /* Iterate from implicit bit down to the 3 extra bits to compute a
3459 * properly rounded result. Remember we've inserted two more bits
3460 * at the top, so these positions are two less.
3462 bit
= DECOMPOSED_BINARY_POINT
- 2;
3463 last_bit
= MAX(p
->frac_shift
- 4, 0);
3465 uint64_t q
= 1ULL << bit
;
3466 uint64_t t_frac
= s_frac
+ q
;
3467 if (t_frac
<= a_frac
) {
3468 s_frac
= t_frac
+ q
;
3473 } while (--bit
>= last_bit
);
3475 /* Undo the right shift done above. If there is any remaining
3476 * fraction, the result is inexact. Set the sticky bit.
3478 a
.frac
= (r_frac
<< 2) + (a_frac
!= 0);
3483 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3485 FloatParts64 pa
= float16_unpack_canonical(a
, status
);
3486 FloatParts64 pr
= sqrt_float(pa
, status
, &float16_params
);
3487 return float16_round_pack_canonical(pr
, status
);
3490 static float32 QEMU_SOFTFLOAT_ATTR
3491 soft_f32_sqrt(float32 a
, float_status
*status
)
3493 FloatParts64 pa
= float32_unpack_canonical(a
, status
);
3494 FloatParts64 pr
= sqrt_float(pa
, status
, &float32_params
);
3495 return float32_round_pack_canonical(pr
, status
);
3498 static float64 QEMU_SOFTFLOAT_ATTR
3499 soft_f64_sqrt(float64 a
, float_status
*status
)
3501 FloatParts64 pa
= float64_unpack_canonical(a
, status
);
3502 FloatParts64 pr
= sqrt_float(pa
, status
, &float64_params
);
3503 return float64_round_pack_canonical(pr
, status
);
3506 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3508 union_float32 ua
, ur
;
3511 if (unlikely(!can_use_fpu(s
))) {
3515 float32_input_flush1(&ua
.s
, s
);
3516 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3517 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3518 fpclassify(ua
.h
) == FP_ZERO
) ||
3522 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3523 float32_is_neg(ua
.s
))) {
3530 return soft_f32_sqrt(ua
.s
, s
);
3533 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3535 union_float64 ua
, ur
;
3538 if (unlikely(!can_use_fpu(s
))) {
3542 float64_input_flush1(&ua
.s
, s
);
3543 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3544 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3545 fpclassify(ua
.h
) == FP_ZERO
) ||
3549 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3550 float64_is_neg(ua
.s
))) {
3557 return soft_f64_sqrt(ua
.s
, s
);
3560 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3562 FloatParts64 pa
= bfloat16_unpack_canonical(a
, status
);
3563 FloatParts64 pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3564 return bfloat16_round_pack_canonical(pr
, status
);
3567 /*----------------------------------------------------------------------------
3568 | The pattern for a default generated NaN.
3569 *----------------------------------------------------------------------------*/
3571 float16
float16_default_nan(float_status
*status
)
3573 FloatParts64 p
= parts_default_nan(status
);
3574 p
.frac
>>= float16_params
.frac_shift
;
3575 return float16_pack_raw(p
);
3578 float32
float32_default_nan(float_status
*status
)
3580 FloatParts64 p
= parts_default_nan(status
);
3581 p
.frac
>>= float32_params
.frac_shift
;
3582 return float32_pack_raw(p
);
3585 float64
float64_default_nan(float_status
*status
)
3587 FloatParts64 p
= parts_default_nan(status
);
3588 p
.frac
>>= float64_params
.frac_shift
;
3589 return float64_pack_raw(p
);
3592 float128
float128_default_nan(float_status
*status
)
3594 FloatParts64 p
= parts_default_nan(status
);
3597 /* Extrapolate from the choices made by parts_default_nan to fill
3598 * in the quad-floating format. If the low bit is set, assume we
3599 * want to set all non-snan bits.
3601 r
.low
= -(p
.frac
& 1);
3602 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3603 r
.high
|= UINT64_C(0x7FFF000000000000);
3604 r
.high
|= (uint64_t)p
.sign
<< 63;
3609 bfloat16
bfloat16_default_nan(float_status
*status
)
3611 FloatParts64 p
= parts_default_nan(status
);
3612 p
.frac
>>= bfloat16_params
.frac_shift
;
3613 return bfloat16_pack_raw(p
);
3616 /*----------------------------------------------------------------------------
3617 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3618 *----------------------------------------------------------------------------*/
3620 float16
float16_silence_nan(float16 a
, float_status
*status
)
3622 FloatParts64 p
= float16_unpack_raw(a
);
3623 p
.frac
<<= float16_params
.frac_shift
;
3624 p
= parts_silence_nan(p
, status
);
3625 p
.frac
>>= float16_params
.frac_shift
;
3626 return float16_pack_raw(p
);
3629 float32
float32_silence_nan(float32 a
, float_status
*status
)
3631 FloatParts64 p
= float32_unpack_raw(a
);
3632 p
.frac
<<= float32_params
.frac_shift
;
3633 p
= parts_silence_nan(p
, status
);
3634 p
.frac
>>= float32_params
.frac_shift
;
3635 return float32_pack_raw(p
);
3638 float64
float64_silence_nan(float64 a
, float_status
*status
)
3640 FloatParts64 p
= float64_unpack_raw(a
);
3641 p
.frac
<<= float64_params
.frac_shift
;
3642 p
= parts_silence_nan(p
, status
);
3643 p
.frac
>>= float64_params
.frac_shift
;
3644 return float64_pack_raw(p
);
3647 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3649 FloatParts64 p
= bfloat16_unpack_raw(a
);
3650 p
.frac
<<= bfloat16_params
.frac_shift
;
3651 p
= parts_silence_nan(p
, status
);
3652 p
.frac
>>= bfloat16_params
.frac_shift
;
3653 return bfloat16_pack_raw(p
);
3656 /*----------------------------------------------------------------------------
3657 | If `a' is denormal and we are in flush-to-zero mode then set the
3658 | input-denormal exception and return zero. Otherwise just return the value.
3659 *----------------------------------------------------------------------------*/
3661 static bool parts_squash_denormal(FloatParts64 p
, float_status
*status
)
3663 if (p
.exp
== 0 && p
.frac
!= 0) {
3664 float_raise(float_flag_input_denormal
, status
);
3671 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3673 if (status
->flush_inputs_to_zero
) {
3674 FloatParts64 p
= float16_unpack_raw(a
);
3675 if (parts_squash_denormal(p
, status
)) {
3676 return float16_set_sign(float16_zero
, p
.sign
);
3682 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3684 if (status
->flush_inputs_to_zero
) {
3685 FloatParts64 p
= float32_unpack_raw(a
);
3686 if (parts_squash_denormal(p
, status
)) {
3687 return float32_set_sign(float32_zero
, p
.sign
);
3693 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3695 if (status
->flush_inputs_to_zero
) {
3696 FloatParts64 p
= float64_unpack_raw(a
);
3697 if (parts_squash_denormal(p
, status
)) {
3698 return float64_set_sign(float64_zero
, p
.sign
);
3704 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3706 if (status
->flush_inputs_to_zero
) {
3707 FloatParts64 p
= bfloat16_unpack_raw(a
);
3708 if (parts_squash_denormal(p
, status
)) {
3709 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3715 /*----------------------------------------------------------------------------
3716 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3717 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3718 | input. If `zSign' is 1, the input is negated before being converted to an
3719 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3720 | is simply rounded to an integer, with the inexact exception raised if the
3721 | input cannot be represented exactly as an integer. However, if the fixed-
3722 | point input is too large, the invalid exception is raised and the largest
3723 | positive or negative integer is returned.
3724 *----------------------------------------------------------------------------*/
3726 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3727 float_status
*status
)
3729 int8_t roundingMode
;
3730 bool roundNearestEven
;
3731 int8_t roundIncrement
, roundBits
;
3734 roundingMode
= status
->float_rounding_mode
;
3735 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3736 switch (roundingMode
) {
3737 case float_round_nearest_even
:
3738 case float_round_ties_away
:
3739 roundIncrement
= 0x40;
3741 case float_round_to_zero
:
3744 case float_round_up
:
3745 roundIncrement
= zSign
? 0 : 0x7f;
3747 case float_round_down
:
3748 roundIncrement
= zSign
? 0x7f : 0;
3750 case float_round_to_odd
:
3751 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3756 roundBits
= absZ
& 0x7F;
3757 absZ
= ( absZ
+ roundIncrement
)>>7;
3758 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3762 if ( zSign
) z
= - z
;
3763 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3764 float_raise(float_flag_invalid
, status
);
3765 return zSign
? INT32_MIN
: INT32_MAX
;
3768 float_raise(float_flag_inexact
, status
);
3774 /*----------------------------------------------------------------------------
3775 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3776 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3777 | and returns the properly rounded 64-bit integer corresponding to the input.
3778 | If `zSign' is 1, the input is negated before being converted to an integer.
3779 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3780 | the inexact exception raised if the input cannot be represented exactly as
3781 | an integer. However, if the fixed-point input is too large, the invalid
3782 | exception is raised and the largest positive or negative integer is
3784 *----------------------------------------------------------------------------*/
3786 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3787 float_status
*status
)
3789 int8_t roundingMode
;
3790 bool roundNearestEven
, increment
;
3793 roundingMode
= status
->float_rounding_mode
;
3794 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3795 switch (roundingMode
) {
3796 case float_round_nearest_even
:
3797 case float_round_ties_away
:
3798 increment
= ((int64_t) absZ1
< 0);
3800 case float_round_to_zero
:
3803 case float_round_up
:
3804 increment
= !zSign
&& absZ1
;
3806 case float_round_down
:
3807 increment
= zSign
&& absZ1
;
3809 case float_round_to_odd
:
3810 increment
= !(absZ0
& 1) && absZ1
;
3817 if ( absZ0
== 0 ) goto overflow
;
3818 if (!(absZ1
<< 1) && roundNearestEven
) {
3823 if ( zSign
) z
= - z
;
3824 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3826 float_raise(float_flag_invalid
, status
);
3827 return zSign
? INT64_MIN
: INT64_MAX
;
3830 float_raise(float_flag_inexact
, status
);
3836 /*----------------------------------------------------------------------------
3837 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3838 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3839 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3840 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3841 | with the inexact exception raised if the input cannot be represented exactly
3842 | as an integer. However, if the fixed-point input is too large, the invalid
3843 | exception is raised and the largest unsigned integer is returned.
3844 *----------------------------------------------------------------------------*/
3846 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
3847 uint64_t absZ1
, float_status
*status
)
3849 int8_t roundingMode
;
3850 bool roundNearestEven
, increment
;
3852 roundingMode
= status
->float_rounding_mode
;
3853 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3854 switch (roundingMode
) {
3855 case float_round_nearest_even
:
3856 case float_round_ties_away
:
3857 increment
= ((int64_t)absZ1
< 0);
3859 case float_round_to_zero
:
3862 case float_round_up
:
3863 increment
= !zSign
&& absZ1
;
3865 case float_round_down
:
3866 increment
= zSign
&& absZ1
;
3868 case float_round_to_odd
:
3869 increment
= !(absZ0
& 1) && absZ1
;
3877 float_raise(float_flag_invalid
, status
);
3880 if (!(absZ1
<< 1) && roundNearestEven
) {
3885 if (zSign
&& absZ0
) {
3886 float_raise(float_flag_invalid
, status
);
3891 float_raise(float_flag_inexact
, status
);
3896 /*----------------------------------------------------------------------------
3897 | Normalizes the subnormal single-precision floating-point value represented
3898 | by the denormalized significand `aSig'. The normalized exponent and
3899 | significand are stored at the locations pointed to by `zExpPtr' and
3900 | `zSigPtr', respectively.
3901 *----------------------------------------------------------------------------*/
3904 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3908 shiftCount
= clz32(aSig
) - 8;
3909 *zSigPtr
= aSig
<<shiftCount
;
3910 *zExpPtr
= 1 - shiftCount
;
3914 /*----------------------------------------------------------------------------
3915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3916 | and significand `zSig', and returns the proper single-precision floating-
3917 | point value corresponding to the abstract input. Ordinarily, the abstract
3918 | value is simply rounded and packed into the single-precision format, with
3919 | the inexact exception raised if the abstract input cannot be represented
3920 | exactly. However, if the abstract value is too large, the overflow and
3921 | inexact exceptions are raised and an infinity or maximal finite value is
3922 | returned. If the abstract value is too small, the input value is rounded to
3923 | a subnormal number, and the underflow and inexact exceptions are raised if
3924 | the abstract input cannot be represented exactly as a subnormal single-
3925 | precision floating-point number.
3926 | The input significand `zSig' has its binary point between bits 30
3927 | and 29, which is 7 bits to the left of the usual location. This shifted
3928 | significand must be normalized or smaller. If `zSig' is not normalized,
3929 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3930 | and it must not require rounding. In the usual case that `zSig' is
3931 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3932 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3933 | Binary Floating-Point Arithmetic.
3934 *----------------------------------------------------------------------------*/
3936 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3937 float_status
*status
)
3939 int8_t roundingMode
;
3940 bool roundNearestEven
;
3941 int8_t roundIncrement
, roundBits
;
3944 roundingMode
= status
->float_rounding_mode
;
3945 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3946 switch (roundingMode
) {
3947 case float_round_nearest_even
:
3948 case float_round_ties_away
:
3949 roundIncrement
= 0x40;
3951 case float_round_to_zero
:
3954 case float_round_up
:
3955 roundIncrement
= zSign
? 0 : 0x7f;
3957 case float_round_down
:
3958 roundIncrement
= zSign
? 0x7f : 0;
3960 case float_round_to_odd
:
3961 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3967 roundBits
= zSig
& 0x7F;
3968 if ( 0xFD <= (uint16_t) zExp
) {
3969 if ( ( 0xFD < zExp
)
3970 || ( ( zExp
== 0xFD )
3971 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3973 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3974 roundIncrement
!= 0;
3975 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3976 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3979 if (status
->flush_to_zero
) {
3980 float_raise(float_flag_output_denormal
, status
);
3981 return packFloat32(zSign
, 0, 0);
3983 isTiny
= status
->tininess_before_rounding
3985 || (zSig
+ roundIncrement
< 0x80000000);
3986 shift32RightJamming( zSig
, - zExp
, &zSig
);
3988 roundBits
= zSig
& 0x7F;
3989 if (isTiny
&& roundBits
) {
3990 float_raise(float_flag_underflow
, status
);
3992 if (roundingMode
== float_round_to_odd
) {
3994 * For round-to-odd case, the roundIncrement depends on
3995 * zSig which just changed.
3997 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4002 float_raise(float_flag_inexact
, status
);
4004 zSig
= ( zSig
+ roundIncrement
)>>7;
4005 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4008 if ( zSig
== 0 ) zExp
= 0;
4009 return packFloat32( zSign
, zExp
, zSig
);
4013 /*----------------------------------------------------------------------------
4014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4015 | and significand `zSig', and returns the proper single-precision floating-
4016 | point value corresponding to the abstract input. This routine is just like
4017 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4018 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4019 | floating-point exponent.
4020 *----------------------------------------------------------------------------*/
4023 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4024 float_status
*status
)
4028 shiftCount
= clz32(zSig
) - 1;
4029 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4034 /*----------------------------------------------------------------------------
4035 | Normalizes the subnormal double-precision floating-point value represented
4036 | by the denormalized significand `aSig'. The normalized exponent and
4037 | significand are stored at the locations pointed to by `zExpPtr' and
4038 | `zSigPtr', respectively.
4039 *----------------------------------------------------------------------------*/
4042 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4046 shiftCount
= clz64(aSig
) - 11;
4047 *zSigPtr
= aSig
<<shiftCount
;
4048 *zExpPtr
= 1 - shiftCount
;
4052 /*----------------------------------------------------------------------------
4053 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4054 | double-precision floating-point value, returning the result. After being
4055 | shifted into the proper positions, the three fields are simply added
4056 | together to form the result. This means that any integer portion of `zSig'
4057 | will be added into the exponent. Since a properly normalized significand
4058 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4059 | than the desired result exponent whenever `zSig' is a complete, normalized
4061 *----------------------------------------------------------------------------*/
4063 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4066 return make_float64(
4067 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4071 /*----------------------------------------------------------------------------
4072 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4073 | and significand `zSig', and returns the proper double-precision floating-
4074 | point value corresponding to the abstract input. Ordinarily, the abstract
4075 | value is simply rounded and packed into the double-precision format, with
4076 | the inexact exception raised if the abstract input cannot be represented
4077 | exactly. However, if the abstract value is too large, the overflow and
4078 | inexact exceptions are raised and an infinity or maximal finite value is
4079 | returned. If the abstract value is too small, the input value is rounded to
4080 | a subnormal number, and the underflow and inexact exceptions are raised if
4081 | the abstract input cannot be represented exactly as a subnormal double-
4082 | precision floating-point number.
4083 | The input significand `zSig' has its binary point between bits 62
4084 | and 61, which is 10 bits to the left of the usual location. This shifted
4085 | significand must be normalized or smaller. If `zSig' is not normalized,
4086 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4087 | and it must not require rounding. In the usual case that `zSig' is
4088 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4089 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4090 | Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4093 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4094 float_status
*status
)
4096 int8_t roundingMode
;
4097 bool roundNearestEven
;
4098 int roundIncrement
, roundBits
;
4101 roundingMode
= status
->float_rounding_mode
;
4102 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4103 switch (roundingMode
) {
4104 case float_round_nearest_even
:
4105 case float_round_ties_away
:
4106 roundIncrement
= 0x200;
4108 case float_round_to_zero
:
4111 case float_round_up
:
4112 roundIncrement
= zSign
? 0 : 0x3ff;
4114 case float_round_down
:
4115 roundIncrement
= zSign
? 0x3ff : 0;
4117 case float_round_to_odd
:
4118 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4123 roundBits
= zSig
& 0x3FF;
4124 if ( 0x7FD <= (uint16_t) zExp
) {
4125 if ( ( 0x7FD < zExp
)
4126 || ( ( zExp
== 0x7FD )
4127 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4129 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4130 roundIncrement
!= 0;
4131 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4132 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4135 if (status
->flush_to_zero
) {
4136 float_raise(float_flag_output_denormal
, status
);
4137 return packFloat64(zSign
, 0, 0);
4139 isTiny
= status
->tininess_before_rounding
4141 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4142 shift64RightJamming( zSig
, - zExp
, &zSig
);
4144 roundBits
= zSig
& 0x3FF;
4145 if (isTiny
&& roundBits
) {
4146 float_raise(float_flag_underflow
, status
);
4148 if (roundingMode
== float_round_to_odd
) {
4150 * For round-to-odd case, the roundIncrement depends on
4151 * zSig which just changed.
4153 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4158 float_raise(float_flag_inexact
, status
);
4160 zSig
= ( zSig
+ roundIncrement
)>>10;
4161 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4164 if ( zSig
== 0 ) zExp
= 0;
4165 return packFloat64( zSign
, zExp
, zSig
);
4169 /*----------------------------------------------------------------------------
4170 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4171 | and significand `zSig', and returns the proper double-precision floating-
4172 | point value corresponding to the abstract input. This routine is just like
4173 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4174 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4175 | floating-point exponent.
4176 *----------------------------------------------------------------------------*/
4179 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4180 float_status
*status
)
4184 shiftCount
= clz64(zSig
) - 1;
4185 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4190 /*----------------------------------------------------------------------------
4191 | Normalizes the subnormal extended double-precision floating-point value
4192 | represented by the denormalized significand `aSig'. The normalized exponent
4193 | and significand are stored at the locations pointed to by `zExpPtr' and
4194 | `zSigPtr', respectively.
4195 *----------------------------------------------------------------------------*/
4197 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4202 shiftCount
= clz64(aSig
);
4203 *zSigPtr
= aSig
<<shiftCount
;
4204 *zExpPtr
= 1 - shiftCount
;
4207 /*----------------------------------------------------------------------------
4208 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4209 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4210 | and returns the proper extended double-precision floating-point value
4211 | corresponding to the abstract input. Ordinarily, the abstract value is
4212 | rounded and packed into the extended double-precision format, with the
4213 | inexact exception raised if the abstract input cannot be represented
4214 | exactly. However, if the abstract value is too large, the overflow and
4215 | inexact exceptions are raised and an infinity or maximal finite value is
4216 | returned. If the abstract value is too small, the input value is rounded to
4217 | a subnormal number, and the underflow and inexact exceptions are raised if
4218 | the abstract input cannot be represented exactly as a subnormal extended
4219 | double-precision floating-point number.
4220 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4221 | number of bits as single or double precision, respectively. Otherwise, the
4222 | result is rounded to the full precision of the extended double-precision
4224 | The input significand must be normalized or smaller. If the input
4225 | significand is not normalized, `zExp' must be 0; in that case, the result
4226 | returned is a subnormal number, and it must not require rounding. The
4227 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4231 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4232 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4233 float_status
*status
)
4235 int8_t roundingMode
;
4236 bool roundNearestEven
, increment
, isTiny
;
4237 int64_t roundIncrement
, roundMask
, roundBits
;
4239 roundingMode
= status
->float_rounding_mode
;
4240 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4241 if ( roundingPrecision
== 80 ) goto precision80
;
4242 if ( roundingPrecision
== 64 ) {
4243 roundIncrement
= UINT64_C(0x0000000000000400);
4244 roundMask
= UINT64_C(0x00000000000007FF);
4246 else if ( roundingPrecision
== 32 ) {
4247 roundIncrement
= UINT64_C(0x0000008000000000);
4248 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4253 zSig0
|= ( zSig1
!= 0 );
4254 switch (roundingMode
) {
4255 case float_round_nearest_even
:
4256 case float_round_ties_away
:
4258 case float_round_to_zero
:
4261 case float_round_up
:
4262 roundIncrement
= zSign
? 0 : roundMask
;
4264 case float_round_down
:
4265 roundIncrement
= zSign
? roundMask
: 0;
4270 roundBits
= zSig0
& roundMask
;
4271 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4272 if ( ( 0x7FFE < zExp
)
4273 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4278 if (status
->flush_to_zero
) {
4279 float_raise(float_flag_output_denormal
, status
);
4280 return packFloatx80(zSign
, 0, 0);
4282 isTiny
= status
->tininess_before_rounding
4284 || (zSig0
<= zSig0
+ roundIncrement
);
4285 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4287 roundBits
= zSig0
& roundMask
;
4288 if (isTiny
&& roundBits
) {
4289 float_raise(float_flag_underflow
, status
);
4292 float_raise(float_flag_inexact
, status
);
4294 zSig0
+= roundIncrement
;
4295 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4296 roundIncrement
= roundMask
+ 1;
4297 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4298 roundMask
|= roundIncrement
;
4300 zSig0
&= ~ roundMask
;
4301 return packFloatx80( zSign
, zExp
, zSig0
);
4305 float_raise(float_flag_inexact
, status
);
4307 zSig0
+= roundIncrement
;
4308 if ( zSig0
< roundIncrement
) {
4310 zSig0
= UINT64_C(0x8000000000000000);
4312 roundIncrement
= roundMask
+ 1;
4313 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4314 roundMask
|= roundIncrement
;
4316 zSig0
&= ~ roundMask
;
4317 if ( zSig0
== 0 ) zExp
= 0;
4318 return packFloatx80( zSign
, zExp
, zSig0
);
4320 switch (roundingMode
) {
4321 case float_round_nearest_even
:
4322 case float_round_ties_away
:
4323 increment
= ((int64_t)zSig1
< 0);
4325 case float_round_to_zero
:
4328 case float_round_up
:
4329 increment
= !zSign
&& zSig1
;
4331 case float_round_down
:
4332 increment
= zSign
&& zSig1
;
4337 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4338 if ( ( 0x7FFE < zExp
)
4339 || ( ( zExp
== 0x7FFE )
4340 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4346 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4347 if ( ( roundingMode
== float_round_to_zero
)
4348 || ( zSign
&& ( roundingMode
== float_round_up
) )
4349 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4351 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4353 return packFloatx80(zSign
,
4354 floatx80_infinity_high
,
4355 floatx80_infinity_low
);
4358 isTiny
= status
->tininess_before_rounding
4361 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4362 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4364 if (isTiny
&& zSig1
) {
4365 float_raise(float_flag_underflow
, status
);
4368 float_raise(float_flag_inexact
, status
);
4370 switch (roundingMode
) {
4371 case float_round_nearest_even
:
4372 case float_round_ties_away
:
4373 increment
= ((int64_t)zSig1
< 0);
4375 case float_round_to_zero
:
4378 case float_round_up
:
4379 increment
= !zSign
&& zSig1
;
4381 case float_round_down
:
4382 increment
= zSign
&& zSig1
;
4389 if (!(zSig1
<< 1) && roundNearestEven
) {
4392 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4394 return packFloatx80( zSign
, zExp
, zSig0
);
4398 float_raise(float_flag_inexact
, status
);
4404 zSig0
= UINT64_C(0x8000000000000000);
4407 if (!(zSig1
<< 1) && roundNearestEven
) {
4413 if ( zSig0
== 0 ) zExp
= 0;
4415 return packFloatx80( zSign
, zExp
, zSig0
);
4419 /*----------------------------------------------------------------------------
4420 | Takes an abstract floating-point value having sign `zSign', exponent
4421 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4422 | and returns the proper extended double-precision floating-point value
4423 | corresponding to the abstract input. This routine is just like
4424 | `roundAndPackFloatx80' except that the input significand does not have to be
4426 *----------------------------------------------------------------------------*/
4428 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4429 bool zSign
, int32_t zExp
,
4430 uint64_t zSig0
, uint64_t zSig1
,
4431 float_status
*status
)
4440 shiftCount
= clz64(zSig0
);
4441 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4443 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4444 zSig0
, zSig1
, status
);
4448 /*----------------------------------------------------------------------------
4449 | Returns the least-significant 64 fraction bits of the quadruple-precision
4450 | floating-point value `a'.
4451 *----------------------------------------------------------------------------*/
4453 static inline uint64_t extractFloat128Frac1( float128 a
)
4460 /*----------------------------------------------------------------------------
4461 | Returns the most-significant 48 fraction bits of the quadruple-precision
4462 | floating-point value `a'.
4463 *----------------------------------------------------------------------------*/
4465 static inline uint64_t extractFloat128Frac0( float128 a
)
4468 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4472 /*----------------------------------------------------------------------------
4473 | Returns the exponent bits of the quadruple-precision floating-point value
4475 *----------------------------------------------------------------------------*/
4477 static inline int32_t extractFloat128Exp( float128 a
)
4480 return ( a
.high
>>48 ) & 0x7FFF;
4484 /*----------------------------------------------------------------------------
4485 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4486 *----------------------------------------------------------------------------*/
4488 static inline bool extractFloat128Sign(float128 a
)
4490 return a
.high
>> 63;
4493 /*----------------------------------------------------------------------------
4494 | Normalizes the subnormal quadruple-precision floating-point value
4495 | represented by the denormalized significand formed by the concatenation of
4496 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4497 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4498 | significand are stored at the location pointed to by `zSig0Ptr', and the
4499 | least significant 64 bits of the normalized significand are stored at the
4500 | location pointed to by `zSig1Ptr'.
4501 *----------------------------------------------------------------------------*/
4504 normalizeFloat128Subnormal(
4515 shiftCount
= clz64(aSig1
) - 15;
4516 if ( shiftCount
< 0 ) {
4517 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4518 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4521 *zSig0Ptr
= aSig1
<<shiftCount
;
4524 *zExpPtr
= - shiftCount
- 63;
4527 shiftCount
= clz64(aSig0
) - 15;
4528 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4529 *zExpPtr
= 1 - shiftCount
;
4534 /*----------------------------------------------------------------------------
4535 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4536 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4537 | floating-point value, returning the result. After being shifted into the
4538 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4539 | added together to form the most significant 32 bits of the result. This
4540 | means that any integer portion of `zSig0' will be added into the exponent.
4541 | Since a properly normalized significand will have an integer portion equal
4542 | to 1, the `zExp' input should be 1 less than the desired result exponent
4543 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4545 *----------------------------------------------------------------------------*/
4547 static inline float128
4548 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4553 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4557 /*----------------------------------------------------------------------------
4558 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4559 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4560 | and `zSig2', and returns the proper quadruple-precision floating-point value
4561 | corresponding to the abstract input. Ordinarily, the abstract value is
4562 | simply rounded and packed into the quadruple-precision format, with the
4563 | inexact exception raised if the abstract input cannot be represented
4564 | exactly. However, if the abstract value is too large, the overflow and
4565 | inexact exceptions are raised and an infinity or maximal finite value is
4566 | returned. If the abstract value is too small, the input value is rounded to
4567 | a subnormal number, and the underflow and inexact exceptions are raised if
4568 | the abstract input cannot be represented exactly as a subnormal quadruple-
4569 | precision floating-point number.
4570 | The input significand must be normalized or smaller. If the input
4571 | significand is not normalized, `zExp' must be 0; in that case, the result
4572 | returned is a subnormal number, and it must not require rounding. In the
4573 | usual case that the input significand is normalized, `zExp' must be 1 less
4574 | than the ``true'' floating-point exponent. The handling of underflow and
4575 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4576 *----------------------------------------------------------------------------*/
4578 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4579 uint64_t zSig0
, uint64_t zSig1
,
4580 uint64_t zSig2
, float_status
*status
)
4582 int8_t roundingMode
;
4583 bool roundNearestEven
, increment
, isTiny
;
4585 roundingMode
= status
->float_rounding_mode
;
4586 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4587 switch (roundingMode
) {
4588 case float_round_nearest_even
:
4589 case float_round_ties_away
:
4590 increment
= ((int64_t)zSig2
< 0);
4592 case float_round_to_zero
:
4595 case float_round_up
:
4596 increment
= !zSign
&& zSig2
;
4598 case float_round_down
:
4599 increment
= zSign
&& zSig2
;
4601 case float_round_to_odd
:
4602 increment
= !(zSig1
& 0x1) && zSig2
;
4607 if ( 0x7FFD <= (uint32_t) zExp
) {
4608 if ( ( 0x7FFD < zExp
)
4609 || ( ( zExp
== 0x7FFD )
4611 UINT64_C(0x0001FFFFFFFFFFFF),
4612 UINT64_C(0xFFFFFFFFFFFFFFFF),
4619 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4620 if ( ( roundingMode
== float_round_to_zero
)
4621 || ( zSign
&& ( roundingMode
== float_round_up
) )
4622 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4623 || (roundingMode
== float_round_to_odd
)
4629 UINT64_C(0x0000FFFFFFFFFFFF),
4630 UINT64_C(0xFFFFFFFFFFFFFFFF)
4633 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4636 if (status
->flush_to_zero
) {
4637 float_raise(float_flag_output_denormal
, status
);
4638 return packFloat128(zSign
, 0, 0, 0);
4640 isTiny
= status
->tininess_before_rounding
4643 || lt128(zSig0
, zSig1
,
4644 UINT64_C(0x0001FFFFFFFFFFFF),
4645 UINT64_C(0xFFFFFFFFFFFFFFFF));
4646 shift128ExtraRightJamming(
4647 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4649 if (isTiny
&& zSig2
) {
4650 float_raise(float_flag_underflow
, status
);
4652 switch (roundingMode
) {
4653 case float_round_nearest_even
:
4654 case float_round_ties_away
:
4655 increment
= ((int64_t)zSig2
< 0);
4657 case float_round_to_zero
:
4660 case float_round_up
:
4661 increment
= !zSign
&& zSig2
;
4663 case float_round_down
:
4664 increment
= zSign
&& zSig2
;
4666 case float_round_to_odd
:
4667 increment
= !(zSig1
& 0x1) && zSig2
;
4675 float_raise(float_flag_inexact
, status
);
4678 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4679 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4684 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4686 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4690 /*----------------------------------------------------------------------------
4691 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4692 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4693 | returns the proper quadruple-precision floating-point value corresponding
4694 | to the abstract input. This routine is just like `roundAndPackFloat128'
4695 | except that the input significand has fewer bits and does not have to be
4696 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4698 *----------------------------------------------------------------------------*/
4700 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4701 uint64_t zSig0
, uint64_t zSig1
,
4702 float_status
*status
)
4712 shiftCount
= clz64(zSig0
) - 15;
4713 if ( 0 <= shiftCount
) {
4715 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4718 shift128ExtraRightJamming(
4719 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4722 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4727 /*----------------------------------------------------------------------------
4728 | Returns the result of converting the 32-bit two's complement integer `a'
4729 | to the extended double-precision floating-point format. The conversion
4730 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4732 *----------------------------------------------------------------------------*/
4734 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4741 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4743 absA
= zSign
? - a
: a
;
4744 shiftCount
= clz32(absA
) + 32;
4746 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4750 /*----------------------------------------------------------------------------
4751 | Returns the result of converting the 32-bit two's complement integer `a' to
4752 | the quadruple-precision floating-point format. The conversion is performed
4753 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4756 float128
int32_to_float128(int32_t a
, float_status
*status
)
4763 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4765 absA
= zSign
? - a
: a
;
4766 shiftCount
= clz32(absA
) + 17;
4768 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4772 /*----------------------------------------------------------------------------
4773 | Returns the result of converting the 64-bit two's complement integer `a'
4774 | to the extended double-precision floating-point format. The conversion
4775 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4777 *----------------------------------------------------------------------------*/
4779 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4785 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4787 absA
= zSign
? - a
: a
;
4788 shiftCount
= clz64(absA
);
4789 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4793 /*----------------------------------------------------------------------------
4794 | Returns the result of converting the 64-bit two's complement integer `a' to
4795 | the quadruple-precision floating-point format. The conversion is performed
4796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4797 *----------------------------------------------------------------------------*/
4799 float128
int64_to_float128(int64_t a
, float_status
*status
)
4805 uint64_t zSig0
, zSig1
;
4807 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4809 absA
= zSign
? - a
: a
;
4810 shiftCount
= clz64(absA
) + 49;
4811 zExp
= 0x406E - shiftCount
;
4812 if ( 64 <= shiftCount
) {
4821 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4822 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the 64-bit unsigned integer `a'
4828 | to the quadruple-precision floating-point format. The conversion is performed
4829 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4830 *----------------------------------------------------------------------------*/
4832 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4835 return float128_zero
;
4837 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4840 /*----------------------------------------------------------------------------
4841 | Returns the result of converting the single-precision floating-point value
4842 | `a' to the extended double-precision floating-point format. The conversion
4843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4845 *----------------------------------------------------------------------------*/
4847 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4853 a
= float32_squash_input_denormal(a
, status
);
4854 aSig
= extractFloat32Frac( a
);
4855 aExp
= extractFloat32Exp( a
);
4856 aSign
= extractFloat32Sign( a
);
4857 if ( aExp
== 0xFF ) {
4859 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4861 return floatx80_silence_nan(res
, status
);
4863 return packFloatx80(aSign
,
4864 floatx80_infinity_high
,
4865 floatx80_infinity_low
);
4868 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4869 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4872 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4876 /*----------------------------------------------------------------------------
4877 | Returns the result of converting the single-precision floating-point value
4878 | `a' to the double-precision floating-point format. The conversion is
4879 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4881 *----------------------------------------------------------------------------*/
4883 float128
float32_to_float128(float32 a
, float_status
*status
)
4889 a
= float32_squash_input_denormal(a
, status
);
4890 aSig
= extractFloat32Frac( a
);
4891 aExp
= extractFloat32Exp( a
);
4892 aSign
= extractFloat32Sign( a
);
4893 if ( aExp
== 0xFF ) {
4895 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4897 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4900 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4901 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4904 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4908 /*----------------------------------------------------------------------------
4909 | Returns the remainder of the single-precision floating-point value `a'
4910 | with respect to the corresponding value `b'. The operation is performed
4911 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912 *----------------------------------------------------------------------------*/
4914 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4917 int aExp
, bExp
, expDiff
;
4918 uint32_t aSig
, bSig
;
4920 uint64_t aSig64
, bSig64
, q64
;
4921 uint32_t alternateASig
;
4923 a
= float32_squash_input_denormal(a
, status
);
4924 b
= float32_squash_input_denormal(b
, status
);
4926 aSig
= extractFloat32Frac( a
);
4927 aExp
= extractFloat32Exp( a
);
4928 aSign
= extractFloat32Sign( a
);
4929 bSig
= extractFloat32Frac( b
);
4930 bExp
= extractFloat32Exp( b
);
4931 if ( aExp
== 0xFF ) {
4932 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4933 return propagateFloat32NaN(a
, b
, status
);
4935 float_raise(float_flag_invalid
, status
);
4936 return float32_default_nan(status
);
4938 if ( bExp
== 0xFF ) {
4940 return propagateFloat32NaN(a
, b
, status
);
4946 float_raise(float_flag_invalid
, status
);
4947 return float32_default_nan(status
);
4949 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4952 if ( aSig
== 0 ) return a
;
4953 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4955 expDiff
= aExp
- bExp
;
4958 if ( expDiff
< 32 ) {
4961 if ( expDiff
< 0 ) {
4962 if ( expDiff
< -1 ) return a
;
4965 q
= ( bSig
<= aSig
);
4966 if ( q
) aSig
-= bSig
;
4967 if ( 0 < expDiff
) {
4968 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4971 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4979 if ( bSig
<= aSig
) aSig
-= bSig
;
4980 aSig64
= ( (uint64_t) aSig
)<<40;
4981 bSig64
= ( (uint64_t) bSig
)<<40;
4983 while ( 0 < expDiff
) {
4984 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4985 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4986 aSig64
= - ( ( bSig
* q64
)<<38 );
4990 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4991 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4992 q
= q64
>>( 64 - expDiff
);
4994 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4997 alternateASig
= aSig
;
5000 } while ( 0 <= (int32_t) aSig
);
5001 sigMean
= aSig
+ alternateASig
;
5002 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5003 aSig
= alternateASig
;
5005 zSign
= ( (int32_t) aSig
< 0 );
5006 if ( zSign
) aSig
= - aSig
;
5007 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5012 /*----------------------------------------------------------------------------
5013 | Returns the binary exponential of the single-precision floating-point value
5014 | `a'. The operation is performed according to the IEC/IEEE Standard for
5015 | Binary Floating-Point Arithmetic.
5017 | Uses the following identities:
5019 | 1. -------------------------------------------------------------------------
5023 | 2. -------------------------------------------------------------------------
5026 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5028 *----------------------------------------------------------------------------*/
5030 static const float64 float32_exp2_coefficients
[15] =
5032 const_float64( 0x3ff0000000000000ll
), /* 1 */
5033 const_float64( 0x3fe0000000000000ll
), /* 2 */
5034 const_float64( 0x3fc5555555555555ll
), /* 3 */
5035 const_float64( 0x3fa5555555555555ll
), /* 4 */
5036 const_float64( 0x3f81111111111111ll
), /* 5 */
5037 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5038 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5039 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5040 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5041 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5042 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5043 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5044 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5045 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5046 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5049 float32
float32_exp2(float32 a
, float_status
*status
)
5056 a
= float32_squash_input_denormal(a
, status
);
5058 aSig
= extractFloat32Frac( a
);
5059 aExp
= extractFloat32Exp( a
);
5060 aSign
= extractFloat32Sign( a
);
5062 if ( aExp
== 0xFF) {
5064 return propagateFloat32NaN(a
, float32_zero
, status
);
5066 return (aSign
) ? float32_zero
: a
;
5069 if (aSig
== 0) return float32_one
;
5072 float_raise(float_flag_inexact
, status
);
5074 /* ******************************* */
5075 /* using float64 for approximation */
5076 /* ******************************* */
5077 x
= float32_to_float64(a
, status
);
5078 x
= float64_mul(x
, float64_ln2
, status
);
5082 for (i
= 0 ; i
< 15 ; i
++) {
5085 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5086 r
= float64_add(r
, f
, status
);
5088 xn
= float64_mul(xn
, x
, status
);
5091 return float64_to_float32(r
, status
);
5094 /*----------------------------------------------------------------------------
5095 | Returns the binary log of the single-precision floating-point value `a'.
5096 | The operation is performed according to the IEC/IEEE Standard for Binary
5097 | Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099 float32
float32_log2(float32 a
, float_status
*status
)
5103 uint32_t aSig
, zSig
, i
;
5105 a
= float32_squash_input_denormal(a
, status
);
5106 aSig
= extractFloat32Frac( a
);
5107 aExp
= extractFloat32Exp( a
);
5108 aSign
= extractFloat32Sign( a
);
5111 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5112 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5115 float_raise(float_flag_invalid
, status
);
5116 return float32_default_nan(status
);
5118 if ( aExp
== 0xFF ) {
5120 return propagateFloat32NaN(a
, float32_zero
, status
);
5130 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5131 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5132 if ( aSig
& 0x01000000 ) {
5141 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5144 /*----------------------------------------------------------------------------
5145 | Returns the result of converting the double-precision floating-point value
5146 | `a' to the extended double-precision floating-point format. The conversion
5147 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5149 *----------------------------------------------------------------------------*/
5151 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5157 a
= float64_squash_input_denormal(a
, status
);
5158 aSig
= extractFloat64Frac( a
);
5159 aExp
= extractFloat64Exp( a
);
5160 aSign
= extractFloat64Sign( a
);
5161 if ( aExp
== 0x7FF ) {
5163 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5165 return floatx80_silence_nan(res
, status
);
5167 return packFloatx80(aSign
,
5168 floatx80_infinity_high
,
5169 floatx80_infinity_low
);
5172 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5173 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5177 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5181 /*----------------------------------------------------------------------------
5182 | Returns the result of converting the double-precision floating-point value
5183 | `a' to the quadruple-precision floating-point format. The conversion is
5184 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5186 *----------------------------------------------------------------------------*/
5188 float128
float64_to_float128(float64 a
, float_status
*status
)
5192 uint64_t aSig
, zSig0
, zSig1
;
5194 a
= float64_squash_input_denormal(a
, status
);
5195 aSig
= extractFloat64Frac( a
);
5196 aExp
= extractFloat64Exp( a
);
5197 aSign
= extractFloat64Sign( a
);
5198 if ( aExp
== 0x7FF ) {
5200 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5202 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5205 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5206 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5209 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5210 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5215 /*----------------------------------------------------------------------------
5216 | Returns the remainder of the double-precision floating-point value `a'
5217 | with respect to the corresponding value `b'. The operation is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5221 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5224 int aExp
, bExp
, expDiff
;
5225 uint64_t aSig
, bSig
;
5226 uint64_t q
, alternateASig
;
5229 a
= float64_squash_input_denormal(a
, status
);
5230 b
= float64_squash_input_denormal(b
, status
);
5231 aSig
= extractFloat64Frac( a
);
5232 aExp
= extractFloat64Exp( a
);
5233 aSign
= extractFloat64Sign( a
);
5234 bSig
= extractFloat64Frac( b
);
5235 bExp
= extractFloat64Exp( b
);
5236 if ( aExp
== 0x7FF ) {
5237 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5238 return propagateFloat64NaN(a
, b
, status
);
5240 float_raise(float_flag_invalid
, status
);
5241 return float64_default_nan(status
);
5243 if ( bExp
== 0x7FF ) {
5245 return propagateFloat64NaN(a
, b
, status
);
5251 float_raise(float_flag_invalid
, status
);
5252 return float64_default_nan(status
);
5254 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5257 if ( aSig
== 0 ) return a
;
5258 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5260 expDiff
= aExp
- bExp
;
5261 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5262 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5263 if ( expDiff
< 0 ) {
5264 if ( expDiff
< -1 ) return a
;
5267 q
= ( bSig
<= aSig
);
5268 if ( q
) aSig
-= bSig
;
5270 while ( 0 < expDiff
) {
5271 q
= estimateDiv128To64( aSig
, 0, bSig
);
5272 q
= ( 2 < q
) ? q
- 2 : 0;
5273 aSig
= - ( ( bSig
>>2 ) * q
);
5277 if ( 0 < expDiff
) {
5278 q
= estimateDiv128To64( aSig
, 0, bSig
);
5279 q
= ( 2 < q
) ? q
- 2 : 0;
5282 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5289 alternateASig
= aSig
;
5292 } while ( 0 <= (int64_t) aSig
);
5293 sigMean
= aSig
+ alternateASig
;
5294 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5295 aSig
= alternateASig
;
5297 zSign
= ( (int64_t) aSig
< 0 );
5298 if ( zSign
) aSig
= - aSig
;
5299 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5303 /*----------------------------------------------------------------------------
5304 | Returns the binary log of the double-precision floating-point value `a'.
5305 | The operation is performed according to the IEC/IEEE Standard for Binary
5306 | Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5308 float64
float64_log2(float64 a
, float_status
*status
)
5312 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5313 a
= float64_squash_input_denormal(a
, status
);
5315 aSig
= extractFloat64Frac( a
);
5316 aExp
= extractFloat64Exp( a
);
5317 aSign
= extractFloat64Sign( a
);
5320 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5321 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5324 float_raise(float_flag_invalid
, status
);
5325 return float64_default_nan(status
);
5327 if ( aExp
== 0x7FF ) {
5329 return propagateFloat64NaN(a
, float64_zero
, status
);
5335 aSig
|= UINT64_C(0x0010000000000000);
5337 zSig
= (uint64_t)aExp
<< 52;
5338 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5339 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5340 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5341 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5349 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5352 /*----------------------------------------------------------------------------
5353 | Returns the result of converting the extended double-precision floating-
5354 | point value `a' to the 32-bit two's complement integer format. The
5355 | conversion is performed according to the IEC/IEEE Standard for Binary
5356 | Floating-Point Arithmetic---which means in particular that the conversion
5357 | is rounded according to the current rounding mode. If `a' is a NaN, the
5358 | largest positive integer is returned. Otherwise, if the conversion
5359 | overflows, the largest integer with the same sign as `a' is returned.
5360 *----------------------------------------------------------------------------*/
5362 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5365 int32_t aExp
, shiftCount
;
5368 if (floatx80_invalid_encoding(a
)) {
5369 float_raise(float_flag_invalid
, status
);
5372 aSig
= extractFloatx80Frac( a
);
5373 aExp
= extractFloatx80Exp( a
);
5374 aSign
= extractFloatx80Sign( a
);
5375 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5376 shiftCount
= 0x4037 - aExp
;
5377 if ( shiftCount
<= 0 ) shiftCount
= 1;
5378 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5379 return roundAndPackInt32(aSign
, aSig
, status
);
5383 /*----------------------------------------------------------------------------
5384 | Returns the result of converting the extended double-precision floating-
5385 | point value `a' to the 32-bit two's complement integer format. The
5386 | conversion is performed according to the IEC/IEEE Standard for Binary
5387 | Floating-Point Arithmetic, except that the conversion is always rounded
5388 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5389 | Otherwise, if the conversion overflows, the largest integer with the same
5390 | sign as `a' is returned.
5391 *----------------------------------------------------------------------------*/
5393 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5396 int32_t aExp
, shiftCount
;
5397 uint64_t aSig
, savedASig
;
5400 if (floatx80_invalid_encoding(a
)) {
5401 float_raise(float_flag_invalid
, status
);
5404 aSig
= extractFloatx80Frac( a
);
5405 aExp
= extractFloatx80Exp( a
);
5406 aSign
= extractFloatx80Sign( a
);
5407 if ( 0x401E < aExp
) {
5408 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5411 else if ( aExp
< 0x3FFF ) {
5413 float_raise(float_flag_inexact
, status
);
5417 shiftCount
= 0x403E - aExp
;
5419 aSig
>>= shiftCount
;
5421 if ( aSign
) z
= - z
;
5422 if ( ( z
< 0 ) ^ aSign
) {
5424 float_raise(float_flag_invalid
, status
);
5425 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5427 if ( ( aSig
<<shiftCount
) != savedASig
) {
5428 float_raise(float_flag_inexact
, status
);
5434 /*----------------------------------------------------------------------------
5435 | Returns the result of converting the extended double-precision floating-
5436 | point value `a' to the 64-bit two's complement integer format. The
5437 | conversion is performed according to the IEC/IEEE Standard for Binary
5438 | Floating-Point Arithmetic---which means in particular that the conversion
5439 | is rounded according to the current rounding mode. If `a' is a NaN,
5440 | the largest positive integer is returned. Otherwise, if the conversion
5441 | overflows, the largest integer with the same sign as `a' is returned.
5442 *----------------------------------------------------------------------------*/
5444 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5447 int32_t aExp
, shiftCount
;
5448 uint64_t aSig
, aSigExtra
;
5450 if (floatx80_invalid_encoding(a
)) {
5451 float_raise(float_flag_invalid
, status
);
5454 aSig
= extractFloatx80Frac( a
);
5455 aExp
= extractFloatx80Exp( a
);
5456 aSign
= extractFloatx80Sign( a
);
5457 shiftCount
= 0x403E - aExp
;
5458 if ( shiftCount
<= 0 ) {
5460 float_raise(float_flag_invalid
, status
);
5461 if (!aSign
|| floatx80_is_any_nan(a
)) {
5469 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5471 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5475 /*----------------------------------------------------------------------------
5476 | Returns the result of converting the extended double-precision floating-
5477 | point value `a' to the 64-bit two's complement integer format. The
5478 | conversion is performed according to the IEC/IEEE Standard for Binary
5479 | Floating-Point Arithmetic, except that the conversion is always rounded
5480 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5481 | Otherwise, if the conversion overflows, the largest integer with the same
5482 | sign as `a' is returned.
5483 *----------------------------------------------------------------------------*/
5485 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5488 int32_t aExp
, shiftCount
;
5492 if (floatx80_invalid_encoding(a
)) {
5493 float_raise(float_flag_invalid
, status
);
5496 aSig
= extractFloatx80Frac( a
);
5497 aExp
= extractFloatx80Exp( a
);
5498 aSign
= extractFloatx80Sign( a
);
5499 shiftCount
= aExp
- 0x403E;
5500 if ( 0 <= shiftCount
) {
5501 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5502 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5503 float_raise(float_flag_invalid
, status
);
5504 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5510 else if ( aExp
< 0x3FFF ) {
5512 float_raise(float_flag_inexact
, status
);
5516 z
= aSig
>>( - shiftCount
);
5517 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5518 float_raise(float_flag_inexact
, status
);
5520 if ( aSign
) z
= - z
;
5525 /*----------------------------------------------------------------------------
5526 | Returns the result of converting the extended double-precision floating-
5527 | point value `a' to the single-precision floating-point format. The
5528 | conversion is performed according to the IEC/IEEE Standard for Binary
5529 | Floating-Point Arithmetic.
5530 *----------------------------------------------------------------------------*/
5532 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5538 if (floatx80_invalid_encoding(a
)) {
5539 float_raise(float_flag_invalid
, status
);
5540 return float32_default_nan(status
);
5542 aSig
= extractFloatx80Frac( a
);
5543 aExp
= extractFloatx80Exp( a
);
5544 aSign
= extractFloatx80Sign( a
);
5545 if ( aExp
== 0x7FFF ) {
5546 if ( (uint64_t) ( aSig
<<1 ) ) {
5547 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5549 return float32_silence_nan(res
, status
);
5551 return packFloat32( aSign
, 0xFF, 0 );
5553 shift64RightJamming( aSig
, 33, &aSig
);
5554 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5555 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5559 /*----------------------------------------------------------------------------
5560 | Returns the result of converting the extended double-precision floating-
5561 | point value `a' to the double-precision floating-point format. The
5562 | conversion is performed according to the IEC/IEEE Standard for Binary
5563 | Floating-Point Arithmetic.
5564 *----------------------------------------------------------------------------*/
5566 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5570 uint64_t aSig
, zSig
;
5572 if (floatx80_invalid_encoding(a
)) {
5573 float_raise(float_flag_invalid
, status
);
5574 return float64_default_nan(status
);
5576 aSig
= extractFloatx80Frac( a
);
5577 aExp
= extractFloatx80Exp( a
);
5578 aSign
= extractFloatx80Sign( a
);
5579 if ( aExp
== 0x7FFF ) {
5580 if ( (uint64_t) ( aSig
<<1 ) ) {
5581 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5583 return float64_silence_nan(res
, status
);
5585 return packFloat64( aSign
, 0x7FF, 0 );
5587 shift64RightJamming( aSig
, 1, &zSig
);
5588 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5589 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5593 /*----------------------------------------------------------------------------
5594 | Returns the result of converting the extended double-precision floating-
5595 | point value `a' to the quadruple-precision floating-point format. The
5596 | conversion is performed according to the IEC/IEEE Standard for Binary
5597 | Floating-Point Arithmetic.
5598 *----------------------------------------------------------------------------*/
5600 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5604 uint64_t aSig
, zSig0
, zSig1
;
5606 if (floatx80_invalid_encoding(a
)) {
5607 float_raise(float_flag_invalid
, status
);
5608 return float128_default_nan(status
);
5610 aSig
= extractFloatx80Frac( a
);
5611 aExp
= extractFloatx80Exp( a
);
5612 aSign
= extractFloatx80Sign( a
);
5613 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5614 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5616 return float128_silence_nan(res
, status
);
5618 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5619 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5623 /*----------------------------------------------------------------------------
5624 | Rounds the extended double-precision floating-point value `a'
5625 | to the precision provided by floatx80_rounding_precision and returns the
5626 | result as an extended double-precision floating-point value.
5627 | The operation is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic.
5629 *----------------------------------------------------------------------------*/
5631 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5633 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5634 extractFloatx80Sign(a
),
5635 extractFloatx80Exp(a
),
5636 extractFloatx80Frac(a
), 0, status
);
5639 /*----------------------------------------------------------------------------
5640 | Rounds the extended double-precision floating-point value `a' to an integer,
5641 | and returns the result as an extended quadruple-precision floating-point
5642 | value. The operation is performed according to the IEC/IEEE Standard for
5643 | Binary Floating-Point Arithmetic.
5644 *----------------------------------------------------------------------------*/
5646 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5650 uint64_t lastBitMask
, roundBitsMask
;
5653 if (floatx80_invalid_encoding(a
)) {
5654 float_raise(float_flag_invalid
, status
);
5655 return floatx80_default_nan(status
);
5657 aExp
= extractFloatx80Exp( a
);
5658 if ( 0x403E <= aExp
) {
5659 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5660 return propagateFloatx80NaN(a
, a
, status
);
5664 if ( aExp
< 0x3FFF ) {
5666 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5669 float_raise(float_flag_inexact
, status
);
5670 aSign
= extractFloatx80Sign( a
);
5671 switch (status
->float_rounding_mode
) {
5672 case float_round_nearest_even
:
5673 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5676 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5679 case float_round_ties_away
:
5680 if (aExp
== 0x3FFE) {
5681 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5684 case float_round_down
:
5687 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5688 : packFloatx80( 0, 0, 0 );
5689 case float_round_up
:
5691 aSign
? packFloatx80( 1, 0, 0 )
5692 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5694 case float_round_to_zero
:
5697 g_assert_not_reached();
5699 return packFloatx80( aSign
, 0, 0 );
5702 lastBitMask
<<= 0x403E - aExp
;
5703 roundBitsMask
= lastBitMask
- 1;
5705 switch (status
->float_rounding_mode
) {
5706 case float_round_nearest_even
:
5707 z
.low
+= lastBitMask
>>1;
5708 if ((z
.low
& roundBitsMask
) == 0) {
5709 z
.low
&= ~lastBitMask
;
5712 case float_round_ties_away
:
5713 z
.low
+= lastBitMask
>> 1;
5715 case float_round_to_zero
:
5717 case float_round_up
:
5718 if (!extractFloatx80Sign(z
)) {
5719 z
.low
+= roundBitsMask
;
5722 case float_round_down
:
5723 if (extractFloatx80Sign(z
)) {
5724 z
.low
+= roundBitsMask
;
5730 z
.low
&= ~ roundBitsMask
;
5733 z
.low
= UINT64_C(0x8000000000000000);
5735 if (z
.low
!= a
.low
) {
5736 float_raise(float_flag_inexact
, status
);
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of adding the absolute values of the extended double-
5744 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5745 | negated before being returned. `zSign' is ignored if the result is a NaN.
5746 | The addition is performed according to the IEC/IEEE Standard for Binary
5747 | Floating-Point Arithmetic.
5748 *----------------------------------------------------------------------------*/
5750 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5751 float_status
*status
)
5753 int32_t aExp
, bExp
, zExp
;
5754 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5757 aSig
= extractFloatx80Frac( a
);
5758 aExp
= extractFloatx80Exp( a
);
5759 bSig
= extractFloatx80Frac( b
);
5760 bExp
= extractFloatx80Exp( b
);
5761 expDiff
= aExp
- bExp
;
5762 if ( 0 < expDiff
) {
5763 if ( aExp
== 0x7FFF ) {
5764 if ((uint64_t)(aSig
<< 1)) {
5765 return propagateFloatx80NaN(a
, b
, status
);
5769 if ( bExp
== 0 ) --expDiff
;
5770 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5773 else if ( expDiff
< 0 ) {
5774 if ( bExp
== 0x7FFF ) {
5775 if ((uint64_t)(bSig
<< 1)) {
5776 return propagateFloatx80NaN(a
, b
, status
);
5778 return packFloatx80(zSign
,
5779 floatx80_infinity_high
,
5780 floatx80_infinity_low
);
5782 if ( aExp
== 0 ) ++expDiff
;
5783 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5787 if ( aExp
== 0x7FFF ) {
5788 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5789 return propagateFloatx80NaN(a
, b
, status
);
5794 zSig0
= aSig
+ bSig
;
5796 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5797 /* At least one of the values is a pseudo-denormal,
5798 * and there is a carry out of the result. */
5803 return packFloatx80(zSign
, 0, 0);
5805 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5811 zSig0
= aSig
+ bSig
;
5812 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5814 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5815 zSig0
|= UINT64_C(0x8000000000000000);
5818 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5819 zSign
, zExp
, zSig0
, zSig1
, status
);
5822 /*----------------------------------------------------------------------------
5823 | Returns the result of subtracting the absolute values of the extended
5824 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5825 | difference is negated before being returned. `zSign' is ignored if the
5826 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5827 | Standard for Binary Floating-Point Arithmetic.
5828 *----------------------------------------------------------------------------*/
5830 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5831 float_status
*status
)
5833 int32_t aExp
, bExp
, zExp
;
5834 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5837 aSig
= extractFloatx80Frac( a
);
5838 aExp
= extractFloatx80Exp( a
);
5839 bSig
= extractFloatx80Frac( b
);
5840 bExp
= extractFloatx80Exp( b
);
5841 expDiff
= aExp
- bExp
;
5842 if ( 0 < expDiff
) goto aExpBigger
;
5843 if ( expDiff
< 0 ) goto bExpBigger
;
5844 if ( aExp
== 0x7FFF ) {
5845 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5846 return propagateFloatx80NaN(a
, b
, status
);
5848 float_raise(float_flag_invalid
, status
);
5849 return floatx80_default_nan(status
);
5856 if ( bSig
< aSig
) goto aBigger
;
5857 if ( aSig
< bSig
) goto bBigger
;
5858 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5860 if ( bExp
== 0x7FFF ) {
5861 if ((uint64_t)(bSig
<< 1)) {
5862 return propagateFloatx80NaN(a
, b
, status
);
5864 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5865 floatx80_infinity_low
);
5867 if ( aExp
== 0 ) ++expDiff
;
5868 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5870 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5873 goto normalizeRoundAndPack
;
5875 if ( aExp
== 0x7FFF ) {
5876 if ((uint64_t)(aSig
<< 1)) {
5877 return propagateFloatx80NaN(a
, b
, status
);
5881 if ( bExp
== 0 ) --expDiff
;
5882 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5884 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5886 normalizeRoundAndPack
:
5887 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5888 zSign
, zExp
, zSig0
, zSig1
, status
);
5891 /*----------------------------------------------------------------------------
5892 | Returns the result of adding the extended double-precision floating-point
5893 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5894 | Standard for Binary Floating-Point Arithmetic.
5895 *----------------------------------------------------------------------------*/
5897 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5901 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5902 float_raise(float_flag_invalid
, status
);
5903 return floatx80_default_nan(status
);
5905 aSign
= extractFloatx80Sign( a
);
5906 bSign
= extractFloatx80Sign( b
);
5907 if ( aSign
== bSign
) {
5908 return addFloatx80Sigs(a
, b
, aSign
, status
);
5911 return subFloatx80Sigs(a
, b
, aSign
, status
);
5916 /*----------------------------------------------------------------------------
5917 | Returns the result of subtracting the extended double-precision floating-
5918 | point values `a' and `b'. The operation is performed according to the
5919 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5920 *----------------------------------------------------------------------------*/
5922 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5926 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5927 float_raise(float_flag_invalid
, status
);
5928 return floatx80_default_nan(status
);
5930 aSign
= extractFloatx80Sign( a
);
5931 bSign
= extractFloatx80Sign( b
);
5932 if ( aSign
== bSign
) {
5933 return subFloatx80Sigs(a
, b
, aSign
, status
);
5936 return addFloatx80Sigs(a
, b
, aSign
, status
);
5941 /*----------------------------------------------------------------------------
5942 | Returns the result of multiplying the extended double-precision floating-
5943 | point values `a' and `b'. The operation is performed according to the
5944 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5945 *----------------------------------------------------------------------------*/
5947 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5949 bool aSign
, bSign
, zSign
;
5950 int32_t aExp
, bExp
, zExp
;
5951 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5953 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5954 float_raise(float_flag_invalid
, status
);
5955 return floatx80_default_nan(status
);
5957 aSig
= extractFloatx80Frac( a
);
5958 aExp
= extractFloatx80Exp( a
);
5959 aSign
= extractFloatx80Sign( a
);
5960 bSig
= extractFloatx80Frac( b
);
5961 bExp
= extractFloatx80Exp( b
);
5962 bSign
= extractFloatx80Sign( b
);
5963 zSign
= aSign
^ bSign
;
5964 if ( aExp
== 0x7FFF ) {
5965 if ( (uint64_t) ( aSig
<<1 )
5966 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5967 return propagateFloatx80NaN(a
, b
, status
);
5969 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5970 return packFloatx80(zSign
, floatx80_infinity_high
,
5971 floatx80_infinity_low
);
5973 if ( bExp
== 0x7FFF ) {
5974 if ((uint64_t)(bSig
<< 1)) {
5975 return propagateFloatx80NaN(a
, b
, status
);
5977 if ( ( aExp
| aSig
) == 0 ) {
5979 float_raise(float_flag_invalid
, status
);
5980 return floatx80_default_nan(status
);
5982 return packFloatx80(zSign
, floatx80_infinity_high
,
5983 floatx80_infinity_low
);
5986 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5987 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5990 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5991 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5993 zExp
= aExp
+ bExp
- 0x3FFE;
5994 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5995 if ( 0 < (int64_t) zSig0
) {
5996 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5999 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6000 zSign
, zExp
, zSig0
, zSig1
, status
);
6003 /*----------------------------------------------------------------------------
6004 | Returns the result of dividing the extended double-precision floating-point
6005 | value `a' by the corresponding value `b'. The operation is performed
6006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6007 *----------------------------------------------------------------------------*/
6009 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6011 bool aSign
, bSign
, zSign
;
6012 int32_t aExp
, bExp
, zExp
;
6013 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6014 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6016 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6017 float_raise(float_flag_invalid
, status
);
6018 return floatx80_default_nan(status
);
6020 aSig
= extractFloatx80Frac( a
);
6021 aExp
= extractFloatx80Exp( a
);
6022 aSign
= extractFloatx80Sign( a
);
6023 bSig
= extractFloatx80Frac( b
);
6024 bExp
= extractFloatx80Exp( b
);
6025 bSign
= extractFloatx80Sign( b
);
6026 zSign
= aSign
^ bSign
;
6027 if ( aExp
== 0x7FFF ) {
6028 if ((uint64_t)(aSig
<< 1)) {
6029 return propagateFloatx80NaN(a
, b
, status
);
6031 if ( bExp
== 0x7FFF ) {
6032 if ((uint64_t)(bSig
<< 1)) {
6033 return propagateFloatx80NaN(a
, b
, status
);
6037 return packFloatx80(zSign
, floatx80_infinity_high
,
6038 floatx80_infinity_low
);
6040 if ( bExp
== 0x7FFF ) {
6041 if ((uint64_t)(bSig
<< 1)) {
6042 return propagateFloatx80NaN(a
, b
, status
);
6044 return packFloatx80( zSign
, 0, 0 );
6048 if ( ( aExp
| aSig
) == 0 ) {
6050 float_raise(float_flag_invalid
, status
);
6051 return floatx80_default_nan(status
);
6053 float_raise(float_flag_divbyzero
, status
);
6054 return packFloatx80(zSign
, floatx80_infinity_high
,
6055 floatx80_infinity_low
);
6057 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6060 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6061 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6063 zExp
= aExp
- bExp
+ 0x3FFE;
6065 if ( bSig
<= aSig
) {
6066 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6069 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6070 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6071 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6072 while ( (int64_t) rem0
< 0 ) {
6074 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6076 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6077 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6078 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6079 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6080 while ( (int64_t) rem1
< 0 ) {
6082 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6084 zSig1
|= ( ( rem1
| rem2
) != 0 );
6086 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6087 zSign
, zExp
, zSig0
, zSig1
, status
);
6090 /*----------------------------------------------------------------------------
6091 | Returns the remainder of the extended double-precision floating-point value
6092 | `a' with respect to the corresponding value `b'. The operation is performed
6093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6094 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6095 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6096 | the absolute value of the integer quotient.
6097 *----------------------------------------------------------------------------*/
6099 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6100 float_status
*status
)
6103 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6104 uint64_t aSig0
, aSig1
, bSig
;
6105 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6108 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6109 float_raise(float_flag_invalid
, status
);
6110 return floatx80_default_nan(status
);
6112 aSig0
= extractFloatx80Frac( a
);
6113 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6114 aSign
= extractFloatx80Sign( a
);
6115 bSig
= extractFloatx80Frac( b
);
6116 bExp
= extractFloatx80Exp( b
);
6117 if ( aExp
== 0x7FFF ) {
6118 if ( (uint64_t) ( aSig0
<<1 )
6119 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6120 return propagateFloatx80NaN(a
, b
, status
);
6124 if ( bExp
== 0x7FFF ) {
6125 if ((uint64_t)(bSig
<< 1)) {
6126 return propagateFloatx80NaN(a
, b
, status
);
6128 if (aExp
== 0 && aSig0
>> 63) {
6130 * Pseudo-denormal argument must be returned in normalized
6133 return packFloatx80(aSign
, 1, aSig0
);
6140 float_raise(float_flag_invalid
, status
);
6141 return floatx80_default_nan(status
);
6143 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6146 if ( aSig0
== 0 ) return a
;
6147 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6150 expDiff
= aExp
- bExp
;
6152 if ( expDiff
< 0 ) {
6153 if ( mod
|| expDiff
< -1 ) {
6154 if (aExp
== 1 && aExpOrig
== 0) {
6156 * Pseudo-denormal argument must be returned in
6159 return packFloatx80(aSign
, aExp
, aSig0
);
6163 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6166 *quotient
= q
= ( bSig
<= aSig0
);
6167 if ( q
) aSig0
-= bSig
;
6169 while ( 0 < expDiff
) {
6170 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6171 q
= ( 2 < q
) ? q
- 2 : 0;
6172 mul64To128( bSig
, q
, &term0
, &term1
);
6173 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6174 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6180 if ( 0 < expDiff
) {
6181 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6182 q
= ( 2 < q
) ? q
- 2 : 0;
6184 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6185 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6186 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6187 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6189 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6192 *quotient
<<= expDiff
;
6203 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6204 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6205 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6208 aSig0
= alternateASig0
;
6209 aSig1
= alternateASig1
;
6215 normalizeRoundAndPackFloatx80(
6216 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6220 /*----------------------------------------------------------------------------
6221 | Returns the remainder of the extended double-precision floating-point value
6222 | `a' with respect to the corresponding value `b'. The operation is performed
6223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6224 *----------------------------------------------------------------------------*/
6226 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6229 return floatx80_modrem(a
, b
, false, "ient
, status
);
6232 /*----------------------------------------------------------------------------
6233 | Returns the remainder of the extended double-precision floating-point value
6234 | `a' with respect to the corresponding value `b', with the quotient truncated
6236 *----------------------------------------------------------------------------*/
6238 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6241 return floatx80_modrem(a
, b
, true, "ient
, status
);
6244 /*----------------------------------------------------------------------------
6245 | Returns the square root of the extended double-precision floating-point
6246 | value `a'. The operation is performed according to the IEC/IEEE Standard
6247 | for Binary Floating-Point Arithmetic.
6248 *----------------------------------------------------------------------------*/
6250 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6254 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6255 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6257 if (floatx80_invalid_encoding(a
)) {
6258 float_raise(float_flag_invalid
, status
);
6259 return floatx80_default_nan(status
);
6261 aSig0
= extractFloatx80Frac( a
);
6262 aExp
= extractFloatx80Exp( a
);
6263 aSign
= extractFloatx80Sign( a
);
6264 if ( aExp
== 0x7FFF ) {
6265 if ((uint64_t)(aSig0
<< 1)) {
6266 return propagateFloatx80NaN(a
, a
, status
);
6268 if ( ! aSign
) return a
;
6272 if ( ( aExp
| aSig0
) == 0 ) return a
;
6274 float_raise(float_flag_invalid
, status
);
6275 return floatx80_default_nan(status
);
6278 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6279 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6281 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6282 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6283 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6284 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6285 doubleZSig0
= zSig0
<<1;
6286 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6287 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6288 while ( (int64_t) rem0
< 0 ) {
6291 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6293 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6294 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6295 if ( zSig1
== 0 ) zSig1
= 1;
6296 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6297 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6298 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6299 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6300 while ( (int64_t) rem1
< 0 ) {
6302 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6304 term2
|= doubleZSig0
;
6305 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6307 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6309 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6310 zSig0
|= doubleZSig0
;
6311 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6312 0, zExp
, zSig0
, zSig1
, status
);
6315 /*----------------------------------------------------------------------------
6316 | Returns the result of converting the quadruple-precision floating-point
6317 | value `a' to the 32-bit two's complement integer format. The conversion
6318 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6319 | Arithmetic---which means in particular that the conversion is rounded
6320 | according to the current rounding mode. If `a' is a NaN, the largest
6321 | positive integer is returned. Otherwise, if the conversion overflows, the
6322 | largest integer with the same sign as `a' is returned.
6323 *----------------------------------------------------------------------------*/
6325 int32_t float128_to_int32(float128 a
, float_status
*status
)
6328 int32_t aExp
, shiftCount
;
6329 uint64_t aSig0
, aSig1
;
6331 aSig1
= extractFloat128Frac1( a
);
6332 aSig0
= extractFloat128Frac0( a
);
6333 aExp
= extractFloat128Exp( a
);
6334 aSign
= extractFloat128Sign( a
);
6335 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6336 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6337 aSig0
|= ( aSig1
!= 0 );
6338 shiftCount
= 0x4028 - aExp
;
6339 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6340 return roundAndPackInt32(aSign
, aSig0
, status
);
6344 /*----------------------------------------------------------------------------
6345 | Returns the result of converting the quadruple-precision floating-point
6346 | value `a' to the 32-bit two's complement integer format. The conversion
6347 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6348 | Arithmetic, except that the conversion is always rounded toward zero. If
6349 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6350 | conversion overflows, the largest integer with the same sign as `a' is
6352 *----------------------------------------------------------------------------*/
6354 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6357 int32_t aExp
, shiftCount
;
6358 uint64_t aSig0
, aSig1
, savedASig
;
6361 aSig1
= extractFloat128Frac1( a
);
6362 aSig0
= extractFloat128Frac0( a
);
6363 aExp
= extractFloat128Exp( a
);
6364 aSign
= extractFloat128Sign( a
);
6365 aSig0
|= ( aSig1
!= 0 );
6366 if ( 0x401E < aExp
) {
6367 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6370 else if ( aExp
< 0x3FFF ) {
6371 if (aExp
|| aSig0
) {
6372 float_raise(float_flag_inexact
, status
);
6376 aSig0
|= UINT64_C(0x0001000000000000);
6377 shiftCount
= 0x402F - aExp
;
6379 aSig0
>>= shiftCount
;
6381 if ( aSign
) z
= - z
;
6382 if ( ( z
< 0 ) ^ aSign
) {
6384 float_raise(float_flag_invalid
, status
);
6385 return aSign
? INT32_MIN
: INT32_MAX
;
6387 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6388 float_raise(float_flag_inexact
, status
);
6394 /*----------------------------------------------------------------------------
6395 | Returns the result of converting the quadruple-precision floating-point
6396 | value `a' to the 64-bit two's complement integer format. The conversion
6397 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6398 | Arithmetic---which means in particular that the conversion is rounded
6399 | according to the current rounding mode. If `a' is a NaN, the largest
6400 | positive integer is returned. Otherwise, if the conversion overflows, the
6401 | largest integer with the same sign as `a' is returned.
6402 *----------------------------------------------------------------------------*/
6404 int64_t float128_to_int64(float128 a
, float_status
*status
)
6407 int32_t aExp
, shiftCount
;
6408 uint64_t aSig0
, aSig1
;
6410 aSig1
= extractFloat128Frac1( a
);
6411 aSig0
= extractFloat128Frac0( a
);
6412 aExp
= extractFloat128Exp( a
);
6413 aSign
= extractFloat128Sign( a
);
6414 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6415 shiftCount
= 0x402F - aExp
;
6416 if ( shiftCount
<= 0 ) {
6417 if ( 0x403E < aExp
) {
6418 float_raise(float_flag_invalid
, status
);
6420 || ( ( aExp
== 0x7FFF )
6421 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6428 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6431 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6433 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6437 /*----------------------------------------------------------------------------
6438 | Returns the result of converting the quadruple-precision floating-point
6439 | value `a' to the 64-bit two's complement integer format. The conversion
6440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6441 | Arithmetic, except that the conversion is always rounded toward zero.
6442 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6443 | the conversion overflows, the largest integer with the same sign as `a' is
6445 *----------------------------------------------------------------------------*/
6447 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6450 int32_t aExp
, shiftCount
;
6451 uint64_t aSig0
, aSig1
;
6454 aSig1
= extractFloat128Frac1( a
);
6455 aSig0
= extractFloat128Frac0( a
);
6456 aExp
= extractFloat128Exp( a
);
6457 aSign
= extractFloat128Sign( a
);
6458 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6459 shiftCount
= aExp
- 0x402F;
6460 if ( 0 < shiftCount
) {
6461 if ( 0x403E <= aExp
) {
6462 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6463 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6464 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6466 float_raise(float_flag_inexact
, status
);
6470 float_raise(float_flag_invalid
, status
);
6471 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6477 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6478 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6479 float_raise(float_flag_inexact
, status
);
6483 if ( aExp
< 0x3FFF ) {
6484 if ( aExp
| aSig0
| aSig1
) {
6485 float_raise(float_flag_inexact
, status
);
6489 z
= aSig0
>>( - shiftCount
);
6491 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6492 float_raise(float_flag_inexact
, status
);
6495 if ( aSign
) z
= - z
;
6500 /*----------------------------------------------------------------------------
6501 | Returns the result of converting the quadruple-precision floating-point value
6502 | `a' to the 64-bit unsigned integer format. The conversion is
6503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6504 | Arithmetic---which means in particular that the conversion is rounded
6505 | according to the current rounding mode. If `a' is a NaN, the largest
6506 | positive integer is returned. If the conversion overflows, the
6507 | largest unsigned integer is returned. If 'a' is negative, the value is
6508 | rounded and zero is returned; negative values that do not round to zero
6509 | will raise the inexact exception.
6510 *----------------------------------------------------------------------------*/
6512 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6517 uint64_t aSig0
, aSig1
;
6519 aSig0
= extractFloat128Frac0(a
);
6520 aSig1
= extractFloat128Frac1(a
);
6521 aExp
= extractFloat128Exp(a
);
6522 aSign
= extractFloat128Sign(a
);
6523 if (aSign
&& (aExp
> 0x3FFE)) {
6524 float_raise(float_flag_invalid
, status
);
6525 if (float128_is_any_nan(a
)) {
6532 aSig0
|= UINT64_C(0x0001000000000000);
6534 shiftCount
= 0x402F - aExp
;
6535 if (shiftCount
<= 0) {
6536 if (0x403E < aExp
) {
6537 float_raise(float_flag_invalid
, status
);
6540 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6542 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6544 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6547 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6550 signed char current_rounding_mode
= status
->float_rounding_mode
;
6552 set_float_rounding_mode(float_round_to_zero
, status
);
6553 v
= float128_to_uint64(a
, status
);
6554 set_float_rounding_mode(current_rounding_mode
, status
);
6559 /*----------------------------------------------------------------------------
6560 | Returns the result of converting the quadruple-precision floating-point
6561 | value `a' to the 32-bit unsigned integer format. The conversion
6562 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6563 | Arithmetic except that the conversion is always rounded toward zero.
6564 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6565 | if the conversion overflows, the largest unsigned integer is returned.
6566 | If 'a' is negative, the value is rounded and zero is returned; negative
6567 | values that do not round to zero will raise the inexact exception.
6568 *----------------------------------------------------------------------------*/
6570 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6574 int old_exc_flags
= get_float_exception_flags(status
);
6576 v
= float128_to_uint64_round_to_zero(a
, status
);
6577 if (v
> 0xffffffff) {
6582 set_float_exception_flags(old_exc_flags
, status
);
6583 float_raise(float_flag_invalid
, status
);
6587 /*----------------------------------------------------------------------------
6588 | Returns the result of converting the quadruple-precision floating-point value
6589 | `a' to the 32-bit unsigned integer format. The conversion is
6590 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6591 | Arithmetic---which means in particular that the conversion is rounded
6592 | according to the current rounding mode. If `a' is a NaN, the largest
6593 | positive integer is returned. If the conversion overflows, the
6594 | largest unsigned integer is returned. If 'a' is negative, the value is
6595 | rounded and zero is returned; negative values that do not round to zero
6596 | will raise the inexact exception.
6597 *----------------------------------------------------------------------------*/
6599 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6603 int old_exc_flags
= get_float_exception_flags(status
);
6605 v
= float128_to_uint64(a
, status
);
6606 if (v
> 0xffffffff) {
6611 set_float_exception_flags(old_exc_flags
, status
);
6612 float_raise(float_flag_invalid
, status
);
6616 /*----------------------------------------------------------------------------
6617 | Returns the result of converting the quadruple-precision floating-point
6618 | value `a' to the single-precision floating-point format. The conversion
6619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6621 *----------------------------------------------------------------------------*/
6623 float32
float128_to_float32(float128 a
, float_status
*status
)
6627 uint64_t aSig0
, aSig1
;
6630 aSig1
= extractFloat128Frac1( a
);
6631 aSig0
= extractFloat128Frac0( a
);
6632 aExp
= extractFloat128Exp( a
);
6633 aSign
= extractFloat128Sign( a
);
6634 if ( aExp
== 0x7FFF ) {
6635 if ( aSig0
| aSig1
) {
6636 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6638 return packFloat32( aSign
, 0xFF, 0 );
6640 aSig0
|= ( aSig1
!= 0 );
6641 shift64RightJamming( aSig0
, 18, &aSig0
);
6643 if ( aExp
|| zSig
) {
6647 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6651 /*----------------------------------------------------------------------------
6652 | Returns the result of converting the quadruple-precision floating-point
6653 | value `a' to the double-precision floating-point format. The conversion
6654 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6656 *----------------------------------------------------------------------------*/
6658 float64
float128_to_float64(float128 a
, float_status
*status
)
6662 uint64_t aSig0
, aSig1
;
6664 aSig1
= extractFloat128Frac1( a
);
6665 aSig0
= extractFloat128Frac0( a
);
6666 aExp
= extractFloat128Exp( a
);
6667 aSign
= extractFloat128Sign( a
);
6668 if ( aExp
== 0x7FFF ) {
6669 if ( aSig0
| aSig1
) {
6670 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6672 return packFloat64( aSign
, 0x7FF, 0 );
6674 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6675 aSig0
|= ( aSig1
!= 0 );
6676 if ( aExp
|| aSig0
) {
6677 aSig0
|= UINT64_C(0x4000000000000000);
6680 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6684 /*----------------------------------------------------------------------------
6685 | Returns the result of converting the quadruple-precision floating-point
6686 | value `a' to the extended double-precision floating-point format. The
6687 | conversion is performed according to the IEC/IEEE Standard for Binary
6688 | Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6691 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6695 uint64_t aSig0
, aSig1
;
6697 aSig1
= extractFloat128Frac1( a
);
6698 aSig0
= extractFloat128Frac0( a
);
6699 aExp
= extractFloat128Exp( a
);
6700 aSign
= extractFloat128Sign( a
);
6701 if ( aExp
== 0x7FFF ) {
6702 if ( aSig0
| aSig1
) {
6703 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6705 return floatx80_silence_nan(res
, status
);
6707 return packFloatx80(aSign
, floatx80_infinity_high
,
6708 floatx80_infinity_low
);
6711 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6712 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6715 aSig0
|= UINT64_C(0x0001000000000000);
6717 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6718 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6722 /*----------------------------------------------------------------------------
6723 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6724 | returns the result as a quadruple-precision floating-point value. The
6725 | operation is performed according to the IEC/IEEE Standard for Binary
6726 | Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6729 float128
float128_round_to_int(float128 a
, float_status
*status
)
6733 uint64_t lastBitMask
, roundBitsMask
;
6736 aExp
= extractFloat128Exp( a
);
6737 if ( 0x402F <= aExp
) {
6738 if ( 0x406F <= aExp
) {
6739 if ( ( aExp
== 0x7FFF )
6740 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6742 return propagateFloat128NaN(a
, a
, status
);
6747 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6748 roundBitsMask
= lastBitMask
- 1;
6750 switch (status
->float_rounding_mode
) {
6751 case float_round_nearest_even
:
6752 if ( lastBitMask
) {
6753 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6754 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6757 if ( (int64_t) z
.low
< 0 ) {
6759 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6763 case float_round_ties_away
:
6765 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6767 if ((int64_t) z
.low
< 0) {
6772 case float_round_to_zero
:
6774 case float_round_up
:
6775 if (!extractFloat128Sign(z
)) {
6776 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6779 case float_round_down
:
6780 if (extractFloat128Sign(z
)) {
6781 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6784 case float_round_to_odd
:
6786 * Note that if lastBitMask == 0, the last bit is the lsb
6787 * of high, and roundBitsMask == -1.
6789 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6790 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6796 z
.low
&= ~ roundBitsMask
;
6799 if ( aExp
< 0x3FFF ) {
6800 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6801 float_raise(float_flag_inexact
, status
);
6802 aSign
= extractFloat128Sign( a
);
6803 switch (status
->float_rounding_mode
) {
6804 case float_round_nearest_even
:
6805 if ( ( aExp
== 0x3FFE )
6806 && ( extractFloat128Frac0( a
)
6807 | extractFloat128Frac1( a
) )
6809 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6812 case float_round_ties_away
:
6813 if (aExp
== 0x3FFE) {
6814 return packFloat128(aSign
, 0x3FFF, 0, 0);
6817 case float_round_down
:
6819 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6820 : packFloat128( 0, 0, 0, 0 );
6821 case float_round_up
:
6823 aSign
? packFloat128( 1, 0, 0, 0 )
6824 : packFloat128( 0, 0x3FFF, 0, 0 );
6826 case float_round_to_odd
:
6827 return packFloat128(aSign
, 0x3FFF, 0, 0);
6829 case float_round_to_zero
:
6832 return packFloat128( aSign
, 0, 0, 0 );
6835 lastBitMask
<<= 0x402F - aExp
;
6836 roundBitsMask
= lastBitMask
- 1;
6839 switch (status
->float_rounding_mode
) {
6840 case float_round_nearest_even
:
6841 z
.high
+= lastBitMask
>>1;
6842 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6843 z
.high
&= ~ lastBitMask
;
6846 case float_round_ties_away
:
6847 z
.high
+= lastBitMask
>>1;
6849 case float_round_to_zero
:
6851 case float_round_up
:
6852 if (!extractFloat128Sign(z
)) {
6853 z
.high
|= ( a
.low
!= 0 );
6854 z
.high
+= roundBitsMask
;
6857 case float_round_down
:
6858 if (extractFloat128Sign(z
)) {
6859 z
.high
|= (a
.low
!= 0);
6860 z
.high
+= roundBitsMask
;
6863 case float_round_to_odd
:
6864 if ((z
.high
& lastBitMask
) == 0) {
6865 z
.high
|= (a
.low
!= 0);
6866 z
.high
+= roundBitsMask
;
6872 z
.high
&= ~ roundBitsMask
;
6874 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6875 float_raise(float_flag_inexact
, status
);
6881 /*----------------------------------------------------------------------------
6882 | Returns the result of adding the absolute values of the quadruple-precision
6883 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6884 | before being returned. `zSign' is ignored if the result is a NaN.
6885 | The addition is performed according to the IEC/IEEE Standard for Binary
6886 | Floating-Point Arithmetic.
6887 *----------------------------------------------------------------------------*/
6889 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6890 float_status
*status
)
6892 int32_t aExp
, bExp
, zExp
;
6893 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6896 aSig1
= extractFloat128Frac1( a
);
6897 aSig0
= extractFloat128Frac0( a
);
6898 aExp
= extractFloat128Exp( a
);
6899 bSig1
= extractFloat128Frac1( b
);
6900 bSig0
= extractFloat128Frac0( b
);
6901 bExp
= extractFloat128Exp( b
);
6902 expDiff
= aExp
- bExp
;
6903 if ( 0 < expDiff
) {
6904 if ( aExp
== 0x7FFF ) {
6905 if (aSig0
| aSig1
) {
6906 return propagateFloat128NaN(a
, b
, status
);
6914 bSig0
|= UINT64_C(0x0001000000000000);
6916 shift128ExtraRightJamming(
6917 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6920 else if ( expDiff
< 0 ) {
6921 if ( bExp
== 0x7FFF ) {
6922 if (bSig0
| bSig1
) {
6923 return propagateFloat128NaN(a
, b
, status
);
6925 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6931 aSig0
|= UINT64_C(0x0001000000000000);
6933 shift128ExtraRightJamming(
6934 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6938 if ( aExp
== 0x7FFF ) {
6939 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6940 return propagateFloat128NaN(a
, b
, status
);
6944 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6946 if (status
->flush_to_zero
) {
6947 if (zSig0
| zSig1
) {
6948 float_raise(float_flag_output_denormal
, status
);
6950 return packFloat128(zSign
, 0, 0, 0);
6952 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6955 zSig0
|= UINT64_C(0x0002000000000000);
6959 aSig0
|= UINT64_C(0x0001000000000000);
6960 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6962 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
6965 shift128ExtraRightJamming(
6966 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6968 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6972 /*----------------------------------------------------------------------------
6973 | Returns the result of subtracting the absolute values of the quadruple-
6974 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6975 | difference is negated before being returned. `zSign' is ignored if the
6976 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6977 | Standard for Binary Floating-Point Arithmetic.
6978 *----------------------------------------------------------------------------*/
6980 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6981 float_status
*status
)
6983 int32_t aExp
, bExp
, zExp
;
6984 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6987 aSig1
= extractFloat128Frac1( a
);
6988 aSig0
= extractFloat128Frac0( a
);
6989 aExp
= extractFloat128Exp( a
);
6990 bSig1
= extractFloat128Frac1( b
);
6991 bSig0
= extractFloat128Frac0( b
);
6992 bExp
= extractFloat128Exp( b
);
6993 expDiff
= aExp
- bExp
;
6994 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6995 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6996 if ( 0 < expDiff
) goto aExpBigger
;
6997 if ( expDiff
< 0 ) goto bExpBigger
;
6998 if ( aExp
== 0x7FFF ) {
6999 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7000 return propagateFloat128NaN(a
, b
, status
);
7002 float_raise(float_flag_invalid
, status
);
7003 return float128_default_nan(status
);
7009 if ( bSig0
< aSig0
) goto aBigger
;
7010 if ( aSig0
< bSig0
) goto bBigger
;
7011 if ( bSig1
< aSig1
) goto aBigger
;
7012 if ( aSig1
< bSig1
) goto bBigger
;
7013 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7016 if ( bExp
== 0x7FFF ) {
7017 if (bSig0
| bSig1
) {
7018 return propagateFloat128NaN(a
, b
, status
);
7020 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7026 aSig0
|= UINT64_C(0x4000000000000000);
7028 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7029 bSig0
|= UINT64_C(0x4000000000000000);
7031 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7034 goto normalizeRoundAndPack
;
7036 if ( aExp
== 0x7FFF ) {
7037 if (aSig0
| aSig1
) {
7038 return propagateFloat128NaN(a
, b
, status
);
7046 bSig0
|= UINT64_C(0x4000000000000000);
7048 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7049 aSig0
|= UINT64_C(0x4000000000000000);
7051 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7053 normalizeRoundAndPack
:
7055 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7060 /*----------------------------------------------------------------------------
7061 | Returns the result of adding the quadruple-precision floating-point values
7062 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7063 | for Binary Floating-Point Arithmetic.
7064 *----------------------------------------------------------------------------*/
7066 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7070 aSign
= extractFloat128Sign( a
);
7071 bSign
= extractFloat128Sign( b
);
7072 if ( aSign
== bSign
) {
7073 return addFloat128Sigs(a
, b
, aSign
, status
);
7076 return subFloat128Sigs(a
, b
, aSign
, status
);
7081 /*----------------------------------------------------------------------------
7082 | Returns the result of subtracting the quadruple-precision floating-point
7083 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7084 | Standard for Binary Floating-Point Arithmetic.
7085 *----------------------------------------------------------------------------*/
7087 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7091 aSign
= extractFloat128Sign( a
);
7092 bSign
= extractFloat128Sign( b
);
7093 if ( aSign
== bSign
) {
7094 return subFloat128Sigs(a
, b
, aSign
, status
);
7097 return addFloat128Sigs(a
, b
, aSign
, status
);
7102 /*----------------------------------------------------------------------------
7103 | Returns the result of multiplying the quadruple-precision floating-point
7104 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7105 | Standard for Binary Floating-Point Arithmetic.
7106 *----------------------------------------------------------------------------*/
7108 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7110 bool aSign
, bSign
, zSign
;
7111 int32_t aExp
, bExp
, zExp
;
7112 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7114 aSig1
= extractFloat128Frac1( a
);
7115 aSig0
= extractFloat128Frac0( a
);
7116 aExp
= extractFloat128Exp( a
);
7117 aSign
= extractFloat128Sign( a
);
7118 bSig1
= extractFloat128Frac1( b
);
7119 bSig0
= extractFloat128Frac0( b
);
7120 bExp
= extractFloat128Exp( b
);
7121 bSign
= extractFloat128Sign( b
);
7122 zSign
= aSign
^ bSign
;
7123 if ( aExp
== 0x7FFF ) {
7124 if ( ( aSig0
| aSig1
)
7125 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7126 return propagateFloat128NaN(a
, b
, status
);
7128 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7129 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7131 if ( bExp
== 0x7FFF ) {
7132 if (bSig0
| bSig1
) {
7133 return propagateFloat128NaN(a
, b
, status
);
7135 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7137 float_raise(float_flag_invalid
, status
);
7138 return float128_default_nan(status
);
7140 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7143 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7144 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7147 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7148 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7150 zExp
= aExp
+ bExp
- 0x4000;
7151 aSig0
|= UINT64_C(0x0001000000000000);
7152 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7153 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7154 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7155 zSig2
|= ( zSig3
!= 0 );
7156 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7157 shift128ExtraRightJamming(
7158 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7161 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7165 /*----------------------------------------------------------------------------
7166 | Returns the result of dividing the quadruple-precision floating-point value
7167 | `a' by the corresponding value `b'. The operation is performed according to
7168 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7169 *----------------------------------------------------------------------------*/
7171 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7173 bool aSign
, bSign
, zSign
;
7174 int32_t aExp
, bExp
, zExp
;
7175 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7176 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7178 aSig1
= extractFloat128Frac1( a
);
7179 aSig0
= extractFloat128Frac0( a
);
7180 aExp
= extractFloat128Exp( a
);
7181 aSign
= extractFloat128Sign( a
);
7182 bSig1
= extractFloat128Frac1( b
);
7183 bSig0
= extractFloat128Frac0( b
);
7184 bExp
= extractFloat128Exp( b
);
7185 bSign
= extractFloat128Sign( b
);
7186 zSign
= aSign
^ bSign
;
7187 if ( aExp
== 0x7FFF ) {
7188 if (aSig0
| aSig1
) {
7189 return propagateFloat128NaN(a
, b
, status
);
7191 if ( bExp
== 0x7FFF ) {
7192 if (bSig0
| bSig1
) {
7193 return propagateFloat128NaN(a
, b
, status
);
7197 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7199 if ( bExp
== 0x7FFF ) {
7200 if (bSig0
| bSig1
) {
7201 return propagateFloat128NaN(a
, b
, status
);
7203 return packFloat128( zSign
, 0, 0, 0 );
7206 if ( ( bSig0
| bSig1
) == 0 ) {
7207 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7209 float_raise(float_flag_invalid
, status
);
7210 return float128_default_nan(status
);
7212 float_raise(float_flag_divbyzero
, status
);
7213 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7215 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7218 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7219 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7221 zExp
= aExp
- bExp
+ 0x3FFD;
7223 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7225 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7226 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7227 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7230 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7231 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7232 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7233 while ( (int64_t) rem0
< 0 ) {
7235 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7237 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7238 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7239 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7240 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7241 while ( (int64_t) rem1
< 0 ) {
7243 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7245 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7247 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7248 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7252 /*----------------------------------------------------------------------------
7253 | Returns the remainder of the quadruple-precision floating-point value `a'
7254 | with respect to the corresponding value `b'. The operation is performed
7255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7256 *----------------------------------------------------------------------------*/
7258 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7261 int32_t aExp
, bExp
, expDiff
;
7262 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7263 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7266 aSig1
= extractFloat128Frac1( a
);
7267 aSig0
= extractFloat128Frac0( a
);
7268 aExp
= extractFloat128Exp( a
);
7269 aSign
= extractFloat128Sign( a
);
7270 bSig1
= extractFloat128Frac1( b
);
7271 bSig0
= extractFloat128Frac0( b
);
7272 bExp
= extractFloat128Exp( b
);
7273 if ( aExp
== 0x7FFF ) {
7274 if ( ( aSig0
| aSig1
)
7275 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7276 return propagateFloat128NaN(a
, b
, status
);
7280 if ( bExp
== 0x7FFF ) {
7281 if (bSig0
| bSig1
) {
7282 return propagateFloat128NaN(a
, b
, status
);
7287 if ( ( bSig0
| bSig1
) == 0 ) {
7289 float_raise(float_flag_invalid
, status
);
7290 return float128_default_nan(status
);
7292 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7295 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7296 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7298 expDiff
= aExp
- bExp
;
7299 if ( expDiff
< -1 ) return a
;
7301 aSig0
| UINT64_C(0x0001000000000000),
7303 15 - ( expDiff
< 0 ),
7308 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7309 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7310 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7312 while ( 0 < expDiff
) {
7313 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7314 q
= ( 4 < q
) ? q
- 4 : 0;
7315 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7316 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7317 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7318 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7321 if ( -64 < expDiff
) {
7322 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7323 q
= ( 4 < q
) ? q
- 4 : 0;
7325 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7327 if ( expDiff
< 0 ) {
7328 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7331 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7333 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7334 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7337 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7338 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7341 alternateASig0
= aSig0
;
7342 alternateASig1
= aSig1
;
7344 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7345 } while ( 0 <= (int64_t) aSig0
);
7347 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7348 if ( ( sigMean0
< 0 )
7349 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7350 aSig0
= alternateASig0
;
7351 aSig1
= alternateASig1
;
7353 zSign
= ( (int64_t) aSig0
< 0 );
7354 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7355 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7359 /*----------------------------------------------------------------------------
7360 | Returns the square root of the quadruple-precision floating-point value `a'.
7361 | The operation is performed according to the IEC/IEEE Standard for Binary
7362 | Floating-Point Arithmetic.
7363 *----------------------------------------------------------------------------*/
7365 float128
float128_sqrt(float128 a
, float_status
*status
)
7369 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7370 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7372 aSig1
= extractFloat128Frac1( a
);
7373 aSig0
= extractFloat128Frac0( a
);
7374 aExp
= extractFloat128Exp( a
);
7375 aSign
= extractFloat128Sign( a
);
7376 if ( aExp
== 0x7FFF ) {
7377 if (aSig0
| aSig1
) {
7378 return propagateFloat128NaN(a
, a
, status
);
7380 if ( ! aSign
) return a
;
7384 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7386 float_raise(float_flag_invalid
, status
);
7387 return float128_default_nan(status
);
7390 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7391 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7393 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7394 aSig0
|= UINT64_C(0x0001000000000000);
7395 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7396 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7397 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7398 doubleZSig0
= zSig0
<<1;
7399 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7400 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7401 while ( (int64_t) rem0
< 0 ) {
7404 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7406 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7407 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7408 if ( zSig1
== 0 ) zSig1
= 1;
7409 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7410 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7411 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7412 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7413 while ( (int64_t) rem1
< 0 ) {
7415 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7417 term2
|= doubleZSig0
;
7418 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7420 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7422 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7423 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7427 static inline FloatRelation
7428 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7429 float_status
*status
)
7433 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7434 float_raise(float_flag_invalid
, status
);
7435 return float_relation_unordered
;
7437 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7438 ( extractFloatx80Frac( a
)<<1 ) ) ||
7439 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7440 ( extractFloatx80Frac( b
)<<1 ) )) {
7442 floatx80_is_signaling_nan(a
, status
) ||
7443 floatx80_is_signaling_nan(b
, status
)) {
7444 float_raise(float_flag_invalid
, status
);
7446 return float_relation_unordered
;
7448 aSign
= extractFloatx80Sign( a
);
7449 bSign
= extractFloatx80Sign( b
);
7450 if ( aSign
!= bSign
) {
7452 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7453 ( ( a
.low
| b
.low
) == 0 ) ) {
7455 return float_relation_equal
;
7457 return 1 - (2 * aSign
);
7460 /* Normalize pseudo-denormals before comparison. */
7461 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7464 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7467 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7468 return float_relation_equal
;
7470 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7475 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7477 return floatx80_compare_internal(a
, b
, 0, status
);
7480 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7481 float_status
*status
)
7483 return floatx80_compare_internal(a
, b
, 1, status
);
7486 static inline FloatRelation
7487 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7488 float_status
*status
)
7492 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7493 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7494 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7495 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7497 float128_is_signaling_nan(a
, status
) ||
7498 float128_is_signaling_nan(b
, status
)) {
7499 float_raise(float_flag_invalid
, status
);
7501 return float_relation_unordered
;
7503 aSign
= extractFloat128Sign( a
);
7504 bSign
= extractFloat128Sign( b
);
7505 if ( aSign
!= bSign
) {
7506 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7508 return float_relation_equal
;
7510 return 1 - (2 * aSign
);
7513 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7514 return float_relation_equal
;
7516 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7521 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7523 return float128_compare_internal(a
, b
, 0, status
);
7526 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7527 float_status
*status
)
7529 return float128_compare_internal(a
, b
, 1, status
);
7532 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7538 if (floatx80_invalid_encoding(a
)) {
7539 float_raise(float_flag_invalid
, status
);
7540 return floatx80_default_nan(status
);
7542 aSig
= extractFloatx80Frac( a
);
7543 aExp
= extractFloatx80Exp( a
);
7544 aSign
= extractFloatx80Sign( a
);
7546 if ( aExp
== 0x7FFF ) {
7548 return propagateFloatx80NaN(a
, a
, status
);
7562 } else if (n
< -0x10000) {
7567 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7568 aSign
, aExp
, aSig
, 0, status
);
7571 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7575 uint64_t aSig0
, aSig1
;
7577 aSig1
= extractFloat128Frac1( a
);
7578 aSig0
= extractFloat128Frac0( a
);
7579 aExp
= extractFloat128Exp( a
);
7580 aSign
= extractFloat128Sign( a
);
7581 if ( aExp
== 0x7FFF ) {
7582 if ( aSig0
| aSig1
) {
7583 return propagateFloat128NaN(a
, a
, status
);
7588 aSig0
|= UINT64_C(0x0001000000000000);
7589 } else if (aSig0
== 0 && aSig1
== 0) {
7597 } else if (n
< -0x10000) {
7602 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7607 static void __attribute__((constructor
)) softfloat_init(void)
7609 union_float64 ua
, ub
, uc
, ur
;
7611 if (QEMU_NO_HARDFLOAT
) {
7615 * Test that the host's FMA is not obviously broken. For example,
7616 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7617 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7619 ua
.s
= 0x0020000000000001ULL
;
7620 ub
.s
= 0x3ca0000000000000ULL
;
7621 uc
.s
= 0x0020000000000000ULL
;
7622 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7623 if (ur
.s
!= 0x0020000000000001ULL
) {
7624 force_soft_fma
= true;