]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Move round_to_uint_and_pack to softfloat-parts.c.inc
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
505 *
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
509 */
510
511 typedef struct {
512 FloatClass cls;
513 bool sign;
514 int32_t exp;
515 union {
516 /* Routines that know the structure may reference the singular name. */
517 uint64_t frac;
518 /*
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
522 */
523 uint64_t frac_hi;
524 uint64_t frac_lo;
525 };
526 } FloatParts64;
527
528 typedef struct {
529 FloatClass cls;
530 bool sign;
531 int32_t exp;
532 uint64_t frac_hi;
533 uint64_t frac_lo;
534 } FloatParts128;
535
536 typedef struct {
537 FloatClass cls;
538 bool sign;
539 int32_t exp;
540 uint64_t frac_hi;
541 uint64_t frac_hm; /* high-middle */
542 uint64_t frac_lm; /* low-middle */
543 uint64_t frac_lo;
544 } FloatParts256;
545
546 /* These apply to the most significant word of each FloatPartsN. */
547 #define DECOMPOSED_BINARY_POINT 63
548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
549
550 /* Structure holding all of the relevant parameters for a format.
551 * exp_size: the size of the exponent field
552 * exp_bias: the offset applied to the exponent field
553 * exp_max: the maximum normalised exponent
554 * frac_size: the size of the fraction field
555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
556 * The following are computed based the size of fraction
557 * frac_lsb: least significant bit of fraction
558 * frac_lsbm1: the bit below the least significant bit (for rounding)
559 * round_mask/roundeven_mask: masks used for rounding
560 * The following optional modifiers are available:
561 * arm_althp: handle ARM Alternative Half Precision
562 */
563 typedef struct {
564 int exp_size;
565 int exp_bias;
566 int exp_max;
567 int frac_size;
568 int frac_shift;
569 uint64_t frac_lsb;
570 uint64_t frac_lsbm1;
571 uint64_t round_mask;
572 uint64_t roundeven_mask;
573 bool arm_althp;
574 } FloatFmt;
575
576 /* Expand fields based on the size of exponent and fraction */
577 #define FLOAT_PARAMS(E, F) \
578 .exp_size = E, \
579 .exp_bias = ((1 << E) - 1) >> 1, \
580 .exp_max = (1 << E) - 1, \
581 .frac_size = F, \
582 .frac_shift = (-F - 1) & 63, \
583 .frac_lsb = 1ull << ((-F - 1) & 63), \
584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
587
588 static const FloatFmt float16_params = {
589 FLOAT_PARAMS(5, 10)
590 };
591
592 static const FloatFmt float16_params_ahp = {
593 FLOAT_PARAMS(5, 10),
594 .arm_althp = true
595 };
596
597 static const FloatFmt bfloat16_params = {
598 FLOAT_PARAMS(8, 7)
599 };
600
601 static const FloatFmt float32_params = {
602 FLOAT_PARAMS(8, 23)
603 };
604
605 static const FloatFmt float64_params = {
606 FLOAT_PARAMS(11, 52)
607 };
608
609 static const FloatFmt float128_params = {
610 FLOAT_PARAMS(15, 112)
611 };
612
613 /* Unpack a float to parts, but do not canonicalize. */
614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
615 {
616 const int f_size = fmt->frac_size;
617 const int e_size = fmt->exp_size;
618
619 *r = (FloatParts64) {
620 .cls = float_class_unclassified,
621 .sign = extract64(raw, f_size + e_size, 1),
622 .exp = extract64(raw, f_size, e_size),
623 .frac = extract64(raw, 0, f_size)
624 };
625 }
626
627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
628 {
629 unpack_raw64(p, &float16_params, f);
630 }
631
632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
633 {
634 unpack_raw64(p, &bfloat16_params, f);
635 }
636
637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
638 {
639 unpack_raw64(p, &float32_params, f);
640 }
641
642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
643 {
644 unpack_raw64(p, &float64_params, f);
645 }
646
647 static void float128_unpack_raw(FloatParts128 *p, float128 f)
648 {
649 const int f_size = float128_params.frac_size - 64;
650 const int e_size = float128_params.exp_size;
651
652 *p = (FloatParts128) {
653 .cls = float_class_unclassified,
654 .sign = extract64(f.high, f_size + e_size, 1),
655 .exp = extract64(f.high, f_size, e_size),
656 .frac_hi = extract64(f.high, 0, f_size),
657 .frac_lo = f.low,
658 };
659 }
660
661 /* Pack a float from parts, but do not canonicalize. */
662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
663 {
664 const int f_size = fmt->frac_size;
665 const int e_size = fmt->exp_size;
666 uint64_t ret;
667
668 ret = (uint64_t)p->sign << (f_size + e_size);
669 ret = deposit64(ret, f_size, e_size, p->exp);
670 ret = deposit64(ret, 0, f_size, p->frac);
671 return ret;
672 }
673
674 static inline float16 float16_pack_raw(const FloatParts64 *p)
675 {
676 return make_float16(pack_raw64(p, &float16_params));
677 }
678
679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
680 {
681 return pack_raw64(p, &bfloat16_params);
682 }
683
684 static inline float32 float32_pack_raw(const FloatParts64 *p)
685 {
686 return make_float32(pack_raw64(p, &float32_params));
687 }
688
689 static inline float64 float64_pack_raw(const FloatParts64 *p)
690 {
691 return make_float64(pack_raw64(p, &float64_params));
692 }
693
694 static float128 float128_pack_raw(const FloatParts128 *p)
695 {
696 const int f_size = float128_params.frac_size - 64;
697 const int e_size = float128_params.exp_size;
698 uint64_t hi;
699
700 hi = (uint64_t)p->sign << (f_size + e_size);
701 hi = deposit64(hi, f_size, e_size, p->exp);
702 hi = deposit64(hi, 0, f_size, p->frac_hi);
703 return make_float128(hi, p->frac_lo);
704 }
705
706 /*----------------------------------------------------------------------------
707 | Functions and definitions to determine: (1) whether tininess for underflow
708 | is detected before or after rounding by default, (2) what (if anything)
709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
711 | are propagated from function inputs to output. These details are target-
712 | specific.
713 *----------------------------------------------------------------------------*/
714 #include "softfloat-specialize.c.inc"
715
716 #define PARTS_GENERIC_64_128(NAME, P) \
717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
718
719 #define PARTS_GENERIC_64_128_256(NAME, P) \
720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
721 (FloatParts128 *, parts128_##NAME), parts64_##NAME)
722
723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
725
726 static void parts64_return_nan(FloatParts64 *a, float_status *s);
727 static void parts128_return_nan(FloatParts128 *a, float_status *s);
728
729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
730
731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
732 float_status *s);
733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
734 float_status *s);
735
736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
737
738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
739 FloatParts64 *c, float_status *s,
740 int ab_mask, int abc_mask);
741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
742 FloatParts128 *b,
743 FloatParts128 *c,
744 float_status *s,
745 int ab_mask, int abc_mask);
746
747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
749
750 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
751 const FloatFmt *fmt);
752 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
753 const FloatFmt *fmt);
754
755 #define parts_canonicalize(A, S, F) \
756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
757
758 static void parts64_uncanon(FloatParts64 *p, float_status *status,
759 const FloatFmt *fmt);
760 static void parts128_uncanon(FloatParts128 *p, float_status *status,
761 const FloatFmt *fmt);
762
763 #define parts_uncanon(A, S, F) \
764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
765
766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b);
769
770 #define parts_add_normal(A, B) \
771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
772
773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b);
776
777 #define parts_sub_normal(A, B) \
778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
779
780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
781 float_status *s, bool subtract);
782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
783 float_status *s, bool subtract);
784
785 #define parts_addsub(A, B, S, Z) \
786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
787
788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b,
789 float_status *s);
790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
791 float_status *s);
792
793 #define parts_mul(A, B, S) \
794 PARTS_GENERIC_64_128(mul, A)(A, B, S)
795
796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
797 FloatParts64 *c, int flags,
798 float_status *s);
799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
800 FloatParts128 *c, int flags,
801 float_status *s);
802
803 #define parts_muladd(A, B, C, Z, S) \
804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
805
806 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
807 float_status *s);
808 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b,
809 float_status *s);
810
811 #define parts_div(A, B, S) \
812 PARTS_GENERIC_64_128(div, A)(A, B, S)
813
814 static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm,
815 int scale, int frac_size);
816 static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r,
817 int scale, int frac_size);
818
819 #define parts_round_to_int_normal(A, R, C, F) \
820 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F)
821
822 static void parts64_round_to_int(FloatParts64 *a, FloatRoundMode rm,
823 int scale, float_status *s,
824 const FloatFmt *fmt);
825 static void parts128_round_to_int(FloatParts128 *a, FloatRoundMode r,
826 int scale, float_status *s,
827 const FloatFmt *fmt);
828
829 #define parts_round_to_int(A, R, C, S, F) \
830 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F)
831
832 static int64_t parts64_float_to_sint(FloatParts64 *p, FloatRoundMode rmode,
833 int scale, int64_t min, int64_t max,
834 float_status *s);
835 static int64_t parts128_float_to_sint(FloatParts128 *p, FloatRoundMode rmode,
836 int scale, int64_t min, int64_t max,
837 float_status *s);
838
839 #define parts_float_to_sint(P, R, Z, MN, MX, S) \
840 PARTS_GENERIC_64_128(float_to_sint, P)(P, R, Z, MN, MX, S)
841
842 static uint64_t parts64_float_to_uint(FloatParts64 *p, FloatRoundMode rmode,
843 int scale, uint64_t max,
844 float_status *s);
845 static uint64_t parts128_float_to_uint(FloatParts128 *p, FloatRoundMode rmode,
846 int scale, uint64_t max,
847 float_status *s);
848
849 #define parts_float_to_uint(P, R, Z, M, S) \
850 PARTS_GENERIC_64_128(float_to_uint, P)(P, R, Z, M, S)
851
852 /*
853 * Helper functions for softfloat-parts.c.inc, per-size operations.
854 */
855
856 #define FRAC_GENERIC_64_128(NAME, P) \
857 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
858
859 #define FRAC_GENERIC_64_128_256(NAME, P) \
860 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
861 (FloatParts128 *, frac128_##NAME), frac64_##NAME)
862
863 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
864 {
865 return uadd64_overflow(a->frac, b->frac, &r->frac);
866 }
867
868 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
869 {
870 bool c = 0;
871 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
872 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
873 return c;
874 }
875
876 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
877 {
878 bool c = 0;
879 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
880 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c);
881 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c);
882 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
883 return c;
884 }
885
886 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B)
887
888 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
889 {
890 return uadd64_overflow(a->frac, c, &r->frac);
891 }
892
893 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
894 {
895 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
896 return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
897 }
898
899 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
900
901 static void frac64_allones(FloatParts64 *a)
902 {
903 a->frac = -1;
904 }
905
906 static void frac128_allones(FloatParts128 *a)
907 {
908 a->frac_hi = a->frac_lo = -1;
909 }
910
911 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
912
913 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
914 {
915 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
916 }
917
918 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
919 {
920 uint64_t ta = a->frac_hi, tb = b->frac_hi;
921 if (ta == tb) {
922 ta = a->frac_lo, tb = b->frac_lo;
923 if (ta == tb) {
924 return 0;
925 }
926 }
927 return ta < tb ? -1 : 1;
928 }
929
930 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
931
932 static void frac64_clear(FloatParts64 *a)
933 {
934 a->frac = 0;
935 }
936
937 static void frac128_clear(FloatParts128 *a)
938 {
939 a->frac_hi = a->frac_lo = 0;
940 }
941
942 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
943
944 static bool frac64_div(FloatParts64 *a, FloatParts64 *b)
945 {
946 uint64_t n1, n0, r, q;
947 bool ret;
948
949 /*
950 * We want a 2*N / N-bit division to produce exactly an N-bit
951 * result, so that we do not lose any precision and so that we
952 * do not have to renormalize afterward. If A.frac < B.frac,
953 * then division would produce an (N-1)-bit result; shift A left
954 * by one to produce the an N-bit result, and return true to
955 * decrement the exponent to match.
956 *
957 * The udiv_qrnnd algorithm that we're using requires normalization,
958 * i.e. the msb of the denominator must be set, which is already true.
959 */
960 ret = a->frac < b->frac;
961 if (ret) {
962 n0 = a->frac;
963 n1 = 0;
964 } else {
965 n0 = a->frac >> 1;
966 n1 = a->frac << 63;
967 }
968 q = udiv_qrnnd(&r, n0, n1, b->frac);
969
970 /* Set lsb if there is a remainder, to set inexact. */
971 a->frac = q | (r != 0);
972
973 return ret;
974 }
975
976 static bool frac128_div(FloatParts128 *a, FloatParts128 *b)
977 {
978 uint64_t q0, q1, a0, a1, b0, b1;
979 uint64_t r0, r1, r2, r3, t0, t1, t2, t3;
980 bool ret = false;
981
982 a0 = a->frac_hi, a1 = a->frac_lo;
983 b0 = b->frac_hi, b1 = b->frac_lo;
984
985 ret = lt128(a0, a1, b0, b1);
986 if (!ret) {
987 a1 = shr_double(a0, a1, 1);
988 a0 = a0 >> 1;
989 }
990
991 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */
992 q0 = estimateDiv128To64(a0, a1, b0);
993
994 /*
995 * Estimate is high because B1 was not included (unless B1 == 0).
996 * Reduce quotient and increase remainder until remainder is non-negative.
997 * This loop will execute 0 to 2 times.
998 */
999 mul128By64To192(b0, b1, q0, &t0, &t1, &t2);
1000 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2);
1001 while (r0 != 0) {
1002 q0--;
1003 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2);
1004 }
1005
1006 /* Repeat using the remainder, producing a second word of quotient. */
1007 q1 = estimateDiv128To64(r1, r2, b0);
1008 mul128By64To192(b0, b1, q1, &t1, &t2, &t3);
1009 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3);
1010 while (r1 != 0) {
1011 q1--;
1012 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3);
1013 }
1014
1015 /* Any remainder indicates inexact; set sticky bit. */
1016 q1 |= (r2 | r3) != 0;
1017
1018 a->frac_hi = q0;
1019 a->frac_lo = q1;
1020 return ret;
1021 }
1022
1023 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B)
1024
1025 static bool frac64_eqz(FloatParts64 *a)
1026 {
1027 return a->frac == 0;
1028 }
1029
1030 static bool frac128_eqz(FloatParts128 *a)
1031 {
1032 return (a->frac_hi | a->frac_lo) == 0;
1033 }
1034
1035 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
1036
1037 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b)
1038 {
1039 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac);
1040 }
1041
1042 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b)
1043 {
1044 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo,
1045 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo);
1046 }
1047
1048 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B)
1049
1050 static void frac64_neg(FloatParts64 *a)
1051 {
1052 a->frac = -a->frac;
1053 }
1054
1055 static void frac128_neg(FloatParts128 *a)
1056 {
1057 bool c = 0;
1058 a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
1059 a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
1060 }
1061
1062 static void frac256_neg(FloatParts256 *a)
1063 {
1064 bool c = 0;
1065 a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
1066 a->frac_lm = usub64_borrow(0, a->frac_lm, &c);
1067 a->frac_hm = usub64_borrow(0, a->frac_hm, &c);
1068 a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
1069 }
1070
1071 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A)
1072
1073 static int frac64_normalize(FloatParts64 *a)
1074 {
1075 if (a->frac) {
1076 int shift = clz64(a->frac);
1077 a->frac <<= shift;
1078 return shift;
1079 }
1080 return 64;
1081 }
1082
1083 static int frac128_normalize(FloatParts128 *a)
1084 {
1085 if (a->frac_hi) {
1086 int shl = clz64(a->frac_hi);
1087 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl);
1088 a->frac_lo <<= shl;
1089 return shl;
1090 } else if (a->frac_lo) {
1091 int shl = clz64(a->frac_lo);
1092 a->frac_hi = a->frac_lo << shl;
1093 a->frac_lo = 0;
1094 return shl + 64;
1095 }
1096 return 128;
1097 }
1098
1099 static int frac256_normalize(FloatParts256 *a)
1100 {
1101 uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
1102 uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
1103 int ret, shl;
1104
1105 if (likely(a0)) {
1106 shl = clz64(a0);
1107 if (shl == 0) {
1108 return 0;
1109 }
1110 ret = shl;
1111 } else {
1112 if (a1) {
1113 ret = 64;
1114 a0 = a1, a1 = a2, a2 = a3, a3 = 0;
1115 } else if (a2) {
1116 ret = 128;
1117 a0 = a2, a1 = a3, a2 = 0, a3 = 0;
1118 } else if (a3) {
1119 ret = 192;
1120 a0 = a3, a1 = 0, a2 = 0, a3 = 0;
1121 } else {
1122 ret = 256;
1123 a0 = 0, a1 = 0, a2 = 0, a3 = 0;
1124 goto done;
1125 }
1126 shl = clz64(a0);
1127 if (shl == 0) {
1128 goto done;
1129 }
1130 ret += shl;
1131 }
1132
1133 a0 = shl_double(a0, a1, shl);
1134 a1 = shl_double(a1, a2, shl);
1135 a2 = shl_double(a2, a3, shl);
1136 a3 <<= shl;
1137
1138 done:
1139 a->frac_hi = a0;
1140 a->frac_hm = a1;
1141 a->frac_lm = a2;
1142 a->frac_lo = a3;
1143 return ret;
1144 }
1145
1146 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A)
1147
1148 static void frac64_shl(FloatParts64 *a, int c)
1149 {
1150 a->frac <<= c;
1151 }
1152
1153 static void frac128_shl(FloatParts128 *a, int c)
1154 {
1155 uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
1156
1157 if (c & 64) {
1158 a0 = a1, a1 = 0;
1159 }
1160
1161 c &= 63;
1162 if (c) {
1163 a0 = shl_double(a0, a1, c);
1164 a1 = a1 << c;
1165 }
1166
1167 a->frac_hi = a0;
1168 a->frac_lo = a1;
1169 }
1170
1171 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
1172
1173 static void frac64_shr(FloatParts64 *a, int c)
1174 {
1175 a->frac >>= c;
1176 }
1177
1178 static void frac128_shr(FloatParts128 *a, int c)
1179 {
1180 uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
1181
1182 if (c & 64) {
1183 a1 = a0, a0 = 0;
1184 }
1185
1186 c &= 63;
1187 if (c) {
1188 a1 = shr_double(a0, a1, c);
1189 a0 = a0 >> c;
1190 }
1191
1192 a->frac_hi = a0;
1193 a->frac_lo = a1;
1194 }
1195
1196 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
1197
1198 static void frac64_shrjam(FloatParts64 *a, int c)
1199 {
1200 uint64_t a0 = a->frac;
1201
1202 if (likely(c != 0)) {
1203 if (likely(c < 64)) {
1204 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0);
1205 } else {
1206 a0 = a0 != 0;
1207 }
1208 a->frac = a0;
1209 }
1210 }
1211
1212 static void frac128_shrjam(FloatParts128 *a, int c)
1213 {
1214 uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
1215 uint64_t sticky = 0;
1216
1217 if (unlikely(c == 0)) {
1218 return;
1219 } else if (likely(c < 64)) {
1220 /* nothing */
1221 } else if (likely(c < 128)) {
1222 sticky = a1;
1223 a1 = a0;
1224 a0 = 0;
1225 c &= 63;
1226 if (c == 0) {
1227 goto done;
1228 }
1229 } else {
1230 sticky = a0 | a1;
1231 a0 = a1 = 0;
1232 goto done;
1233 }
1234
1235 sticky |= shr_double(a1, 0, c);
1236 a1 = shr_double(a0, a1, c);
1237 a0 = a0 >> c;
1238
1239 done:
1240 a->frac_lo = a1 | (sticky != 0);
1241 a->frac_hi = a0;
1242 }
1243
1244 static void frac256_shrjam(FloatParts256 *a, int c)
1245 {
1246 uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
1247 uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
1248 uint64_t sticky = 0;
1249
1250 if (unlikely(c == 0)) {
1251 return;
1252 } else if (likely(c < 64)) {
1253 /* nothing */
1254 } else if (likely(c < 256)) {
1255 if (unlikely(c & 128)) {
1256 sticky |= a2 | a3;
1257 a3 = a1, a2 = a0, a1 = 0, a0 = 0;
1258 }
1259 if (unlikely(c & 64)) {
1260 sticky |= a3;
1261 a3 = a2, a2 = a1, a1 = a0, a0 = 0;
1262 }
1263 c &= 63;
1264 if (c == 0) {
1265 goto done;
1266 }
1267 } else {
1268 sticky = a0 | a1 | a2 | a3;
1269 a0 = a1 = a2 = a3 = 0;
1270 goto done;
1271 }
1272
1273 sticky |= shr_double(a3, 0, c);
1274 a3 = shr_double(a2, a3, c);
1275 a2 = shr_double(a1, a2, c);
1276 a1 = shr_double(a0, a1, c);
1277 a0 = a0 >> c;
1278
1279 done:
1280 a->frac_lo = a3 | (sticky != 0);
1281 a->frac_lm = a2;
1282 a->frac_hm = a1;
1283 a->frac_hi = a0;
1284 }
1285
1286 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1287
1288 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
1289 {
1290 return usub64_overflow(a->frac, b->frac, &r->frac);
1291 }
1292
1293 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
1294 {
1295 bool c = 0;
1296 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1297 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1298 return c;
1299 }
1300
1301 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
1302 {
1303 bool c = 0;
1304 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1305 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c);
1306 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c);
1307 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1308 return c;
1309 }
1310
1311 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1312
1313 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a)
1314 {
1315 r->frac = a->frac_hi | (a->frac_lo != 0);
1316 }
1317
1318 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a)
1319 {
1320 r->frac_hi = a->frac_hi;
1321 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0);
1322 }
1323
1324 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A)
1325
1326 static void frac64_widen(FloatParts128 *r, FloatParts64 *a)
1327 {
1328 r->frac_hi = a->frac;
1329 r->frac_lo = 0;
1330 }
1331
1332 static void frac128_widen(FloatParts256 *r, FloatParts128 *a)
1333 {
1334 r->frac_hi = a->frac_hi;
1335 r->frac_hm = a->frac_lo;
1336 r->frac_lm = 0;
1337 r->frac_lo = 0;
1338 }
1339
1340 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B)
1341
1342 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
1343 #define FloatPartsN glue(FloatParts,N)
1344 #define FloatPartsW glue(FloatParts,W)
1345
1346 #define N 64
1347 #define W 128
1348
1349 #include "softfloat-parts-addsub.c.inc"
1350 #include "softfloat-parts.c.inc"
1351
1352 #undef N
1353 #undef W
1354 #define N 128
1355 #define W 256
1356
1357 #include "softfloat-parts-addsub.c.inc"
1358 #include "softfloat-parts.c.inc"
1359
1360 #undef N
1361 #undef W
1362 #define N 256
1363
1364 #include "softfloat-parts-addsub.c.inc"
1365
1366 #undef N
1367 #undef W
1368 #undef partsN
1369 #undef FloatPartsN
1370 #undef FloatPartsW
1371
1372 /*
1373 * Pack/unpack routines with a specific FloatFmt.
1374 */
1375
1376 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1377 float_status *s, const FloatFmt *params)
1378 {
1379 float16_unpack_raw(p, f);
1380 parts_canonicalize(p, s, params);
1381 }
1382
1383 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1384 float_status *s)
1385 {
1386 float16a_unpack_canonical(p, f, s, &float16_params);
1387 }
1388
1389 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1390 float_status *s)
1391 {
1392 bfloat16_unpack_raw(p, f);
1393 parts_canonicalize(p, s, &bfloat16_params);
1394 }
1395
1396 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1397 float_status *s,
1398 const FloatFmt *params)
1399 {
1400 parts_uncanon(p, s, params);
1401 return float16_pack_raw(p);
1402 }
1403
1404 static float16 float16_round_pack_canonical(FloatParts64 *p,
1405 float_status *s)
1406 {
1407 return float16a_round_pack_canonical(p, s, &float16_params);
1408 }
1409
1410 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1411 float_status *s)
1412 {
1413 parts_uncanon(p, s, &bfloat16_params);
1414 return bfloat16_pack_raw(p);
1415 }
1416
1417 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1418 float_status *s)
1419 {
1420 float32_unpack_raw(p, f);
1421 parts_canonicalize(p, s, &float32_params);
1422 }
1423
1424 static float32 float32_round_pack_canonical(FloatParts64 *p,
1425 float_status *s)
1426 {
1427 parts_uncanon(p, s, &float32_params);
1428 return float32_pack_raw(p);
1429 }
1430
1431 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1432 float_status *s)
1433 {
1434 float64_unpack_raw(p, f);
1435 parts_canonicalize(p, s, &float64_params);
1436 }
1437
1438 static float64 float64_round_pack_canonical(FloatParts64 *p,
1439 float_status *s)
1440 {
1441 parts_uncanon(p, s, &float64_params);
1442 return float64_pack_raw(p);
1443 }
1444
1445 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1446 float_status *s)
1447 {
1448 float128_unpack_raw(p, f);
1449 parts_canonicalize(p, s, &float128_params);
1450 }
1451
1452 static float128 float128_round_pack_canonical(FloatParts128 *p,
1453 float_status *s)
1454 {
1455 parts_uncanon(p, s, &float128_params);
1456 return float128_pack_raw(p);
1457 }
1458
1459 /*
1460 * Addition and subtraction
1461 */
1462
1463 static float16 QEMU_FLATTEN
1464 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1465 {
1466 FloatParts64 pa, pb, *pr;
1467
1468 float16_unpack_canonical(&pa, a, status);
1469 float16_unpack_canonical(&pb, b, status);
1470 pr = parts_addsub(&pa, &pb, status, subtract);
1471
1472 return float16_round_pack_canonical(pr, status);
1473 }
1474
1475 float16 float16_add(float16 a, float16 b, float_status *status)
1476 {
1477 return float16_addsub(a, b, status, false);
1478 }
1479
1480 float16 float16_sub(float16 a, float16 b, float_status *status)
1481 {
1482 return float16_addsub(a, b, status, true);
1483 }
1484
1485 static float32 QEMU_SOFTFLOAT_ATTR
1486 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1487 {
1488 FloatParts64 pa, pb, *pr;
1489
1490 float32_unpack_canonical(&pa, a, status);
1491 float32_unpack_canonical(&pb, b, status);
1492 pr = parts_addsub(&pa, &pb, status, subtract);
1493
1494 return float32_round_pack_canonical(pr, status);
1495 }
1496
1497 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1498 {
1499 return soft_f32_addsub(a, b, status, false);
1500 }
1501
1502 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1503 {
1504 return soft_f32_addsub(a, b, status, true);
1505 }
1506
1507 static float64 QEMU_SOFTFLOAT_ATTR
1508 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1509 {
1510 FloatParts64 pa, pb, *pr;
1511
1512 float64_unpack_canonical(&pa, a, status);
1513 float64_unpack_canonical(&pb, b, status);
1514 pr = parts_addsub(&pa, &pb, status, subtract);
1515
1516 return float64_round_pack_canonical(pr, status);
1517 }
1518
1519 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1520 {
1521 return soft_f64_addsub(a, b, status, false);
1522 }
1523
1524 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1525 {
1526 return soft_f64_addsub(a, b, status, true);
1527 }
1528
1529 static float hard_f32_add(float a, float b)
1530 {
1531 return a + b;
1532 }
1533
1534 static float hard_f32_sub(float a, float b)
1535 {
1536 return a - b;
1537 }
1538
1539 static double hard_f64_add(double a, double b)
1540 {
1541 return a + b;
1542 }
1543
1544 static double hard_f64_sub(double a, double b)
1545 {
1546 return a - b;
1547 }
1548
1549 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1550 {
1551 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1552 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1553 }
1554 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1555 }
1556
1557 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1558 {
1559 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1560 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1561 } else {
1562 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1563 }
1564 }
1565
1566 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1567 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1568 {
1569 return float32_gen2(a, b, s, hard, soft,
1570 f32_is_zon2, f32_addsubmul_post);
1571 }
1572
1573 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1574 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1575 {
1576 return float64_gen2(a, b, s, hard, soft,
1577 f64_is_zon2, f64_addsubmul_post);
1578 }
1579
1580 float32 QEMU_FLATTEN
1581 float32_add(float32 a, float32 b, float_status *s)
1582 {
1583 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1584 }
1585
1586 float32 QEMU_FLATTEN
1587 float32_sub(float32 a, float32 b, float_status *s)
1588 {
1589 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1590 }
1591
1592 float64 QEMU_FLATTEN
1593 float64_add(float64 a, float64 b, float_status *s)
1594 {
1595 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1596 }
1597
1598 float64 QEMU_FLATTEN
1599 float64_sub(float64 a, float64 b, float_status *s)
1600 {
1601 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1602 }
1603
1604 static bfloat16 QEMU_FLATTEN
1605 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1606 {
1607 FloatParts64 pa, pb, *pr;
1608
1609 bfloat16_unpack_canonical(&pa, a, status);
1610 bfloat16_unpack_canonical(&pb, b, status);
1611 pr = parts_addsub(&pa, &pb, status, subtract);
1612
1613 return bfloat16_round_pack_canonical(pr, status);
1614 }
1615
1616 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1617 {
1618 return bfloat16_addsub(a, b, status, false);
1619 }
1620
1621 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1622 {
1623 return bfloat16_addsub(a, b, status, true);
1624 }
1625
1626 static float128 QEMU_FLATTEN
1627 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1628 {
1629 FloatParts128 pa, pb, *pr;
1630
1631 float128_unpack_canonical(&pa, a, status);
1632 float128_unpack_canonical(&pb, b, status);
1633 pr = parts_addsub(&pa, &pb, status, subtract);
1634
1635 return float128_round_pack_canonical(pr, status);
1636 }
1637
1638 float128 float128_add(float128 a, float128 b, float_status *status)
1639 {
1640 return float128_addsub(a, b, status, false);
1641 }
1642
1643 float128 float128_sub(float128 a, float128 b, float_status *status)
1644 {
1645 return float128_addsub(a, b, status, true);
1646 }
1647
1648 /*
1649 * Multiplication
1650 */
1651
1652 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1653 {
1654 FloatParts64 pa, pb, *pr;
1655
1656 float16_unpack_canonical(&pa, a, status);
1657 float16_unpack_canonical(&pb, b, status);
1658 pr = parts_mul(&pa, &pb, status);
1659
1660 return float16_round_pack_canonical(pr, status);
1661 }
1662
1663 static float32 QEMU_SOFTFLOAT_ATTR
1664 soft_f32_mul(float32 a, float32 b, float_status *status)
1665 {
1666 FloatParts64 pa, pb, *pr;
1667
1668 float32_unpack_canonical(&pa, a, status);
1669 float32_unpack_canonical(&pb, b, status);
1670 pr = parts_mul(&pa, &pb, status);
1671
1672 return float32_round_pack_canonical(pr, status);
1673 }
1674
1675 static float64 QEMU_SOFTFLOAT_ATTR
1676 soft_f64_mul(float64 a, float64 b, float_status *status)
1677 {
1678 FloatParts64 pa, pb, *pr;
1679
1680 float64_unpack_canonical(&pa, a, status);
1681 float64_unpack_canonical(&pb, b, status);
1682 pr = parts_mul(&pa, &pb, status);
1683
1684 return float64_round_pack_canonical(pr, status);
1685 }
1686
1687 static float hard_f32_mul(float a, float b)
1688 {
1689 return a * b;
1690 }
1691
1692 static double hard_f64_mul(double a, double b)
1693 {
1694 return a * b;
1695 }
1696
1697 float32 QEMU_FLATTEN
1698 float32_mul(float32 a, float32 b, float_status *s)
1699 {
1700 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1701 f32_is_zon2, f32_addsubmul_post);
1702 }
1703
1704 float64 QEMU_FLATTEN
1705 float64_mul(float64 a, float64 b, float_status *s)
1706 {
1707 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1708 f64_is_zon2, f64_addsubmul_post);
1709 }
1710
1711 bfloat16 QEMU_FLATTEN
1712 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1713 {
1714 FloatParts64 pa, pb, *pr;
1715
1716 bfloat16_unpack_canonical(&pa, a, status);
1717 bfloat16_unpack_canonical(&pb, b, status);
1718 pr = parts_mul(&pa, &pb, status);
1719
1720 return bfloat16_round_pack_canonical(pr, status);
1721 }
1722
1723 float128 QEMU_FLATTEN
1724 float128_mul(float128 a, float128 b, float_status *status)
1725 {
1726 FloatParts128 pa, pb, *pr;
1727
1728 float128_unpack_canonical(&pa, a, status);
1729 float128_unpack_canonical(&pb, b, status);
1730 pr = parts_mul(&pa, &pb, status);
1731
1732 return float128_round_pack_canonical(pr, status);
1733 }
1734
1735 /*
1736 * Fused multiply-add
1737 */
1738
1739 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1740 int flags, float_status *status)
1741 {
1742 FloatParts64 pa, pb, pc, *pr;
1743
1744 float16_unpack_canonical(&pa, a, status);
1745 float16_unpack_canonical(&pb, b, status);
1746 float16_unpack_canonical(&pc, c, status);
1747 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1748
1749 return float16_round_pack_canonical(pr, status);
1750 }
1751
1752 static float32 QEMU_SOFTFLOAT_ATTR
1753 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1754 float_status *status)
1755 {
1756 FloatParts64 pa, pb, pc, *pr;
1757
1758 float32_unpack_canonical(&pa, a, status);
1759 float32_unpack_canonical(&pb, b, status);
1760 float32_unpack_canonical(&pc, c, status);
1761 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1762
1763 return float32_round_pack_canonical(pr, status);
1764 }
1765
1766 static float64 QEMU_SOFTFLOAT_ATTR
1767 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1768 float_status *status)
1769 {
1770 FloatParts64 pa, pb, pc, *pr;
1771
1772 float64_unpack_canonical(&pa, a, status);
1773 float64_unpack_canonical(&pb, b, status);
1774 float64_unpack_canonical(&pc, c, status);
1775 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1776
1777 return float64_round_pack_canonical(pr, status);
1778 }
1779
1780 static bool force_soft_fma;
1781
1782 float32 QEMU_FLATTEN
1783 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1784 {
1785 union_float32 ua, ub, uc, ur;
1786
1787 ua.s = xa;
1788 ub.s = xb;
1789 uc.s = xc;
1790
1791 if (unlikely(!can_use_fpu(s))) {
1792 goto soft;
1793 }
1794 if (unlikely(flags & float_muladd_halve_result)) {
1795 goto soft;
1796 }
1797
1798 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1799 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1800 goto soft;
1801 }
1802
1803 if (unlikely(force_soft_fma)) {
1804 goto soft;
1805 }
1806
1807 /*
1808 * When (a || b) == 0, there's no need to check for under/over flow,
1809 * since we know the addend is (normal || 0) and the product is 0.
1810 */
1811 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1812 union_float32 up;
1813 bool prod_sign;
1814
1815 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1816 prod_sign ^= !!(flags & float_muladd_negate_product);
1817 up.s = float32_set_sign(float32_zero, prod_sign);
1818
1819 if (flags & float_muladd_negate_c) {
1820 uc.h = -uc.h;
1821 }
1822 ur.h = up.h + uc.h;
1823 } else {
1824 union_float32 ua_orig = ua;
1825 union_float32 uc_orig = uc;
1826
1827 if (flags & float_muladd_negate_product) {
1828 ua.h = -ua.h;
1829 }
1830 if (flags & float_muladd_negate_c) {
1831 uc.h = -uc.h;
1832 }
1833
1834 ur.h = fmaf(ua.h, ub.h, uc.h);
1835
1836 if (unlikely(f32_is_inf(ur))) {
1837 float_raise(float_flag_overflow, s);
1838 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1839 ua = ua_orig;
1840 uc = uc_orig;
1841 goto soft;
1842 }
1843 }
1844 if (flags & float_muladd_negate_result) {
1845 return float32_chs(ur.s);
1846 }
1847 return ur.s;
1848
1849 soft:
1850 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1851 }
1852
1853 float64 QEMU_FLATTEN
1854 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1855 {
1856 union_float64 ua, ub, uc, ur;
1857
1858 ua.s = xa;
1859 ub.s = xb;
1860 uc.s = xc;
1861
1862 if (unlikely(!can_use_fpu(s))) {
1863 goto soft;
1864 }
1865 if (unlikely(flags & float_muladd_halve_result)) {
1866 goto soft;
1867 }
1868
1869 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1870 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1871 goto soft;
1872 }
1873
1874 if (unlikely(force_soft_fma)) {
1875 goto soft;
1876 }
1877
1878 /*
1879 * When (a || b) == 0, there's no need to check for under/over flow,
1880 * since we know the addend is (normal || 0) and the product is 0.
1881 */
1882 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1883 union_float64 up;
1884 bool prod_sign;
1885
1886 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1887 prod_sign ^= !!(flags & float_muladd_negate_product);
1888 up.s = float64_set_sign(float64_zero, prod_sign);
1889
1890 if (flags & float_muladd_negate_c) {
1891 uc.h = -uc.h;
1892 }
1893 ur.h = up.h + uc.h;
1894 } else {
1895 union_float64 ua_orig = ua;
1896 union_float64 uc_orig = uc;
1897
1898 if (flags & float_muladd_negate_product) {
1899 ua.h = -ua.h;
1900 }
1901 if (flags & float_muladd_negate_c) {
1902 uc.h = -uc.h;
1903 }
1904
1905 ur.h = fma(ua.h, ub.h, uc.h);
1906
1907 if (unlikely(f64_is_inf(ur))) {
1908 float_raise(float_flag_overflow, s);
1909 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1910 ua = ua_orig;
1911 uc = uc_orig;
1912 goto soft;
1913 }
1914 }
1915 if (flags & float_muladd_negate_result) {
1916 return float64_chs(ur.s);
1917 }
1918 return ur.s;
1919
1920 soft:
1921 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1922 }
1923
1924 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1925 int flags, float_status *status)
1926 {
1927 FloatParts64 pa, pb, pc, *pr;
1928
1929 bfloat16_unpack_canonical(&pa, a, status);
1930 bfloat16_unpack_canonical(&pb, b, status);
1931 bfloat16_unpack_canonical(&pc, c, status);
1932 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1933
1934 return bfloat16_round_pack_canonical(pr, status);
1935 }
1936
1937 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
1938 int flags, float_status *status)
1939 {
1940 FloatParts128 pa, pb, pc, *pr;
1941
1942 float128_unpack_canonical(&pa, a, status);
1943 float128_unpack_canonical(&pb, b, status);
1944 float128_unpack_canonical(&pc, c, status);
1945 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1946
1947 return float128_round_pack_canonical(pr, status);
1948 }
1949
1950 /*
1951 * Division
1952 */
1953
1954 float16 float16_div(float16 a, float16 b, float_status *status)
1955 {
1956 FloatParts64 pa, pb, *pr;
1957
1958 float16_unpack_canonical(&pa, a, status);
1959 float16_unpack_canonical(&pb, b, status);
1960 pr = parts_div(&pa, &pb, status);
1961
1962 return float16_round_pack_canonical(pr, status);
1963 }
1964
1965 static float32 QEMU_SOFTFLOAT_ATTR
1966 soft_f32_div(float32 a, float32 b, float_status *status)
1967 {
1968 FloatParts64 pa, pb, *pr;
1969
1970 float32_unpack_canonical(&pa, a, status);
1971 float32_unpack_canonical(&pb, b, status);
1972 pr = parts_div(&pa, &pb, status);
1973
1974 return float32_round_pack_canonical(pr, status);
1975 }
1976
1977 static float64 QEMU_SOFTFLOAT_ATTR
1978 soft_f64_div(float64 a, float64 b, float_status *status)
1979 {
1980 FloatParts64 pa, pb, *pr;
1981
1982 float64_unpack_canonical(&pa, a, status);
1983 float64_unpack_canonical(&pb, b, status);
1984 pr = parts_div(&pa, &pb, status);
1985
1986 return float64_round_pack_canonical(pr, status);
1987 }
1988
1989 static float hard_f32_div(float a, float b)
1990 {
1991 return a / b;
1992 }
1993
1994 static double hard_f64_div(double a, double b)
1995 {
1996 return a / b;
1997 }
1998
1999 static bool f32_div_pre(union_float32 a, union_float32 b)
2000 {
2001 if (QEMU_HARDFLOAT_2F32_USE_FP) {
2002 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
2003 fpclassify(b.h) == FP_NORMAL;
2004 }
2005 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
2006 }
2007
2008 static bool f64_div_pre(union_float64 a, union_float64 b)
2009 {
2010 if (QEMU_HARDFLOAT_2F64_USE_FP) {
2011 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
2012 fpclassify(b.h) == FP_NORMAL;
2013 }
2014 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
2015 }
2016
2017 static bool f32_div_post(union_float32 a, union_float32 b)
2018 {
2019 if (QEMU_HARDFLOAT_2F32_USE_FP) {
2020 return fpclassify(a.h) != FP_ZERO;
2021 }
2022 return !float32_is_zero(a.s);
2023 }
2024
2025 static bool f64_div_post(union_float64 a, union_float64 b)
2026 {
2027 if (QEMU_HARDFLOAT_2F64_USE_FP) {
2028 return fpclassify(a.h) != FP_ZERO;
2029 }
2030 return !float64_is_zero(a.s);
2031 }
2032
2033 float32 QEMU_FLATTEN
2034 float32_div(float32 a, float32 b, float_status *s)
2035 {
2036 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2037 f32_div_pre, f32_div_post);
2038 }
2039
2040 float64 QEMU_FLATTEN
2041 float64_div(float64 a, float64 b, float_status *s)
2042 {
2043 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2044 f64_div_pre, f64_div_post);
2045 }
2046
2047 bfloat16 QEMU_FLATTEN
2048 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2049 {
2050 FloatParts64 pa, pb, *pr;
2051
2052 bfloat16_unpack_canonical(&pa, a, status);
2053 bfloat16_unpack_canonical(&pb, b, status);
2054 pr = parts_div(&pa, &pb, status);
2055
2056 return bfloat16_round_pack_canonical(pr, status);
2057 }
2058
2059 float128 QEMU_FLATTEN
2060 float128_div(float128 a, float128 b, float_status *status)
2061 {
2062 FloatParts128 pa, pb, *pr;
2063
2064 float128_unpack_canonical(&pa, a, status);
2065 float128_unpack_canonical(&pb, b, status);
2066 pr = parts_div(&pa, &pb, status);
2067
2068 return float128_round_pack_canonical(pr, status);
2069 }
2070
2071 /*
2072 * Float to Float conversions
2073 *
2074 * Returns the result of converting one float format to another. The
2075 * conversion is performed according to the IEC/IEEE Standard for
2076 * Binary Floating-Point Arithmetic.
2077 *
2078 * Usually this only needs to take care of raising invalid exceptions
2079 * and handling the conversion on NaNs.
2080 */
2081
2082 static void parts_float_to_ahp(FloatParts64 *a, float_status *s)
2083 {
2084 switch (a->cls) {
2085 case float_class_qnan:
2086 case float_class_snan:
2087 /*
2088 * There is no NaN in the destination format. Raise Invalid
2089 * and return a zero with the sign of the input NaN.
2090 */
2091 float_raise(float_flag_invalid, s);
2092 a->cls = float_class_zero;
2093 break;
2094
2095 case float_class_inf:
2096 /*
2097 * There is no Inf in the destination format. Raise Invalid
2098 * and return the maximum normal with the correct sign.
2099 */
2100 float_raise(float_flag_invalid, s);
2101 a->cls = float_class_normal;
2102 a->exp = float16_params_ahp.exp_max;
2103 a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift,
2104 float16_params_ahp.frac_size + 1);
2105 break;
2106
2107 case float_class_normal:
2108 case float_class_zero:
2109 break;
2110
2111 default:
2112 g_assert_not_reached();
2113 }
2114 }
2115
2116 static void parts64_float_to_float(FloatParts64 *a, float_status *s)
2117 {
2118 if (is_nan(a->cls)) {
2119 parts_return_nan(a, s);
2120 }
2121 }
2122
2123 static void parts128_float_to_float(FloatParts128 *a, float_status *s)
2124 {
2125 if (is_nan(a->cls)) {
2126 parts_return_nan(a, s);
2127 }
2128 }
2129
2130 #define parts_float_to_float(P, S) \
2131 PARTS_GENERIC_64_128(float_to_float, P)(P, S)
2132
2133 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b,
2134 float_status *s)
2135 {
2136 a->cls = b->cls;
2137 a->sign = b->sign;
2138 a->exp = b->exp;
2139
2140 if (a->cls == float_class_normal) {
2141 frac_truncjam(a, b);
2142 } else if (is_nan(a->cls)) {
2143 /* Discard the low bits of the NaN. */
2144 a->frac = b->frac_hi;
2145 parts_return_nan(a, s);
2146 }
2147 }
2148
2149 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b,
2150 float_status *s)
2151 {
2152 a->cls = b->cls;
2153 a->sign = b->sign;
2154 a->exp = b->exp;
2155 frac_widen(a, b);
2156
2157 if (is_nan(a->cls)) {
2158 parts_return_nan(a, s);
2159 }
2160 }
2161
2162 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2163 {
2164 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2165 FloatParts64 p;
2166
2167 float16a_unpack_canonical(&p, a, s, fmt16);
2168 parts_float_to_float(&p, s);
2169 return float32_round_pack_canonical(&p, s);
2170 }
2171
2172 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2173 {
2174 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2175 FloatParts64 p;
2176
2177 float16a_unpack_canonical(&p, a, s, fmt16);
2178 parts_float_to_float(&p, s);
2179 return float64_round_pack_canonical(&p, s);
2180 }
2181
2182 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2183 {
2184 FloatParts64 p;
2185 const FloatFmt *fmt;
2186
2187 float32_unpack_canonical(&p, a, s);
2188 if (ieee) {
2189 parts_float_to_float(&p, s);
2190 fmt = &float16_params;
2191 } else {
2192 parts_float_to_ahp(&p, s);
2193 fmt = &float16_params_ahp;
2194 }
2195 return float16a_round_pack_canonical(&p, s, fmt);
2196 }
2197
2198 static float64 QEMU_SOFTFLOAT_ATTR
2199 soft_float32_to_float64(float32 a, float_status *s)
2200 {
2201 FloatParts64 p;
2202
2203 float32_unpack_canonical(&p, a, s);
2204 parts_float_to_float(&p, s);
2205 return float64_round_pack_canonical(&p, s);
2206 }
2207
2208 float64 float32_to_float64(float32 a, float_status *s)
2209 {
2210 if (likely(float32_is_normal(a))) {
2211 /* Widening conversion can never produce inexact results. */
2212 union_float32 uf;
2213 union_float64 ud;
2214 uf.s = a;
2215 ud.h = uf.h;
2216 return ud.s;
2217 } else if (float32_is_zero(a)) {
2218 return float64_set_sign(float64_zero, float32_is_neg(a));
2219 } else {
2220 return soft_float32_to_float64(a, s);
2221 }
2222 }
2223
2224 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2225 {
2226 FloatParts64 p;
2227 const FloatFmt *fmt;
2228
2229 float64_unpack_canonical(&p, a, s);
2230 if (ieee) {
2231 parts_float_to_float(&p, s);
2232 fmt = &float16_params;
2233 } else {
2234 parts_float_to_ahp(&p, s);
2235 fmt = &float16_params_ahp;
2236 }
2237 return float16a_round_pack_canonical(&p, s, fmt);
2238 }
2239
2240 float32 float64_to_float32(float64 a, float_status *s)
2241 {
2242 FloatParts64 p;
2243
2244 float64_unpack_canonical(&p, a, s);
2245 parts_float_to_float(&p, s);
2246 return float32_round_pack_canonical(&p, s);
2247 }
2248
2249 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2250 {
2251 FloatParts64 p;
2252
2253 bfloat16_unpack_canonical(&p, a, s);
2254 parts_float_to_float(&p, s);
2255 return float32_round_pack_canonical(&p, s);
2256 }
2257
2258 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2259 {
2260 FloatParts64 p;
2261
2262 bfloat16_unpack_canonical(&p, a, s);
2263 parts_float_to_float(&p, s);
2264 return float64_round_pack_canonical(&p, s);
2265 }
2266
2267 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2268 {
2269 FloatParts64 p;
2270
2271 float32_unpack_canonical(&p, a, s);
2272 parts_float_to_float(&p, s);
2273 return bfloat16_round_pack_canonical(&p, s);
2274 }
2275
2276 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2277 {
2278 FloatParts64 p;
2279
2280 float64_unpack_canonical(&p, a, s);
2281 parts_float_to_float(&p, s);
2282 return bfloat16_round_pack_canonical(&p, s);
2283 }
2284
2285 float32 float128_to_float32(float128 a, float_status *s)
2286 {
2287 FloatParts64 p64;
2288 FloatParts128 p128;
2289
2290 float128_unpack_canonical(&p128, a, s);
2291 parts_float_to_float_narrow(&p64, &p128, s);
2292 return float32_round_pack_canonical(&p64, s);
2293 }
2294
2295 float64 float128_to_float64(float128 a, float_status *s)
2296 {
2297 FloatParts64 p64;
2298 FloatParts128 p128;
2299
2300 float128_unpack_canonical(&p128, a, s);
2301 parts_float_to_float_narrow(&p64, &p128, s);
2302 return float64_round_pack_canonical(&p64, s);
2303 }
2304
2305 float128 float32_to_float128(float32 a, float_status *s)
2306 {
2307 FloatParts64 p64;
2308 FloatParts128 p128;
2309
2310 float32_unpack_canonical(&p64, a, s);
2311 parts_float_to_float_widen(&p128, &p64, s);
2312 return float128_round_pack_canonical(&p128, s);
2313 }
2314
2315 float128 float64_to_float128(float64 a, float_status *s)
2316 {
2317 FloatParts64 p64;
2318 FloatParts128 p128;
2319
2320 float64_unpack_canonical(&p64, a, s);
2321 parts_float_to_float_widen(&p128, &p64, s);
2322 return float128_round_pack_canonical(&p128, s);
2323 }
2324
2325 /*
2326 * Round to integral value
2327 */
2328
2329 float16 float16_round_to_int(float16 a, float_status *s)
2330 {
2331 FloatParts64 p;
2332
2333 float16_unpack_canonical(&p, a, s);
2334 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float16_params);
2335 return float16_round_pack_canonical(&p, s);
2336 }
2337
2338 float32 float32_round_to_int(float32 a, float_status *s)
2339 {
2340 FloatParts64 p;
2341
2342 float32_unpack_canonical(&p, a, s);
2343 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float32_params);
2344 return float32_round_pack_canonical(&p, s);
2345 }
2346
2347 float64 float64_round_to_int(float64 a, float_status *s)
2348 {
2349 FloatParts64 p;
2350
2351 float64_unpack_canonical(&p, a, s);
2352 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float64_params);
2353 return float64_round_pack_canonical(&p, s);
2354 }
2355
2356 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2357 {
2358 FloatParts64 p;
2359
2360 bfloat16_unpack_canonical(&p, a, s);
2361 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &bfloat16_params);
2362 return bfloat16_round_pack_canonical(&p, s);
2363 }
2364
2365 float128 float128_round_to_int(float128 a, float_status *s)
2366 {
2367 FloatParts128 p;
2368
2369 float128_unpack_canonical(&p, a, s);
2370 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float128_params);
2371 return float128_round_pack_canonical(&p, s);
2372 }
2373
2374 /*
2375 * Floating-point to signed integer conversions
2376 */
2377
2378 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2379 float_status *s)
2380 {
2381 FloatParts64 p;
2382
2383 float16_unpack_canonical(&p, a, s);
2384 return parts_float_to_sint(&p, rmode, scale, INT8_MIN, INT8_MAX, s);
2385 }
2386
2387 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2388 float_status *s)
2389 {
2390 FloatParts64 p;
2391
2392 float16_unpack_canonical(&p, a, s);
2393 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s);
2394 }
2395
2396 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2397 float_status *s)
2398 {
2399 FloatParts64 p;
2400
2401 float16_unpack_canonical(&p, a, s);
2402 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s);
2403 }
2404
2405 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2406 float_status *s)
2407 {
2408 FloatParts64 p;
2409
2410 float16_unpack_canonical(&p, a, s);
2411 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
2412 }
2413
2414 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2415 float_status *s)
2416 {
2417 FloatParts64 p;
2418
2419 float32_unpack_canonical(&p, a, s);
2420 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s);
2421 }
2422
2423 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2424 float_status *s)
2425 {
2426 FloatParts64 p;
2427
2428 float32_unpack_canonical(&p, a, s);
2429 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s);
2430 }
2431
2432 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2433 float_status *s)
2434 {
2435 FloatParts64 p;
2436
2437 float32_unpack_canonical(&p, a, s);
2438 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
2439 }
2440
2441 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2442 float_status *s)
2443 {
2444 FloatParts64 p;
2445
2446 float64_unpack_canonical(&p, a, s);
2447 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s);
2448 }
2449
2450 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2451 float_status *s)
2452 {
2453 FloatParts64 p;
2454
2455 float64_unpack_canonical(&p, a, s);
2456 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s);
2457 }
2458
2459 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2460 float_status *s)
2461 {
2462 FloatParts64 p;
2463
2464 float64_unpack_canonical(&p, a, s);
2465 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
2466 }
2467
2468 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2469 float_status *s)
2470 {
2471 FloatParts64 p;
2472
2473 bfloat16_unpack_canonical(&p, a, s);
2474 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s);
2475 }
2476
2477 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2478 float_status *s)
2479 {
2480 FloatParts64 p;
2481
2482 bfloat16_unpack_canonical(&p, a, s);
2483 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s);
2484 }
2485
2486 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2487 float_status *s)
2488 {
2489 FloatParts64 p;
2490
2491 bfloat16_unpack_canonical(&p, a, s);
2492 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
2493 }
2494
2495 static int32_t float128_to_int32_scalbn(float128 a, FloatRoundMode rmode,
2496 int scale, float_status *s)
2497 {
2498 FloatParts128 p;
2499
2500 float128_unpack_canonical(&p, a, s);
2501 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s);
2502 }
2503
2504 static int64_t float128_to_int64_scalbn(float128 a, FloatRoundMode rmode,
2505 int scale, float_status *s)
2506 {
2507 FloatParts128 p;
2508
2509 float128_unpack_canonical(&p, a, s);
2510 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
2511 }
2512
2513 int8_t float16_to_int8(float16 a, float_status *s)
2514 {
2515 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2516 }
2517
2518 int16_t float16_to_int16(float16 a, float_status *s)
2519 {
2520 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2521 }
2522
2523 int32_t float16_to_int32(float16 a, float_status *s)
2524 {
2525 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2526 }
2527
2528 int64_t float16_to_int64(float16 a, float_status *s)
2529 {
2530 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2531 }
2532
2533 int16_t float32_to_int16(float32 a, float_status *s)
2534 {
2535 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2536 }
2537
2538 int32_t float32_to_int32(float32 a, float_status *s)
2539 {
2540 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2541 }
2542
2543 int64_t float32_to_int64(float32 a, float_status *s)
2544 {
2545 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2546 }
2547
2548 int16_t float64_to_int16(float64 a, float_status *s)
2549 {
2550 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2551 }
2552
2553 int32_t float64_to_int32(float64 a, float_status *s)
2554 {
2555 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2556 }
2557
2558 int64_t float64_to_int64(float64 a, float_status *s)
2559 {
2560 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2561 }
2562
2563 int32_t float128_to_int32(float128 a, float_status *s)
2564 {
2565 return float128_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2566 }
2567
2568 int64_t float128_to_int64(float128 a, float_status *s)
2569 {
2570 return float128_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2571 }
2572
2573 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2574 {
2575 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2576 }
2577
2578 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2579 {
2580 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2581 }
2582
2583 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2584 {
2585 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2586 }
2587
2588 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2589 {
2590 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2591 }
2592
2593 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2594 {
2595 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2596 }
2597
2598 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2599 {
2600 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2601 }
2602
2603 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2604 {
2605 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2606 }
2607
2608 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2609 {
2610 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2611 }
2612
2613 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2614 {
2615 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2616 }
2617
2618 int32_t float128_to_int32_round_to_zero(float128 a, float_status *s)
2619 {
2620 return float128_to_int32_scalbn(a, float_round_to_zero, 0, s);
2621 }
2622
2623 int64_t float128_to_int64_round_to_zero(float128 a, float_status *s)
2624 {
2625 return float128_to_int64_scalbn(a, float_round_to_zero, 0, s);
2626 }
2627
2628 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2629 {
2630 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2631 }
2632
2633 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2634 {
2635 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2636 }
2637
2638 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2639 {
2640 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2641 }
2642
2643 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2644 {
2645 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2646 }
2647
2648 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2649 {
2650 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2651 }
2652
2653 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2654 {
2655 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2656 }
2657
2658 /*
2659 * Floating-point to unsigned integer conversions
2660 */
2661
2662 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2663 float_status *s)
2664 {
2665 FloatParts64 p;
2666
2667 float16_unpack_canonical(&p, a, s);
2668 return parts_float_to_uint(&p, rmode, scale, UINT8_MAX, s);
2669 }
2670
2671 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2672 float_status *s)
2673 {
2674 FloatParts64 p;
2675
2676 float16_unpack_canonical(&p, a, s);
2677 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s);
2678 }
2679
2680 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2681 float_status *s)
2682 {
2683 FloatParts64 p;
2684
2685 float16_unpack_canonical(&p, a, s);
2686 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s);
2687 }
2688
2689 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2690 float_status *s)
2691 {
2692 FloatParts64 p;
2693
2694 float16_unpack_canonical(&p, a, s);
2695 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
2696 }
2697
2698 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2699 float_status *s)
2700 {
2701 FloatParts64 p;
2702
2703 float32_unpack_canonical(&p, a, s);
2704 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s);
2705 }
2706
2707 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2708 float_status *s)
2709 {
2710 FloatParts64 p;
2711
2712 float32_unpack_canonical(&p, a, s);
2713 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s);
2714 }
2715
2716 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2717 float_status *s)
2718 {
2719 FloatParts64 p;
2720
2721 float32_unpack_canonical(&p, a, s);
2722 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
2723 }
2724
2725 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2726 float_status *s)
2727 {
2728 FloatParts64 p;
2729
2730 float64_unpack_canonical(&p, a, s);
2731 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s);
2732 }
2733
2734 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2735 float_status *s)
2736 {
2737 FloatParts64 p;
2738
2739 float64_unpack_canonical(&p, a, s);
2740 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s);
2741 }
2742
2743 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2744 float_status *s)
2745 {
2746 FloatParts64 p;
2747
2748 float64_unpack_canonical(&p, a, s);
2749 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
2750 }
2751
2752 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2753 int scale, float_status *s)
2754 {
2755 FloatParts64 p;
2756
2757 bfloat16_unpack_canonical(&p, a, s);
2758 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s);
2759 }
2760
2761 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2762 int scale, float_status *s)
2763 {
2764 FloatParts64 p;
2765
2766 bfloat16_unpack_canonical(&p, a, s);
2767 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s);
2768 }
2769
2770 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2771 int scale, float_status *s)
2772 {
2773 FloatParts64 p;
2774
2775 bfloat16_unpack_canonical(&p, a, s);
2776 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
2777 }
2778
2779 static uint32_t float128_to_uint32_scalbn(float128 a, FloatRoundMode rmode,
2780 int scale, float_status *s)
2781 {
2782 FloatParts128 p;
2783
2784 float128_unpack_canonical(&p, a, s);
2785 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s);
2786 }
2787
2788 static uint64_t float128_to_uint64_scalbn(float128 a, FloatRoundMode rmode,
2789 int scale, float_status *s)
2790 {
2791 FloatParts128 p;
2792
2793 float128_unpack_canonical(&p, a, s);
2794 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
2795 }
2796
2797 uint8_t float16_to_uint8(float16 a, float_status *s)
2798 {
2799 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2800 }
2801
2802 uint16_t float16_to_uint16(float16 a, float_status *s)
2803 {
2804 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2805 }
2806
2807 uint32_t float16_to_uint32(float16 a, float_status *s)
2808 {
2809 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2810 }
2811
2812 uint64_t float16_to_uint64(float16 a, float_status *s)
2813 {
2814 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2815 }
2816
2817 uint16_t float32_to_uint16(float32 a, float_status *s)
2818 {
2819 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2820 }
2821
2822 uint32_t float32_to_uint32(float32 a, float_status *s)
2823 {
2824 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2825 }
2826
2827 uint64_t float32_to_uint64(float32 a, float_status *s)
2828 {
2829 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2830 }
2831
2832 uint16_t float64_to_uint16(float64 a, float_status *s)
2833 {
2834 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2835 }
2836
2837 uint32_t float64_to_uint32(float64 a, float_status *s)
2838 {
2839 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2840 }
2841
2842 uint64_t float64_to_uint64(float64 a, float_status *s)
2843 {
2844 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2845 }
2846
2847 uint32_t float128_to_uint32(float128 a, float_status *s)
2848 {
2849 return float128_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2850 }
2851
2852 uint64_t float128_to_uint64(float128 a, float_status *s)
2853 {
2854 return float128_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2855 }
2856
2857 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2858 {
2859 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2860 }
2861
2862 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2863 {
2864 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2865 }
2866
2867 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2868 {
2869 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2870 }
2871
2872 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2873 {
2874 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2875 }
2876
2877 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2878 {
2879 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2880 }
2881
2882 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2883 {
2884 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2885 }
2886
2887 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2888 {
2889 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2890 }
2891
2892 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2893 {
2894 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2895 }
2896
2897 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2898 {
2899 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2900 }
2901
2902 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *s)
2903 {
2904 return float128_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2905 }
2906
2907 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *s)
2908 {
2909 return float128_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2910 }
2911
2912 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2913 {
2914 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2915 }
2916
2917 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2918 {
2919 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2920 }
2921
2922 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2923 {
2924 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2925 }
2926
2927 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2928 {
2929 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2930 }
2931
2932 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2933 {
2934 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2935 }
2936
2937 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2938 {
2939 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2940 }
2941
2942 /*
2943 * Integer to float conversions
2944 *
2945 * Returns the result of converting the two's complement integer `a'
2946 * to the floating-point format. The conversion is performed according
2947 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2948 */
2949
2950 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2951 {
2952 FloatParts64 r = { .sign = false };
2953
2954 if (a == 0) {
2955 r.cls = float_class_zero;
2956 } else {
2957 uint64_t f = a;
2958 int shift;
2959
2960 r.cls = float_class_normal;
2961 if (a < 0) {
2962 f = -f;
2963 r.sign = true;
2964 }
2965 shift = clz64(f);
2966 scale = MIN(MAX(scale, -0x10000), 0x10000);
2967
2968 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2969 r.frac = f << shift;
2970 }
2971
2972 return r;
2973 }
2974
2975 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2976 {
2977 FloatParts64 pa = int_to_float(a, scale, status);
2978 return float16_round_pack_canonical(&pa, status);
2979 }
2980
2981 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2982 {
2983 return int64_to_float16_scalbn(a, scale, status);
2984 }
2985
2986 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2987 {
2988 return int64_to_float16_scalbn(a, scale, status);
2989 }
2990
2991 float16 int64_to_float16(int64_t a, float_status *status)
2992 {
2993 return int64_to_float16_scalbn(a, 0, status);
2994 }
2995
2996 float16 int32_to_float16(int32_t a, float_status *status)
2997 {
2998 return int64_to_float16_scalbn(a, 0, status);
2999 }
3000
3001 float16 int16_to_float16(int16_t a, float_status *status)
3002 {
3003 return int64_to_float16_scalbn(a, 0, status);
3004 }
3005
3006 float16 int8_to_float16(int8_t a, float_status *status)
3007 {
3008 return int64_to_float16_scalbn(a, 0, status);
3009 }
3010
3011 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3012 {
3013 FloatParts64 pa = int_to_float(a, scale, status);
3014 return float32_round_pack_canonical(&pa, status);
3015 }
3016
3017 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3018 {
3019 return int64_to_float32_scalbn(a, scale, status);
3020 }
3021
3022 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3023 {
3024 return int64_to_float32_scalbn(a, scale, status);
3025 }
3026
3027 float32 int64_to_float32(int64_t a, float_status *status)
3028 {
3029 return int64_to_float32_scalbn(a, 0, status);
3030 }
3031
3032 float32 int32_to_float32(int32_t a, float_status *status)
3033 {
3034 return int64_to_float32_scalbn(a, 0, status);
3035 }
3036
3037 float32 int16_to_float32(int16_t a, float_status *status)
3038 {
3039 return int64_to_float32_scalbn(a, 0, status);
3040 }
3041
3042 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3043 {
3044 FloatParts64 pa = int_to_float(a, scale, status);
3045 return float64_round_pack_canonical(&pa, status);
3046 }
3047
3048 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3049 {
3050 return int64_to_float64_scalbn(a, scale, status);
3051 }
3052
3053 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3054 {
3055 return int64_to_float64_scalbn(a, scale, status);
3056 }
3057
3058 float64 int64_to_float64(int64_t a, float_status *status)
3059 {
3060 return int64_to_float64_scalbn(a, 0, status);
3061 }
3062
3063 float64 int32_to_float64(int32_t a, float_status *status)
3064 {
3065 return int64_to_float64_scalbn(a, 0, status);
3066 }
3067
3068 float64 int16_to_float64(int16_t a, float_status *status)
3069 {
3070 return int64_to_float64_scalbn(a, 0, status);
3071 }
3072
3073 /*
3074 * Returns the result of converting the two's complement integer `a'
3075 * to the bfloat16 format.
3076 */
3077
3078 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3079 {
3080 FloatParts64 pa = int_to_float(a, scale, status);
3081 return bfloat16_round_pack_canonical(&pa, status);
3082 }
3083
3084 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3085 {
3086 return int64_to_bfloat16_scalbn(a, scale, status);
3087 }
3088
3089 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3090 {
3091 return int64_to_bfloat16_scalbn(a, scale, status);
3092 }
3093
3094 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3095 {
3096 return int64_to_bfloat16_scalbn(a, 0, status);
3097 }
3098
3099 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3100 {
3101 return int64_to_bfloat16_scalbn(a, 0, status);
3102 }
3103
3104 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3105 {
3106 return int64_to_bfloat16_scalbn(a, 0, status);
3107 }
3108
3109 /*
3110 * Unsigned Integer to float conversions
3111 *
3112 * Returns the result of converting the unsigned integer `a' to the
3113 * floating-point format. The conversion is performed according to the
3114 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3115 */
3116
3117 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3118 {
3119 FloatParts64 r = { .sign = false };
3120 int shift;
3121
3122 if (a == 0) {
3123 r.cls = float_class_zero;
3124 } else {
3125 scale = MIN(MAX(scale, -0x10000), 0x10000);
3126 shift = clz64(a);
3127 r.cls = float_class_normal;
3128 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3129 r.frac = a << shift;
3130 }
3131
3132 return r;
3133 }
3134
3135 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3136 {
3137 FloatParts64 pa = uint_to_float(a, scale, status);
3138 return float16_round_pack_canonical(&pa, status);
3139 }
3140
3141 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3142 {
3143 return uint64_to_float16_scalbn(a, scale, status);
3144 }
3145
3146 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3147 {
3148 return uint64_to_float16_scalbn(a, scale, status);
3149 }
3150
3151 float16 uint64_to_float16(uint64_t a, float_status *status)
3152 {
3153 return uint64_to_float16_scalbn(a, 0, status);
3154 }
3155
3156 float16 uint32_to_float16(uint32_t a, float_status *status)
3157 {
3158 return uint64_to_float16_scalbn(a, 0, status);
3159 }
3160
3161 float16 uint16_to_float16(uint16_t a, float_status *status)
3162 {
3163 return uint64_to_float16_scalbn(a, 0, status);
3164 }
3165
3166 float16 uint8_to_float16(uint8_t a, float_status *status)
3167 {
3168 return uint64_to_float16_scalbn(a, 0, status);
3169 }
3170
3171 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3172 {
3173 FloatParts64 pa = uint_to_float(a, scale, status);
3174 return float32_round_pack_canonical(&pa, status);
3175 }
3176
3177 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3178 {
3179 return uint64_to_float32_scalbn(a, scale, status);
3180 }
3181
3182 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3183 {
3184 return uint64_to_float32_scalbn(a, scale, status);
3185 }
3186
3187 float32 uint64_to_float32(uint64_t a, float_status *status)
3188 {
3189 return uint64_to_float32_scalbn(a, 0, status);
3190 }
3191
3192 float32 uint32_to_float32(uint32_t a, float_status *status)
3193 {
3194 return uint64_to_float32_scalbn(a, 0, status);
3195 }
3196
3197 float32 uint16_to_float32(uint16_t a, float_status *status)
3198 {
3199 return uint64_to_float32_scalbn(a, 0, status);
3200 }
3201
3202 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3203 {
3204 FloatParts64 pa = uint_to_float(a, scale, status);
3205 return float64_round_pack_canonical(&pa, status);
3206 }
3207
3208 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3209 {
3210 return uint64_to_float64_scalbn(a, scale, status);
3211 }
3212
3213 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3214 {
3215 return uint64_to_float64_scalbn(a, scale, status);
3216 }
3217
3218 float64 uint64_to_float64(uint64_t a, float_status *status)
3219 {
3220 return uint64_to_float64_scalbn(a, 0, status);
3221 }
3222
3223 float64 uint32_to_float64(uint32_t a, float_status *status)
3224 {
3225 return uint64_to_float64_scalbn(a, 0, status);
3226 }
3227
3228 float64 uint16_to_float64(uint16_t a, float_status *status)
3229 {
3230 return uint64_to_float64_scalbn(a, 0, status);
3231 }
3232
3233 /*
3234 * Returns the result of converting the unsigned integer `a' to the
3235 * bfloat16 format.
3236 */
3237
3238 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3239 {
3240 FloatParts64 pa = uint_to_float(a, scale, status);
3241 return bfloat16_round_pack_canonical(&pa, status);
3242 }
3243
3244 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3245 {
3246 return uint64_to_bfloat16_scalbn(a, scale, status);
3247 }
3248
3249 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3250 {
3251 return uint64_to_bfloat16_scalbn(a, scale, status);
3252 }
3253
3254 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3255 {
3256 return uint64_to_bfloat16_scalbn(a, 0, status);
3257 }
3258
3259 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3260 {
3261 return uint64_to_bfloat16_scalbn(a, 0, status);
3262 }
3263
3264 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3265 {
3266 return uint64_to_bfloat16_scalbn(a, 0, status);
3267 }
3268
3269 /* Float Min/Max */
3270 /* min() and max() functions. These can't be implemented as
3271 * 'compare and pick one input' because that would mishandle
3272 * NaNs and +0 vs -0.
3273 *
3274 * minnum() and maxnum() functions. These are similar to the min()
3275 * and max() functions but if one of the arguments is a QNaN and
3276 * the other is numerical then the numerical argument is returned.
3277 * SNaNs will get quietened before being returned.
3278 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3279 * and maxNum() operations. min() and max() are the typical min/max
3280 * semantics provided by many CPUs which predate that specification.
3281 *
3282 * minnummag() and maxnummag() functions correspond to minNumMag()
3283 * and minNumMag() from the IEEE-754 2008.
3284 */
3285 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3286 bool ieee, bool ismag, float_status *s)
3287 {
3288 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3289 if (ieee) {
3290 /* Takes two floating-point values `a' and `b', one of
3291 * which is a NaN, and returns the appropriate NaN
3292 * result. If either `a' or `b' is a signaling NaN,
3293 * the invalid exception is raised.
3294 */
3295 if (is_snan(a.cls) || is_snan(b.cls)) {
3296 return *parts_pick_nan(&a, &b, s);
3297 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3298 return b;
3299 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3300 return a;
3301 }
3302 }
3303 return *parts_pick_nan(&a, &b, s);
3304 } else {
3305 int a_exp, b_exp;
3306
3307 switch (a.cls) {
3308 case float_class_normal:
3309 a_exp = a.exp;
3310 break;
3311 case float_class_inf:
3312 a_exp = INT_MAX;
3313 break;
3314 case float_class_zero:
3315 a_exp = INT_MIN;
3316 break;
3317 default:
3318 g_assert_not_reached();
3319 break;
3320 }
3321 switch (b.cls) {
3322 case float_class_normal:
3323 b_exp = b.exp;
3324 break;
3325 case float_class_inf:
3326 b_exp = INT_MAX;
3327 break;
3328 case float_class_zero:
3329 b_exp = INT_MIN;
3330 break;
3331 default:
3332 g_assert_not_reached();
3333 break;
3334 }
3335
3336 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3337 bool a_less = a_exp < b_exp;
3338 if (a_exp == b_exp) {
3339 a_less = a.frac < b.frac;
3340 }
3341 return a_less ^ ismin ? b : a;
3342 }
3343
3344 if (a.sign == b.sign) {
3345 bool a_less = a_exp < b_exp;
3346 if (a_exp == b_exp) {
3347 a_less = a.frac < b.frac;
3348 }
3349 return a.sign ^ a_less ^ ismin ? b : a;
3350 } else {
3351 return a.sign ^ ismin ? b : a;
3352 }
3353 }
3354 }
3355
3356 #define MINMAX(sz, name, ismin, isiee, ismag) \
3357 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3358 float_status *s) \
3359 { \
3360 FloatParts64 pa, pb, pr; \
3361 float ## sz ## _unpack_canonical(&pa, a, s); \
3362 float ## sz ## _unpack_canonical(&pb, b, s); \
3363 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3364 return float ## sz ## _round_pack_canonical(&pr, s); \
3365 }
3366
3367 MINMAX(16, min, true, false, false)
3368 MINMAX(16, minnum, true, true, false)
3369 MINMAX(16, minnummag, true, true, true)
3370 MINMAX(16, max, false, false, false)
3371 MINMAX(16, maxnum, false, true, false)
3372 MINMAX(16, maxnummag, false, true, true)
3373
3374 MINMAX(32, min, true, false, false)
3375 MINMAX(32, minnum, true, true, false)
3376 MINMAX(32, minnummag, true, true, true)
3377 MINMAX(32, max, false, false, false)
3378 MINMAX(32, maxnum, false, true, false)
3379 MINMAX(32, maxnummag, false, true, true)
3380
3381 MINMAX(64, min, true, false, false)
3382 MINMAX(64, minnum, true, true, false)
3383 MINMAX(64, minnummag, true, true, true)
3384 MINMAX(64, max, false, false, false)
3385 MINMAX(64, maxnum, false, true, false)
3386 MINMAX(64, maxnummag, false, true, true)
3387
3388 #undef MINMAX
3389
3390 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3391 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3392 { \
3393 FloatParts64 pa, pb, pr; \
3394 bfloat16_unpack_canonical(&pa, a, s); \
3395 bfloat16_unpack_canonical(&pb, b, s); \
3396 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3397 return bfloat16_round_pack_canonical(&pr, s); \
3398 }
3399
3400 BF16_MINMAX(min, true, false, false)
3401 BF16_MINMAX(minnum, true, true, false)
3402 BF16_MINMAX(minnummag, true, true, true)
3403 BF16_MINMAX(max, false, false, false)
3404 BF16_MINMAX(maxnum, false, true, false)
3405 BF16_MINMAX(maxnummag, false, true, true)
3406
3407 #undef BF16_MINMAX
3408
3409 /* Floating point compare */
3410 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3411 float_status *s)
3412 {
3413 if (is_nan(a.cls) || is_nan(b.cls)) {
3414 if (!is_quiet ||
3415 a.cls == float_class_snan ||
3416 b.cls == float_class_snan) {
3417 float_raise(float_flag_invalid, s);
3418 }
3419 return float_relation_unordered;
3420 }
3421
3422 if (a.cls == float_class_zero) {
3423 if (b.cls == float_class_zero) {
3424 return float_relation_equal;
3425 }
3426 return b.sign ? float_relation_greater : float_relation_less;
3427 } else if (b.cls == float_class_zero) {
3428 return a.sign ? float_relation_less : float_relation_greater;
3429 }
3430
3431 /* The only really important thing about infinity is its sign. If
3432 * both are infinities the sign marks the smallest of the two.
3433 */
3434 if (a.cls == float_class_inf) {
3435 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3436 return float_relation_equal;
3437 }
3438 return a.sign ? float_relation_less : float_relation_greater;
3439 } else if (b.cls == float_class_inf) {
3440 return b.sign ? float_relation_greater : float_relation_less;
3441 }
3442
3443 if (a.sign != b.sign) {
3444 return a.sign ? float_relation_less : float_relation_greater;
3445 }
3446
3447 if (a.exp == b.exp) {
3448 if (a.frac == b.frac) {
3449 return float_relation_equal;
3450 }
3451 if (a.sign) {
3452 return a.frac > b.frac ?
3453 float_relation_less : float_relation_greater;
3454 } else {
3455 return a.frac > b.frac ?
3456 float_relation_greater : float_relation_less;
3457 }
3458 } else {
3459 if (a.sign) {
3460 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3461 } else {
3462 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3463 }
3464 }
3465 }
3466
3467 #define COMPARE(name, attr, sz) \
3468 static int attr \
3469 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3470 { \
3471 FloatParts64 pa, pb; \
3472 float ## sz ## _unpack_canonical(&pa, a, s); \
3473 float ## sz ## _unpack_canonical(&pb, b, s); \
3474 return compare_floats(pa, pb, is_quiet, s); \
3475 }
3476
3477 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3478 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3479 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3480
3481 #undef COMPARE
3482
3483 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3484 {
3485 return soft_f16_compare(a, b, false, s);
3486 }
3487
3488 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3489 {
3490 return soft_f16_compare(a, b, true, s);
3491 }
3492
3493 static FloatRelation QEMU_FLATTEN
3494 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3495 {
3496 union_float32 ua, ub;
3497
3498 ua.s = xa;
3499 ub.s = xb;
3500
3501 if (QEMU_NO_HARDFLOAT) {
3502 goto soft;
3503 }
3504
3505 float32_input_flush2(&ua.s, &ub.s, s);
3506 if (isgreaterequal(ua.h, ub.h)) {
3507 if (isgreater(ua.h, ub.h)) {
3508 return float_relation_greater;
3509 }
3510 return float_relation_equal;
3511 }
3512 if (likely(isless(ua.h, ub.h))) {
3513 return float_relation_less;
3514 }
3515 /* The only condition remaining is unordered.
3516 * Fall through to set flags.
3517 */
3518 soft:
3519 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3520 }
3521
3522 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3523 {
3524 return f32_compare(a, b, false, s);
3525 }
3526
3527 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3528 {
3529 return f32_compare(a, b, true, s);
3530 }
3531
3532 static FloatRelation QEMU_FLATTEN
3533 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3534 {
3535 union_float64 ua, ub;
3536
3537 ua.s = xa;
3538 ub.s = xb;
3539
3540 if (QEMU_NO_HARDFLOAT) {
3541 goto soft;
3542 }
3543
3544 float64_input_flush2(&ua.s, &ub.s, s);
3545 if (isgreaterequal(ua.h, ub.h)) {
3546 if (isgreater(ua.h, ub.h)) {
3547 return float_relation_greater;
3548 }
3549 return float_relation_equal;
3550 }
3551 if (likely(isless(ua.h, ub.h))) {
3552 return float_relation_less;
3553 }
3554 /* The only condition remaining is unordered.
3555 * Fall through to set flags.
3556 */
3557 soft:
3558 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3559 }
3560
3561 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3562 {
3563 return f64_compare(a, b, false, s);
3564 }
3565
3566 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3567 {
3568 return f64_compare(a, b, true, s);
3569 }
3570
3571 static FloatRelation QEMU_FLATTEN
3572 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3573 {
3574 FloatParts64 pa, pb;
3575
3576 bfloat16_unpack_canonical(&pa, a, s);
3577 bfloat16_unpack_canonical(&pb, b, s);
3578 return compare_floats(pa, pb, is_quiet, s);
3579 }
3580
3581 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3582 {
3583 return soft_bf16_compare(a, b, false, s);
3584 }
3585
3586 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3587 {
3588 return soft_bf16_compare(a, b, true, s);
3589 }
3590
3591 /* Multiply A by 2 raised to the power N. */
3592 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3593 {
3594 if (unlikely(is_nan(a.cls))) {
3595 parts_return_nan(&a, s);
3596 }
3597 if (a.cls == float_class_normal) {
3598 /* The largest float type (even though not supported by FloatParts64)
3599 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3600 * still allows rounding to infinity, without allowing overflow
3601 * within the int32_t that backs FloatParts64.exp.
3602 */
3603 n = MIN(MAX(n, -0x10000), 0x10000);
3604 a.exp += n;
3605 }
3606 return a;
3607 }
3608
3609 float16 float16_scalbn(float16 a, int n, float_status *status)
3610 {
3611 FloatParts64 pa, pr;
3612
3613 float16_unpack_canonical(&pa, a, status);
3614 pr = scalbn_decomposed(pa, n, status);
3615 return float16_round_pack_canonical(&pr, status);
3616 }
3617
3618 float32 float32_scalbn(float32 a, int n, float_status *status)
3619 {
3620 FloatParts64 pa, pr;
3621
3622 float32_unpack_canonical(&pa, a, status);
3623 pr = scalbn_decomposed(pa, n, status);
3624 return float32_round_pack_canonical(&pr, status);
3625 }
3626
3627 float64 float64_scalbn(float64 a, int n, float_status *status)
3628 {
3629 FloatParts64 pa, pr;
3630
3631 float64_unpack_canonical(&pa, a, status);
3632 pr = scalbn_decomposed(pa, n, status);
3633 return float64_round_pack_canonical(&pr, status);
3634 }
3635
3636 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3637 {
3638 FloatParts64 pa, pr;
3639
3640 bfloat16_unpack_canonical(&pa, a, status);
3641 pr = scalbn_decomposed(pa, n, status);
3642 return bfloat16_round_pack_canonical(&pr, status);
3643 }
3644
3645 /*
3646 * Square Root
3647 *
3648 * The old softfloat code did an approximation step before zeroing in
3649 * on the final result. However for simpleness we just compute the
3650 * square root by iterating down from the implicit bit to enough extra
3651 * bits to ensure we get a correctly rounded result.
3652 *
3653 * This does mean however the calculation is slower than before,
3654 * especially for 64 bit floats.
3655 */
3656
3657 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3658 {
3659 uint64_t a_frac, r_frac, s_frac;
3660 int bit, last_bit;
3661
3662 if (is_nan(a.cls)) {
3663 parts_return_nan(&a, s);
3664 return a;
3665 }
3666 if (a.cls == float_class_zero) {
3667 return a; /* sqrt(+-0) = +-0 */
3668 }
3669 if (a.sign) {
3670 float_raise(float_flag_invalid, s);
3671 parts_default_nan(&a, s);
3672 return a;
3673 }
3674 if (a.cls == float_class_inf) {
3675 return a; /* sqrt(+inf) = +inf */
3676 }
3677
3678 assert(a.cls == float_class_normal);
3679
3680 /* We need two overflow bits at the top. Adding room for that is a
3681 * right shift. If the exponent is odd, we can discard the low bit
3682 * by multiplying the fraction by 2; that's a left shift. Combine
3683 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3684 */
3685 a_frac = a.frac >> (2 - (a.exp & 1));
3686 a.exp >>= 1;
3687
3688 /* Bit-by-bit computation of sqrt. */
3689 r_frac = 0;
3690 s_frac = 0;
3691
3692 /* Iterate from implicit bit down to the 3 extra bits to compute a
3693 * properly rounded result. Remember we've inserted two more bits
3694 * at the top, so these positions are two less.
3695 */
3696 bit = DECOMPOSED_BINARY_POINT - 2;
3697 last_bit = MAX(p->frac_shift - 4, 0);
3698 do {
3699 uint64_t q = 1ULL << bit;
3700 uint64_t t_frac = s_frac + q;
3701 if (t_frac <= a_frac) {
3702 s_frac = t_frac + q;
3703 a_frac -= t_frac;
3704 r_frac += q;
3705 }
3706 a_frac <<= 1;
3707 } while (--bit >= last_bit);
3708
3709 /* Undo the right shift done above. If there is any remaining
3710 * fraction, the result is inexact. Set the sticky bit.
3711 */
3712 a.frac = (r_frac << 2) + (a_frac != 0);
3713
3714 return a;
3715 }
3716
3717 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3718 {
3719 FloatParts64 pa, pr;
3720
3721 float16_unpack_canonical(&pa, a, status);
3722 pr = sqrt_float(pa, status, &float16_params);
3723 return float16_round_pack_canonical(&pr, status);
3724 }
3725
3726 static float32 QEMU_SOFTFLOAT_ATTR
3727 soft_f32_sqrt(float32 a, float_status *status)
3728 {
3729 FloatParts64 pa, pr;
3730
3731 float32_unpack_canonical(&pa, a, status);
3732 pr = sqrt_float(pa, status, &float32_params);
3733 return float32_round_pack_canonical(&pr, status);
3734 }
3735
3736 static float64 QEMU_SOFTFLOAT_ATTR
3737 soft_f64_sqrt(float64 a, float_status *status)
3738 {
3739 FloatParts64 pa, pr;
3740
3741 float64_unpack_canonical(&pa, a, status);
3742 pr = sqrt_float(pa, status, &float64_params);
3743 return float64_round_pack_canonical(&pr, status);
3744 }
3745
3746 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3747 {
3748 union_float32 ua, ur;
3749
3750 ua.s = xa;
3751 if (unlikely(!can_use_fpu(s))) {
3752 goto soft;
3753 }
3754
3755 float32_input_flush1(&ua.s, s);
3756 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3757 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3758 fpclassify(ua.h) == FP_ZERO) ||
3759 signbit(ua.h))) {
3760 goto soft;
3761 }
3762 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3763 float32_is_neg(ua.s))) {
3764 goto soft;
3765 }
3766 ur.h = sqrtf(ua.h);
3767 return ur.s;
3768
3769 soft:
3770 return soft_f32_sqrt(ua.s, s);
3771 }
3772
3773 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3774 {
3775 union_float64 ua, ur;
3776
3777 ua.s = xa;
3778 if (unlikely(!can_use_fpu(s))) {
3779 goto soft;
3780 }
3781
3782 float64_input_flush1(&ua.s, s);
3783 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3784 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3785 fpclassify(ua.h) == FP_ZERO) ||
3786 signbit(ua.h))) {
3787 goto soft;
3788 }
3789 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3790 float64_is_neg(ua.s))) {
3791 goto soft;
3792 }
3793 ur.h = sqrt(ua.h);
3794 return ur.s;
3795
3796 soft:
3797 return soft_f64_sqrt(ua.s, s);
3798 }
3799
3800 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3801 {
3802 FloatParts64 pa, pr;
3803
3804 bfloat16_unpack_canonical(&pa, a, status);
3805 pr = sqrt_float(pa, status, &bfloat16_params);
3806 return bfloat16_round_pack_canonical(&pr, status);
3807 }
3808
3809 /*----------------------------------------------------------------------------
3810 | The pattern for a default generated NaN.
3811 *----------------------------------------------------------------------------*/
3812
3813 float16 float16_default_nan(float_status *status)
3814 {
3815 FloatParts64 p;
3816
3817 parts_default_nan(&p, status);
3818 p.frac >>= float16_params.frac_shift;
3819 return float16_pack_raw(&p);
3820 }
3821
3822 float32 float32_default_nan(float_status *status)
3823 {
3824 FloatParts64 p;
3825
3826 parts_default_nan(&p, status);
3827 p.frac >>= float32_params.frac_shift;
3828 return float32_pack_raw(&p);
3829 }
3830
3831 float64 float64_default_nan(float_status *status)
3832 {
3833 FloatParts64 p;
3834
3835 parts_default_nan(&p, status);
3836 p.frac >>= float64_params.frac_shift;
3837 return float64_pack_raw(&p);
3838 }
3839
3840 float128 float128_default_nan(float_status *status)
3841 {
3842 FloatParts128 p;
3843
3844 parts_default_nan(&p, status);
3845 frac_shr(&p, float128_params.frac_shift);
3846 return float128_pack_raw(&p);
3847 }
3848
3849 bfloat16 bfloat16_default_nan(float_status *status)
3850 {
3851 FloatParts64 p;
3852
3853 parts_default_nan(&p, status);
3854 p.frac >>= bfloat16_params.frac_shift;
3855 return bfloat16_pack_raw(&p);
3856 }
3857
3858 /*----------------------------------------------------------------------------
3859 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3860 *----------------------------------------------------------------------------*/
3861
3862 float16 float16_silence_nan(float16 a, float_status *status)
3863 {
3864 FloatParts64 p;
3865
3866 float16_unpack_raw(&p, a);
3867 p.frac <<= float16_params.frac_shift;
3868 parts_silence_nan(&p, status);
3869 p.frac >>= float16_params.frac_shift;
3870 return float16_pack_raw(&p);
3871 }
3872
3873 float32 float32_silence_nan(float32 a, float_status *status)
3874 {
3875 FloatParts64 p;
3876
3877 float32_unpack_raw(&p, a);
3878 p.frac <<= float32_params.frac_shift;
3879 parts_silence_nan(&p, status);
3880 p.frac >>= float32_params.frac_shift;
3881 return float32_pack_raw(&p);
3882 }
3883
3884 float64 float64_silence_nan(float64 a, float_status *status)
3885 {
3886 FloatParts64 p;
3887
3888 float64_unpack_raw(&p, a);
3889 p.frac <<= float64_params.frac_shift;
3890 parts_silence_nan(&p, status);
3891 p.frac >>= float64_params.frac_shift;
3892 return float64_pack_raw(&p);
3893 }
3894
3895 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3896 {
3897 FloatParts64 p;
3898
3899 bfloat16_unpack_raw(&p, a);
3900 p.frac <<= bfloat16_params.frac_shift;
3901 parts_silence_nan(&p, status);
3902 p.frac >>= bfloat16_params.frac_shift;
3903 return bfloat16_pack_raw(&p);
3904 }
3905
3906 float128 float128_silence_nan(float128 a, float_status *status)
3907 {
3908 FloatParts128 p;
3909
3910 float128_unpack_raw(&p, a);
3911 frac_shl(&p, float128_params.frac_shift);
3912 parts_silence_nan(&p, status);
3913 frac_shr(&p, float128_params.frac_shift);
3914 return float128_pack_raw(&p);
3915 }
3916
3917 /*----------------------------------------------------------------------------
3918 | If `a' is denormal and we are in flush-to-zero mode then set the
3919 | input-denormal exception and return zero. Otherwise just return the value.
3920 *----------------------------------------------------------------------------*/
3921
3922 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3923 {
3924 if (p.exp == 0 && p.frac != 0) {
3925 float_raise(float_flag_input_denormal, status);
3926 return true;
3927 }
3928
3929 return false;
3930 }
3931
3932 float16 float16_squash_input_denormal(float16 a, float_status *status)
3933 {
3934 if (status->flush_inputs_to_zero) {
3935 FloatParts64 p;
3936
3937 float16_unpack_raw(&p, a);
3938 if (parts_squash_denormal(p, status)) {
3939 return float16_set_sign(float16_zero, p.sign);
3940 }
3941 }
3942 return a;
3943 }
3944
3945 float32 float32_squash_input_denormal(float32 a, float_status *status)
3946 {
3947 if (status->flush_inputs_to_zero) {
3948 FloatParts64 p;
3949
3950 float32_unpack_raw(&p, a);
3951 if (parts_squash_denormal(p, status)) {
3952 return float32_set_sign(float32_zero, p.sign);
3953 }
3954 }
3955 return a;
3956 }
3957
3958 float64 float64_squash_input_denormal(float64 a, float_status *status)
3959 {
3960 if (status->flush_inputs_to_zero) {
3961 FloatParts64 p;
3962
3963 float64_unpack_raw(&p, a);
3964 if (parts_squash_denormal(p, status)) {
3965 return float64_set_sign(float64_zero, p.sign);
3966 }
3967 }
3968 return a;
3969 }
3970
3971 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3972 {
3973 if (status->flush_inputs_to_zero) {
3974 FloatParts64 p;
3975
3976 bfloat16_unpack_raw(&p, a);
3977 if (parts_squash_denormal(p, status)) {
3978 return bfloat16_set_sign(bfloat16_zero, p.sign);
3979 }
3980 }
3981 return a;
3982 }
3983
3984 /*----------------------------------------------------------------------------
3985 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3986 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3987 | input. If `zSign' is 1, the input is negated before being converted to an
3988 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3989 | is simply rounded to an integer, with the inexact exception raised if the
3990 | input cannot be represented exactly as an integer. However, if the fixed-
3991 | point input is too large, the invalid exception is raised and the largest
3992 | positive or negative integer is returned.
3993 *----------------------------------------------------------------------------*/
3994
3995 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3996 float_status *status)
3997 {
3998 int8_t roundingMode;
3999 bool roundNearestEven;
4000 int8_t roundIncrement, roundBits;
4001 int32_t z;
4002
4003 roundingMode = status->float_rounding_mode;
4004 roundNearestEven = ( roundingMode == float_round_nearest_even );
4005 switch (roundingMode) {
4006 case float_round_nearest_even:
4007 case float_round_ties_away:
4008 roundIncrement = 0x40;
4009 break;
4010 case float_round_to_zero:
4011 roundIncrement = 0;
4012 break;
4013 case float_round_up:
4014 roundIncrement = zSign ? 0 : 0x7f;
4015 break;
4016 case float_round_down:
4017 roundIncrement = zSign ? 0x7f : 0;
4018 break;
4019 case float_round_to_odd:
4020 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4021 break;
4022 default:
4023 abort();
4024 }
4025 roundBits = absZ & 0x7F;
4026 absZ = ( absZ + roundIncrement )>>7;
4027 if (!(roundBits ^ 0x40) && roundNearestEven) {
4028 absZ &= ~1;
4029 }
4030 z = absZ;
4031 if ( zSign ) z = - z;
4032 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4033 float_raise(float_flag_invalid, status);
4034 return zSign ? INT32_MIN : INT32_MAX;
4035 }
4036 if (roundBits) {
4037 float_raise(float_flag_inexact, status);
4038 }
4039 return z;
4040
4041 }
4042
4043 /*----------------------------------------------------------------------------
4044 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4045 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4046 | and returns the properly rounded 64-bit integer corresponding to the input.
4047 | If `zSign' is 1, the input is negated before being converted to an integer.
4048 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4049 | the inexact exception raised if the input cannot be represented exactly as
4050 | an integer. However, if the fixed-point input is too large, the invalid
4051 | exception is raised and the largest positive or negative integer is
4052 | returned.
4053 *----------------------------------------------------------------------------*/
4054
4055 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4056 float_status *status)
4057 {
4058 int8_t roundingMode;
4059 bool roundNearestEven, increment;
4060 int64_t z;
4061
4062 roundingMode = status->float_rounding_mode;
4063 roundNearestEven = ( roundingMode == float_round_nearest_even );
4064 switch (roundingMode) {
4065 case float_round_nearest_even:
4066 case float_round_ties_away:
4067 increment = ((int64_t) absZ1 < 0);
4068 break;
4069 case float_round_to_zero:
4070 increment = 0;
4071 break;
4072 case float_round_up:
4073 increment = !zSign && absZ1;
4074 break;
4075 case float_round_down:
4076 increment = zSign && absZ1;
4077 break;
4078 case float_round_to_odd:
4079 increment = !(absZ0 & 1) && absZ1;
4080 break;
4081 default:
4082 abort();
4083 }
4084 if ( increment ) {
4085 ++absZ0;
4086 if ( absZ0 == 0 ) goto overflow;
4087 if (!(absZ1 << 1) && roundNearestEven) {
4088 absZ0 &= ~1;
4089 }
4090 }
4091 z = absZ0;
4092 if ( zSign ) z = - z;
4093 if ( z && ( ( z < 0 ) ^ zSign ) ) {
4094 overflow:
4095 float_raise(float_flag_invalid, status);
4096 return zSign ? INT64_MIN : INT64_MAX;
4097 }
4098 if (absZ1) {
4099 float_raise(float_flag_inexact, status);
4100 }
4101 return z;
4102
4103 }
4104
4105 /*----------------------------------------------------------------------------
4106 | Normalizes the subnormal single-precision floating-point value represented
4107 | by the denormalized significand `aSig'. The normalized exponent and
4108 | significand are stored at the locations pointed to by `zExpPtr' and
4109 | `zSigPtr', respectively.
4110 *----------------------------------------------------------------------------*/
4111
4112 static void
4113 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4114 {
4115 int8_t shiftCount;
4116
4117 shiftCount = clz32(aSig) - 8;
4118 *zSigPtr = aSig<<shiftCount;
4119 *zExpPtr = 1 - shiftCount;
4120
4121 }
4122
4123 /*----------------------------------------------------------------------------
4124 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4125 | and significand `zSig', and returns the proper single-precision floating-
4126 | point value corresponding to the abstract input. Ordinarily, the abstract
4127 | value is simply rounded and packed into the single-precision format, with
4128 | the inexact exception raised if the abstract input cannot be represented
4129 | exactly. However, if the abstract value is too large, the overflow and
4130 | inexact exceptions are raised and an infinity or maximal finite value is
4131 | returned. If the abstract value is too small, the input value is rounded to
4132 | a subnormal number, and the underflow and inexact exceptions are raised if
4133 | the abstract input cannot be represented exactly as a subnormal single-
4134 | precision floating-point number.
4135 | The input significand `zSig' has its binary point between bits 30
4136 | and 29, which is 7 bits to the left of the usual location. This shifted
4137 | significand must be normalized or smaller. If `zSig' is not normalized,
4138 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4139 | and it must not require rounding. In the usual case that `zSig' is
4140 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4141 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4142 | Binary Floating-Point Arithmetic.
4143 *----------------------------------------------------------------------------*/
4144
4145 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4146 float_status *status)
4147 {
4148 int8_t roundingMode;
4149 bool roundNearestEven;
4150 int8_t roundIncrement, roundBits;
4151 bool isTiny;
4152
4153 roundingMode = status->float_rounding_mode;
4154 roundNearestEven = ( roundingMode == float_round_nearest_even );
4155 switch (roundingMode) {
4156 case float_round_nearest_even:
4157 case float_round_ties_away:
4158 roundIncrement = 0x40;
4159 break;
4160 case float_round_to_zero:
4161 roundIncrement = 0;
4162 break;
4163 case float_round_up:
4164 roundIncrement = zSign ? 0 : 0x7f;
4165 break;
4166 case float_round_down:
4167 roundIncrement = zSign ? 0x7f : 0;
4168 break;
4169 case float_round_to_odd:
4170 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4171 break;
4172 default:
4173 abort();
4174 break;
4175 }
4176 roundBits = zSig & 0x7F;
4177 if ( 0xFD <= (uint16_t) zExp ) {
4178 if ( ( 0xFD < zExp )
4179 || ( ( zExp == 0xFD )
4180 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4181 ) {
4182 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4183 roundIncrement != 0;
4184 float_raise(float_flag_overflow | float_flag_inexact, status);
4185 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4186 }
4187 if ( zExp < 0 ) {
4188 if (status->flush_to_zero) {
4189 float_raise(float_flag_output_denormal, status);
4190 return packFloat32(zSign, 0, 0);
4191 }
4192 isTiny = status->tininess_before_rounding
4193 || (zExp < -1)
4194 || (zSig + roundIncrement < 0x80000000);
4195 shift32RightJamming( zSig, - zExp, &zSig );
4196 zExp = 0;
4197 roundBits = zSig & 0x7F;
4198 if (isTiny && roundBits) {
4199 float_raise(float_flag_underflow, status);
4200 }
4201 if (roundingMode == float_round_to_odd) {
4202 /*
4203 * For round-to-odd case, the roundIncrement depends on
4204 * zSig which just changed.
4205 */
4206 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4207 }
4208 }
4209 }
4210 if (roundBits) {
4211 float_raise(float_flag_inexact, status);
4212 }
4213 zSig = ( zSig + roundIncrement )>>7;
4214 if (!(roundBits ^ 0x40) && roundNearestEven) {
4215 zSig &= ~1;
4216 }
4217 if ( zSig == 0 ) zExp = 0;
4218 return packFloat32( zSign, zExp, zSig );
4219
4220 }
4221
4222 /*----------------------------------------------------------------------------
4223 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4224 | and significand `zSig', and returns the proper single-precision floating-
4225 | point value corresponding to the abstract input. This routine is just like
4226 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4227 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4228 | floating-point exponent.
4229 *----------------------------------------------------------------------------*/
4230
4231 static float32
4232 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4233 float_status *status)
4234 {
4235 int8_t shiftCount;
4236
4237 shiftCount = clz32(zSig) - 1;
4238 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4239 status);
4240
4241 }
4242
4243 /*----------------------------------------------------------------------------
4244 | Normalizes the subnormal double-precision floating-point value represented
4245 | by the denormalized significand `aSig'. The normalized exponent and
4246 | significand are stored at the locations pointed to by `zExpPtr' and
4247 | `zSigPtr', respectively.
4248 *----------------------------------------------------------------------------*/
4249
4250 static void
4251 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4252 {
4253 int8_t shiftCount;
4254
4255 shiftCount = clz64(aSig) - 11;
4256 *zSigPtr = aSig<<shiftCount;
4257 *zExpPtr = 1 - shiftCount;
4258
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4263 | double-precision floating-point value, returning the result. After being
4264 | shifted into the proper positions, the three fields are simply added
4265 | together to form the result. This means that any integer portion of `zSig'
4266 | will be added into the exponent. Since a properly normalized significand
4267 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4268 | than the desired result exponent whenever `zSig' is a complete, normalized
4269 | significand.
4270 *----------------------------------------------------------------------------*/
4271
4272 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4273 {
4274
4275 return make_float64(
4276 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4277
4278 }
4279
4280 /*----------------------------------------------------------------------------
4281 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4282 | and significand `zSig', and returns the proper double-precision floating-
4283 | point value corresponding to the abstract input. Ordinarily, the abstract
4284 | value is simply rounded and packed into the double-precision format, with
4285 | the inexact exception raised if the abstract input cannot be represented
4286 | exactly. However, if the abstract value is too large, the overflow and
4287 | inexact exceptions are raised and an infinity or maximal finite value is
4288 | returned. If the abstract value is too small, the input value is rounded to
4289 | a subnormal number, and the underflow and inexact exceptions are raised if
4290 | the abstract input cannot be represented exactly as a subnormal double-
4291 | precision floating-point number.
4292 | The input significand `zSig' has its binary point between bits 62
4293 | and 61, which is 10 bits to the left of the usual location. This shifted
4294 | significand must be normalized or smaller. If `zSig' is not normalized,
4295 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4296 | and it must not require rounding. In the usual case that `zSig' is
4297 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4298 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4299 | Binary Floating-Point Arithmetic.
4300 *----------------------------------------------------------------------------*/
4301
4302 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4303 float_status *status)
4304 {
4305 int8_t roundingMode;
4306 bool roundNearestEven;
4307 int roundIncrement, roundBits;
4308 bool isTiny;
4309
4310 roundingMode = status->float_rounding_mode;
4311 roundNearestEven = ( roundingMode == float_round_nearest_even );
4312 switch (roundingMode) {
4313 case float_round_nearest_even:
4314 case float_round_ties_away:
4315 roundIncrement = 0x200;
4316 break;
4317 case float_round_to_zero:
4318 roundIncrement = 0;
4319 break;
4320 case float_round_up:
4321 roundIncrement = zSign ? 0 : 0x3ff;
4322 break;
4323 case float_round_down:
4324 roundIncrement = zSign ? 0x3ff : 0;
4325 break;
4326 case float_round_to_odd:
4327 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4328 break;
4329 default:
4330 abort();
4331 }
4332 roundBits = zSig & 0x3FF;
4333 if ( 0x7FD <= (uint16_t) zExp ) {
4334 if ( ( 0x7FD < zExp )
4335 || ( ( zExp == 0x7FD )
4336 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4337 ) {
4338 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4339 roundIncrement != 0;
4340 float_raise(float_flag_overflow | float_flag_inexact, status);
4341 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4342 }
4343 if ( zExp < 0 ) {
4344 if (status->flush_to_zero) {
4345 float_raise(float_flag_output_denormal, status);
4346 return packFloat64(zSign, 0, 0);
4347 }
4348 isTiny = status->tininess_before_rounding
4349 || (zExp < -1)
4350 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4351 shift64RightJamming( zSig, - zExp, &zSig );
4352 zExp = 0;
4353 roundBits = zSig & 0x3FF;
4354 if (isTiny && roundBits) {
4355 float_raise(float_flag_underflow, status);
4356 }
4357 if (roundingMode == float_round_to_odd) {
4358 /*
4359 * For round-to-odd case, the roundIncrement depends on
4360 * zSig which just changed.
4361 */
4362 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4363 }
4364 }
4365 }
4366 if (roundBits) {
4367 float_raise(float_flag_inexact, status);
4368 }
4369 zSig = ( zSig + roundIncrement )>>10;
4370 if (!(roundBits ^ 0x200) && roundNearestEven) {
4371 zSig &= ~1;
4372 }
4373 if ( zSig == 0 ) zExp = 0;
4374 return packFloat64( zSign, zExp, zSig );
4375
4376 }
4377
4378 /*----------------------------------------------------------------------------
4379 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4380 | and significand `zSig', and returns the proper double-precision floating-
4381 | point value corresponding to the abstract input. This routine is just like
4382 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4383 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4384 | floating-point exponent.
4385 *----------------------------------------------------------------------------*/
4386
4387 static float64
4388 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4389 float_status *status)
4390 {
4391 int8_t shiftCount;
4392
4393 shiftCount = clz64(zSig) - 1;
4394 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4395 status);
4396
4397 }
4398
4399 /*----------------------------------------------------------------------------
4400 | Normalizes the subnormal extended double-precision floating-point value
4401 | represented by the denormalized significand `aSig'. The normalized exponent
4402 | and significand are stored at the locations pointed to by `zExpPtr' and
4403 | `zSigPtr', respectively.
4404 *----------------------------------------------------------------------------*/
4405
4406 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4407 uint64_t *zSigPtr)
4408 {
4409 int8_t shiftCount;
4410
4411 shiftCount = clz64(aSig);
4412 *zSigPtr = aSig<<shiftCount;
4413 *zExpPtr = 1 - shiftCount;
4414 }
4415
4416 /*----------------------------------------------------------------------------
4417 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4418 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4419 | and returns the proper extended double-precision floating-point value
4420 | corresponding to the abstract input. Ordinarily, the abstract value is
4421 | rounded and packed into the extended double-precision format, with the
4422 | inexact exception raised if the abstract input cannot be represented
4423 | exactly. However, if the abstract value is too large, the overflow and
4424 | inexact exceptions are raised and an infinity or maximal finite value is
4425 | returned. If the abstract value is too small, the input value is rounded to
4426 | a subnormal number, and the underflow and inexact exceptions are raised if
4427 | the abstract input cannot be represented exactly as a subnormal extended
4428 | double-precision floating-point number.
4429 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4430 | number of bits as single or double precision, respectively. Otherwise, the
4431 | result is rounded to the full precision of the extended double-precision
4432 | format.
4433 | The input significand must be normalized or smaller. If the input
4434 | significand is not normalized, `zExp' must be 0; in that case, the result
4435 | returned is a subnormal number, and it must not require rounding. The
4436 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic.
4438 *----------------------------------------------------------------------------*/
4439
4440 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4441 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4442 float_status *status)
4443 {
4444 int8_t roundingMode;
4445 bool roundNearestEven, increment, isTiny;
4446 int64_t roundIncrement, roundMask, roundBits;
4447
4448 roundingMode = status->float_rounding_mode;
4449 roundNearestEven = ( roundingMode == float_round_nearest_even );
4450 if ( roundingPrecision == 80 ) goto precision80;
4451 if ( roundingPrecision == 64 ) {
4452 roundIncrement = UINT64_C(0x0000000000000400);
4453 roundMask = UINT64_C(0x00000000000007FF);
4454 }
4455 else if ( roundingPrecision == 32 ) {
4456 roundIncrement = UINT64_C(0x0000008000000000);
4457 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4458 }
4459 else {
4460 goto precision80;
4461 }
4462 zSig0 |= ( zSig1 != 0 );
4463 switch (roundingMode) {
4464 case float_round_nearest_even:
4465 case float_round_ties_away:
4466 break;
4467 case float_round_to_zero:
4468 roundIncrement = 0;
4469 break;
4470 case float_round_up:
4471 roundIncrement = zSign ? 0 : roundMask;
4472 break;
4473 case float_round_down:
4474 roundIncrement = zSign ? roundMask : 0;
4475 break;
4476 default:
4477 abort();
4478 }
4479 roundBits = zSig0 & roundMask;
4480 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4481 if ( ( 0x7FFE < zExp )
4482 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4483 ) {
4484 goto overflow;
4485 }
4486 if ( zExp <= 0 ) {
4487 if (status->flush_to_zero) {
4488 float_raise(float_flag_output_denormal, status);
4489 return packFloatx80(zSign, 0, 0);
4490 }
4491 isTiny = status->tininess_before_rounding
4492 || (zExp < 0 )
4493 || (zSig0 <= zSig0 + roundIncrement);
4494 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4495 zExp = 0;
4496 roundBits = zSig0 & roundMask;
4497 if (isTiny && roundBits) {
4498 float_raise(float_flag_underflow, status);
4499 }
4500 if (roundBits) {
4501 float_raise(float_flag_inexact, status);
4502 }
4503 zSig0 += roundIncrement;
4504 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4505 roundIncrement = roundMask + 1;
4506 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4507 roundMask |= roundIncrement;
4508 }
4509 zSig0 &= ~ roundMask;
4510 return packFloatx80( zSign, zExp, zSig0 );
4511 }
4512 }
4513 if (roundBits) {
4514 float_raise(float_flag_inexact, status);
4515 }
4516 zSig0 += roundIncrement;
4517 if ( zSig0 < roundIncrement ) {
4518 ++zExp;
4519 zSig0 = UINT64_C(0x8000000000000000);
4520 }
4521 roundIncrement = roundMask + 1;
4522 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4523 roundMask |= roundIncrement;
4524 }
4525 zSig0 &= ~ roundMask;
4526 if ( zSig0 == 0 ) zExp = 0;
4527 return packFloatx80( zSign, zExp, zSig0 );
4528 precision80:
4529 switch (roundingMode) {
4530 case float_round_nearest_even:
4531 case float_round_ties_away:
4532 increment = ((int64_t)zSig1 < 0);
4533 break;
4534 case float_round_to_zero:
4535 increment = 0;
4536 break;
4537 case float_round_up:
4538 increment = !zSign && zSig1;
4539 break;
4540 case float_round_down:
4541 increment = zSign && zSig1;
4542 break;
4543 default:
4544 abort();
4545 }
4546 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4547 if ( ( 0x7FFE < zExp )
4548 || ( ( zExp == 0x7FFE )
4549 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4550 && increment
4551 )
4552 ) {
4553 roundMask = 0;
4554 overflow:
4555 float_raise(float_flag_overflow | float_flag_inexact, status);
4556 if ( ( roundingMode == float_round_to_zero )
4557 || ( zSign && ( roundingMode == float_round_up ) )
4558 || ( ! zSign && ( roundingMode == float_round_down ) )
4559 ) {
4560 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4561 }
4562 return packFloatx80(zSign,
4563 floatx80_infinity_high,
4564 floatx80_infinity_low);
4565 }
4566 if ( zExp <= 0 ) {
4567 isTiny = status->tininess_before_rounding
4568 || (zExp < 0)
4569 || !increment
4570 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4571 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4572 zExp = 0;
4573 if (isTiny && zSig1) {
4574 float_raise(float_flag_underflow, status);
4575 }
4576 if (zSig1) {
4577 float_raise(float_flag_inexact, status);
4578 }
4579 switch (roundingMode) {
4580 case float_round_nearest_even:
4581 case float_round_ties_away:
4582 increment = ((int64_t)zSig1 < 0);
4583 break;
4584 case float_round_to_zero:
4585 increment = 0;
4586 break;
4587 case float_round_up:
4588 increment = !zSign && zSig1;
4589 break;
4590 case float_round_down:
4591 increment = zSign && zSig1;
4592 break;
4593 default:
4594 abort();
4595 }
4596 if ( increment ) {
4597 ++zSig0;
4598 if (!(zSig1 << 1) && roundNearestEven) {
4599 zSig0 &= ~1;
4600 }
4601 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4602 }
4603 return packFloatx80( zSign, zExp, zSig0 );
4604 }
4605 }
4606 if (zSig1) {
4607 float_raise(float_flag_inexact, status);
4608 }
4609 if ( increment ) {
4610 ++zSig0;
4611 if ( zSig0 == 0 ) {
4612 ++zExp;
4613 zSig0 = UINT64_C(0x8000000000000000);
4614 }
4615 else {
4616 if (!(zSig1 << 1) && roundNearestEven) {
4617 zSig0 &= ~1;
4618 }
4619 }
4620 }
4621 else {
4622 if ( zSig0 == 0 ) zExp = 0;
4623 }
4624 return packFloatx80( zSign, zExp, zSig0 );
4625
4626 }
4627
4628 /*----------------------------------------------------------------------------
4629 | Takes an abstract floating-point value having sign `zSign', exponent
4630 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4631 | and returns the proper extended double-precision floating-point value
4632 | corresponding to the abstract input. This routine is just like
4633 | `roundAndPackFloatx80' except that the input significand does not have to be
4634 | normalized.
4635 *----------------------------------------------------------------------------*/
4636
4637 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4638 bool zSign, int32_t zExp,
4639 uint64_t zSig0, uint64_t zSig1,
4640 float_status *status)
4641 {
4642 int8_t shiftCount;
4643
4644 if ( zSig0 == 0 ) {
4645 zSig0 = zSig1;
4646 zSig1 = 0;
4647 zExp -= 64;
4648 }
4649 shiftCount = clz64(zSig0);
4650 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4651 zExp -= shiftCount;
4652 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4653 zSig0, zSig1, status);
4654
4655 }
4656
4657 /*----------------------------------------------------------------------------
4658 | Returns the least-significant 64 fraction bits of the quadruple-precision
4659 | floating-point value `a'.
4660 *----------------------------------------------------------------------------*/
4661
4662 static inline uint64_t extractFloat128Frac1( float128 a )
4663 {
4664
4665 return a.low;
4666
4667 }
4668
4669 /*----------------------------------------------------------------------------
4670 | Returns the most-significant 48 fraction bits of the quadruple-precision
4671 | floating-point value `a'.
4672 *----------------------------------------------------------------------------*/
4673
4674 static inline uint64_t extractFloat128Frac0( float128 a )
4675 {
4676
4677 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4678
4679 }
4680
4681 /*----------------------------------------------------------------------------
4682 | Returns the exponent bits of the quadruple-precision floating-point value
4683 | `a'.
4684 *----------------------------------------------------------------------------*/
4685
4686 static inline int32_t extractFloat128Exp( float128 a )
4687 {
4688
4689 return ( a.high>>48 ) & 0x7FFF;
4690
4691 }
4692
4693 /*----------------------------------------------------------------------------
4694 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4695 *----------------------------------------------------------------------------*/
4696
4697 static inline bool extractFloat128Sign(float128 a)
4698 {
4699 return a.high >> 63;
4700 }
4701
4702 /*----------------------------------------------------------------------------
4703 | Normalizes the subnormal quadruple-precision floating-point value
4704 | represented by the denormalized significand formed by the concatenation of
4705 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4706 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4707 | significand are stored at the location pointed to by `zSig0Ptr', and the
4708 | least significant 64 bits of the normalized significand are stored at the
4709 | location pointed to by `zSig1Ptr'.
4710 *----------------------------------------------------------------------------*/
4711
4712 static void
4713 normalizeFloat128Subnormal(
4714 uint64_t aSig0,
4715 uint64_t aSig1,
4716 int32_t *zExpPtr,
4717 uint64_t *zSig0Ptr,
4718 uint64_t *zSig1Ptr
4719 )
4720 {
4721 int8_t shiftCount;
4722
4723 if ( aSig0 == 0 ) {
4724 shiftCount = clz64(aSig1) - 15;
4725 if ( shiftCount < 0 ) {
4726 *zSig0Ptr = aSig1>>( - shiftCount );
4727 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4728 }
4729 else {
4730 *zSig0Ptr = aSig1<<shiftCount;
4731 *zSig1Ptr = 0;
4732 }
4733 *zExpPtr = - shiftCount - 63;
4734 }
4735 else {
4736 shiftCount = clz64(aSig0) - 15;
4737 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4738 *zExpPtr = 1 - shiftCount;
4739 }
4740
4741 }
4742
4743 /*----------------------------------------------------------------------------
4744 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4745 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4746 | floating-point value, returning the result. After being shifted into the
4747 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4748 | added together to form the most significant 32 bits of the result. This
4749 | means that any integer portion of `zSig0' will be added into the exponent.
4750 | Since a properly normalized significand will have an integer portion equal
4751 | to 1, the `zExp' input should be 1 less than the desired result exponent
4752 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4753 | significand.
4754 *----------------------------------------------------------------------------*/
4755
4756 static inline float128
4757 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4758 {
4759 float128 z;
4760
4761 z.low = zSig1;
4762 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4763 return z;
4764 }
4765
4766 /*----------------------------------------------------------------------------
4767 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4768 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4769 | and `zSig2', and returns the proper quadruple-precision floating-point value
4770 | corresponding to the abstract input. Ordinarily, the abstract value is
4771 | simply rounded and packed into the quadruple-precision format, with the
4772 | inexact exception raised if the abstract input cannot be represented
4773 | exactly. However, if the abstract value is too large, the overflow and
4774 | inexact exceptions are raised and an infinity or maximal finite value is
4775 | returned. If the abstract value is too small, the input value is rounded to
4776 | a subnormal number, and the underflow and inexact exceptions are raised if
4777 | the abstract input cannot be represented exactly as a subnormal quadruple-
4778 | precision floating-point number.
4779 | The input significand must be normalized or smaller. If the input
4780 | significand is not normalized, `zExp' must be 0; in that case, the result
4781 | returned is a subnormal number, and it must not require rounding. In the
4782 | usual case that the input significand is normalized, `zExp' must be 1 less
4783 | than the ``true'' floating-point exponent. The handling of underflow and
4784 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4785 *----------------------------------------------------------------------------*/
4786
4787 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4788 uint64_t zSig0, uint64_t zSig1,
4789 uint64_t zSig2, float_status *status)
4790 {
4791 int8_t roundingMode;
4792 bool roundNearestEven, increment, isTiny;
4793
4794 roundingMode = status->float_rounding_mode;
4795 roundNearestEven = ( roundingMode == float_round_nearest_even );
4796 switch (roundingMode) {
4797 case float_round_nearest_even:
4798 case float_round_ties_away:
4799 increment = ((int64_t)zSig2 < 0);
4800 break;
4801 case float_round_to_zero:
4802 increment = 0;
4803 break;
4804 case float_round_up:
4805 increment = !zSign && zSig2;
4806 break;
4807 case float_round_down:
4808 increment = zSign && zSig2;
4809 break;
4810 case float_round_to_odd:
4811 increment = !(zSig1 & 0x1) && zSig2;
4812 break;
4813 default:
4814 abort();
4815 }
4816 if ( 0x7FFD <= (uint32_t) zExp ) {
4817 if ( ( 0x7FFD < zExp )
4818 || ( ( zExp == 0x7FFD )
4819 && eq128(
4820 UINT64_C(0x0001FFFFFFFFFFFF),
4821 UINT64_C(0xFFFFFFFFFFFFFFFF),
4822 zSig0,
4823 zSig1
4824 )
4825 && increment
4826 )
4827 ) {
4828 float_raise(float_flag_overflow | float_flag_inexact, status);
4829 if ( ( roundingMode == float_round_to_zero )
4830 || ( zSign && ( roundingMode == float_round_up ) )
4831 || ( ! zSign && ( roundingMode == float_round_down ) )
4832 || (roundingMode == float_round_to_odd)
4833 ) {
4834 return
4835 packFloat128(
4836 zSign,
4837 0x7FFE,
4838 UINT64_C(0x0000FFFFFFFFFFFF),
4839 UINT64_C(0xFFFFFFFFFFFFFFFF)
4840 );
4841 }
4842 return packFloat128( zSign, 0x7FFF, 0, 0 );
4843 }
4844 if ( zExp < 0 ) {
4845 if (status->flush_to_zero) {
4846 float_raise(float_flag_output_denormal, status);
4847 return packFloat128(zSign, 0, 0, 0);
4848 }
4849 isTiny = status->tininess_before_rounding
4850 || (zExp < -1)
4851 || !increment
4852 || lt128(zSig0, zSig1,
4853 UINT64_C(0x0001FFFFFFFFFFFF),
4854 UINT64_C(0xFFFFFFFFFFFFFFFF));
4855 shift128ExtraRightJamming(
4856 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4857 zExp = 0;
4858 if (isTiny && zSig2) {
4859 float_raise(float_flag_underflow, status);
4860 }
4861 switch (roundingMode) {
4862 case float_round_nearest_even:
4863 case float_round_ties_away:
4864 increment = ((int64_t)zSig2 < 0);
4865 break;
4866 case float_round_to_zero:
4867 increment = 0;
4868 break;
4869 case float_round_up:
4870 increment = !zSign && zSig2;
4871 break;
4872 case float_round_down:
4873 increment = zSign && zSig2;
4874 break;
4875 case float_round_to_odd:
4876 increment = !(zSig1 & 0x1) && zSig2;
4877 break;
4878 default:
4879 abort();
4880 }
4881 }
4882 }
4883 if (zSig2) {
4884 float_raise(float_flag_inexact, status);
4885 }
4886 if ( increment ) {
4887 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4888 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4889 zSig1 &= ~1;
4890 }
4891 }
4892 else {
4893 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4894 }
4895 return packFloat128( zSign, zExp, zSig0, zSig1 );
4896
4897 }
4898
4899 /*----------------------------------------------------------------------------
4900 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4901 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4902 | returns the proper quadruple-precision floating-point value corresponding
4903 | to the abstract input. This routine is just like `roundAndPackFloat128'
4904 | except that the input significand has fewer bits and does not have to be
4905 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4906 | point exponent.
4907 *----------------------------------------------------------------------------*/
4908
4909 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4910 uint64_t zSig0, uint64_t zSig1,
4911 float_status *status)
4912 {
4913 int8_t shiftCount;
4914 uint64_t zSig2;
4915
4916 if ( zSig0 == 0 ) {
4917 zSig0 = zSig1;
4918 zSig1 = 0;
4919 zExp -= 64;
4920 }
4921 shiftCount = clz64(zSig0) - 15;
4922 if ( 0 <= shiftCount ) {
4923 zSig2 = 0;
4924 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4925 }
4926 else {
4927 shift128ExtraRightJamming(
4928 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4929 }
4930 zExp -= shiftCount;
4931 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4932
4933 }
4934
4935
4936 /*----------------------------------------------------------------------------
4937 | Returns the result of converting the 32-bit two's complement integer `a'
4938 | to the extended double-precision floating-point format. The conversion
4939 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4940 | Arithmetic.
4941 *----------------------------------------------------------------------------*/
4942
4943 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4944 {
4945 bool zSign;
4946 uint32_t absA;
4947 int8_t shiftCount;
4948 uint64_t zSig;
4949
4950 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4951 zSign = ( a < 0 );
4952 absA = zSign ? - a : a;
4953 shiftCount = clz32(absA) + 32;
4954 zSig = absA;
4955 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4956
4957 }
4958
4959 /*----------------------------------------------------------------------------
4960 | Returns the result of converting the 32-bit two's complement integer `a' to
4961 | the quadruple-precision floating-point format. The conversion is performed
4962 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4963 *----------------------------------------------------------------------------*/
4964
4965 float128 int32_to_float128(int32_t a, float_status *status)
4966 {
4967 bool zSign;
4968 uint32_t absA;
4969 int8_t shiftCount;
4970 uint64_t zSig0;
4971
4972 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4973 zSign = ( a < 0 );
4974 absA = zSign ? - a : a;
4975 shiftCount = clz32(absA) + 17;
4976 zSig0 = absA;
4977 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4978
4979 }
4980
4981 /*----------------------------------------------------------------------------
4982 | Returns the result of converting the 64-bit two's complement integer `a'
4983 | to the extended double-precision floating-point format. The conversion
4984 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4985 | Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4989 {
4990 bool zSign;
4991 uint64_t absA;
4992 int8_t shiftCount;
4993
4994 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4995 zSign = ( a < 0 );
4996 absA = zSign ? - a : a;
4997 shiftCount = clz64(absA);
4998 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4999
5000 }
5001
5002 /*----------------------------------------------------------------------------
5003 | Returns the result of converting the 64-bit two's complement integer `a' to
5004 | the quadruple-precision floating-point format. The conversion is performed
5005 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5006 *----------------------------------------------------------------------------*/
5007
5008 float128 int64_to_float128(int64_t a, float_status *status)
5009 {
5010 bool zSign;
5011 uint64_t absA;
5012 int8_t shiftCount;
5013 int32_t zExp;
5014 uint64_t zSig0, zSig1;
5015
5016 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5017 zSign = ( a < 0 );
5018 absA = zSign ? - a : a;
5019 shiftCount = clz64(absA) + 49;
5020 zExp = 0x406E - shiftCount;
5021 if ( 64 <= shiftCount ) {
5022 zSig1 = 0;
5023 zSig0 = absA;
5024 shiftCount -= 64;
5025 }
5026 else {
5027 zSig1 = absA;
5028 zSig0 = 0;
5029 }
5030 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5031 return packFloat128( zSign, zExp, zSig0, zSig1 );
5032
5033 }
5034
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of converting the 64-bit unsigned integer `a'
5037 | to the quadruple-precision floating-point format. The conversion is performed
5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5040
5041 float128 uint64_to_float128(uint64_t a, float_status *status)
5042 {
5043 if (a == 0) {
5044 return float128_zero;
5045 }
5046 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5047 }
5048
5049 /*----------------------------------------------------------------------------
5050 | Returns the result of converting the single-precision floating-point value
5051 | `a' to the extended double-precision floating-point format. The conversion
5052 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5053 | Arithmetic.
5054 *----------------------------------------------------------------------------*/
5055
5056 floatx80 float32_to_floatx80(float32 a, float_status *status)
5057 {
5058 bool aSign;
5059 int aExp;
5060 uint32_t aSig;
5061
5062 a = float32_squash_input_denormal(a, status);
5063 aSig = extractFloat32Frac( a );
5064 aExp = extractFloat32Exp( a );
5065 aSign = extractFloat32Sign( a );
5066 if ( aExp == 0xFF ) {
5067 if (aSig) {
5068 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5069 status);
5070 return floatx80_silence_nan(res, status);
5071 }
5072 return packFloatx80(aSign,
5073 floatx80_infinity_high,
5074 floatx80_infinity_low);
5075 }
5076 if ( aExp == 0 ) {
5077 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5078 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5079 }
5080 aSig |= 0x00800000;
5081 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5082
5083 }
5084
5085 /*----------------------------------------------------------------------------
5086 | Returns the remainder of the single-precision floating-point value `a'
5087 | with respect to the corresponding value `b'. The operation is performed
5088 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5089 *----------------------------------------------------------------------------*/
5090
5091 float32 float32_rem(float32 a, float32 b, float_status *status)
5092 {
5093 bool aSign, zSign;
5094 int aExp, bExp, expDiff;
5095 uint32_t aSig, bSig;
5096 uint32_t q;
5097 uint64_t aSig64, bSig64, q64;
5098 uint32_t alternateASig;
5099 int32_t sigMean;
5100 a = float32_squash_input_denormal(a, status);
5101 b = float32_squash_input_denormal(b, status);
5102
5103 aSig = extractFloat32Frac( a );
5104 aExp = extractFloat32Exp( a );
5105 aSign = extractFloat32Sign( a );
5106 bSig = extractFloat32Frac( b );
5107 bExp = extractFloat32Exp( b );
5108 if ( aExp == 0xFF ) {
5109 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5110 return propagateFloat32NaN(a, b, status);
5111 }
5112 float_raise(float_flag_invalid, status);
5113 return float32_default_nan(status);
5114 }
5115 if ( bExp == 0xFF ) {
5116 if (bSig) {
5117 return propagateFloat32NaN(a, b, status);
5118 }
5119 return a;
5120 }
5121 if ( bExp == 0 ) {
5122 if ( bSig == 0 ) {
5123 float_raise(float_flag_invalid, status);
5124 return float32_default_nan(status);
5125 }
5126 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5127 }
5128 if ( aExp == 0 ) {
5129 if ( aSig == 0 ) return a;
5130 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5131 }
5132 expDiff = aExp - bExp;
5133 aSig |= 0x00800000;
5134 bSig |= 0x00800000;
5135 if ( expDiff < 32 ) {
5136 aSig <<= 8;
5137 bSig <<= 8;
5138 if ( expDiff < 0 ) {
5139 if ( expDiff < -1 ) return a;
5140 aSig >>= 1;
5141 }
5142 q = ( bSig <= aSig );
5143 if ( q ) aSig -= bSig;
5144 if ( 0 < expDiff ) {
5145 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5146 q >>= 32 - expDiff;
5147 bSig >>= 2;
5148 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5149 }
5150 else {
5151 aSig >>= 2;
5152 bSig >>= 2;
5153 }
5154 }
5155 else {
5156 if ( bSig <= aSig ) aSig -= bSig;
5157 aSig64 = ( (uint64_t) aSig )<<40;
5158 bSig64 = ( (uint64_t) bSig )<<40;
5159 expDiff -= 64;
5160 while ( 0 < expDiff ) {
5161 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5162 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5163 aSig64 = - ( ( bSig * q64 )<<38 );
5164 expDiff -= 62;
5165 }
5166 expDiff += 64;
5167 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5168 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5169 q = q64>>( 64 - expDiff );
5170 bSig <<= 6;
5171 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5172 }
5173 do {
5174 alternateASig = aSig;
5175 ++q;
5176 aSig -= bSig;
5177 } while ( 0 <= (int32_t) aSig );
5178 sigMean = aSig + alternateASig;
5179 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5180 aSig = alternateASig;
5181 }
5182 zSign = ( (int32_t) aSig < 0 );
5183 if ( zSign ) aSig = - aSig;
5184 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5185 }
5186
5187
5188
5189 /*----------------------------------------------------------------------------
5190 | Returns the binary exponential of the single-precision floating-point value
5191 | `a'. The operation is performed according to the IEC/IEEE Standard for
5192 | Binary Floating-Point Arithmetic.
5193 |
5194 | Uses the following identities:
5195 |
5196 | 1. -------------------------------------------------------------------------
5197 | x x*ln(2)
5198 | 2 = e
5199 |
5200 | 2. -------------------------------------------------------------------------
5201 | 2 3 4 5 n
5202 | x x x x x x x
5203 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5204 | 1! 2! 3! 4! 5! n!
5205 *----------------------------------------------------------------------------*/
5206
5207 static const float64 float32_exp2_coefficients[15] =
5208 {
5209 const_float64( 0x3ff0000000000000ll ), /* 1 */
5210 const_float64( 0x3fe0000000000000ll ), /* 2 */
5211 const_float64( 0x3fc5555555555555ll ), /* 3 */
5212 const_float64( 0x3fa5555555555555ll ), /* 4 */
5213 const_float64( 0x3f81111111111111ll ), /* 5 */
5214 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5215 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5216 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5217 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5218 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5219 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5220 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5221 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5222 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5223 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5224 };
5225
5226 float32 float32_exp2(float32 a, float_status *status)
5227 {
5228 bool aSign;
5229 int aExp;
5230 uint32_t aSig;
5231 float64 r, x, xn;
5232 int i;
5233 a = float32_squash_input_denormal(a, status);
5234
5235 aSig = extractFloat32Frac( a );
5236 aExp = extractFloat32Exp( a );
5237 aSign = extractFloat32Sign( a );
5238
5239 if ( aExp == 0xFF) {
5240 if (aSig) {
5241 return propagateFloat32NaN(a, float32_zero, status);
5242 }
5243 return (aSign) ? float32_zero : a;
5244 }
5245 if (aExp == 0) {
5246 if (aSig == 0) return float32_one;
5247 }
5248
5249 float_raise(float_flag_inexact, status);
5250
5251 /* ******************************* */
5252 /* using float64 for approximation */
5253 /* ******************************* */
5254 x = float32_to_float64(a, status);
5255 x = float64_mul(x, float64_ln2, status);
5256
5257 xn = x;
5258 r = float64_one;
5259 for (i = 0 ; i < 15 ; i++) {
5260 float64 f;
5261
5262 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5263 r = float64_add(r, f, status);
5264
5265 xn = float64_mul(xn, x, status);
5266 }
5267
5268 return float64_to_float32(r, status);
5269 }
5270
5271 /*----------------------------------------------------------------------------
5272 | Returns the binary log of the single-precision floating-point value `a'.
5273 | The operation is performed according to the IEC/IEEE Standard for Binary
5274 | Floating-Point Arithmetic.
5275 *----------------------------------------------------------------------------*/
5276 float32 float32_log2(float32 a, float_status *status)
5277 {
5278 bool aSign, zSign;
5279 int aExp;
5280 uint32_t aSig, zSig, i;
5281
5282 a = float32_squash_input_denormal(a, status);
5283 aSig = extractFloat32Frac( a );
5284 aExp = extractFloat32Exp( a );
5285 aSign = extractFloat32Sign( a );
5286
5287 if ( aExp == 0 ) {
5288 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5289 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5290 }
5291 if ( aSign ) {
5292 float_raise(float_flag_invalid, status);
5293 return float32_default_nan(status);
5294 }
5295 if ( aExp == 0xFF ) {
5296 if (aSig) {
5297 return propagateFloat32NaN(a, float32_zero, status);
5298 }
5299 return a;
5300 }
5301
5302 aExp -= 0x7F;
5303 aSig |= 0x00800000;
5304 zSign = aExp < 0;
5305 zSig = aExp << 23;
5306
5307 for (i = 1 << 22; i > 0; i >>= 1) {
5308 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5309 if ( aSig & 0x01000000 ) {
5310 aSig >>= 1;
5311 zSig |= i;
5312 }
5313 }
5314
5315 if ( zSign )
5316 zSig = -zSig;
5317
5318 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5319 }
5320
5321 /*----------------------------------------------------------------------------
5322 | Returns the result of converting the double-precision floating-point value
5323 | `a' to the extended double-precision floating-point format. The conversion
5324 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5325 | Arithmetic.
5326 *----------------------------------------------------------------------------*/
5327
5328 floatx80 float64_to_floatx80(float64 a, float_status *status)
5329 {
5330 bool aSign;
5331 int aExp;
5332 uint64_t aSig;
5333
5334 a = float64_squash_input_denormal(a, status);
5335 aSig = extractFloat64Frac( a );
5336 aExp = extractFloat64Exp( a );
5337 aSign = extractFloat64Sign( a );
5338 if ( aExp == 0x7FF ) {
5339 if (aSig) {
5340 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5341 status);
5342 return floatx80_silence_nan(res, status);
5343 }
5344 return packFloatx80(aSign,
5345 floatx80_infinity_high,
5346 floatx80_infinity_low);
5347 }
5348 if ( aExp == 0 ) {
5349 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5350 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5351 }
5352 return
5353 packFloatx80(
5354 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5355
5356 }
5357
5358 /*----------------------------------------------------------------------------
5359 | Returns the remainder of the double-precision floating-point value `a'
5360 | with respect to the corresponding value `b'. The operation is performed
5361 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5363
5364 float64 float64_rem(float64 a, float64 b, float_status *status)
5365 {
5366 bool aSign, zSign;
5367 int aExp, bExp, expDiff;
5368 uint64_t aSig, bSig;
5369 uint64_t q, alternateASig;
5370 int64_t sigMean;
5371
5372 a = float64_squash_input_denormal(a, status);
5373 b = float64_squash_input_denormal(b, status);
5374 aSig = extractFloat64Frac( a );
5375 aExp = extractFloat64Exp( a );
5376 aSign = extractFloat64Sign( a );
5377 bSig = extractFloat64Frac( b );
5378 bExp = extractFloat64Exp( b );
5379 if ( aExp == 0x7FF ) {
5380 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5381 return propagateFloat64NaN(a, b, status);
5382 }
5383 float_raise(float_flag_invalid, status);
5384 return float64_default_nan(status);
5385 }
5386 if ( bExp == 0x7FF ) {
5387 if (bSig) {
5388 return propagateFloat64NaN(a, b, status);
5389 }
5390 return a;
5391 }
5392 if ( bExp == 0 ) {
5393 if ( bSig == 0 ) {
5394 float_raise(float_flag_invalid, status);
5395 return float64_default_nan(status);
5396 }
5397 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5398 }
5399 if ( aExp == 0 ) {
5400 if ( aSig == 0 ) return a;
5401 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5402 }
5403 expDiff = aExp - bExp;
5404 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5405 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5406 if ( expDiff < 0 ) {
5407 if ( expDiff < -1 ) return a;
5408 aSig >>= 1;
5409 }
5410 q = ( bSig <= aSig );
5411 if ( q ) aSig -= bSig;
5412 expDiff -= 64;
5413 while ( 0 < expDiff ) {
5414 q = estimateDiv128To64( aSig, 0, bSig );
5415 q = ( 2 < q ) ? q - 2 : 0;
5416 aSig = - ( ( bSig>>2 ) * q );
5417 expDiff -= 62;
5418 }
5419 expDiff += 64;
5420 if ( 0 < expDiff ) {
5421 q = estimateDiv128To64( aSig, 0, bSig );
5422 q = ( 2 < q ) ? q - 2 : 0;
5423 q >>= 64 - expDiff;
5424 bSig >>= 2;
5425 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5426 }
5427 else {
5428 aSig >>= 2;
5429 bSig >>= 2;
5430 }
5431 do {
5432 alternateASig = aSig;
5433 ++q;
5434 aSig -= bSig;
5435 } while ( 0 <= (int64_t) aSig );
5436 sigMean = aSig + alternateASig;
5437 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5438 aSig = alternateASig;
5439 }
5440 zSign = ( (int64_t) aSig < 0 );
5441 if ( zSign ) aSig = - aSig;
5442 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5443
5444 }
5445
5446 /*----------------------------------------------------------------------------
5447 | Returns the binary log of the double-precision floating-point value `a'.
5448 | The operation is performed according to the IEC/IEEE Standard for Binary
5449 | Floating-Point Arithmetic.
5450 *----------------------------------------------------------------------------*/
5451 float64 float64_log2(float64 a, float_status *status)
5452 {
5453 bool aSign, zSign;
5454 int aExp;
5455 uint64_t aSig, aSig0, aSig1, zSig, i;
5456 a = float64_squash_input_denormal(a, status);
5457
5458 aSig = extractFloat64Frac( a );
5459 aExp = extractFloat64Exp( a );
5460 aSign = extractFloat64Sign( a );
5461
5462 if ( aExp == 0 ) {
5463 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5464 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5465 }
5466 if ( aSign ) {
5467 float_raise(float_flag_invalid, status);
5468 return float64_default_nan(status);
5469 }
5470 if ( aExp == 0x7FF ) {
5471 if (aSig) {
5472 return propagateFloat64NaN(a, float64_zero, status);
5473 }
5474 return a;
5475 }
5476
5477 aExp -= 0x3FF;
5478 aSig |= UINT64_C(0x0010000000000000);
5479 zSign = aExp < 0;
5480 zSig = (uint64_t)aExp << 52;
5481 for (i = 1LL << 51; i > 0; i >>= 1) {
5482 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5483 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5484 if ( aSig & UINT64_C(0x0020000000000000) ) {
5485 aSig >>= 1;
5486 zSig |= i;
5487 }
5488 }
5489
5490 if ( zSign )
5491 zSig = -zSig;
5492 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5493 }
5494
5495 /*----------------------------------------------------------------------------
5496 | Returns the result of converting the extended double-precision floating-
5497 | point value `a' to the 32-bit two's complement integer format. The
5498 | conversion is performed according to the IEC/IEEE Standard for Binary
5499 | Floating-Point Arithmetic---which means in particular that the conversion
5500 | is rounded according to the current rounding mode. If `a' is a NaN, the
5501 | largest positive integer is returned. Otherwise, if the conversion
5502 | overflows, the largest integer with the same sign as `a' is returned.
5503 *----------------------------------------------------------------------------*/
5504
5505 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5506 {
5507 bool aSign;
5508 int32_t aExp, shiftCount;
5509 uint64_t aSig;
5510
5511 if (floatx80_invalid_encoding(a)) {
5512 float_raise(float_flag_invalid, status);
5513 return 1 << 31;
5514 }
5515 aSig = extractFloatx80Frac( a );
5516 aExp = extractFloatx80Exp( a );
5517 aSign = extractFloatx80Sign( a );
5518 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5519 shiftCount = 0x4037 - aExp;
5520 if ( shiftCount <= 0 ) shiftCount = 1;
5521 shift64RightJamming( aSig, shiftCount, &aSig );
5522 return roundAndPackInt32(aSign, aSig, status);
5523
5524 }
5525
5526 /*----------------------------------------------------------------------------
5527 | Returns the result of converting the extended double-precision floating-
5528 | point value `a' to the 32-bit two's complement integer format. The
5529 | conversion is performed according to the IEC/IEEE Standard for Binary
5530 | Floating-Point Arithmetic, except that the conversion is always rounded
5531 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5532 | Otherwise, if the conversion overflows, the largest integer with the same
5533 | sign as `a' is returned.
5534 *----------------------------------------------------------------------------*/
5535
5536 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5537 {
5538 bool aSign;
5539 int32_t aExp, shiftCount;
5540 uint64_t aSig, savedASig;
5541 int32_t z;
5542
5543 if (floatx80_invalid_encoding(a)) {
5544 float_raise(float_flag_invalid, status);
5545 return 1 << 31;
5546 }
5547 aSig = extractFloatx80Frac( a );
5548 aExp = extractFloatx80Exp( a );
5549 aSign = extractFloatx80Sign( a );
5550 if ( 0x401E < aExp ) {
5551 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5552 goto invalid;
5553 }
5554 else if ( aExp < 0x3FFF ) {
5555 if (aExp || aSig) {
5556 float_raise(float_flag_inexact, status);
5557 }
5558 return 0;
5559 }
5560 shiftCount = 0x403E - aExp;
5561 savedASig = aSig;
5562 aSig >>= shiftCount;
5563 z = aSig;
5564 if ( aSign ) z = - z;
5565 if ( ( z < 0 ) ^ aSign ) {
5566 invalid:
5567 float_raise(float_flag_invalid, status);
5568 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5569 }
5570 if ( ( aSig<<shiftCount ) != savedASig ) {
5571 float_raise(float_flag_inexact, status);
5572 }
5573 return z;
5574
5575 }
5576
5577 /*----------------------------------------------------------------------------
5578 | Returns the result of converting the extended double-precision floating-
5579 | point value `a' to the 64-bit two's complement integer format. The
5580 | conversion is performed according to the IEC/IEEE Standard for Binary
5581 | Floating-Point Arithmetic---which means in particular that the conversion
5582 | is rounded according to the current rounding mode. If `a' is a NaN,
5583 | the largest positive integer is returned. Otherwise, if the conversion
5584 | overflows, the largest integer with the same sign as `a' is returned.
5585 *----------------------------------------------------------------------------*/
5586
5587 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5588 {
5589 bool aSign;
5590 int32_t aExp, shiftCount;
5591 uint64_t aSig, aSigExtra;
5592
5593 if (floatx80_invalid_encoding(a)) {
5594 float_raise(float_flag_invalid, status);
5595 return 1ULL << 63;
5596 }
5597 aSig = extractFloatx80Frac( a );
5598 aExp = extractFloatx80Exp( a );
5599 aSign = extractFloatx80Sign( a );
5600 shiftCount = 0x403E - aExp;
5601 if ( shiftCount <= 0 ) {
5602 if ( shiftCount ) {
5603 float_raise(float_flag_invalid, status);
5604 if (!aSign || floatx80_is_any_nan(a)) {
5605 return INT64_MAX;
5606 }
5607 return INT64_MIN;
5608 }
5609 aSigExtra = 0;
5610 }
5611 else {
5612 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5613 }
5614 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5615
5616 }
5617
5618 /*----------------------------------------------------------------------------
5619 | Returns the result of converting the extended double-precision floating-
5620 | point value `a' to the 64-bit two's complement integer format. The
5621 | conversion is performed according to the IEC/IEEE Standard for Binary
5622 | Floating-Point Arithmetic, except that the conversion is always rounded
5623 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5624 | Otherwise, if the conversion overflows, the largest integer with the same
5625 | sign as `a' is returned.
5626 *----------------------------------------------------------------------------*/
5627
5628 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5629 {
5630 bool aSign;
5631 int32_t aExp, shiftCount;
5632 uint64_t aSig;
5633 int64_t z;
5634
5635 if (floatx80_invalid_encoding(a)) {
5636 float_raise(float_flag_invalid, status);
5637 return 1ULL << 63;
5638 }
5639 aSig = extractFloatx80Frac( a );
5640 aExp = extractFloatx80Exp( a );
5641 aSign = extractFloatx80Sign( a );
5642 shiftCount = aExp - 0x403E;
5643 if ( 0 <= shiftCount ) {
5644 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5645 if ( ( a.high != 0xC03E ) || aSig ) {
5646 float_raise(float_flag_invalid, status);
5647 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5648 return INT64_MAX;
5649 }
5650 }
5651 return INT64_MIN;
5652 }
5653 else if ( aExp < 0x3FFF ) {
5654 if (aExp | aSig) {
5655 float_raise(float_flag_inexact, status);
5656 }
5657 return 0;
5658 }
5659 z = aSig>>( - shiftCount );
5660 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5661 float_raise(float_flag_inexact, status);
5662 }
5663 if ( aSign ) z = - z;
5664 return z;
5665
5666 }
5667
5668 /*----------------------------------------------------------------------------
5669 | Returns the result of converting the extended double-precision floating-
5670 | point value `a' to the single-precision floating-point format. The
5671 | conversion is performed according to the IEC/IEEE Standard for Binary
5672 | Floating-Point Arithmetic.
5673 *----------------------------------------------------------------------------*/
5674
5675 float32 floatx80_to_float32(floatx80 a, float_status *status)
5676 {
5677 bool aSign;
5678 int32_t aExp;
5679 uint64_t aSig;
5680
5681 if (floatx80_invalid_encoding(a)) {
5682 float_raise(float_flag_invalid, status);
5683 return float32_default_nan(status);
5684 }
5685 aSig = extractFloatx80Frac( a );
5686 aExp = extractFloatx80Exp( a );
5687 aSign = extractFloatx80Sign( a );
5688 if ( aExp == 0x7FFF ) {
5689 if ( (uint64_t) ( aSig<<1 ) ) {
5690 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5691 status);
5692 return float32_silence_nan(res, status);
5693 }
5694 return packFloat32( aSign, 0xFF, 0 );
5695 }
5696 shift64RightJamming( aSig, 33, &aSig );
5697 if ( aExp || aSig ) aExp -= 0x3F81;
5698 return roundAndPackFloat32(aSign, aExp, aSig, status);
5699
5700 }
5701
5702 /*----------------------------------------------------------------------------
5703 | Returns the result of converting the extended double-precision floating-
5704 | point value `a' to the double-precision floating-point format. The
5705 | conversion is performed according to the IEC/IEEE Standard for Binary
5706 | Floating-Point Arithmetic.
5707 *----------------------------------------------------------------------------*/
5708
5709 float64 floatx80_to_float64(floatx80 a, float_status *status)
5710 {
5711 bool aSign;
5712 int32_t aExp;
5713 uint64_t aSig, zSig;
5714
5715 if (floatx80_invalid_encoding(a)) {
5716 float_raise(float_flag_invalid, status);
5717 return float64_default_nan(status);
5718 }
5719 aSig = extractFloatx80Frac( a );
5720 aExp = extractFloatx80Exp( a );
5721 aSign = extractFloatx80Sign( a );
5722 if ( aExp == 0x7FFF ) {
5723 if ( (uint64_t) ( aSig<<1 ) ) {
5724 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5725 status);
5726 return float64_silence_nan(res, status);
5727 }
5728 return packFloat64( aSign, 0x7FF, 0 );
5729 }
5730 shift64RightJamming( aSig, 1, &zSig );
5731 if ( aExp || aSig ) aExp -= 0x3C01;
5732 return roundAndPackFloat64(aSign, aExp, zSig, status);
5733
5734 }
5735
5736 /*----------------------------------------------------------------------------
5737 | Returns the result of converting the extended double-precision floating-
5738 | point value `a' to the quadruple-precision floating-point format. The
5739 | conversion is performed according to the IEC/IEEE Standard for Binary
5740 | Floating-Point Arithmetic.
5741 *----------------------------------------------------------------------------*/
5742
5743 float128 floatx80_to_float128(floatx80 a, float_status *status)
5744 {
5745 bool aSign;
5746 int aExp;
5747 uint64_t aSig, zSig0, zSig1;
5748
5749 if (floatx80_invalid_encoding(a)) {
5750 float_raise(float_flag_invalid, status);
5751 return float128_default_nan(status);
5752 }
5753 aSig = extractFloatx80Frac( a );
5754 aExp = extractFloatx80Exp( a );
5755 aSign = extractFloatx80Sign( a );
5756 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5757 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5758 status);
5759 return float128_silence_nan(res, status);
5760 }
5761 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5762 return packFloat128( aSign, aExp, zSig0, zSig1 );
5763
5764 }
5765
5766 /*----------------------------------------------------------------------------
5767 | Rounds the extended double-precision floating-point value `a'
5768 | to the precision provided by floatx80_rounding_precision and returns the
5769 | result as an extended double-precision floating-point value.
5770 | The operation is performed according to the IEC/IEEE Standard for Binary
5771 | Floating-Point Arithmetic.
5772 *----------------------------------------------------------------------------*/
5773
5774 floatx80 floatx80_round(floatx80 a, float_status *status)
5775 {
5776 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5777 extractFloatx80Sign(a),
5778 extractFloatx80Exp(a),
5779 extractFloatx80Frac(a), 0, status);
5780 }
5781
5782 /*----------------------------------------------------------------------------
5783 | Rounds the extended double-precision floating-point value `a' to an integer,
5784 | and returns the result as an extended quadruple-precision floating-point
5785 | value. The operation is performed according to the IEC/IEEE Standard for
5786 | Binary Floating-Point Arithmetic.
5787 *----------------------------------------------------------------------------*/
5788
5789 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5790 {
5791 bool aSign;
5792 int32_t aExp;
5793 uint64_t lastBitMask, roundBitsMask;
5794 floatx80 z;
5795
5796 if (floatx80_invalid_encoding(a)) {
5797 float_raise(float_flag_invalid, status);
5798 return floatx80_default_nan(status);
5799 }
5800 aExp = extractFloatx80Exp( a );
5801 if ( 0x403E <= aExp ) {
5802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5803 return propagateFloatx80NaN(a, a, status);
5804 }
5805 return a;
5806 }
5807 if ( aExp < 0x3FFF ) {
5808 if ( ( aExp == 0 )
5809 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5810 return a;
5811 }
5812 float_raise(float_flag_inexact, status);
5813 aSign = extractFloatx80Sign( a );
5814 switch (status->float_rounding_mode) {
5815 case float_round_nearest_even:
5816 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5817 ) {
5818 return
5819 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5820 }
5821 break;
5822 case float_round_ties_away:
5823 if (aExp == 0x3FFE) {
5824 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5825 }
5826 break;
5827 case float_round_down:
5828 return
5829 aSign ?
5830 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5831 : packFloatx80( 0, 0, 0 );
5832 case float_round_up:
5833 return
5834 aSign ? packFloatx80( 1, 0, 0 )
5835 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5836
5837 case float_round_to_zero:
5838 break;
5839 default:
5840 g_assert_not_reached();
5841 }
5842 return packFloatx80( aSign, 0, 0 );
5843 }
5844 lastBitMask = 1;
5845 lastBitMask <<= 0x403E - aExp;
5846 roundBitsMask = lastBitMask - 1;
5847 z = a;
5848 switch (status->float_rounding_mode) {
5849 case float_round_nearest_even:
5850 z.low += lastBitMask>>1;
5851 if ((z.low & roundBitsMask) == 0) {
5852 z.low &= ~lastBitMask;
5853 }
5854 break;
5855 case float_round_ties_away:
5856 z.low += lastBitMask >> 1;
5857 break;
5858 case float_round_to_zero:
5859 break;
5860 case float_round_up:
5861 if (!extractFloatx80Sign(z)) {
5862 z.low += roundBitsMask;
5863 }
5864 break;
5865 case float_round_down:
5866 if (extractFloatx80Sign(z)) {
5867 z.low += roundBitsMask;
5868 }
5869 break;
5870 default:
5871 abort();
5872 }
5873 z.low &= ~ roundBitsMask;
5874 if ( z.low == 0 ) {
5875 ++z.high;
5876 z.low = UINT64_C(0x8000000000000000);
5877 }
5878 if (z.low != a.low) {
5879 float_raise(float_flag_inexact, status);
5880 }
5881 return z;
5882
5883 }
5884
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of adding the absolute values of the extended double-
5887 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5888 | negated before being returned. `zSign' is ignored if the result is a NaN.
5889 | The addition is performed according to the IEC/IEEE Standard for Binary
5890 | Floating-Point Arithmetic.
5891 *----------------------------------------------------------------------------*/
5892
5893 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5894 float_status *status)
5895 {
5896 int32_t aExp, bExp, zExp;
5897 uint64_t aSig, bSig, zSig0, zSig1;
5898 int32_t expDiff;
5899
5900 aSig = extractFloatx80Frac( a );
5901 aExp = extractFloatx80Exp( a );
5902 bSig = extractFloatx80Frac( b );
5903 bExp = extractFloatx80Exp( b );
5904 expDiff = aExp - bExp;
5905 if ( 0 < expDiff ) {
5906 if ( aExp == 0x7FFF ) {
5907 if ((uint64_t)(aSig << 1)) {
5908 return propagateFloatx80NaN(a, b, status);
5909 }
5910 return a;
5911 }
5912 if ( bExp == 0 ) --expDiff;
5913 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5914 zExp = aExp;
5915 }
5916 else if ( expDiff < 0 ) {
5917 if ( bExp == 0x7FFF ) {
5918 if ((uint64_t)(bSig << 1)) {
5919 return propagateFloatx80NaN(a, b, status);
5920 }
5921 return packFloatx80(zSign,
5922 floatx80_infinity_high,
5923 floatx80_infinity_low);
5924 }
5925 if ( aExp == 0 ) ++expDiff;
5926 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5927 zExp = bExp;
5928 }
5929 else {
5930 if ( aExp == 0x7FFF ) {
5931 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5932 return propagateFloatx80NaN(a, b, status);
5933 }
5934 return a;
5935 }
5936 zSig1 = 0;
5937 zSig0 = aSig + bSig;
5938 if ( aExp == 0 ) {
5939 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5940 /* At least one of the values is a pseudo-denormal,
5941 * and there is a carry out of the result. */
5942 zExp = 1;
5943 goto shiftRight1;
5944 }
5945 if (zSig0 == 0) {
5946 return packFloatx80(zSign, 0, 0);
5947 }
5948 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5949 goto roundAndPack;
5950 }
5951 zExp = aExp;
5952 goto shiftRight1;
5953 }
5954 zSig0 = aSig + bSig;
5955 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5956 shiftRight1:
5957 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5958 zSig0 |= UINT64_C(0x8000000000000000);
5959 ++zExp;
5960 roundAndPack:
5961 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5962 zSign, zExp, zSig0, zSig1, status);
5963 }
5964
5965 /*----------------------------------------------------------------------------
5966 | Returns the result of subtracting the absolute values of the extended
5967 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5968 | difference is negated before being returned. `zSign' is ignored if the
5969 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5970 | Standard for Binary Floating-Point Arithmetic.
5971 *----------------------------------------------------------------------------*/
5972
5973 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5974 float_status *status)
5975 {
5976 int32_t aExp, bExp, zExp;
5977 uint64_t aSig, bSig, zSig0, zSig1;
5978 int32_t expDiff;
5979
5980 aSig = extractFloatx80Frac( a );
5981 aExp = extractFloatx80Exp( a );
5982 bSig = extractFloatx80Frac( b );
5983 bExp = extractFloatx80Exp( b );
5984 expDiff = aExp - bExp;
5985 if ( 0 < expDiff ) goto aExpBigger;
5986 if ( expDiff < 0 ) goto bExpBigger;
5987 if ( aExp == 0x7FFF ) {
5988 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5989 return propagateFloatx80NaN(a, b, status);
5990 }
5991 float_raise(float_flag_invalid, status);
5992 return floatx80_default_nan(status);
5993 }
5994 if ( aExp == 0 ) {
5995 aExp = 1;
5996 bExp = 1;
5997 }
5998 zSig1 = 0;
5999 if ( bSig < aSig ) goto aBigger;
6000 if ( aSig < bSig ) goto bBigger;
6001 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6002 bExpBigger:
6003 if ( bExp == 0x7FFF ) {
6004 if ((uint64_t)(bSig << 1)) {
6005 return propagateFloatx80NaN(a, b, status);
6006 }
6007 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6008 floatx80_infinity_low);
6009 }
6010 if ( aExp == 0 ) ++expDiff;
6011 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6012 bBigger:
6013 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6014 zExp = bExp;
6015 zSign ^= 1;
6016 goto normalizeRoundAndPack;
6017 aExpBigger:
6018 if ( aExp == 0x7FFF ) {
6019 if ((uint64_t)(aSig << 1)) {
6020 return propagateFloatx80NaN(a, b, status);
6021 }
6022 return a;
6023 }
6024 if ( bExp == 0 ) --expDiff;
6025 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6026 aBigger:
6027 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6028 zExp = aExp;
6029 normalizeRoundAndPack:
6030 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6031 zSign, zExp, zSig0, zSig1, status);
6032 }
6033
6034 /*----------------------------------------------------------------------------
6035 | Returns the result of adding the extended double-precision floating-point
6036 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6037 | Standard for Binary Floating-Point Arithmetic.
6038 *----------------------------------------------------------------------------*/
6039
6040 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6041 {
6042 bool aSign, bSign;
6043
6044 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6045 float_raise(float_flag_invalid, status);
6046 return floatx80_default_nan(status);
6047 }
6048 aSign = extractFloatx80Sign( a );
6049 bSign = extractFloatx80Sign( b );
6050 if ( aSign == bSign ) {
6051 return addFloatx80Sigs(a, b, aSign, status);
6052 }
6053 else {
6054 return subFloatx80Sigs(a, b, aSign, status);
6055 }
6056
6057 }
6058
6059 /*----------------------------------------------------------------------------
6060 | Returns the result of subtracting the extended double-precision floating-
6061 | point values `a' and `b'. The operation is performed according to the
6062 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6064
6065 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6066 {
6067 bool aSign, bSign;
6068
6069 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6070 float_raise(float_flag_invalid, status);
6071 return floatx80_default_nan(status);
6072 }
6073 aSign = extractFloatx80Sign( a );
6074 bSign = extractFloatx80Sign( b );
6075 if ( aSign == bSign ) {
6076 return subFloatx80Sigs(a, b, aSign, status);
6077 }
6078 else {
6079 return addFloatx80Sigs(a, b, aSign, status);
6080 }
6081
6082 }
6083
6084 /*----------------------------------------------------------------------------
6085 | Returns the result of multiplying the extended double-precision floating-
6086 | point values `a' and `b'. The operation is performed according to the
6087 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6088 *----------------------------------------------------------------------------*/
6089
6090 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6091 {
6092 bool aSign, bSign, zSign;
6093 int32_t aExp, bExp, zExp;
6094 uint64_t aSig, bSig, zSig0, zSig1;
6095
6096 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6097 float_raise(float_flag_invalid, status);
6098 return floatx80_default_nan(status);
6099 }
6100 aSig = extractFloatx80Frac( a );
6101 aExp = extractFloatx80Exp( a );
6102 aSign = extractFloatx80Sign( a );
6103 bSig = extractFloatx80Frac( b );
6104 bExp = extractFloatx80Exp( b );
6105 bSign = extractFloatx80Sign( b );
6106 zSign = aSign ^ bSign;
6107 if ( aExp == 0x7FFF ) {
6108 if ( (uint64_t) ( aSig<<1 )
6109 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6110 return propagateFloatx80NaN(a, b, status);
6111 }
6112 if ( ( bExp | bSig ) == 0 ) goto invalid;
6113 return packFloatx80(zSign, floatx80_infinity_high,
6114 floatx80_infinity_low);
6115 }
6116 if ( bExp == 0x7FFF ) {
6117 if ((uint64_t)(bSig << 1)) {
6118 return propagateFloatx80NaN(a, b, status);
6119 }
6120 if ( ( aExp | aSig ) == 0 ) {
6121 invalid:
6122 float_raise(float_flag_invalid, status);
6123 return floatx80_default_nan(status);
6124 }
6125 return packFloatx80(zSign, floatx80_infinity_high,
6126 floatx80_infinity_low);
6127 }
6128 if ( aExp == 0 ) {
6129 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6130 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6131 }
6132 if ( bExp == 0 ) {
6133 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6134 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6135 }
6136 zExp = aExp + bExp - 0x3FFE;
6137 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6138 if ( 0 < (int64_t) zSig0 ) {
6139 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6140 --zExp;
6141 }
6142 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6143 zSign, zExp, zSig0, zSig1, status);
6144 }
6145
6146 /*----------------------------------------------------------------------------
6147 | Returns the result of dividing the extended double-precision floating-point
6148 | value `a' by the corresponding value `b'. The operation is performed
6149 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6150 *----------------------------------------------------------------------------*/
6151
6152 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6153 {
6154 bool aSign, bSign, zSign;
6155 int32_t aExp, bExp, zExp;
6156 uint64_t aSig, bSig, zSig0, zSig1;
6157 uint64_t rem0, rem1, rem2, term0, term1, term2;
6158
6159 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6160 float_raise(float_flag_invalid, status);
6161 return floatx80_default_nan(status);
6162 }
6163 aSig = extractFloatx80Frac( a );
6164 aExp = extractFloatx80Exp( a );
6165 aSign = extractFloatx80Sign( a );
6166 bSig = extractFloatx80Frac( b );
6167 bExp = extractFloatx80Exp( b );
6168 bSign = extractFloatx80Sign( b );
6169 zSign = aSign ^ bSign;
6170 if ( aExp == 0x7FFF ) {
6171 if ((uint64_t)(aSig << 1)) {
6172 return propagateFloatx80NaN(a, b, status);
6173 }
6174 if ( bExp == 0x7FFF ) {
6175 if ((uint64_t)(bSig << 1)) {
6176 return propagateFloatx80NaN(a, b, status);
6177 }
6178 goto invalid;
6179 }
6180 return packFloatx80(zSign, floatx80_infinity_high,
6181 floatx80_infinity_low);
6182 }
6183 if ( bExp == 0x7FFF ) {
6184 if ((uint64_t)(bSig << 1)) {
6185 return propagateFloatx80NaN(a, b, status);
6186 }
6187 return packFloatx80( zSign, 0, 0 );
6188 }
6189 if ( bExp == 0 ) {
6190 if ( bSig == 0 ) {
6191 if ( ( aExp | aSig ) == 0 ) {
6192 invalid:
6193 float_raise(float_flag_invalid, status);
6194 return floatx80_default_nan(status);
6195 }
6196 float_raise(float_flag_divbyzero, status);
6197 return packFloatx80(zSign, floatx80_infinity_high,
6198 floatx80_infinity_low);
6199 }
6200 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6201 }
6202 if ( aExp == 0 ) {
6203 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6204 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6205 }
6206 zExp = aExp - bExp + 0x3FFE;
6207 rem1 = 0;
6208 if ( bSig <= aSig ) {
6209 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6210 ++zExp;
6211 }
6212 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6213 mul64To128( bSig, zSig0, &term0, &term1 );
6214 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6215 while ( (int64_t) rem0 < 0 ) {
6216 --zSig0;
6217 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6218 }
6219 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6220 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6221 mul64To128( bSig, zSig1, &term1, &term2 );
6222 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6223 while ( (int64_t) rem1 < 0 ) {
6224 --zSig1;
6225 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6226 }
6227 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6228 }
6229 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6230 zSign, zExp, zSig0, zSig1, status);
6231 }
6232
6233 /*----------------------------------------------------------------------------
6234 | Returns the remainder of the extended double-precision floating-point value
6235 | `a' with respect to the corresponding value `b'. The operation is performed
6236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6237 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6238 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6239 | the absolute value of the integer quotient.
6240 *----------------------------------------------------------------------------*/
6241
6242 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6243 float_status *status)
6244 {
6245 bool aSign, zSign;
6246 int32_t aExp, bExp, expDiff, aExpOrig;
6247 uint64_t aSig0, aSig1, bSig;
6248 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6249
6250 *quotient = 0;
6251 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6252 float_raise(float_flag_invalid, status);
6253 return floatx80_default_nan(status);
6254 }
6255 aSig0 = extractFloatx80Frac( a );
6256 aExpOrig = aExp = extractFloatx80Exp( a );
6257 aSign = extractFloatx80Sign( a );
6258 bSig = extractFloatx80Frac( b );
6259 bExp = extractFloatx80Exp( b );
6260 if ( aExp == 0x7FFF ) {
6261 if ( (uint64_t) ( aSig0<<1 )
6262 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6263 return propagateFloatx80NaN(a, b, status);
6264 }
6265 goto invalid;
6266 }
6267 if ( bExp == 0x7FFF ) {
6268 if ((uint64_t)(bSig << 1)) {
6269 return propagateFloatx80NaN(a, b, status);
6270 }
6271 if (aExp == 0 && aSig0 >> 63) {
6272 /*
6273 * Pseudo-denormal argument must be returned in normalized
6274 * form.
6275 */
6276 return packFloatx80(aSign, 1, aSig0);
6277 }
6278 return a;
6279 }
6280 if ( bExp == 0 ) {
6281 if ( bSig == 0 ) {
6282 invalid:
6283 float_raise(float_flag_invalid, status);
6284 return floatx80_default_nan(status);
6285 }
6286 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6287 }
6288 if ( aExp == 0 ) {
6289 if ( aSig0 == 0 ) return a;
6290 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6291 }
6292 zSign = aSign;
6293 expDiff = aExp - bExp;
6294 aSig1 = 0;
6295 if ( expDiff < 0 ) {
6296 if ( mod || expDiff < -1 ) {
6297 if (aExp == 1 && aExpOrig == 0) {
6298 /*
6299 * Pseudo-denormal argument must be returned in
6300 * normalized form.
6301 */
6302 return packFloatx80(aSign, aExp, aSig0);
6303 }
6304 return a;
6305 }
6306 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6307 expDiff = 0;
6308 }
6309 *quotient = q = ( bSig <= aSig0 );
6310 if ( q ) aSig0 -= bSig;
6311 expDiff -= 64;
6312 while ( 0 < expDiff ) {
6313 q = estimateDiv128To64( aSig0, aSig1, bSig );
6314 q = ( 2 < q ) ? q - 2 : 0;
6315 mul64To128( bSig, q, &term0, &term1 );
6316 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6317 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6318 expDiff -= 62;
6319 *quotient <<= 62;
6320 *quotient += q;
6321 }
6322 expDiff += 64;
6323 if ( 0 < expDiff ) {
6324 q = estimateDiv128To64( aSig0, aSig1, bSig );
6325 q = ( 2 < q ) ? q - 2 : 0;
6326 q >>= 64 - expDiff;
6327 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6328 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6329 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6330 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6331 ++q;
6332 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6333 }
6334 if (expDiff < 64) {
6335 *quotient <<= expDiff;
6336 } else {
6337 *quotient = 0;
6338 }
6339 *quotient += q;
6340 }
6341 else {
6342 term1 = 0;
6343 term0 = bSig;
6344 }
6345 if (!mod) {
6346 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6347 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6348 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6349 && ( q & 1 ) )
6350 ) {
6351 aSig0 = alternateASig0;
6352 aSig1 = alternateASig1;
6353 zSign = ! zSign;
6354 ++*quotient;
6355 }
6356 }
6357 return
6358 normalizeRoundAndPackFloatx80(
6359 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6360
6361 }
6362
6363 /*----------------------------------------------------------------------------
6364 | Returns the remainder of the extended double-precision floating-point value
6365 | `a' with respect to the corresponding value `b'. The operation is performed
6366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6367 *----------------------------------------------------------------------------*/
6368
6369 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6370 {
6371 uint64_t quotient;
6372 return floatx80_modrem(a, b, false, &quotient, status);
6373 }
6374
6375 /*----------------------------------------------------------------------------
6376 | Returns the remainder of the extended double-precision floating-point value
6377 | `a' with respect to the corresponding value `b', with the quotient truncated
6378 | toward zero.
6379 *----------------------------------------------------------------------------*/
6380
6381 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6382 {
6383 uint64_t quotient;
6384 return floatx80_modrem(a, b, true, &quotient, status);
6385 }
6386
6387 /*----------------------------------------------------------------------------
6388 | Returns the square root of the extended double-precision floating-point
6389 | value `a'. The operation is performed according to the IEC/IEEE Standard
6390 | for Binary Floating-Point Arithmetic.
6391 *----------------------------------------------------------------------------*/
6392
6393 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6394 {
6395 bool aSign;
6396 int32_t aExp, zExp;
6397 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6398 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6399
6400 if (floatx80_invalid_encoding(a)) {
6401 float_raise(float_flag_invalid, status);
6402 return floatx80_default_nan(status);
6403 }
6404 aSig0 = extractFloatx80Frac( a );
6405 aExp = extractFloatx80Exp( a );
6406 aSign = extractFloatx80Sign( a );
6407 if ( aExp == 0x7FFF ) {
6408 if ((uint64_t)(aSig0 << 1)) {
6409 return propagateFloatx80NaN(a, a, status);
6410 }
6411 if ( ! aSign ) return a;
6412 goto invalid;
6413 }
6414 if ( aSign ) {
6415 if ( ( aExp | aSig0 ) == 0 ) return a;
6416 invalid:
6417 float_raise(float_flag_invalid, status);
6418 return floatx80_default_nan(status);
6419 }
6420 if ( aExp == 0 ) {
6421 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6422 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6423 }
6424 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6425 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6426 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6427 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6428 doubleZSig0 = zSig0<<1;
6429 mul64To128( zSig0, zSig0, &term0, &term1 );
6430 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6431 while ( (int64_t) rem0 < 0 ) {
6432 --zSig0;
6433 doubleZSig0 -= 2;
6434 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6435 }
6436 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6437 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6438 if ( zSig1 == 0 ) zSig1 = 1;
6439 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6440 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6441 mul64To128( zSig1, zSig1, &term2, &term3 );
6442 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6443 while ( (int64_t) rem1 < 0 ) {
6444 --zSig1;
6445 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6446 term3 |= 1;
6447 term2 |= doubleZSig0;
6448 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6449 }
6450 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6451 }
6452 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6453 zSig0 |= doubleZSig0;
6454 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6455 0, zExp, zSig0, zSig1, status);
6456 }
6457
6458 /*----------------------------------------------------------------------------
6459 | Returns the result of converting the quadruple-precision floating-point
6460 | value `a' to the extended double-precision floating-point format. The
6461 | conversion is performed according to the IEC/IEEE Standard for Binary
6462 | Floating-Point Arithmetic.
6463 *----------------------------------------------------------------------------*/
6464
6465 floatx80 float128_to_floatx80(float128 a, float_status *status)
6466 {
6467 bool aSign;
6468 int32_t aExp;
6469 uint64_t aSig0, aSig1;
6470
6471 aSig1 = extractFloat128Frac1( a );
6472 aSig0 = extractFloat128Frac0( a );
6473 aExp = extractFloat128Exp( a );
6474 aSign = extractFloat128Sign( a );
6475 if ( aExp == 0x7FFF ) {
6476 if ( aSig0 | aSig1 ) {
6477 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6478 status);
6479 return floatx80_silence_nan(res, status);
6480 }
6481 return packFloatx80(aSign, floatx80_infinity_high,
6482 floatx80_infinity_low);
6483 }
6484 if ( aExp == 0 ) {
6485 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6486 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6487 }
6488 else {
6489 aSig0 |= UINT64_C(0x0001000000000000);
6490 }
6491 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6492 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6493
6494 }
6495
6496 /*----------------------------------------------------------------------------
6497 | Returns the remainder of the quadruple-precision floating-point value `a'
6498 | with respect to the corresponding value `b'. The operation is performed
6499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6501
6502 float128 float128_rem(float128 a, float128 b, float_status *status)
6503 {
6504 bool aSign, zSign;
6505 int32_t aExp, bExp, expDiff;
6506 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6507 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6508 int64_t sigMean0;
6509
6510 aSig1 = extractFloat128Frac1( a );
6511 aSig0 = extractFloat128Frac0( a );
6512 aExp = extractFloat128Exp( a );
6513 aSign = extractFloat128Sign( a );
6514 bSig1 = extractFloat128Frac1( b );
6515 bSig0 = extractFloat128Frac0( b );
6516 bExp = extractFloat128Exp( b );
6517 if ( aExp == 0x7FFF ) {
6518 if ( ( aSig0 | aSig1 )
6519 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6520 return propagateFloat128NaN(a, b, status);
6521 }
6522 goto invalid;
6523 }
6524 if ( bExp == 0x7FFF ) {
6525 if (bSig0 | bSig1) {
6526 return propagateFloat128NaN(a, b, status);
6527 }
6528 return a;
6529 }
6530 if ( bExp == 0 ) {
6531 if ( ( bSig0 | bSig1 ) == 0 ) {
6532 invalid:
6533 float_raise(float_flag_invalid, status);
6534 return float128_default_nan(status);
6535 }
6536 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6537 }
6538 if ( aExp == 0 ) {
6539 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6540 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6541 }
6542 expDiff = aExp - bExp;
6543 if ( expDiff < -1 ) return a;
6544 shortShift128Left(
6545 aSig0 | UINT64_C(0x0001000000000000),
6546 aSig1,
6547 15 - ( expDiff < 0 ),
6548 &aSig0,
6549 &aSig1
6550 );
6551 shortShift128Left(
6552 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6553 q = le128( bSig0, bSig1, aSig0, aSig1 );
6554 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6555 expDiff -= 64;
6556 while ( 0 < expDiff ) {
6557 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6558 q = ( 4 < q ) ? q - 4 : 0;
6559 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6560 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6561 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6562 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6563 expDiff -= 61;
6564 }
6565 if ( -64 < expDiff ) {
6566 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6567 q = ( 4 < q ) ? q - 4 : 0;
6568 q >>= - expDiff;
6569 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6570 expDiff += 52;
6571 if ( expDiff < 0 ) {
6572 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6573 }
6574 else {
6575 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6576 }
6577 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6578 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6579 }
6580 else {
6581 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6582 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6583 }
6584 do {
6585 alternateASig0 = aSig0;
6586 alternateASig1 = aSig1;
6587 ++q;
6588 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6589 } while ( 0 <= (int64_t) aSig0 );
6590 add128(
6591 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6592 if ( ( sigMean0 < 0 )
6593 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6594 aSig0 = alternateASig0;
6595 aSig1 = alternateASig1;
6596 }
6597 zSign = ( (int64_t) aSig0 < 0 );
6598 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6599 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6600 status);
6601 }
6602
6603 /*----------------------------------------------------------------------------
6604 | Returns the square root of the quadruple-precision floating-point value `a'.
6605 | The operation is performed according to the IEC/IEEE Standard for Binary
6606 | Floating-Point Arithmetic.
6607 *----------------------------------------------------------------------------*/
6608
6609 float128 float128_sqrt(float128 a, float_status *status)
6610 {
6611 bool aSign;
6612 int32_t aExp, zExp;
6613 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6614 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6615
6616 aSig1 = extractFloat128Frac1( a );
6617 aSig0 = extractFloat128Frac0( a );
6618 aExp = extractFloat128Exp( a );
6619 aSign = extractFloat128Sign( a );
6620 if ( aExp == 0x7FFF ) {
6621 if (aSig0 | aSig1) {
6622 return propagateFloat128NaN(a, a, status);
6623 }
6624 if ( ! aSign ) return a;
6625 goto invalid;
6626 }
6627 if ( aSign ) {
6628 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6629 invalid:
6630 float_raise(float_flag_invalid, status);
6631 return float128_default_nan(status);
6632 }
6633 if ( aExp == 0 ) {
6634 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6635 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6636 }
6637 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6638 aSig0 |= UINT64_C(0x0001000000000000);
6639 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6640 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6641 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6642 doubleZSig0 = zSig0<<1;
6643 mul64To128( zSig0, zSig0, &term0, &term1 );
6644 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6645 while ( (int64_t) rem0 < 0 ) {
6646 --zSig0;
6647 doubleZSig0 -= 2;
6648 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6649 }
6650 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6651 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6652 if ( zSig1 == 0 ) zSig1 = 1;
6653 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6654 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6655 mul64To128( zSig1, zSig1, &term2, &term3 );
6656 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6657 while ( (int64_t) rem1 < 0 ) {
6658 --zSig1;
6659 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6660 term3 |= 1;
6661 term2 |= doubleZSig0;
6662 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6663 }
6664 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6665 }
6666 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6667 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6668
6669 }
6670
6671 static inline FloatRelation
6672 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
6673 float_status *status)
6674 {
6675 bool aSign, bSign;
6676
6677 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6678 float_raise(float_flag_invalid, status);
6679 return float_relation_unordered;
6680 }
6681 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6682 ( extractFloatx80Frac( a )<<1 ) ) ||
6683 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6684 ( extractFloatx80Frac( b )<<1 ) )) {
6685 if (!is_quiet ||
6686 floatx80_is_signaling_nan(a, status) ||
6687 floatx80_is_signaling_nan(b, status)) {
6688 float_raise(float_flag_invalid, status);
6689 }
6690 return float_relation_unordered;
6691 }
6692 aSign = extractFloatx80Sign( a );
6693 bSign = extractFloatx80Sign( b );
6694 if ( aSign != bSign ) {
6695
6696 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6697 ( ( a.low | b.low ) == 0 ) ) {
6698 /* zero case */
6699 return float_relation_equal;
6700 } else {
6701 return 1 - (2 * aSign);
6702 }
6703 } else {
6704 /* Normalize pseudo-denormals before comparison. */
6705 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
6706 ++a.high;
6707 }
6708 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
6709 ++b.high;
6710 }
6711 if (a.low == b.low && a.high == b.high) {
6712 return float_relation_equal;
6713 } else {
6714 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6715 }
6716 }
6717 }
6718
6719 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6720 {
6721 return floatx80_compare_internal(a, b, 0, status);
6722 }
6723
6724 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
6725 float_status *status)
6726 {
6727 return floatx80_compare_internal(a, b, 1, status);
6728 }
6729
6730 static inline FloatRelation
6731 float128_compare_internal(float128 a, float128 b, bool is_quiet,
6732 float_status *status)
6733 {
6734 bool aSign, bSign;
6735
6736 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6737 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6738 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6739 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6740 if (!is_quiet ||
6741 float128_is_signaling_nan(a, status) ||
6742 float128_is_signaling_nan(b, status)) {
6743 float_raise(float_flag_invalid, status);
6744 }
6745 return float_relation_unordered;
6746 }
6747 aSign = extractFloat128Sign( a );
6748 bSign = extractFloat128Sign( b );
6749 if ( aSign != bSign ) {
6750 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6751 /* zero case */
6752 return float_relation_equal;
6753 } else {
6754 return 1 - (2 * aSign);
6755 }
6756 } else {
6757 if (a.low == b.low && a.high == b.high) {
6758 return float_relation_equal;
6759 } else {
6760 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6761 }
6762 }
6763 }
6764
6765 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
6766 {
6767 return float128_compare_internal(a, b, 0, status);
6768 }
6769
6770 FloatRelation float128_compare_quiet(float128 a, float128 b,
6771 float_status *status)
6772 {
6773 return float128_compare_internal(a, b, 1, status);
6774 }
6775
6776 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6777 {
6778 bool aSign;
6779 int32_t aExp;
6780 uint64_t aSig;
6781
6782 if (floatx80_invalid_encoding(a)) {
6783 float_raise(float_flag_invalid, status);
6784 return floatx80_default_nan(status);
6785 }
6786 aSig = extractFloatx80Frac( a );
6787 aExp = extractFloatx80Exp( a );
6788 aSign = extractFloatx80Sign( a );
6789
6790 if ( aExp == 0x7FFF ) {
6791 if ( aSig<<1 ) {
6792 return propagateFloatx80NaN(a, a, status);
6793 }
6794 return a;
6795 }
6796
6797 if (aExp == 0) {
6798 if (aSig == 0) {
6799 return a;
6800 }
6801 aExp++;
6802 }
6803
6804 if (n > 0x10000) {
6805 n = 0x10000;
6806 } else if (n < -0x10000) {
6807 n = -0x10000;
6808 }
6809
6810 aExp += n;
6811 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6812 aSign, aExp, aSig, 0, status);
6813 }
6814
6815 float128 float128_scalbn(float128 a, int n, float_status *status)
6816 {
6817 bool aSign;
6818 int32_t aExp;
6819 uint64_t aSig0, aSig1;
6820
6821 aSig1 = extractFloat128Frac1( a );
6822 aSig0 = extractFloat128Frac0( a );
6823 aExp = extractFloat128Exp( a );
6824 aSign = extractFloat128Sign( a );
6825 if ( aExp == 0x7FFF ) {
6826 if ( aSig0 | aSig1 ) {
6827 return propagateFloat128NaN(a, a, status);
6828 }
6829 return a;
6830 }
6831 if (aExp != 0) {
6832 aSig0 |= UINT64_C(0x0001000000000000);
6833 } else if (aSig0 == 0 && aSig1 == 0) {
6834 return a;
6835 } else {
6836 aExp++;
6837 }
6838
6839 if (n > 0x10000) {
6840 n = 0x10000;
6841 } else if (n < -0x10000) {
6842 n = -0x10000;
6843 }
6844
6845 aExp += n - 1;
6846 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6847 , status);
6848
6849 }
6850
6851 static void __attribute__((constructor)) softfloat_init(void)
6852 {
6853 union_float64 ua, ub, uc, ur;
6854
6855 if (QEMU_NO_HARDFLOAT) {
6856 return;
6857 }
6858 /*
6859 * Test that the host's FMA is not obviously broken. For example,
6860 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
6861 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
6862 */
6863 ua.s = 0x0020000000000001ULL;
6864 ub.s = 0x3ca0000000000000ULL;
6865 uc.s = 0x0020000000000000ULL;
6866 ur.h = fma(ua.h, ub.h, uc.h);
6867 if (ur.s != 0x0020000000000001ULL) {
6868 force_soft_fma = true;
6869 }
6870 }