]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Use mulu64 for mul64To128
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
505 *
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
509 */
510
511 typedef struct {
512 FloatClass cls;
513 bool sign;
514 int32_t exp;
515 union {
516 /* Routines that know the structure may reference the singular name. */
517 uint64_t frac;
518 /*
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
522 */
523 uint64_t frac_hi;
524 uint64_t frac_lo;
525 };
526 } FloatParts64;
527
528 typedef struct {
529 FloatClass cls;
530 bool sign;
531 int32_t exp;
532 uint64_t frac_hi;
533 uint64_t frac_lo;
534 } FloatParts128;
535
536 typedef struct {
537 FloatClass cls;
538 bool sign;
539 int32_t exp;
540 uint64_t frac_hi;
541 uint64_t frac_hm; /* high-middle */
542 uint64_t frac_lm; /* low-middle */
543 uint64_t frac_lo;
544 } FloatParts256;
545
546 /* These apply to the most significant word of each FloatPartsN. */
547 #define DECOMPOSED_BINARY_POINT 63
548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
549
550 /* Structure holding all of the relevant parameters for a format.
551 * exp_size: the size of the exponent field
552 * exp_bias: the offset applied to the exponent field
553 * exp_max: the maximum normalised exponent
554 * frac_size: the size of the fraction field
555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
556 * The following are computed based the size of fraction
557 * frac_lsb: least significant bit of fraction
558 * frac_lsbm1: the bit below the least significant bit (for rounding)
559 * round_mask/roundeven_mask: masks used for rounding
560 * The following optional modifiers are available:
561 * arm_althp: handle ARM Alternative Half Precision
562 */
563 typedef struct {
564 int exp_size;
565 int exp_bias;
566 int exp_max;
567 int frac_size;
568 int frac_shift;
569 uint64_t frac_lsb;
570 uint64_t frac_lsbm1;
571 uint64_t round_mask;
572 uint64_t roundeven_mask;
573 bool arm_althp;
574 } FloatFmt;
575
576 /* Expand fields based on the size of exponent and fraction */
577 #define FLOAT_PARAMS(E, F) \
578 .exp_size = E, \
579 .exp_bias = ((1 << E) - 1) >> 1, \
580 .exp_max = (1 << E) - 1, \
581 .frac_size = F, \
582 .frac_shift = (-F - 1) & 63, \
583 .frac_lsb = 1ull << ((-F - 1) & 63), \
584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
587
588 static const FloatFmt float16_params = {
589 FLOAT_PARAMS(5, 10)
590 };
591
592 static const FloatFmt float16_params_ahp = {
593 FLOAT_PARAMS(5, 10),
594 .arm_althp = true
595 };
596
597 static const FloatFmt bfloat16_params = {
598 FLOAT_PARAMS(8, 7)
599 };
600
601 static const FloatFmt float32_params = {
602 FLOAT_PARAMS(8, 23)
603 };
604
605 static const FloatFmt float64_params = {
606 FLOAT_PARAMS(11, 52)
607 };
608
609 static const FloatFmt float128_params = {
610 FLOAT_PARAMS(15, 112)
611 };
612
613 /* Unpack a float to parts, but do not canonicalize. */
614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
615 {
616 const int f_size = fmt->frac_size;
617 const int e_size = fmt->exp_size;
618
619 *r = (FloatParts64) {
620 .cls = float_class_unclassified,
621 .sign = extract64(raw, f_size + e_size, 1),
622 .exp = extract64(raw, f_size, e_size),
623 .frac = extract64(raw, 0, f_size)
624 };
625 }
626
627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
628 {
629 unpack_raw64(p, &float16_params, f);
630 }
631
632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
633 {
634 unpack_raw64(p, &bfloat16_params, f);
635 }
636
637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
638 {
639 unpack_raw64(p, &float32_params, f);
640 }
641
642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
643 {
644 unpack_raw64(p, &float64_params, f);
645 }
646
647 static void float128_unpack_raw(FloatParts128 *p, float128 f)
648 {
649 const int f_size = float128_params.frac_size - 64;
650 const int e_size = float128_params.exp_size;
651
652 *p = (FloatParts128) {
653 .cls = float_class_unclassified,
654 .sign = extract64(f.high, f_size + e_size, 1),
655 .exp = extract64(f.high, f_size, e_size),
656 .frac_hi = extract64(f.high, 0, f_size),
657 .frac_lo = f.low,
658 };
659 }
660
661 /* Pack a float from parts, but do not canonicalize. */
662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
663 {
664 const int f_size = fmt->frac_size;
665 const int e_size = fmt->exp_size;
666 uint64_t ret;
667
668 ret = (uint64_t)p->sign << (f_size + e_size);
669 ret = deposit64(ret, f_size, e_size, p->exp);
670 ret = deposit64(ret, 0, f_size, p->frac);
671 return ret;
672 }
673
674 static inline float16 float16_pack_raw(const FloatParts64 *p)
675 {
676 return make_float16(pack_raw64(p, &float16_params));
677 }
678
679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
680 {
681 return pack_raw64(p, &bfloat16_params);
682 }
683
684 static inline float32 float32_pack_raw(const FloatParts64 *p)
685 {
686 return make_float32(pack_raw64(p, &float32_params));
687 }
688
689 static inline float64 float64_pack_raw(const FloatParts64 *p)
690 {
691 return make_float64(pack_raw64(p, &float64_params));
692 }
693
694 static float128 float128_pack_raw(const FloatParts128 *p)
695 {
696 const int f_size = float128_params.frac_size - 64;
697 const int e_size = float128_params.exp_size;
698 uint64_t hi;
699
700 hi = (uint64_t)p->sign << (f_size + e_size);
701 hi = deposit64(hi, f_size, e_size, p->exp);
702 hi = deposit64(hi, 0, f_size, p->frac_hi);
703 return make_float128(hi, p->frac_lo);
704 }
705
706 /*----------------------------------------------------------------------------
707 | Functions and definitions to determine: (1) whether tininess for underflow
708 | is detected before or after rounding by default, (2) what (if anything)
709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
711 | are propagated from function inputs to output. These details are target-
712 | specific.
713 *----------------------------------------------------------------------------*/
714 #include "softfloat-specialize.c.inc"
715
716 #define PARTS_GENERIC_64_128(NAME, P) \
717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
718
719 #define PARTS_GENERIC_64_128_256(NAME, P) \
720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
721 (FloatParts128 *, parts128_##NAME), parts64_##NAME)
722
723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
725
726 static void parts64_return_nan(FloatParts64 *a, float_status *s);
727 static void parts128_return_nan(FloatParts128 *a, float_status *s);
728
729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
730
731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
732 float_status *s);
733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
734 float_status *s);
735
736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
737
738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
739 FloatParts64 *c, float_status *s,
740 int ab_mask, int abc_mask);
741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
742 FloatParts128 *b,
743 FloatParts128 *c,
744 float_status *s,
745 int ab_mask, int abc_mask);
746
747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
749
750 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
751 const FloatFmt *fmt);
752 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
753 const FloatFmt *fmt);
754
755 #define parts_canonicalize(A, S, F) \
756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
757
758 static void parts64_uncanon(FloatParts64 *p, float_status *status,
759 const FloatFmt *fmt);
760 static void parts128_uncanon(FloatParts128 *p, float_status *status,
761 const FloatFmt *fmt);
762
763 #define parts_uncanon(A, S, F) \
764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
765
766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b);
769
770 #define parts_add_normal(A, B) \
771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
772
773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b);
776
777 #define parts_sub_normal(A, B) \
778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
779
780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
781 float_status *s, bool subtract);
782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
783 float_status *s, bool subtract);
784
785 #define parts_addsub(A, B, S, Z) \
786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
787
788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b,
789 float_status *s);
790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
791 float_status *s);
792
793 #define parts_mul(A, B, S) \
794 PARTS_GENERIC_64_128(mul, A)(A, B, S)
795
796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
797 FloatParts64 *c, int flags,
798 float_status *s);
799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
800 FloatParts128 *c, int flags,
801 float_status *s);
802
803 #define parts_muladd(A, B, C, Z, S) \
804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
805
806 /*
807 * Helper functions for softfloat-parts.c.inc, per-size operations.
808 */
809
810 #define FRAC_GENERIC_64_128(NAME, P) \
811 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
812
813 #define FRAC_GENERIC_64_128_256(NAME, P) \
814 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
815 (FloatParts128 *, frac128_##NAME), frac64_##NAME)
816
817 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
818 {
819 return uadd64_overflow(a->frac, b->frac, &r->frac);
820 }
821
822 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
823 {
824 bool c = 0;
825 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
826 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
827 return c;
828 }
829
830 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
831 {
832 bool c = 0;
833 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
834 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c);
835 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c);
836 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
837 return c;
838 }
839
840 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B)
841
842 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
843 {
844 return uadd64_overflow(a->frac, c, &r->frac);
845 }
846
847 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
848 {
849 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
850 return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
851 }
852
853 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
854
855 static void frac64_allones(FloatParts64 *a)
856 {
857 a->frac = -1;
858 }
859
860 static void frac128_allones(FloatParts128 *a)
861 {
862 a->frac_hi = a->frac_lo = -1;
863 }
864
865 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
866
867 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
868 {
869 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
870 }
871
872 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
873 {
874 uint64_t ta = a->frac_hi, tb = b->frac_hi;
875 if (ta == tb) {
876 ta = a->frac_lo, tb = b->frac_lo;
877 if (ta == tb) {
878 return 0;
879 }
880 }
881 return ta < tb ? -1 : 1;
882 }
883
884 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
885
886 static void frac64_clear(FloatParts64 *a)
887 {
888 a->frac = 0;
889 }
890
891 static void frac128_clear(FloatParts128 *a)
892 {
893 a->frac_hi = a->frac_lo = 0;
894 }
895
896 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
897
898 static bool frac64_eqz(FloatParts64 *a)
899 {
900 return a->frac == 0;
901 }
902
903 static bool frac128_eqz(FloatParts128 *a)
904 {
905 return (a->frac_hi | a->frac_lo) == 0;
906 }
907
908 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
909
910 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b)
911 {
912 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac);
913 }
914
915 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b)
916 {
917 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo,
918 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo);
919 }
920
921 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B)
922
923 static void frac64_neg(FloatParts64 *a)
924 {
925 a->frac = -a->frac;
926 }
927
928 static void frac128_neg(FloatParts128 *a)
929 {
930 bool c = 0;
931 a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
932 a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
933 }
934
935 static void frac256_neg(FloatParts256 *a)
936 {
937 bool c = 0;
938 a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
939 a->frac_lm = usub64_borrow(0, a->frac_lm, &c);
940 a->frac_hm = usub64_borrow(0, a->frac_hm, &c);
941 a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
942 }
943
944 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A)
945
946 static int frac64_normalize(FloatParts64 *a)
947 {
948 if (a->frac) {
949 int shift = clz64(a->frac);
950 a->frac <<= shift;
951 return shift;
952 }
953 return 64;
954 }
955
956 static int frac128_normalize(FloatParts128 *a)
957 {
958 if (a->frac_hi) {
959 int shl = clz64(a->frac_hi);
960 if (shl) {
961 int shr = 64 - shl;
962 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
963 a->frac_lo = (a->frac_lo << shl);
964 }
965 return shl;
966 } else if (a->frac_lo) {
967 int shl = clz64(a->frac_lo);
968 a->frac_hi = (a->frac_lo << shl);
969 a->frac_lo = 0;
970 return shl + 64;
971 }
972 return 128;
973 }
974
975 static int frac256_normalize(FloatParts256 *a)
976 {
977 uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
978 uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
979 int ret, shl, shr;
980
981 if (likely(a0)) {
982 shl = clz64(a0);
983 if (shl == 0) {
984 return 0;
985 }
986 ret = shl;
987 } else {
988 if (a1) {
989 ret = 64;
990 a0 = a1, a1 = a2, a2 = a3, a3 = 0;
991 } else if (a2) {
992 ret = 128;
993 a0 = a2, a1 = a3, a2 = 0, a3 = 0;
994 } else if (a3) {
995 ret = 192;
996 a0 = a3, a1 = 0, a2 = 0, a3 = 0;
997 } else {
998 ret = 256;
999 a0 = 0, a1 = 0, a2 = 0, a3 = 0;
1000 goto done;
1001 }
1002 shl = clz64(a0);
1003 if (shl == 0) {
1004 goto done;
1005 }
1006 ret += shl;
1007 }
1008
1009 shr = -shl & 63;
1010 a0 = (a0 << shl) | (a1 >> shr);
1011 a1 = (a1 << shl) | (a2 >> shr);
1012 a2 = (a2 << shl) | (a3 >> shr);
1013 a3 = (a3 << shl);
1014
1015 done:
1016 a->frac_hi = a0;
1017 a->frac_hm = a1;
1018 a->frac_lm = a2;
1019 a->frac_lo = a3;
1020 return ret;
1021 }
1022
1023 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A)
1024
1025 static void frac64_shl(FloatParts64 *a, int c)
1026 {
1027 a->frac <<= c;
1028 }
1029
1030 static void frac128_shl(FloatParts128 *a, int c)
1031 {
1032 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
1033 }
1034
1035 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
1036
1037 static void frac64_shr(FloatParts64 *a, int c)
1038 {
1039 a->frac >>= c;
1040 }
1041
1042 static void frac128_shr(FloatParts128 *a, int c)
1043 {
1044 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
1045 }
1046
1047 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
1048
1049 static void frac64_shrjam(FloatParts64 *a, int c)
1050 {
1051 shift64RightJamming(a->frac, c, &a->frac);
1052 }
1053
1054 static void frac128_shrjam(FloatParts128 *a, int c)
1055 {
1056 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
1057 }
1058
1059 static void frac256_shrjam(FloatParts256 *a, int c)
1060 {
1061 uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
1062 uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
1063 uint64_t sticky = 0;
1064 int invc;
1065
1066 if (unlikely(c == 0)) {
1067 return;
1068 } else if (likely(c < 64)) {
1069 /* nothing */
1070 } else if (likely(c < 256)) {
1071 if (unlikely(c & 128)) {
1072 sticky |= a2 | a3;
1073 a3 = a1, a2 = a0, a1 = 0, a0 = 0;
1074 }
1075 if (unlikely(c & 64)) {
1076 sticky |= a3;
1077 a3 = a2, a2 = a1, a1 = a0, a0 = 0;
1078 }
1079 c &= 63;
1080 if (c == 0) {
1081 goto done;
1082 }
1083 } else {
1084 sticky = a0 | a1 | a2 | a3;
1085 a0 = a1 = a2 = a3 = 0;
1086 goto done;
1087 }
1088
1089 invc = -c & 63;
1090 sticky |= a3 << invc;
1091 a3 = (a3 >> c) | (a2 << invc);
1092 a2 = (a2 >> c) | (a1 << invc);
1093 a1 = (a1 >> c) | (a0 << invc);
1094 a0 = (a0 >> c);
1095
1096 done:
1097 a->frac_lo = a3 | (sticky != 0);
1098 a->frac_lm = a2;
1099 a->frac_hm = a1;
1100 a->frac_hi = a0;
1101 }
1102
1103 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1104
1105 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
1106 {
1107 return usub64_overflow(a->frac, b->frac, &r->frac);
1108 }
1109
1110 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
1111 {
1112 bool c = 0;
1113 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1114 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1115 return c;
1116 }
1117
1118 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
1119 {
1120 bool c = 0;
1121 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1122 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c);
1123 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c);
1124 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1125 return c;
1126 }
1127
1128 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1129
1130 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a)
1131 {
1132 r->frac = a->frac_hi | (a->frac_lo != 0);
1133 }
1134
1135 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a)
1136 {
1137 r->frac_hi = a->frac_hi;
1138 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0);
1139 }
1140
1141 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A)
1142
1143 static void frac64_widen(FloatParts128 *r, FloatParts64 *a)
1144 {
1145 r->frac_hi = a->frac;
1146 r->frac_lo = 0;
1147 }
1148
1149 static void frac128_widen(FloatParts256 *r, FloatParts128 *a)
1150 {
1151 r->frac_hi = a->frac_hi;
1152 r->frac_hm = a->frac_lo;
1153 r->frac_lm = 0;
1154 r->frac_lo = 0;
1155 }
1156
1157 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B)
1158
1159 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
1160 #define FloatPartsN glue(FloatParts,N)
1161 #define FloatPartsW glue(FloatParts,W)
1162
1163 #define N 64
1164 #define W 128
1165
1166 #include "softfloat-parts-addsub.c.inc"
1167 #include "softfloat-parts.c.inc"
1168
1169 #undef N
1170 #undef W
1171 #define N 128
1172 #define W 256
1173
1174 #include "softfloat-parts-addsub.c.inc"
1175 #include "softfloat-parts.c.inc"
1176
1177 #undef N
1178 #undef W
1179 #define N 256
1180
1181 #include "softfloat-parts-addsub.c.inc"
1182
1183 #undef N
1184 #undef W
1185 #undef partsN
1186 #undef FloatPartsN
1187 #undef FloatPartsW
1188
1189 /*
1190 * Pack/unpack routines with a specific FloatFmt.
1191 */
1192
1193 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1194 float_status *s, const FloatFmt *params)
1195 {
1196 float16_unpack_raw(p, f);
1197 parts_canonicalize(p, s, params);
1198 }
1199
1200 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1201 float_status *s)
1202 {
1203 float16a_unpack_canonical(p, f, s, &float16_params);
1204 }
1205
1206 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1207 float_status *s)
1208 {
1209 bfloat16_unpack_raw(p, f);
1210 parts_canonicalize(p, s, &bfloat16_params);
1211 }
1212
1213 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1214 float_status *s,
1215 const FloatFmt *params)
1216 {
1217 parts_uncanon(p, s, params);
1218 return float16_pack_raw(p);
1219 }
1220
1221 static float16 float16_round_pack_canonical(FloatParts64 *p,
1222 float_status *s)
1223 {
1224 return float16a_round_pack_canonical(p, s, &float16_params);
1225 }
1226
1227 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1228 float_status *s)
1229 {
1230 parts_uncanon(p, s, &bfloat16_params);
1231 return bfloat16_pack_raw(p);
1232 }
1233
1234 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1235 float_status *s)
1236 {
1237 float32_unpack_raw(p, f);
1238 parts_canonicalize(p, s, &float32_params);
1239 }
1240
1241 static float32 float32_round_pack_canonical(FloatParts64 *p,
1242 float_status *s)
1243 {
1244 parts_uncanon(p, s, &float32_params);
1245 return float32_pack_raw(p);
1246 }
1247
1248 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1249 float_status *s)
1250 {
1251 float64_unpack_raw(p, f);
1252 parts_canonicalize(p, s, &float64_params);
1253 }
1254
1255 static float64 float64_round_pack_canonical(FloatParts64 *p,
1256 float_status *s)
1257 {
1258 parts_uncanon(p, s, &float64_params);
1259 return float64_pack_raw(p);
1260 }
1261
1262 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1263 float_status *s)
1264 {
1265 float128_unpack_raw(p, f);
1266 parts_canonicalize(p, s, &float128_params);
1267 }
1268
1269 static float128 float128_round_pack_canonical(FloatParts128 *p,
1270 float_status *s)
1271 {
1272 parts_uncanon(p, s, &float128_params);
1273 return float128_pack_raw(p);
1274 }
1275
1276 /*
1277 * Addition and subtraction
1278 */
1279
1280 static float16 QEMU_FLATTEN
1281 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1282 {
1283 FloatParts64 pa, pb, *pr;
1284
1285 float16_unpack_canonical(&pa, a, status);
1286 float16_unpack_canonical(&pb, b, status);
1287 pr = parts_addsub(&pa, &pb, status, subtract);
1288
1289 return float16_round_pack_canonical(pr, status);
1290 }
1291
1292 float16 float16_add(float16 a, float16 b, float_status *status)
1293 {
1294 return float16_addsub(a, b, status, false);
1295 }
1296
1297 float16 float16_sub(float16 a, float16 b, float_status *status)
1298 {
1299 return float16_addsub(a, b, status, true);
1300 }
1301
1302 static float32 QEMU_SOFTFLOAT_ATTR
1303 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1304 {
1305 FloatParts64 pa, pb, *pr;
1306
1307 float32_unpack_canonical(&pa, a, status);
1308 float32_unpack_canonical(&pb, b, status);
1309 pr = parts_addsub(&pa, &pb, status, subtract);
1310
1311 return float32_round_pack_canonical(pr, status);
1312 }
1313
1314 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1315 {
1316 return soft_f32_addsub(a, b, status, false);
1317 }
1318
1319 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1320 {
1321 return soft_f32_addsub(a, b, status, true);
1322 }
1323
1324 static float64 QEMU_SOFTFLOAT_ATTR
1325 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1326 {
1327 FloatParts64 pa, pb, *pr;
1328
1329 float64_unpack_canonical(&pa, a, status);
1330 float64_unpack_canonical(&pb, b, status);
1331 pr = parts_addsub(&pa, &pb, status, subtract);
1332
1333 return float64_round_pack_canonical(pr, status);
1334 }
1335
1336 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1337 {
1338 return soft_f64_addsub(a, b, status, false);
1339 }
1340
1341 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1342 {
1343 return soft_f64_addsub(a, b, status, true);
1344 }
1345
1346 static float hard_f32_add(float a, float b)
1347 {
1348 return a + b;
1349 }
1350
1351 static float hard_f32_sub(float a, float b)
1352 {
1353 return a - b;
1354 }
1355
1356 static double hard_f64_add(double a, double b)
1357 {
1358 return a + b;
1359 }
1360
1361 static double hard_f64_sub(double a, double b)
1362 {
1363 return a - b;
1364 }
1365
1366 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1367 {
1368 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1369 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1370 }
1371 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1372 }
1373
1374 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1375 {
1376 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1377 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1378 } else {
1379 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1380 }
1381 }
1382
1383 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1384 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1385 {
1386 return float32_gen2(a, b, s, hard, soft,
1387 f32_is_zon2, f32_addsubmul_post);
1388 }
1389
1390 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1391 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1392 {
1393 return float64_gen2(a, b, s, hard, soft,
1394 f64_is_zon2, f64_addsubmul_post);
1395 }
1396
1397 float32 QEMU_FLATTEN
1398 float32_add(float32 a, float32 b, float_status *s)
1399 {
1400 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1401 }
1402
1403 float32 QEMU_FLATTEN
1404 float32_sub(float32 a, float32 b, float_status *s)
1405 {
1406 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1407 }
1408
1409 float64 QEMU_FLATTEN
1410 float64_add(float64 a, float64 b, float_status *s)
1411 {
1412 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1413 }
1414
1415 float64 QEMU_FLATTEN
1416 float64_sub(float64 a, float64 b, float_status *s)
1417 {
1418 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1419 }
1420
1421 static bfloat16 QEMU_FLATTEN
1422 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1423 {
1424 FloatParts64 pa, pb, *pr;
1425
1426 bfloat16_unpack_canonical(&pa, a, status);
1427 bfloat16_unpack_canonical(&pb, b, status);
1428 pr = parts_addsub(&pa, &pb, status, subtract);
1429
1430 return bfloat16_round_pack_canonical(pr, status);
1431 }
1432
1433 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1434 {
1435 return bfloat16_addsub(a, b, status, false);
1436 }
1437
1438 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1439 {
1440 return bfloat16_addsub(a, b, status, true);
1441 }
1442
1443 static float128 QEMU_FLATTEN
1444 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1445 {
1446 FloatParts128 pa, pb, *pr;
1447
1448 float128_unpack_canonical(&pa, a, status);
1449 float128_unpack_canonical(&pb, b, status);
1450 pr = parts_addsub(&pa, &pb, status, subtract);
1451
1452 return float128_round_pack_canonical(pr, status);
1453 }
1454
1455 float128 float128_add(float128 a, float128 b, float_status *status)
1456 {
1457 return float128_addsub(a, b, status, false);
1458 }
1459
1460 float128 float128_sub(float128 a, float128 b, float_status *status)
1461 {
1462 return float128_addsub(a, b, status, true);
1463 }
1464
1465 /*
1466 * Multiplication
1467 */
1468
1469 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1470 {
1471 FloatParts64 pa, pb, *pr;
1472
1473 float16_unpack_canonical(&pa, a, status);
1474 float16_unpack_canonical(&pb, b, status);
1475 pr = parts_mul(&pa, &pb, status);
1476
1477 return float16_round_pack_canonical(pr, status);
1478 }
1479
1480 static float32 QEMU_SOFTFLOAT_ATTR
1481 soft_f32_mul(float32 a, float32 b, float_status *status)
1482 {
1483 FloatParts64 pa, pb, *pr;
1484
1485 float32_unpack_canonical(&pa, a, status);
1486 float32_unpack_canonical(&pb, b, status);
1487 pr = parts_mul(&pa, &pb, status);
1488
1489 return float32_round_pack_canonical(pr, status);
1490 }
1491
1492 static float64 QEMU_SOFTFLOAT_ATTR
1493 soft_f64_mul(float64 a, float64 b, float_status *status)
1494 {
1495 FloatParts64 pa, pb, *pr;
1496
1497 float64_unpack_canonical(&pa, a, status);
1498 float64_unpack_canonical(&pb, b, status);
1499 pr = parts_mul(&pa, &pb, status);
1500
1501 return float64_round_pack_canonical(pr, status);
1502 }
1503
1504 static float hard_f32_mul(float a, float b)
1505 {
1506 return a * b;
1507 }
1508
1509 static double hard_f64_mul(double a, double b)
1510 {
1511 return a * b;
1512 }
1513
1514 float32 QEMU_FLATTEN
1515 float32_mul(float32 a, float32 b, float_status *s)
1516 {
1517 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1518 f32_is_zon2, f32_addsubmul_post);
1519 }
1520
1521 float64 QEMU_FLATTEN
1522 float64_mul(float64 a, float64 b, float_status *s)
1523 {
1524 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1525 f64_is_zon2, f64_addsubmul_post);
1526 }
1527
1528 bfloat16 QEMU_FLATTEN
1529 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1530 {
1531 FloatParts64 pa, pb, *pr;
1532
1533 bfloat16_unpack_canonical(&pa, a, status);
1534 bfloat16_unpack_canonical(&pb, b, status);
1535 pr = parts_mul(&pa, &pb, status);
1536
1537 return bfloat16_round_pack_canonical(pr, status);
1538 }
1539
1540 float128 QEMU_FLATTEN
1541 float128_mul(float128 a, float128 b, float_status *status)
1542 {
1543 FloatParts128 pa, pb, *pr;
1544
1545 float128_unpack_canonical(&pa, a, status);
1546 float128_unpack_canonical(&pb, b, status);
1547 pr = parts_mul(&pa, &pb, status);
1548
1549 return float128_round_pack_canonical(pr, status);
1550 }
1551
1552 /*
1553 * Fused multiply-add
1554 */
1555
1556 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1557 int flags, float_status *status)
1558 {
1559 FloatParts64 pa, pb, pc, *pr;
1560
1561 float16_unpack_canonical(&pa, a, status);
1562 float16_unpack_canonical(&pb, b, status);
1563 float16_unpack_canonical(&pc, c, status);
1564 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1565
1566 return float16_round_pack_canonical(pr, status);
1567 }
1568
1569 static float32 QEMU_SOFTFLOAT_ATTR
1570 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1571 float_status *status)
1572 {
1573 FloatParts64 pa, pb, pc, *pr;
1574
1575 float32_unpack_canonical(&pa, a, status);
1576 float32_unpack_canonical(&pb, b, status);
1577 float32_unpack_canonical(&pc, c, status);
1578 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1579
1580 return float32_round_pack_canonical(pr, status);
1581 }
1582
1583 static float64 QEMU_SOFTFLOAT_ATTR
1584 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1585 float_status *status)
1586 {
1587 FloatParts64 pa, pb, pc, *pr;
1588
1589 float64_unpack_canonical(&pa, a, status);
1590 float64_unpack_canonical(&pb, b, status);
1591 float64_unpack_canonical(&pc, c, status);
1592 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1593
1594 return float64_round_pack_canonical(pr, status);
1595 }
1596
1597 static bool force_soft_fma;
1598
1599 float32 QEMU_FLATTEN
1600 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1601 {
1602 union_float32 ua, ub, uc, ur;
1603
1604 ua.s = xa;
1605 ub.s = xb;
1606 uc.s = xc;
1607
1608 if (unlikely(!can_use_fpu(s))) {
1609 goto soft;
1610 }
1611 if (unlikely(flags & float_muladd_halve_result)) {
1612 goto soft;
1613 }
1614
1615 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1616 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1617 goto soft;
1618 }
1619
1620 if (unlikely(force_soft_fma)) {
1621 goto soft;
1622 }
1623
1624 /*
1625 * When (a || b) == 0, there's no need to check for under/over flow,
1626 * since we know the addend is (normal || 0) and the product is 0.
1627 */
1628 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1629 union_float32 up;
1630 bool prod_sign;
1631
1632 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1633 prod_sign ^= !!(flags & float_muladd_negate_product);
1634 up.s = float32_set_sign(float32_zero, prod_sign);
1635
1636 if (flags & float_muladd_negate_c) {
1637 uc.h = -uc.h;
1638 }
1639 ur.h = up.h + uc.h;
1640 } else {
1641 union_float32 ua_orig = ua;
1642 union_float32 uc_orig = uc;
1643
1644 if (flags & float_muladd_negate_product) {
1645 ua.h = -ua.h;
1646 }
1647 if (flags & float_muladd_negate_c) {
1648 uc.h = -uc.h;
1649 }
1650
1651 ur.h = fmaf(ua.h, ub.h, uc.h);
1652
1653 if (unlikely(f32_is_inf(ur))) {
1654 float_raise(float_flag_overflow, s);
1655 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1656 ua = ua_orig;
1657 uc = uc_orig;
1658 goto soft;
1659 }
1660 }
1661 if (flags & float_muladd_negate_result) {
1662 return float32_chs(ur.s);
1663 }
1664 return ur.s;
1665
1666 soft:
1667 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1668 }
1669
1670 float64 QEMU_FLATTEN
1671 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1672 {
1673 union_float64 ua, ub, uc, ur;
1674
1675 ua.s = xa;
1676 ub.s = xb;
1677 uc.s = xc;
1678
1679 if (unlikely(!can_use_fpu(s))) {
1680 goto soft;
1681 }
1682 if (unlikely(flags & float_muladd_halve_result)) {
1683 goto soft;
1684 }
1685
1686 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1687 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1688 goto soft;
1689 }
1690
1691 if (unlikely(force_soft_fma)) {
1692 goto soft;
1693 }
1694
1695 /*
1696 * When (a || b) == 0, there's no need to check for under/over flow,
1697 * since we know the addend is (normal || 0) and the product is 0.
1698 */
1699 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1700 union_float64 up;
1701 bool prod_sign;
1702
1703 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1704 prod_sign ^= !!(flags & float_muladd_negate_product);
1705 up.s = float64_set_sign(float64_zero, prod_sign);
1706
1707 if (flags & float_muladd_negate_c) {
1708 uc.h = -uc.h;
1709 }
1710 ur.h = up.h + uc.h;
1711 } else {
1712 union_float64 ua_orig = ua;
1713 union_float64 uc_orig = uc;
1714
1715 if (flags & float_muladd_negate_product) {
1716 ua.h = -ua.h;
1717 }
1718 if (flags & float_muladd_negate_c) {
1719 uc.h = -uc.h;
1720 }
1721
1722 ur.h = fma(ua.h, ub.h, uc.h);
1723
1724 if (unlikely(f64_is_inf(ur))) {
1725 float_raise(float_flag_overflow, s);
1726 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1727 ua = ua_orig;
1728 uc = uc_orig;
1729 goto soft;
1730 }
1731 }
1732 if (flags & float_muladd_negate_result) {
1733 return float64_chs(ur.s);
1734 }
1735 return ur.s;
1736
1737 soft:
1738 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1739 }
1740
1741 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1742 int flags, float_status *status)
1743 {
1744 FloatParts64 pa, pb, pc, *pr;
1745
1746 bfloat16_unpack_canonical(&pa, a, status);
1747 bfloat16_unpack_canonical(&pb, b, status);
1748 bfloat16_unpack_canonical(&pc, c, status);
1749 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1750
1751 return bfloat16_round_pack_canonical(pr, status);
1752 }
1753
1754 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
1755 int flags, float_status *status)
1756 {
1757 FloatParts128 pa, pb, pc, *pr;
1758
1759 float128_unpack_canonical(&pa, a, status);
1760 float128_unpack_canonical(&pb, b, status);
1761 float128_unpack_canonical(&pc, c, status);
1762 pr = parts_muladd(&pa, &pb, &pc, flags, status);
1763
1764 return float128_round_pack_canonical(pr, status);
1765 }
1766
1767 /*
1768 * Returns the result of dividing the floating-point value `a' by the
1769 * corresponding value `b'. The operation is performed according to
1770 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1771 */
1772
1773 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1774 {
1775 bool sign = a.sign ^ b.sign;
1776
1777 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1778 uint64_t n0, n1, q, r;
1779 int exp = a.exp - b.exp;
1780
1781 /*
1782 * We want a 2*N / N-bit division to produce exactly an N-bit
1783 * result, so that we do not lose any precision and so that we
1784 * do not have to renormalize afterward. If A.frac < B.frac,
1785 * then division would produce an (N-1)-bit result; shift A left
1786 * by one to produce the an N-bit result, and decrement the
1787 * exponent to match.
1788 *
1789 * The udiv_qrnnd algorithm that we're using requires normalization,
1790 * i.e. the msb of the denominator must be set, which is already true.
1791 */
1792 if (a.frac < b.frac) {
1793 exp -= 1;
1794 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1795 } else {
1796 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1797 }
1798 q = udiv_qrnnd(&r, n1, n0, b.frac);
1799
1800 /* Set lsb if there is a remainder, to set inexact. */
1801 a.frac = q | (r != 0);
1802 a.sign = sign;
1803 a.exp = exp;
1804 return a;
1805 }
1806 /* handle all the NaN cases */
1807 if (is_nan(a.cls) || is_nan(b.cls)) {
1808 return *parts_pick_nan(&a, &b, s);
1809 }
1810 /* 0/0 or Inf/Inf */
1811 if (a.cls == b.cls
1812 &&
1813 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1814 float_raise(float_flag_invalid, s);
1815 parts_default_nan(&a, s);
1816 return a;
1817 }
1818 /* Inf / x or 0 / x */
1819 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1820 a.sign = sign;
1821 return a;
1822 }
1823 /* Div 0 => Inf */
1824 if (b.cls == float_class_zero) {
1825 float_raise(float_flag_divbyzero, s);
1826 a.cls = float_class_inf;
1827 a.sign = sign;
1828 return a;
1829 }
1830 /* Div by Inf */
1831 if (b.cls == float_class_inf) {
1832 a.cls = float_class_zero;
1833 a.sign = sign;
1834 return a;
1835 }
1836 g_assert_not_reached();
1837 }
1838
1839 float16 float16_div(float16 a, float16 b, float_status *status)
1840 {
1841 FloatParts64 pa, pb, pr;
1842
1843 float16_unpack_canonical(&pa, a, status);
1844 float16_unpack_canonical(&pb, b, status);
1845 pr = div_floats(pa, pb, status);
1846
1847 return float16_round_pack_canonical(&pr, status);
1848 }
1849
1850 static float32 QEMU_SOFTFLOAT_ATTR
1851 soft_f32_div(float32 a, float32 b, float_status *status)
1852 {
1853 FloatParts64 pa, pb, pr;
1854
1855 float32_unpack_canonical(&pa, a, status);
1856 float32_unpack_canonical(&pb, b, status);
1857 pr = div_floats(pa, pb, status);
1858
1859 return float32_round_pack_canonical(&pr, status);
1860 }
1861
1862 static float64 QEMU_SOFTFLOAT_ATTR
1863 soft_f64_div(float64 a, float64 b, float_status *status)
1864 {
1865 FloatParts64 pa, pb, pr;
1866
1867 float64_unpack_canonical(&pa, a, status);
1868 float64_unpack_canonical(&pb, b, status);
1869 pr = div_floats(pa, pb, status);
1870
1871 return float64_round_pack_canonical(&pr, status);
1872 }
1873
1874 static float hard_f32_div(float a, float b)
1875 {
1876 return a / b;
1877 }
1878
1879 static double hard_f64_div(double a, double b)
1880 {
1881 return a / b;
1882 }
1883
1884 static bool f32_div_pre(union_float32 a, union_float32 b)
1885 {
1886 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1887 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1888 fpclassify(b.h) == FP_NORMAL;
1889 }
1890 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1891 }
1892
1893 static bool f64_div_pre(union_float64 a, union_float64 b)
1894 {
1895 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1896 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1897 fpclassify(b.h) == FP_NORMAL;
1898 }
1899 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1900 }
1901
1902 static bool f32_div_post(union_float32 a, union_float32 b)
1903 {
1904 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1905 return fpclassify(a.h) != FP_ZERO;
1906 }
1907 return !float32_is_zero(a.s);
1908 }
1909
1910 static bool f64_div_post(union_float64 a, union_float64 b)
1911 {
1912 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1913 return fpclassify(a.h) != FP_ZERO;
1914 }
1915 return !float64_is_zero(a.s);
1916 }
1917
1918 float32 QEMU_FLATTEN
1919 float32_div(float32 a, float32 b, float_status *s)
1920 {
1921 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1922 f32_div_pre, f32_div_post);
1923 }
1924
1925 float64 QEMU_FLATTEN
1926 float64_div(float64 a, float64 b, float_status *s)
1927 {
1928 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1929 f64_div_pre, f64_div_post);
1930 }
1931
1932 /*
1933 * Returns the result of dividing the bfloat16
1934 * value `a' by the corresponding value `b'.
1935 */
1936
1937 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1938 {
1939 FloatParts64 pa, pb, pr;
1940
1941 bfloat16_unpack_canonical(&pa, a, status);
1942 bfloat16_unpack_canonical(&pb, b, status);
1943 pr = div_floats(pa, pb, status);
1944
1945 return bfloat16_round_pack_canonical(&pr, status);
1946 }
1947
1948 /*
1949 * Float to Float conversions
1950 *
1951 * Returns the result of converting one float format to another. The
1952 * conversion is performed according to the IEC/IEEE Standard for
1953 * Binary Floating-Point Arithmetic.
1954 *
1955 * The float_to_float helper only needs to take care of raising
1956 * invalid exceptions and handling the conversion on NaNs.
1957 */
1958
1959 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1960 float_status *s)
1961 {
1962 if (dstf->arm_althp) {
1963 switch (a.cls) {
1964 case float_class_qnan:
1965 case float_class_snan:
1966 /* There is no NaN in the destination format. Raise Invalid
1967 * and return a zero with the sign of the input NaN.
1968 */
1969 float_raise(float_flag_invalid, s);
1970 a.cls = float_class_zero;
1971 a.frac = 0;
1972 a.exp = 0;
1973 break;
1974
1975 case float_class_inf:
1976 /* There is no Inf in the destination format. Raise Invalid
1977 * and return the maximum normal with the correct sign.
1978 */
1979 float_raise(float_flag_invalid, s);
1980 a.cls = float_class_normal;
1981 a.exp = dstf->exp_max;
1982 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1983 break;
1984
1985 default:
1986 break;
1987 }
1988 } else if (is_nan(a.cls)) {
1989 parts_return_nan(&a, s);
1990 }
1991 return a;
1992 }
1993
1994 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1995 {
1996 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1997 FloatParts64 pa, pr;
1998
1999 float16a_unpack_canonical(&pa, a, s, fmt16);
2000 pr = float_to_float(pa, &float32_params, s);
2001 return float32_round_pack_canonical(&pr, s);
2002 }
2003
2004 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2005 {
2006 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2007 FloatParts64 pa, pr;
2008
2009 float16a_unpack_canonical(&pa, a, s, fmt16);
2010 pr = float_to_float(pa, &float64_params, s);
2011 return float64_round_pack_canonical(&pr, s);
2012 }
2013
2014 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2015 {
2016 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2017 FloatParts64 pa, pr;
2018
2019 float32_unpack_canonical(&pa, a, s);
2020 pr = float_to_float(pa, fmt16, s);
2021 return float16a_round_pack_canonical(&pr, s, fmt16);
2022 }
2023
2024 static float64 QEMU_SOFTFLOAT_ATTR
2025 soft_float32_to_float64(float32 a, float_status *s)
2026 {
2027 FloatParts64 pa, pr;
2028
2029 float32_unpack_canonical(&pa, a, s);
2030 pr = float_to_float(pa, &float64_params, s);
2031 return float64_round_pack_canonical(&pr, s);
2032 }
2033
2034 float64 float32_to_float64(float32 a, float_status *s)
2035 {
2036 if (likely(float32_is_normal(a))) {
2037 /* Widening conversion can never produce inexact results. */
2038 union_float32 uf;
2039 union_float64 ud;
2040 uf.s = a;
2041 ud.h = uf.h;
2042 return ud.s;
2043 } else if (float32_is_zero(a)) {
2044 return float64_set_sign(float64_zero, float32_is_neg(a));
2045 } else {
2046 return soft_float32_to_float64(a, s);
2047 }
2048 }
2049
2050 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2051 {
2052 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2053 FloatParts64 pa, pr;
2054
2055 float64_unpack_canonical(&pa, a, s);
2056 pr = float_to_float(pa, fmt16, s);
2057 return float16a_round_pack_canonical(&pr, s, fmt16);
2058 }
2059
2060 float32 float64_to_float32(float64 a, float_status *s)
2061 {
2062 FloatParts64 pa, pr;
2063
2064 float64_unpack_canonical(&pa, a, s);
2065 pr = float_to_float(pa, &float32_params, s);
2066 return float32_round_pack_canonical(&pr, s);
2067 }
2068
2069 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2070 {
2071 FloatParts64 pa, pr;
2072
2073 bfloat16_unpack_canonical(&pa, a, s);
2074 pr = float_to_float(pa, &float32_params, s);
2075 return float32_round_pack_canonical(&pr, s);
2076 }
2077
2078 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2079 {
2080 FloatParts64 pa, pr;
2081
2082 bfloat16_unpack_canonical(&pa, a, s);
2083 pr = float_to_float(pa, &float64_params, s);
2084 return float64_round_pack_canonical(&pr, s);
2085 }
2086
2087 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2088 {
2089 FloatParts64 pa, pr;
2090
2091 float32_unpack_canonical(&pa, a, s);
2092 pr = float_to_float(pa, &bfloat16_params, s);
2093 return bfloat16_round_pack_canonical(&pr, s);
2094 }
2095
2096 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2097 {
2098 FloatParts64 pa, pr;
2099
2100 float64_unpack_canonical(&pa, a, s);
2101 pr = float_to_float(pa, &bfloat16_params, s);
2102 return bfloat16_round_pack_canonical(&pr, s);
2103 }
2104
2105 /*
2106 * Rounds the floating-point value `a' to an integer, and returns the
2107 * result as a floating-point value. The operation is performed
2108 * according to the IEC/IEEE Standard for Binary Floating-Point
2109 * Arithmetic.
2110 */
2111
2112 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2113 int scale, float_status *s)
2114 {
2115 switch (a.cls) {
2116 case float_class_qnan:
2117 case float_class_snan:
2118 parts_return_nan(&a, s);
2119 break;
2120
2121 case float_class_zero:
2122 case float_class_inf:
2123 /* already "integral" */
2124 break;
2125
2126 case float_class_normal:
2127 scale = MIN(MAX(scale, -0x10000), 0x10000);
2128 a.exp += scale;
2129
2130 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2131 /* already integral */
2132 break;
2133 }
2134 if (a.exp < 0) {
2135 bool one;
2136 /* all fractional */
2137 float_raise(float_flag_inexact, s);
2138 switch (rmode) {
2139 case float_round_nearest_even:
2140 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2141 break;
2142 case float_round_ties_away:
2143 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2144 break;
2145 case float_round_to_zero:
2146 one = false;
2147 break;
2148 case float_round_up:
2149 one = !a.sign;
2150 break;
2151 case float_round_down:
2152 one = a.sign;
2153 break;
2154 case float_round_to_odd:
2155 one = true;
2156 break;
2157 default:
2158 g_assert_not_reached();
2159 }
2160
2161 if (one) {
2162 a.frac = DECOMPOSED_IMPLICIT_BIT;
2163 a.exp = 0;
2164 } else {
2165 a.cls = float_class_zero;
2166 }
2167 } else {
2168 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2169 uint64_t frac_lsbm1 = frac_lsb >> 1;
2170 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2171 uint64_t rnd_mask = rnd_even_mask >> 1;
2172 uint64_t inc;
2173
2174 switch (rmode) {
2175 case float_round_nearest_even:
2176 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2177 break;
2178 case float_round_ties_away:
2179 inc = frac_lsbm1;
2180 break;
2181 case float_round_to_zero:
2182 inc = 0;
2183 break;
2184 case float_round_up:
2185 inc = a.sign ? 0 : rnd_mask;
2186 break;
2187 case float_round_down:
2188 inc = a.sign ? rnd_mask : 0;
2189 break;
2190 case float_round_to_odd:
2191 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2192 break;
2193 default:
2194 g_assert_not_reached();
2195 }
2196
2197 if (a.frac & rnd_mask) {
2198 float_raise(float_flag_inexact, s);
2199 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2200 a.frac >>= 1;
2201 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2202 a.exp++;
2203 }
2204 a.frac &= ~rnd_mask;
2205 }
2206 }
2207 break;
2208 default:
2209 g_assert_not_reached();
2210 }
2211 return a;
2212 }
2213
2214 float16 float16_round_to_int(float16 a, float_status *s)
2215 {
2216 FloatParts64 pa, pr;
2217
2218 float16_unpack_canonical(&pa, a, s);
2219 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2220 return float16_round_pack_canonical(&pr, s);
2221 }
2222
2223 float32 float32_round_to_int(float32 a, float_status *s)
2224 {
2225 FloatParts64 pa, pr;
2226
2227 float32_unpack_canonical(&pa, a, s);
2228 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2229 return float32_round_pack_canonical(&pr, s);
2230 }
2231
2232 float64 float64_round_to_int(float64 a, float_status *s)
2233 {
2234 FloatParts64 pa, pr;
2235
2236 float64_unpack_canonical(&pa, a, s);
2237 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2238 return float64_round_pack_canonical(&pr, s);
2239 }
2240
2241 /*
2242 * Rounds the bfloat16 value `a' to an integer, and returns the
2243 * result as a bfloat16 value.
2244 */
2245
2246 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2247 {
2248 FloatParts64 pa, pr;
2249
2250 bfloat16_unpack_canonical(&pa, a, s);
2251 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2252 return bfloat16_round_pack_canonical(&pr, s);
2253 }
2254
2255 /*
2256 * Returns the result of converting the floating-point value `a' to
2257 * the two's complement integer format. The conversion is performed
2258 * according to the IEC/IEEE Standard for Binary Floating-Point
2259 * Arithmetic---which means in particular that the conversion is
2260 * rounded according to the current rounding mode. If `a' is a NaN,
2261 * the largest positive integer is returned. Otherwise, if the
2262 * conversion overflows, the largest integer with the same sign as `a'
2263 * is returned.
2264 */
2265
2266 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2267 int scale, int64_t min, int64_t max,
2268 float_status *s)
2269 {
2270 uint64_t r;
2271 int orig_flags = get_float_exception_flags(s);
2272 FloatParts64 p = round_to_int(in, rmode, scale, s);
2273
2274 switch (p.cls) {
2275 case float_class_snan:
2276 case float_class_qnan:
2277 s->float_exception_flags = orig_flags | float_flag_invalid;
2278 return max;
2279 case float_class_inf:
2280 s->float_exception_flags = orig_flags | float_flag_invalid;
2281 return p.sign ? min : max;
2282 case float_class_zero:
2283 return 0;
2284 case float_class_normal:
2285 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2286 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2287 } else {
2288 r = UINT64_MAX;
2289 }
2290 if (p.sign) {
2291 if (r <= -(uint64_t) min) {
2292 return -r;
2293 } else {
2294 s->float_exception_flags = orig_flags | float_flag_invalid;
2295 return min;
2296 }
2297 } else {
2298 if (r <= max) {
2299 return r;
2300 } else {
2301 s->float_exception_flags = orig_flags | float_flag_invalid;
2302 return max;
2303 }
2304 }
2305 default:
2306 g_assert_not_reached();
2307 }
2308 }
2309
2310 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2311 float_status *s)
2312 {
2313 FloatParts64 p;
2314
2315 float16_unpack_canonical(&p, a, s);
2316 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2317 }
2318
2319 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2320 float_status *s)
2321 {
2322 FloatParts64 p;
2323
2324 float16_unpack_canonical(&p, a, s);
2325 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2326 }
2327
2328 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2329 float_status *s)
2330 {
2331 FloatParts64 p;
2332
2333 float16_unpack_canonical(&p, a, s);
2334 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2335 }
2336
2337 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2338 float_status *s)
2339 {
2340 FloatParts64 p;
2341
2342 float16_unpack_canonical(&p, a, s);
2343 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2344 }
2345
2346 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2347 float_status *s)
2348 {
2349 FloatParts64 p;
2350
2351 float32_unpack_canonical(&p, a, s);
2352 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2353 }
2354
2355 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2356 float_status *s)
2357 {
2358 FloatParts64 p;
2359
2360 float32_unpack_canonical(&p, a, s);
2361 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2362 }
2363
2364 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2365 float_status *s)
2366 {
2367 FloatParts64 p;
2368
2369 float32_unpack_canonical(&p, a, s);
2370 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2371 }
2372
2373 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2374 float_status *s)
2375 {
2376 FloatParts64 p;
2377
2378 float64_unpack_canonical(&p, a, s);
2379 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2380 }
2381
2382 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2383 float_status *s)
2384 {
2385 FloatParts64 p;
2386
2387 float64_unpack_canonical(&p, a, s);
2388 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2389 }
2390
2391 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2392 float_status *s)
2393 {
2394 FloatParts64 p;
2395
2396 float64_unpack_canonical(&p, a, s);
2397 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2398 }
2399
2400 int8_t float16_to_int8(float16 a, float_status *s)
2401 {
2402 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2403 }
2404
2405 int16_t float16_to_int16(float16 a, float_status *s)
2406 {
2407 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2408 }
2409
2410 int32_t float16_to_int32(float16 a, float_status *s)
2411 {
2412 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2413 }
2414
2415 int64_t float16_to_int64(float16 a, float_status *s)
2416 {
2417 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2418 }
2419
2420 int16_t float32_to_int16(float32 a, float_status *s)
2421 {
2422 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2423 }
2424
2425 int32_t float32_to_int32(float32 a, float_status *s)
2426 {
2427 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2428 }
2429
2430 int64_t float32_to_int64(float32 a, float_status *s)
2431 {
2432 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2433 }
2434
2435 int16_t float64_to_int16(float64 a, float_status *s)
2436 {
2437 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2438 }
2439
2440 int32_t float64_to_int32(float64 a, float_status *s)
2441 {
2442 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2443 }
2444
2445 int64_t float64_to_int64(float64 a, float_status *s)
2446 {
2447 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2448 }
2449
2450 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2451 {
2452 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2453 }
2454
2455 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2456 {
2457 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2458 }
2459
2460 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2461 {
2462 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2463 }
2464
2465 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2466 {
2467 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2468 }
2469
2470 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2471 {
2472 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2473 }
2474
2475 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2476 {
2477 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2478 }
2479
2480 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2481 {
2482 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2483 }
2484
2485 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2486 {
2487 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2488 }
2489
2490 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2491 {
2492 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2493 }
2494
2495 /*
2496 * Returns the result of converting the floating-point value `a' to
2497 * the two's complement integer format.
2498 */
2499
2500 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2501 float_status *s)
2502 {
2503 FloatParts64 p;
2504
2505 bfloat16_unpack_canonical(&p, a, s);
2506 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2507 }
2508
2509 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2510 float_status *s)
2511 {
2512 FloatParts64 p;
2513
2514 bfloat16_unpack_canonical(&p, a, s);
2515 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2516 }
2517
2518 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2519 float_status *s)
2520 {
2521 FloatParts64 p;
2522
2523 bfloat16_unpack_canonical(&p, a, s);
2524 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2525 }
2526
2527 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2528 {
2529 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2530 }
2531
2532 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2533 {
2534 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2535 }
2536
2537 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2538 {
2539 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2540 }
2541
2542 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2543 {
2544 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2545 }
2546
2547 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2548 {
2549 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2550 }
2551
2552 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2553 {
2554 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2555 }
2556
2557 /*
2558 * Returns the result of converting the floating-point value `a' to
2559 * the unsigned integer format. The conversion is performed according
2560 * to the IEC/IEEE Standard for Binary Floating-Point
2561 * Arithmetic---which means in particular that the conversion is
2562 * rounded according to the current rounding mode. If `a' is a NaN,
2563 * the largest unsigned integer is returned. Otherwise, if the
2564 * conversion overflows, the largest unsigned integer is returned. If
2565 * the 'a' is negative, the result is rounded and zero is returned;
2566 * values that do not round to zero will raise the inexact exception
2567 * flag.
2568 */
2569
2570 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2571 int scale, uint64_t max,
2572 float_status *s)
2573 {
2574 int orig_flags = get_float_exception_flags(s);
2575 FloatParts64 p = round_to_int(in, rmode, scale, s);
2576 uint64_t r;
2577
2578 switch (p.cls) {
2579 case float_class_snan:
2580 case float_class_qnan:
2581 s->float_exception_flags = orig_flags | float_flag_invalid;
2582 return max;
2583 case float_class_inf:
2584 s->float_exception_flags = orig_flags | float_flag_invalid;
2585 return p.sign ? 0 : max;
2586 case float_class_zero:
2587 return 0;
2588 case float_class_normal:
2589 if (p.sign) {
2590 s->float_exception_flags = orig_flags | float_flag_invalid;
2591 return 0;
2592 }
2593
2594 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2595 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2596 } else {
2597 s->float_exception_flags = orig_flags | float_flag_invalid;
2598 return max;
2599 }
2600
2601 /* For uint64 this will never trip, but if p.exp is too large
2602 * to shift a decomposed fraction we shall have exited via the
2603 * 3rd leg above.
2604 */
2605 if (r > max) {
2606 s->float_exception_flags = orig_flags | float_flag_invalid;
2607 return max;
2608 }
2609 return r;
2610 default:
2611 g_assert_not_reached();
2612 }
2613 }
2614
2615 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2616 float_status *s)
2617 {
2618 FloatParts64 p;
2619
2620 float16_unpack_canonical(&p, a, s);
2621 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2622 }
2623
2624 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2625 float_status *s)
2626 {
2627 FloatParts64 p;
2628
2629 float16_unpack_canonical(&p, a, s);
2630 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2631 }
2632
2633 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2634 float_status *s)
2635 {
2636 FloatParts64 p;
2637
2638 float16_unpack_canonical(&p, a, s);
2639 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2640 }
2641
2642 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2643 float_status *s)
2644 {
2645 FloatParts64 p;
2646
2647 float16_unpack_canonical(&p, a, s);
2648 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2649 }
2650
2651 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2652 float_status *s)
2653 {
2654 FloatParts64 p;
2655
2656 float32_unpack_canonical(&p, a, s);
2657 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2658 }
2659
2660 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2661 float_status *s)
2662 {
2663 FloatParts64 p;
2664
2665 float32_unpack_canonical(&p, a, s);
2666 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2667 }
2668
2669 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2670 float_status *s)
2671 {
2672 FloatParts64 p;
2673
2674 float32_unpack_canonical(&p, a, s);
2675 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2676 }
2677
2678 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2679 float_status *s)
2680 {
2681 FloatParts64 p;
2682
2683 float64_unpack_canonical(&p, a, s);
2684 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2685 }
2686
2687 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2688 float_status *s)
2689 {
2690 FloatParts64 p;
2691
2692 float64_unpack_canonical(&p, a, s);
2693 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2694 }
2695
2696 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2697 float_status *s)
2698 {
2699 FloatParts64 p;
2700
2701 float64_unpack_canonical(&p, a, s);
2702 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2703 }
2704
2705 uint8_t float16_to_uint8(float16 a, float_status *s)
2706 {
2707 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2708 }
2709
2710 uint16_t float16_to_uint16(float16 a, float_status *s)
2711 {
2712 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2713 }
2714
2715 uint32_t float16_to_uint32(float16 a, float_status *s)
2716 {
2717 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2718 }
2719
2720 uint64_t float16_to_uint64(float16 a, float_status *s)
2721 {
2722 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2723 }
2724
2725 uint16_t float32_to_uint16(float32 a, float_status *s)
2726 {
2727 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2728 }
2729
2730 uint32_t float32_to_uint32(float32 a, float_status *s)
2731 {
2732 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2733 }
2734
2735 uint64_t float32_to_uint64(float32 a, float_status *s)
2736 {
2737 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2738 }
2739
2740 uint16_t float64_to_uint16(float64 a, float_status *s)
2741 {
2742 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2743 }
2744
2745 uint32_t float64_to_uint32(float64 a, float_status *s)
2746 {
2747 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2748 }
2749
2750 uint64_t float64_to_uint64(float64 a, float_status *s)
2751 {
2752 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2753 }
2754
2755 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2756 {
2757 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2758 }
2759
2760 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2761 {
2762 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2763 }
2764
2765 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2766 {
2767 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2768 }
2769
2770 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2771 {
2772 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2773 }
2774
2775 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2776 {
2777 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2778 }
2779
2780 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2781 {
2782 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2783 }
2784
2785 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2786 {
2787 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2788 }
2789
2790 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2791 {
2792 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2793 }
2794
2795 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2796 {
2797 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2798 }
2799
2800 /*
2801 * Returns the result of converting the bfloat16 value `a' to
2802 * the unsigned integer format.
2803 */
2804
2805 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2806 int scale, float_status *s)
2807 {
2808 FloatParts64 p;
2809
2810 bfloat16_unpack_canonical(&p, a, s);
2811 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2812 }
2813
2814 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2815 int scale, float_status *s)
2816 {
2817 FloatParts64 p;
2818
2819 bfloat16_unpack_canonical(&p, a, s);
2820 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2821 }
2822
2823 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2824 int scale, float_status *s)
2825 {
2826 FloatParts64 p;
2827
2828 bfloat16_unpack_canonical(&p, a, s);
2829 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2830 }
2831
2832 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2833 {
2834 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2835 }
2836
2837 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2838 {
2839 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2840 }
2841
2842 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2843 {
2844 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2845 }
2846
2847 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2848 {
2849 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2850 }
2851
2852 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2853 {
2854 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2855 }
2856
2857 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2858 {
2859 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2860 }
2861
2862 /*
2863 * Integer to float conversions
2864 *
2865 * Returns the result of converting the two's complement integer `a'
2866 * to the floating-point format. The conversion is performed according
2867 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2868 */
2869
2870 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2871 {
2872 FloatParts64 r = { .sign = false };
2873
2874 if (a == 0) {
2875 r.cls = float_class_zero;
2876 } else {
2877 uint64_t f = a;
2878 int shift;
2879
2880 r.cls = float_class_normal;
2881 if (a < 0) {
2882 f = -f;
2883 r.sign = true;
2884 }
2885 shift = clz64(f);
2886 scale = MIN(MAX(scale, -0x10000), 0x10000);
2887
2888 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2889 r.frac = f << shift;
2890 }
2891
2892 return r;
2893 }
2894
2895 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2896 {
2897 FloatParts64 pa = int_to_float(a, scale, status);
2898 return float16_round_pack_canonical(&pa, status);
2899 }
2900
2901 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2902 {
2903 return int64_to_float16_scalbn(a, scale, status);
2904 }
2905
2906 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2907 {
2908 return int64_to_float16_scalbn(a, scale, status);
2909 }
2910
2911 float16 int64_to_float16(int64_t a, float_status *status)
2912 {
2913 return int64_to_float16_scalbn(a, 0, status);
2914 }
2915
2916 float16 int32_to_float16(int32_t a, float_status *status)
2917 {
2918 return int64_to_float16_scalbn(a, 0, status);
2919 }
2920
2921 float16 int16_to_float16(int16_t a, float_status *status)
2922 {
2923 return int64_to_float16_scalbn(a, 0, status);
2924 }
2925
2926 float16 int8_to_float16(int8_t a, float_status *status)
2927 {
2928 return int64_to_float16_scalbn(a, 0, status);
2929 }
2930
2931 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2932 {
2933 FloatParts64 pa = int_to_float(a, scale, status);
2934 return float32_round_pack_canonical(&pa, status);
2935 }
2936
2937 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2938 {
2939 return int64_to_float32_scalbn(a, scale, status);
2940 }
2941
2942 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2943 {
2944 return int64_to_float32_scalbn(a, scale, status);
2945 }
2946
2947 float32 int64_to_float32(int64_t a, float_status *status)
2948 {
2949 return int64_to_float32_scalbn(a, 0, status);
2950 }
2951
2952 float32 int32_to_float32(int32_t a, float_status *status)
2953 {
2954 return int64_to_float32_scalbn(a, 0, status);
2955 }
2956
2957 float32 int16_to_float32(int16_t a, float_status *status)
2958 {
2959 return int64_to_float32_scalbn(a, 0, status);
2960 }
2961
2962 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2963 {
2964 FloatParts64 pa = int_to_float(a, scale, status);
2965 return float64_round_pack_canonical(&pa, status);
2966 }
2967
2968 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2969 {
2970 return int64_to_float64_scalbn(a, scale, status);
2971 }
2972
2973 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2974 {
2975 return int64_to_float64_scalbn(a, scale, status);
2976 }
2977
2978 float64 int64_to_float64(int64_t a, float_status *status)
2979 {
2980 return int64_to_float64_scalbn(a, 0, status);
2981 }
2982
2983 float64 int32_to_float64(int32_t a, float_status *status)
2984 {
2985 return int64_to_float64_scalbn(a, 0, status);
2986 }
2987
2988 float64 int16_to_float64(int16_t a, float_status *status)
2989 {
2990 return int64_to_float64_scalbn(a, 0, status);
2991 }
2992
2993 /*
2994 * Returns the result of converting the two's complement integer `a'
2995 * to the bfloat16 format.
2996 */
2997
2998 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2999 {
3000 FloatParts64 pa = int_to_float(a, scale, status);
3001 return bfloat16_round_pack_canonical(&pa, status);
3002 }
3003
3004 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3005 {
3006 return int64_to_bfloat16_scalbn(a, scale, status);
3007 }
3008
3009 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3010 {
3011 return int64_to_bfloat16_scalbn(a, scale, status);
3012 }
3013
3014 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3015 {
3016 return int64_to_bfloat16_scalbn(a, 0, status);
3017 }
3018
3019 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3020 {
3021 return int64_to_bfloat16_scalbn(a, 0, status);
3022 }
3023
3024 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3025 {
3026 return int64_to_bfloat16_scalbn(a, 0, status);
3027 }
3028
3029 /*
3030 * Unsigned Integer to float conversions
3031 *
3032 * Returns the result of converting the unsigned integer `a' to the
3033 * floating-point format. The conversion is performed according to the
3034 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3035 */
3036
3037 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3038 {
3039 FloatParts64 r = { .sign = false };
3040 int shift;
3041
3042 if (a == 0) {
3043 r.cls = float_class_zero;
3044 } else {
3045 scale = MIN(MAX(scale, -0x10000), 0x10000);
3046 shift = clz64(a);
3047 r.cls = float_class_normal;
3048 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3049 r.frac = a << shift;
3050 }
3051
3052 return r;
3053 }
3054
3055 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3056 {
3057 FloatParts64 pa = uint_to_float(a, scale, status);
3058 return float16_round_pack_canonical(&pa, status);
3059 }
3060
3061 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3062 {
3063 return uint64_to_float16_scalbn(a, scale, status);
3064 }
3065
3066 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3067 {
3068 return uint64_to_float16_scalbn(a, scale, status);
3069 }
3070
3071 float16 uint64_to_float16(uint64_t a, float_status *status)
3072 {
3073 return uint64_to_float16_scalbn(a, 0, status);
3074 }
3075
3076 float16 uint32_to_float16(uint32_t a, float_status *status)
3077 {
3078 return uint64_to_float16_scalbn(a, 0, status);
3079 }
3080
3081 float16 uint16_to_float16(uint16_t a, float_status *status)
3082 {
3083 return uint64_to_float16_scalbn(a, 0, status);
3084 }
3085
3086 float16 uint8_to_float16(uint8_t a, float_status *status)
3087 {
3088 return uint64_to_float16_scalbn(a, 0, status);
3089 }
3090
3091 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3092 {
3093 FloatParts64 pa = uint_to_float(a, scale, status);
3094 return float32_round_pack_canonical(&pa, status);
3095 }
3096
3097 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3098 {
3099 return uint64_to_float32_scalbn(a, scale, status);
3100 }
3101
3102 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3103 {
3104 return uint64_to_float32_scalbn(a, scale, status);
3105 }
3106
3107 float32 uint64_to_float32(uint64_t a, float_status *status)
3108 {
3109 return uint64_to_float32_scalbn(a, 0, status);
3110 }
3111
3112 float32 uint32_to_float32(uint32_t a, float_status *status)
3113 {
3114 return uint64_to_float32_scalbn(a, 0, status);
3115 }
3116
3117 float32 uint16_to_float32(uint16_t a, float_status *status)
3118 {
3119 return uint64_to_float32_scalbn(a, 0, status);
3120 }
3121
3122 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3123 {
3124 FloatParts64 pa = uint_to_float(a, scale, status);
3125 return float64_round_pack_canonical(&pa, status);
3126 }
3127
3128 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3129 {
3130 return uint64_to_float64_scalbn(a, scale, status);
3131 }
3132
3133 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3134 {
3135 return uint64_to_float64_scalbn(a, scale, status);
3136 }
3137
3138 float64 uint64_to_float64(uint64_t a, float_status *status)
3139 {
3140 return uint64_to_float64_scalbn(a, 0, status);
3141 }
3142
3143 float64 uint32_to_float64(uint32_t a, float_status *status)
3144 {
3145 return uint64_to_float64_scalbn(a, 0, status);
3146 }
3147
3148 float64 uint16_to_float64(uint16_t a, float_status *status)
3149 {
3150 return uint64_to_float64_scalbn(a, 0, status);
3151 }
3152
3153 /*
3154 * Returns the result of converting the unsigned integer `a' to the
3155 * bfloat16 format.
3156 */
3157
3158 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3159 {
3160 FloatParts64 pa = uint_to_float(a, scale, status);
3161 return bfloat16_round_pack_canonical(&pa, status);
3162 }
3163
3164 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3165 {
3166 return uint64_to_bfloat16_scalbn(a, scale, status);
3167 }
3168
3169 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3170 {
3171 return uint64_to_bfloat16_scalbn(a, scale, status);
3172 }
3173
3174 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3175 {
3176 return uint64_to_bfloat16_scalbn(a, 0, status);
3177 }
3178
3179 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3180 {
3181 return uint64_to_bfloat16_scalbn(a, 0, status);
3182 }
3183
3184 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3185 {
3186 return uint64_to_bfloat16_scalbn(a, 0, status);
3187 }
3188
3189 /* Float Min/Max */
3190 /* min() and max() functions. These can't be implemented as
3191 * 'compare and pick one input' because that would mishandle
3192 * NaNs and +0 vs -0.
3193 *
3194 * minnum() and maxnum() functions. These are similar to the min()
3195 * and max() functions but if one of the arguments is a QNaN and
3196 * the other is numerical then the numerical argument is returned.
3197 * SNaNs will get quietened before being returned.
3198 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3199 * and maxNum() operations. min() and max() are the typical min/max
3200 * semantics provided by many CPUs which predate that specification.
3201 *
3202 * minnummag() and maxnummag() functions correspond to minNumMag()
3203 * and minNumMag() from the IEEE-754 2008.
3204 */
3205 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3206 bool ieee, bool ismag, float_status *s)
3207 {
3208 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3209 if (ieee) {
3210 /* Takes two floating-point values `a' and `b', one of
3211 * which is a NaN, and returns the appropriate NaN
3212 * result. If either `a' or `b' is a signaling NaN,
3213 * the invalid exception is raised.
3214 */
3215 if (is_snan(a.cls) || is_snan(b.cls)) {
3216 return *parts_pick_nan(&a, &b, s);
3217 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3218 return b;
3219 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3220 return a;
3221 }
3222 }
3223 return *parts_pick_nan(&a, &b, s);
3224 } else {
3225 int a_exp, b_exp;
3226
3227 switch (a.cls) {
3228 case float_class_normal:
3229 a_exp = a.exp;
3230 break;
3231 case float_class_inf:
3232 a_exp = INT_MAX;
3233 break;
3234 case float_class_zero:
3235 a_exp = INT_MIN;
3236 break;
3237 default:
3238 g_assert_not_reached();
3239 break;
3240 }
3241 switch (b.cls) {
3242 case float_class_normal:
3243 b_exp = b.exp;
3244 break;
3245 case float_class_inf:
3246 b_exp = INT_MAX;
3247 break;
3248 case float_class_zero:
3249 b_exp = INT_MIN;
3250 break;
3251 default:
3252 g_assert_not_reached();
3253 break;
3254 }
3255
3256 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3257 bool a_less = a_exp < b_exp;
3258 if (a_exp == b_exp) {
3259 a_less = a.frac < b.frac;
3260 }
3261 return a_less ^ ismin ? b : a;
3262 }
3263
3264 if (a.sign == b.sign) {
3265 bool a_less = a_exp < b_exp;
3266 if (a_exp == b_exp) {
3267 a_less = a.frac < b.frac;
3268 }
3269 return a.sign ^ a_less ^ ismin ? b : a;
3270 } else {
3271 return a.sign ^ ismin ? b : a;
3272 }
3273 }
3274 }
3275
3276 #define MINMAX(sz, name, ismin, isiee, ismag) \
3277 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3278 float_status *s) \
3279 { \
3280 FloatParts64 pa, pb, pr; \
3281 float ## sz ## _unpack_canonical(&pa, a, s); \
3282 float ## sz ## _unpack_canonical(&pb, b, s); \
3283 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3284 return float ## sz ## _round_pack_canonical(&pr, s); \
3285 }
3286
3287 MINMAX(16, min, true, false, false)
3288 MINMAX(16, minnum, true, true, false)
3289 MINMAX(16, minnummag, true, true, true)
3290 MINMAX(16, max, false, false, false)
3291 MINMAX(16, maxnum, false, true, false)
3292 MINMAX(16, maxnummag, false, true, true)
3293
3294 MINMAX(32, min, true, false, false)
3295 MINMAX(32, minnum, true, true, false)
3296 MINMAX(32, minnummag, true, true, true)
3297 MINMAX(32, max, false, false, false)
3298 MINMAX(32, maxnum, false, true, false)
3299 MINMAX(32, maxnummag, false, true, true)
3300
3301 MINMAX(64, min, true, false, false)
3302 MINMAX(64, minnum, true, true, false)
3303 MINMAX(64, minnummag, true, true, true)
3304 MINMAX(64, max, false, false, false)
3305 MINMAX(64, maxnum, false, true, false)
3306 MINMAX(64, maxnummag, false, true, true)
3307
3308 #undef MINMAX
3309
3310 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3311 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3312 { \
3313 FloatParts64 pa, pb, pr; \
3314 bfloat16_unpack_canonical(&pa, a, s); \
3315 bfloat16_unpack_canonical(&pb, b, s); \
3316 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3317 return bfloat16_round_pack_canonical(&pr, s); \
3318 }
3319
3320 BF16_MINMAX(min, true, false, false)
3321 BF16_MINMAX(minnum, true, true, false)
3322 BF16_MINMAX(minnummag, true, true, true)
3323 BF16_MINMAX(max, false, false, false)
3324 BF16_MINMAX(maxnum, false, true, false)
3325 BF16_MINMAX(maxnummag, false, true, true)
3326
3327 #undef BF16_MINMAX
3328
3329 /* Floating point compare */
3330 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3331 float_status *s)
3332 {
3333 if (is_nan(a.cls) || is_nan(b.cls)) {
3334 if (!is_quiet ||
3335 a.cls == float_class_snan ||
3336 b.cls == float_class_snan) {
3337 float_raise(float_flag_invalid, s);
3338 }
3339 return float_relation_unordered;
3340 }
3341
3342 if (a.cls == float_class_zero) {
3343 if (b.cls == float_class_zero) {
3344 return float_relation_equal;
3345 }
3346 return b.sign ? float_relation_greater : float_relation_less;
3347 } else if (b.cls == float_class_zero) {
3348 return a.sign ? float_relation_less : float_relation_greater;
3349 }
3350
3351 /* The only really important thing about infinity is its sign. If
3352 * both are infinities the sign marks the smallest of the two.
3353 */
3354 if (a.cls == float_class_inf) {
3355 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3356 return float_relation_equal;
3357 }
3358 return a.sign ? float_relation_less : float_relation_greater;
3359 } else if (b.cls == float_class_inf) {
3360 return b.sign ? float_relation_greater : float_relation_less;
3361 }
3362
3363 if (a.sign != b.sign) {
3364 return a.sign ? float_relation_less : float_relation_greater;
3365 }
3366
3367 if (a.exp == b.exp) {
3368 if (a.frac == b.frac) {
3369 return float_relation_equal;
3370 }
3371 if (a.sign) {
3372 return a.frac > b.frac ?
3373 float_relation_less : float_relation_greater;
3374 } else {
3375 return a.frac > b.frac ?
3376 float_relation_greater : float_relation_less;
3377 }
3378 } else {
3379 if (a.sign) {
3380 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3381 } else {
3382 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3383 }
3384 }
3385 }
3386
3387 #define COMPARE(name, attr, sz) \
3388 static int attr \
3389 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3390 { \
3391 FloatParts64 pa, pb; \
3392 float ## sz ## _unpack_canonical(&pa, a, s); \
3393 float ## sz ## _unpack_canonical(&pb, b, s); \
3394 return compare_floats(pa, pb, is_quiet, s); \
3395 }
3396
3397 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3398 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3399 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3400
3401 #undef COMPARE
3402
3403 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3404 {
3405 return soft_f16_compare(a, b, false, s);
3406 }
3407
3408 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3409 {
3410 return soft_f16_compare(a, b, true, s);
3411 }
3412
3413 static FloatRelation QEMU_FLATTEN
3414 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3415 {
3416 union_float32 ua, ub;
3417
3418 ua.s = xa;
3419 ub.s = xb;
3420
3421 if (QEMU_NO_HARDFLOAT) {
3422 goto soft;
3423 }
3424
3425 float32_input_flush2(&ua.s, &ub.s, s);
3426 if (isgreaterequal(ua.h, ub.h)) {
3427 if (isgreater(ua.h, ub.h)) {
3428 return float_relation_greater;
3429 }
3430 return float_relation_equal;
3431 }
3432 if (likely(isless(ua.h, ub.h))) {
3433 return float_relation_less;
3434 }
3435 /* The only condition remaining is unordered.
3436 * Fall through to set flags.
3437 */
3438 soft:
3439 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3440 }
3441
3442 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3443 {
3444 return f32_compare(a, b, false, s);
3445 }
3446
3447 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3448 {
3449 return f32_compare(a, b, true, s);
3450 }
3451
3452 static FloatRelation QEMU_FLATTEN
3453 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3454 {
3455 union_float64 ua, ub;
3456
3457 ua.s = xa;
3458 ub.s = xb;
3459
3460 if (QEMU_NO_HARDFLOAT) {
3461 goto soft;
3462 }
3463
3464 float64_input_flush2(&ua.s, &ub.s, s);
3465 if (isgreaterequal(ua.h, ub.h)) {
3466 if (isgreater(ua.h, ub.h)) {
3467 return float_relation_greater;
3468 }
3469 return float_relation_equal;
3470 }
3471 if (likely(isless(ua.h, ub.h))) {
3472 return float_relation_less;
3473 }
3474 /* The only condition remaining is unordered.
3475 * Fall through to set flags.
3476 */
3477 soft:
3478 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3479 }
3480
3481 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3482 {
3483 return f64_compare(a, b, false, s);
3484 }
3485
3486 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3487 {
3488 return f64_compare(a, b, true, s);
3489 }
3490
3491 static FloatRelation QEMU_FLATTEN
3492 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3493 {
3494 FloatParts64 pa, pb;
3495
3496 bfloat16_unpack_canonical(&pa, a, s);
3497 bfloat16_unpack_canonical(&pb, b, s);
3498 return compare_floats(pa, pb, is_quiet, s);
3499 }
3500
3501 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3502 {
3503 return soft_bf16_compare(a, b, false, s);
3504 }
3505
3506 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3507 {
3508 return soft_bf16_compare(a, b, true, s);
3509 }
3510
3511 /* Multiply A by 2 raised to the power N. */
3512 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3513 {
3514 if (unlikely(is_nan(a.cls))) {
3515 parts_return_nan(&a, s);
3516 }
3517 if (a.cls == float_class_normal) {
3518 /* The largest float type (even though not supported by FloatParts64)
3519 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3520 * still allows rounding to infinity, without allowing overflow
3521 * within the int32_t that backs FloatParts64.exp.
3522 */
3523 n = MIN(MAX(n, -0x10000), 0x10000);
3524 a.exp += n;
3525 }
3526 return a;
3527 }
3528
3529 float16 float16_scalbn(float16 a, int n, float_status *status)
3530 {
3531 FloatParts64 pa, pr;
3532
3533 float16_unpack_canonical(&pa, a, status);
3534 pr = scalbn_decomposed(pa, n, status);
3535 return float16_round_pack_canonical(&pr, status);
3536 }
3537
3538 float32 float32_scalbn(float32 a, int n, float_status *status)
3539 {
3540 FloatParts64 pa, pr;
3541
3542 float32_unpack_canonical(&pa, a, status);
3543 pr = scalbn_decomposed(pa, n, status);
3544 return float32_round_pack_canonical(&pr, status);
3545 }
3546
3547 float64 float64_scalbn(float64 a, int n, float_status *status)
3548 {
3549 FloatParts64 pa, pr;
3550
3551 float64_unpack_canonical(&pa, a, status);
3552 pr = scalbn_decomposed(pa, n, status);
3553 return float64_round_pack_canonical(&pr, status);
3554 }
3555
3556 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3557 {
3558 FloatParts64 pa, pr;
3559
3560 bfloat16_unpack_canonical(&pa, a, status);
3561 pr = scalbn_decomposed(pa, n, status);
3562 return bfloat16_round_pack_canonical(&pr, status);
3563 }
3564
3565 /*
3566 * Square Root
3567 *
3568 * The old softfloat code did an approximation step before zeroing in
3569 * on the final result. However for simpleness we just compute the
3570 * square root by iterating down from the implicit bit to enough extra
3571 * bits to ensure we get a correctly rounded result.
3572 *
3573 * This does mean however the calculation is slower than before,
3574 * especially for 64 bit floats.
3575 */
3576
3577 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3578 {
3579 uint64_t a_frac, r_frac, s_frac;
3580 int bit, last_bit;
3581
3582 if (is_nan(a.cls)) {
3583 parts_return_nan(&a, s);
3584 return a;
3585 }
3586 if (a.cls == float_class_zero) {
3587 return a; /* sqrt(+-0) = +-0 */
3588 }
3589 if (a.sign) {
3590 float_raise(float_flag_invalid, s);
3591 parts_default_nan(&a, s);
3592 return a;
3593 }
3594 if (a.cls == float_class_inf) {
3595 return a; /* sqrt(+inf) = +inf */
3596 }
3597
3598 assert(a.cls == float_class_normal);
3599
3600 /* We need two overflow bits at the top. Adding room for that is a
3601 * right shift. If the exponent is odd, we can discard the low bit
3602 * by multiplying the fraction by 2; that's a left shift. Combine
3603 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3604 */
3605 a_frac = a.frac >> (2 - (a.exp & 1));
3606 a.exp >>= 1;
3607
3608 /* Bit-by-bit computation of sqrt. */
3609 r_frac = 0;
3610 s_frac = 0;
3611
3612 /* Iterate from implicit bit down to the 3 extra bits to compute a
3613 * properly rounded result. Remember we've inserted two more bits
3614 * at the top, so these positions are two less.
3615 */
3616 bit = DECOMPOSED_BINARY_POINT - 2;
3617 last_bit = MAX(p->frac_shift - 4, 0);
3618 do {
3619 uint64_t q = 1ULL << bit;
3620 uint64_t t_frac = s_frac + q;
3621 if (t_frac <= a_frac) {
3622 s_frac = t_frac + q;
3623 a_frac -= t_frac;
3624 r_frac += q;
3625 }
3626 a_frac <<= 1;
3627 } while (--bit >= last_bit);
3628
3629 /* Undo the right shift done above. If there is any remaining
3630 * fraction, the result is inexact. Set the sticky bit.
3631 */
3632 a.frac = (r_frac << 2) + (a_frac != 0);
3633
3634 return a;
3635 }
3636
3637 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3638 {
3639 FloatParts64 pa, pr;
3640
3641 float16_unpack_canonical(&pa, a, status);
3642 pr = sqrt_float(pa, status, &float16_params);
3643 return float16_round_pack_canonical(&pr, status);
3644 }
3645
3646 static float32 QEMU_SOFTFLOAT_ATTR
3647 soft_f32_sqrt(float32 a, float_status *status)
3648 {
3649 FloatParts64 pa, pr;
3650
3651 float32_unpack_canonical(&pa, a, status);
3652 pr = sqrt_float(pa, status, &float32_params);
3653 return float32_round_pack_canonical(&pr, status);
3654 }
3655
3656 static float64 QEMU_SOFTFLOAT_ATTR
3657 soft_f64_sqrt(float64 a, float_status *status)
3658 {
3659 FloatParts64 pa, pr;
3660
3661 float64_unpack_canonical(&pa, a, status);
3662 pr = sqrt_float(pa, status, &float64_params);
3663 return float64_round_pack_canonical(&pr, status);
3664 }
3665
3666 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3667 {
3668 union_float32 ua, ur;
3669
3670 ua.s = xa;
3671 if (unlikely(!can_use_fpu(s))) {
3672 goto soft;
3673 }
3674
3675 float32_input_flush1(&ua.s, s);
3676 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3677 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3678 fpclassify(ua.h) == FP_ZERO) ||
3679 signbit(ua.h))) {
3680 goto soft;
3681 }
3682 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3683 float32_is_neg(ua.s))) {
3684 goto soft;
3685 }
3686 ur.h = sqrtf(ua.h);
3687 return ur.s;
3688
3689 soft:
3690 return soft_f32_sqrt(ua.s, s);
3691 }
3692
3693 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3694 {
3695 union_float64 ua, ur;
3696
3697 ua.s = xa;
3698 if (unlikely(!can_use_fpu(s))) {
3699 goto soft;
3700 }
3701
3702 float64_input_flush1(&ua.s, s);
3703 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3704 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3705 fpclassify(ua.h) == FP_ZERO) ||
3706 signbit(ua.h))) {
3707 goto soft;
3708 }
3709 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3710 float64_is_neg(ua.s))) {
3711 goto soft;
3712 }
3713 ur.h = sqrt(ua.h);
3714 return ur.s;
3715
3716 soft:
3717 return soft_f64_sqrt(ua.s, s);
3718 }
3719
3720 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3721 {
3722 FloatParts64 pa, pr;
3723
3724 bfloat16_unpack_canonical(&pa, a, status);
3725 pr = sqrt_float(pa, status, &bfloat16_params);
3726 return bfloat16_round_pack_canonical(&pr, status);
3727 }
3728
3729 /*----------------------------------------------------------------------------
3730 | The pattern for a default generated NaN.
3731 *----------------------------------------------------------------------------*/
3732
3733 float16 float16_default_nan(float_status *status)
3734 {
3735 FloatParts64 p;
3736
3737 parts_default_nan(&p, status);
3738 p.frac >>= float16_params.frac_shift;
3739 return float16_pack_raw(&p);
3740 }
3741
3742 float32 float32_default_nan(float_status *status)
3743 {
3744 FloatParts64 p;
3745
3746 parts_default_nan(&p, status);
3747 p.frac >>= float32_params.frac_shift;
3748 return float32_pack_raw(&p);
3749 }
3750
3751 float64 float64_default_nan(float_status *status)
3752 {
3753 FloatParts64 p;
3754
3755 parts_default_nan(&p, status);
3756 p.frac >>= float64_params.frac_shift;
3757 return float64_pack_raw(&p);
3758 }
3759
3760 float128 float128_default_nan(float_status *status)
3761 {
3762 FloatParts128 p;
3763
3764 parts_default_nan(&p, status);
3765 frac_shr(&p, float128_params.frac_shift);
3766 return float128_pack_raw(&p);
3767 }
3768
3769 bfloat16 bfloat16_default_nan(float_status *status)
3770 {
3771 FloatParts64 p;
3772
3773 parts_default_nan(&p, status);
3774 p.frac >>= bfloat16_params.frac_shift;
3775 return bfloat16_pack_raw(&p);
3776 }
3777
3778 /*----------------------------------------------------------------------------
3779 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3780 *----------------------------------------------------------------------------*/
3781
3782 float16 float16_silence_nan(float16 a, float_status *status)
3783 {
3784 FloatParts64 p;
3785
3786 float16_unpack_raw(&p, a);
3787 p.frac <<= float16_params.frac_shift;
3788 parts_silence_nan(&p, status);
3789 p.frac >>= float16_params.frac_shift;
3790 return float16_pack_raw(&p);
3791 }
3792
3793 float32 float32_silence_nan(float32 a, float_status *status)
3794 {
3795 FloatParts64 p;
3796
3797 float32_unpack_raw(&p, a);
3798 p.frac <<= float32_params.frac_shift;
3799 parts_silence_nan(&p, status);
3800 p.frac >>= float32_params.frac_shift;
3801 return float32_pack_raw(&p);
3802 }
3803
3804 float64 float64_silence_nan(float64 a, float_status *status)
3805 {
3806 FloatParts64 p;
3807
3808 float64_unpack_raw(&p, a);
3809 p.frac <<= float64_params.frac_shift;
3810 parts_silence_nan(&p, status);
3811 p.frac >>= float64_params.frac_shift;
3812 return float64_pack_raw(&p);
3813 }
3814
3815 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3816 {
3817 FloatParts64 p;
3818
3819 bfloat16_unpack_raw(&p, a);
3820 p.frac <<= bfloat16_params.frac_shift;
3821 parts_silence_nan(&p, status);
3822 p.frac >>= bfloat16_params.frac_shift;
3823 return bfloat16_pack_raw(&p);
3824 }
3825
3826 float128 float128_silence_nan(float128 a, float_status *status)
3827 {
3828 FloatParts128 p;
3829
3830 float128_unpack_raw(&p, a);
3831 frac_shl(&p, float128_params.frac_shift);
3832 parts_silence_nan(&p, status);
3833 frac_shr(&p, float128_params.frac_shift);
3834 return float128_pack_raw(&p);
3835 }
3836
3837 /*----------------------------------------------------------------------------
3838 | If `a' is denormal and we are in flush-to-zero mode then set the
3839 | input-denormal exception and return zero. Otherwise just return the value.
3840 *----------------------------------------------------------------------------*/
3841
3842 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3843 {
3844 if (p.exp == 0 && p.frac != 0) {
3845 float_raise(float_flag_input_denormal, status);
3846 return true;
3847 }
3848
3849 return false;
3850 }
3851
3852 float16 float16_squash_input_denormal(float16 a, float_status *status)
3853 {
3854 if (status->flush_inputs_to_zero) {
3855 FloatParts64 p;
3856
3857 float16_unpack_raw(&p, a);
3858 if (parts_squash_denormal(p, status)) {
3859 return float16_set_sign(float16_zero, p.sign);
3860 }
3861 }
3862 return a;
3863 }
3864
3865 float32 float32_squash_input_denormal(float32 a, float_status *status)
3866 {
3867 if (status->flush_inputs_to_zero) {
3868 FloatParts64 p;
3869
3870 float32_unpack_raw(&p, a);
3871 if (parts_squash_denormal(p, status)) {
3872 return float32_set_sign(float32_zero, p.sign);
3873 }
3874 }
3875 return a;
3876 }
3877
3878 float64 float64_squash_input_denormal(float64 a, float_status *status)
3879 {
3880 if (status->flush_inputs_to_zero) {
3881 FloatParts64 p;
3882
3883 float64_unpack_raw(&p, a);
3884 if (parts_squash_denormal(p, status)) {
3885 return float64_set_sign(float64_zero, p.sign);
3886 }
3887 }
3888 return a;
3889 }
3890
3891 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3892 {
3893 if (status->flush_inputs_to_zero) {
3894 FloatParts64 p;
3895
3896 bfloat16_unpack_raw(&p, a);
3897 if (parts_squash_denormal(p, status)) {
3898 return bfloat16_set_sign(bfloat16_zero, p.sign);
3899 }
3900 }
3901 return a;
3902 }
3903
3904 /*----------------------------------------------------------------------------
3905 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3906 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3907 | input. If `zSign' is 1, the input is negated before being converted to an
3908 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3909 | is simply rounded to an integer, with the inexact exception raised if the
3910 | input cannot be represented exactly as an integer. However, if the fixed-
3911 | point input is too large, the invalid exception is raised and the largest
3912 | positive or negative integer is returned.
3913 *----------------------------------------------------------------------------*/
3914
3915 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3916 float_status *status)
3917 {
3918 int8_t roundingMode;
3919 bool roundNearestEven;
3920 int8_t roundIncrement, roundBits;
3921 int32_t z;
3922
3923 roundingMode = status->float_rounding_mode;
3924 roundNearestEven = ( roundingMode == float_round_nearest_even );
3925 switch (roundingMode) {
3926 case float_round_nearest_even:
3927 case float_round_ties_away:
3928 roundIncrement = 0x40;
3929 break;
3930 case float_round_to_zero:
3931 roundIncrement = 0;
3932 break;
3933 case float_round_up:
3934 roundIncrement = zSign ? 0 : 0x7f;
3935 break;
3936 case float_round_down:
3937 roundIncrement = zSign ? 0x7f : 0;
3938 break;
3939 case float_round_to_odd:
3940 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3941 break;
3942 default:
3943 abort();
3944 }
3945 roundBits = absZ & 0x7F;
3946 absZ = ( absZ + roundIncrement )>>7;
3947 if (!(roundBits ^ 0x40) && roundNearestEven) {
3948 absZ &= ~1;
3949 }
3950 z = absZ;
3951 if ( zSign ) z = - z;
3952 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3953 float_raise(float_flag_invalid, status);
3954 return zSign ? INT32_MIN : INT32_MAX;
3955 }
3956 if (roundBits) {
3957 float_raise(float_flag_inexact, status);
3958 }
3959 return z;
3960
3961 }
3962
3963 /*----------------------------------------------------------------------------
3964 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3965 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3966 | and returns the properly rounded 64-bit integer corresponding to the input.
3967 | If `zSign' is 1, the input is negated before being converted to an integer.
3968 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3969 | the inexact exception raised if the input cannot be represented exactly as
3970 | an integer. However, if the fixed-point input is too large, the invalid
3971 | exception is raised and the largest positive or negative integer is
3972 | returned.
3973 *----------------------------------------------------------------------------*/
3974
3975 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3976 float_status *status)
3977 {
3978 int8_t roundingMode;
3979 bool roundNearestEven, increment;
3980 int64_t z;
3981
3982 roundingMode = status->float_rounding_mode;
3983 roundNearestEven = ( roundingMode == float_round_nearest_even );
3984 switch (roundingMode) {
3985 case float_round_nearest_even:
3986 case float_round_ties_away:
3987 increment = ((int64_t) absZ1 < 0);
3988 break;
3989 case float_round_to_zero:
3990 increment = 0;
3991 break;
3992 case float_round_up:
3993 increment = !zSign && absZ1;
3994 break;
3995 case float_round_down:
3996 increment = zSign && absZ1;
3997 break;
3998 case float_round_to_odd:
3999 increment = !(absZ0 & 1) && absZ1;
4000 break;
4001 default:
4002 abort();
4003 }
4004 if ( increment ) {
4005 ++absZ0;
4006 if ( absZ0 == 0 ) goto overflow;
4007 if (!(absZ1 << 1) && roundNearestEven) {
4008 absZ0 &= ~1;
4009 }
4010 }
4011 z = absZ0;
4012 if ( zSign ) z = - z;
4013 if ( z && ( ( z < 0 ) ^ zSign ) ) {
4014 overflow:
4015 float_raise(float_flag_invalid, status);
4016 return zSign ? INT64_MIN : INT64_MAX;
4017 }
4018 if (absZ1) {
4019 float_raise(float_flag_inexact, status);
4020 }
4021 return z;
4022
4023 }
4024
4025 /*----------------------------------------------------------------------------
4026 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4027 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4028 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4029 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
4030 | with the inexact exception raised if the input cannot be represented exactly
4031 | as an integer. However, if the fixed-point input is too large, the invalid
4032 | exception is raised and the largest unsigned integer is returned.
4033 *----------------------------------------------------------------------------*/
4034
4035 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4036 uint64_t absZ1, float_status *status)
4037 {
4038 int8_t roundingMode;
4039 bool roundNearestEven, increment;
4040
4041 roundingMode = status->float_rounding_mode;
4042 roundNearestEven = (roundingMode == float_round_nearest_even);
4043 switch (roundingMode) {
4044 case float_round_nearest_even:
4045 case float_round_ties_away:
4046 increment = ((int64_t)absZ1 < 0);
4047 break;
4048 case float_round_to_zero:
4049 increment = 0;
4050 break;
4051 case float_round_up:
4052 increment = !zSign && absZ1;
4053 break;
4054 case float_round_down:
4055 increment = zSign && absZ1;
4056 break;
4057 case float_round_to_odd:
4058 increment = !(absZ0 & 1) && absZ1;
4059 break;
4060 default:
4061 abort();
4062 }
4063 if (increment) {
4064 ++absZ0;
4065 if (absZ0 == 0) {
4066 float_raise(float_flag_invalid, status);
4067 return UINT64_MAX;
4068 }
4069 if (!(absZ1 << 1) && roundNearestEven) {
4070 absZ0 &= ~1;
4071 }
4072 }
4073
4074 if (zSign && absZ0) {
4075 float_raise(float_flag_invalid, status);
4076 return 0;
4077 }
4078
4079 if (absZ1) {
4080 float_raise(float_flag_inexact, status);
4081 }
4082 return absZ0;
4083 }
4084
4085 /*----------------------------------------------------------------------------
4086 | Normalizes the subnormal single-precision floating-point value represented
4087 | by the denormalized significand `aSig'. The normalized exponent and
4088 | significand are stored at the locations pointed to by `zExpPtr' and
4089 | `zSigPtr', respectively.
4090 *----------------------------------------------------------------------------*/
4091
4092 static void
4093 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4094 {
4095 int8_t shiftCount;
4096
4097 shiftCount = clz32(aSig) - 8;
4098 *zSigPtr = aSig<<shiftCount;
4099 *zExpPtr = 1 - shiftCount;
4100
4101 }
4102
4103 /*----------------------------------------------------------------------------
4104 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4105 | and significand `zSig', and returns the proper single-precision floating-
4106 | point value corresponding to the abstract input. Ordinarily, the abstract
4107 | value is simply rounded and packed into the single-precision format, with
4108 | the inexact exception raised if the abstract input cannot be represented
4109 | exactly. However, if the abstract value is too large, the overflow and
4110 | inexact exceptions are raised and an infinity or maximal finite value is
4111 | returned. If the abstract value is too small, the input value is rounded to
4112 | a subnormal number, and the underflow and inexact exceptions are raised if
4113 | the abstract input cannot be represented exactly as a subnormal single-
4114 | precision floating-point number.
4115 | The input significand `zSig' has its binary point between bits 30
4116 | and 29, which is 7 bits to the left of the usual location. This shifted
4117 | significand must be normalized or smaller. If `zSig' is not normalized,
4118 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4119 | and it must not require rounding. In the usual case that `zSig' is
4120 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4121 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4122 | Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124
4125 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4126 float_status *status)
4127 {
4128 int8_t roundingMode;
4129 bool roundNearestEven;
4130 int8_t roundIncrement, roundBits;
4131 bool isTiny;
4132
4133 roundingMode = status->float_rounding_mode;
4134 roundNearestEven = ( roundingMode == float_round_nearest_even );
4135 switch (roundingMode) {
4136 case float_round_nearest_even:
4137 case float_round_ties_away:
4138 roundIncrement = 0x40;
4139 break;
4140 case float_round_to_zero:
4141 roundIncrement = 0;
4142 break;
4143 case float_round_up:
4144 roundIncrement = zSign ? 0 : 0x7f;
4145 break;
4146 case float_round_down:
4147 roundIncrement = zSign ? 0x7f : 0;
4148 break;
4149 case float_round_to_odd:
4150 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4151 break;
4152 default:
4153 abort();
4154 break;
4155 }
4156 roundBits = zSig & 0x7F;
4157 if ( 0xFD <= (uint16_t) zExp ) {
4158 if ( ( 0xFD < zExp )
4159 || ( ( zExp == 0xFD )
4160 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4161 ) {
4162 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4163 roundIncrement != 0;
4164 float_raise(float_flag_overflow | float_flag_inexact, status);
4165 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4166 }
4167 if ( zExp < 0 ) {
4168 if (status->flush_to_zero) {
4169 float_raise(float_flag_output_denormal, status);
4170 return packFloat32(zSign, 0, 0);
4171 }
4172 isTiny = status->tininess_before_rounding
4173 || (zExp < -1)
4174 || (zSig + roundIncrement < 0x80000000);
4175 shift32RightJamming( zSig, - zExp, &zSig );
4176 zExp = 0;
4177 roundBits = zSig & 0x7F;
4178 if (isTiny && roundBits) {
4179 float_raise(float_flag_underflow, status);
4180 }
4181 if (roundingMode == float_round_to_odd) {
4182 /*
4183 * For round-to-odd case, the roundIncrement depends on
4184 * zSig which just changed.
4185 */
4186 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4187 }
4188 }
4189 }
4190 if (roundBits) {
4191 float_raise(float_flag_inexact, status);
4192 }
4193 zSig = ( zSig + roundIncrement )>>7;
4194 if (!(roundBits ^ 0x40) && roundNearestEven) {
4195 zSig &= ~1;
4196 }
4197 if ( zSig == 0 ) zExp = 0;
4198 return packFloat32( zSign, zExp, zSig );
4199
4200 }
4201
4202 /*----------------------------------------------------------------------------
4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4204 | and significand `zSig', and returns the proper single-precision floating-
4205 | point value corresponding to the abstract input. This routine is just like
4206 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4207 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4208 | floating-point exponent.
4209 *----------------------------------------------------------------------------*/
4210
4211 static float32
4212 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4213 float_status *status)
4214 {
4215 int8_t shiftCount;
4216
4217 shiftCount = clz32(zSig) - 1;
4218 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4219 status);
4220
4221 }
4222
4223 /*----------------------------------------------------------------------------
4224 | Normalizes the subnormal double-precision floating-point value represented
4225 | by the denormalized significand `aSig'. The normalized exponent and
4226 | significand are stored at the locations pointed to by `zExpPtr' and
4227 | `zSigPtr', respectively.
4228 *----------------------------------------------------------------------------*/
4229
4230 static void
4231 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4232 {
4233 int8_t shiftCount;
4234
4235 shiftCount = clz64(aSig) - 11;
4236 *zSigPtr = aSig<<shiftCount;
4237 *zExpPtr = 1 - shiftCount;
4238
4239 }
4240
4241 /*----------------------------------------------------------------------------
4242 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4243 | double-precision floating-point value, returning the result. After being
4244 | shifted into the proper positions, the three fields are simply added
4245 | together to form the result. This means that any integer portion of `zSig'
4246 | will be added into the exponent. Since a properly normalized significand
4247 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4248 | than the desired result exponent whenever `zSig' is a complete, normalized
4249 | significand.
4250 *----------------------------------------------------------------------------*/
4251
4252 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4253 {
4254
4255 return make_float64(
4256 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4257
4258 }
4259
4260 /*----------------------------------------------------------------------------
4261 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4262 | and significand `zSig', and returns the proper double-precision floating-
4263 | point value corresponding to the abstract input. Ordinarily, the abstract
4264 | value is simply rounded and packed into the double-precision format, with
4265 | the inexact exception raised if the abstract input cannot be represented
4266 | exactly. However, if the abstract value is too large, the overflow and
4267 | inexact exceptions are raised and an infinity or maximal finite value is
4268 | returned. If the abstract value is too small, the input value is rounded to
4269 | a subnormal number, and the underflow and inexact exceptions are raised if
4270 | the abstract input cannot be represented exactly as a subnormal double-
4271 | precision floating-point number.
4272 | The input significand `zSig' has its binary point between bits 62
4273 | and 61, which is 10 bits to the left of the usual location. This shifted
4274 | significand must be normalized or smaller. If `zSig' is not normalized,
4275 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4276 | and it must not require rounding. In the usual case that `zSig' is
4277 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4278 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4279 | Binary Floating-Point Arithmetic.
4280 *----------------------------------------------------------------------------*/
4281
4282 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4283 float_status *status)
4284 {
4285 int8_t roundingMode;
4286 bool roundNearestEven;
4287 int roundIncrement, roundBits;
4288 bool isTiny;
4289
4290 roundingMode = status->float_rounding_mode;
4291 roundNearestEven = ( roundingMode == float_round_nearest_even );
4292 switch (roundingMode) {
4293 case float_round_nearest_even:
4294 case float_round_ties_away:
4295 roundIncrement = 0x200;
4296 break;
4297 case float_round_to_zero:
4298 roundIncrement = 0;
4299 break;
4300 case float_round_up:
4301 roundIncrement = zSign ? 0 : 0x3ff;
4302 break;
4303 case float_round_down:
4304 roundIncrement = zSign ? 0x3ff : 0;
4305 break;
4306 case float_round_to_odd:
4307 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4308 break;
4309 default:
4310 abort();
4311 }
4312 roundBits = zSig & 0x3FF;
4313 if ( 0x7FD <= (uint16_t) zExp ) {
4314 if ( ( 0x7FD < zExp )
4315 || ( ( zExp == 0x7FD )
4316 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4317 ) {
4318 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4319 roundIncrement != 0;
4320 float_raise(float_flag_overflow | float_flag_inexact, status);
4321 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4322 }
4323 if ( zExp < 0 ) {
4324 if (status->flush_to_zero) {
4325 float_raise(float_flag_output_denormal, status);
4326 return packFloat64(zSign, 0, 0);
4327 }
4328 isTiny = status->tininess_before_rounding
4329 || (zExp < -1)
4330 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4331 shift64RightJamming( zSig, - zExp, &zSig );
4332 zExp = 0;
4333 roundBits = zSig & 0x3FF;
4334 if (isTiny && roundBits) {
4335 float_raise(float_flag_underflow, status);
4336 }
4337 if (roundingMode == float_round_to_odd) {
4338 /*
4339 * For round-to-odd case, the roundIncrement depends on
4340 * zSig which just changed.
4341 */
4342 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4343 }
4344 }
4345 }
4346 if (roundBits) {
4347 float_raise(float_flag_inexact, status);
4348 }
4349 zSig = ( zSig + roundIncrement )>>10;
4350 if (!(roundBits ^ 0x200) && roundNearestEven) {
4351 zSig &= ~1;
4352 }
4353 if ( zSig == 0 ) zExp = 0;
4354 return packFloat64( zSign, zExp, zSig );
4355
4356 }
4357
4358 /*----------------------------------------------------------------------------
4359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4360 | and significand `zSig', and returns the proper double-precision floating-
4361 | point value corresponding to the abstract input. This routine is just like
4362 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4363 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4364 | floating-point exponent.
4365 *----------------------------------------------------------------------------*/
4366
4367 static float64
4368 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4369 float_status *status)
4370 {
4371 int8_t shiftCount;
4372
4373 shiftCount = clz64(zSig) - 1;
4374 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4375 status);
4376
4377 }
4378
4379 /*----------------------------------------------------------------------------
4380 | Normalizes the subnormal extended double-precision floating-point value
4381 | represented by the denormalized significand `aSig'. The normalized exponent
4382 | and significand are stored at the locations pointed to by `zExpPtr' and
4383 | `zSigPtr', respectively.
4384 *----------------------------------------------------------------------------*/
4385
4386 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4387 uint64_t *zSigPtr)
4388 {
4389 int8_t shiftCount;
4390
4391 shiftCount = clz64(aSig);
4392 *zSigPtr = aSig<<shiftCount;
4393 *zExpPtr = 1 - shiftCount;
4394 }
4395
4396 /*----------------------------------------------------------------------------
4397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4398 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4399 | and returns the proper extended double-precision floating-point value
4400 | corresponding to the abstract input. Ordinarily, the abstract value is
4401 | rounded and packed into the extended double-precision format, with the
4402 | inexact exception raised if the abstract input cannot be represented
4403 | exactly. However, if the abstract value is too large, the overflow and
4404 | inexact exceptions are raised and an infinity or maximal finite value is
4405 | returned. If the abstract value is too small, the input value is rounded to
4406 | a subnormal number, and the underflow and inexact exceptions are raised if
4407 | the abstract input cannot be represented exactly as a subnormal extended
4408 | double-precision floating-point number.
4409 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4410 | number of bits as single or double precision, respectively. Otherwise, the
4411 | result is rounded to the full precision of the extended double-precision
4412 | format.
4413 | The input significand must be normalized or smaller. If the input
4414 | significand is not normalized, `zExp' must be 0; in that case, the result
4415 | returned is a subnormal number, and it must not require rounding. The
4416 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4417 | Floating-Point Arithmetic.
4418 *----------------------------------------------------------------------------*/
4419
4420 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4421 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4422 float_status *status)
4423 {
4424 int8_t roundingMode;
4425 bool roundNearestEven, increment, isTiny;
4426 int64_t roundIncrement, roundMask, roundBits;
4427
4428 roundingMode = status->float_rounding_mode;
4429 roundNearestEven = ( roundingMode == float_round_nearest_even );
4430 if ( roundingPrecision == 80 ) goto precision80;
4431 if ( roundingPrecision == 64 ) {
4432 roundIncrement = UINT64_C(0x0000000000000400);
4433 roundMask = UINT64_C(0x00000000000007FF);
4434 }
4435 else if ( roundingPrecision == 32 ) {
4436 roundIncrement = UINT64_C(0x0000008000000000);
4437 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4438 }
4439 else {
4440 goto precision80;
4441 }
4442 zSig0 |= ( zSig1 != 0 );
4443 switch (roundingMode) {
4444 case float_round_nearest_even:
4445 case float_round_ties_away:
4446 break;
4447 case float_round_to_zero:
4448 roundIncrement = 0;
4449 break;
4450 case float_round_up:
4451 roundIncrement = zSign ? 0 : roundMask;
4452 break;
4453 case float_round_down:
4454 roundIncrement = zSign ? roundMask : 0;
4455 break;
4456 default:
4457 abort();
4458 }
4459 roundBits = zSig0 & roundMask;
4460 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4461 if ( ( 0x7FFE < zExp )
4462 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4463 ) {
4464 goto overflow;
4465 }
4466 if ( zExp <= 0 ) {
4467 if (status->flush_to_zero) {
4468 float_raise(float_flag_output_denormal, status);
4469 return packFloatx80(zSign, 0, 0);
4470 }
4471 isTiny = status->tininess_before_rounding
4472 || (zExp < 0 )
4473 || (zSig0 <= zSig0 + roundIncrement);
4474 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4475 zExp = 0;
4476 roundBits = zSig0 & roundMask;
4477 if (isTiny && roundBits) {
4478 float_raise(float_flag_underflow, status);
4479 }
4480 if (roundBits) {
4481 float_raise(float_flag_inexact, status);
4482 }
4483 zSig0 += roundIncrement;
4484 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4485 roundIncrement = roundMask + 1;
4486 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4487 roundMask |= roundIncrement;
4488 }
4489 zSig0 &= ~ roundMask;
4490 return packFloatx80( zSign, zExp, zSig0 );
4491 }
4492 }
4493 if (roundBits) {
4494 float_raise(float_flag_inexact, status);
4495 }
4496 zSig0 += roundIncrement;
4497 if ( zSig0 < roundIncrement ) {
4498 ++zExp;
4499 zSig0 = UINT64_C(0x8000000000000000);
4500 }
4501 roundIncrement = roundMask + 1;
4502 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4503 roundMask |= roundIncrement;
4504 }
4505 zSig0 &= ~ roundMask;
4506 if ( zSig0 == 0 ) zExp = 0;
4507 return packFloatx80( zSign, zExp, zSig0 );
4508 precision80:
4509 switch (roundingMode) {
4510 case float_round_nearest_even:
4511 case float_round_ties_away:
4512 increment = ((int64_t)zSig1 < 0);
4513 break;
4514 case float_round_to_zero:
4515 increment = 0;
4516 break;
4517 case float_round_up:
4518 increment = !zSign && zSig1;
4519 break;
4520 case float_round_down:
4521 increment = zSign && zSig1;
4522 break;
4523 default:
4524 abort();
4525 }
4526 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4527 if ( ( 0x7FFE < zExp )
4528 || ( ( zExp == 0x7FFE )
4529 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4530 && increment
4531 )
4532 ) {
4533 roundMask = 0;
4534 overflow:
4535 float_raise(float_flag_overflow | float_flag_inexact, status);
4536 if ( ( roundingMode == float_round_to_zero )
4537 || ( zSign && ( roundingMode == float_round_up ) )
4538 || ( ! zSign && ( roundingMode == float_round_down ) )
4539 ) {
4540 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4541 }
4542 return packFloatx80(zSign,
4543 floatx80_infinity_high,
4544 floatx80_infinity_low);
4545 }
4546 if ( zExp <= 0 ) {
4547 isTiny = status->tininess_before_rounding
4548 || (zExp < 0)
4549 || !increment
4550 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4551 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4552 zExp = 0;
4553 if (isTiny && zSig1) {
4554 float_raise(float_flag_underflow, status);
4555 }
4556 if (zSig1) {
4557 float_raise(float_flag_inexact, status);
4558 }
4559 switch (roundingMode) {
4560 case float_round_nearest_even:
4561 case float_round_ties_away:
4562 increment = ((int64_t)zSig1 < 0);
4563 break;
4564 case float_round_to_zero:
4565 increment = 0;
4566 break;
4567 case float_round_up:
4568 increment = !zSign && zSig1;
4569 break;
4570 case float_round_down:
4571 increment = zSign && zSig1;
4572 break;
4573 default:
4574 abort();
4575 }
4576 if ( increment ) {
4577 ++zSig0;
4578 if (!(zSig1 << 1) && roundNearestEven) {
4579 zSig0 &= ~1;
4580 }
4581 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4582 }
4583 return packFloatx80( zSign, zExp, zSig0 );
4584 }
4585 }
4586 if (zSig1) {
4587 float_raise(float_flag_inexact, status);
4588 }
4589 if ( increment ) {
4590 ++zSig0;
4591 if ( zSig0 == 0 ) {
4592 ++zExp;
4593 zSig0 = UINT64_C(0x8000000000000000);
4594 }
4595 else {
4596 if (!(zSig1 << 1) && roundNearestEven) {
4597 zSig0 &= ~1;
4598 }
4599 }
4600 }
4601 else {
4602 if ( zSig0 == 0 ) zExp = 0;
4603 }
4604 return packFloatx80( zSign, zExp, zSig0 );
4605
4606 }
4607
4608 /*----------------------------------------------------------------------------
4609 | Takes an abstract floating-point value having sign `zSign', exponent
4610 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4611 | and returns the proper extended double-precision floating-point value
4612 | corresponding to the abstract input. This routine is just like
4613 | `roundAndPackFloatx80' except that the input significand does not have to be
4614 | normalized.
4615 *----------------------------------------------------------------------------*/
4616
4617 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4618 bool zSign, int32_t zExp,
4619 uint64_t zSig0, uint64_t zSig1,
4620 float_status *status)
4621 {
4622 int8_t shiftCount;
4623
4624 if ( zSig0 == 0 ) {
4625 zSig0 = zSig1;
4626 zSig1 = 0;
4627 zExp -= 64;
4628 }
4629 shiftCount = clz64(zSig0);
4630 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4631 zExp -= shiftCount;
4632 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4633 zSig0, zSig1, status);
4634
4635 }
4636
4637 /*----------------------------------------------------------------------------
4638 | Returns the least-significant 64 fraction bits of the quadruple-precision
4639 | floating-point value `a'.
4640 *----------------------------------------------------------------------------*/
4641
4642 static inline uint64_t extractFloat128Frac1( float128 a )
4643 {
4644
4645 return a.low;
4646
4647 }
4648
4649 /*----------------------------------------------------------------------------
4650 | Returns the most-significant 48 fraction bits of the quadruple-precision
4651 | floating-point value `a'.
4652 *----------------------------------------------------------------------------*/
4653
4654 static inline uint64_t extractFloat128Frac0( float128 a )
4655 {
4656
4657 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4658
4659 }
4660
4661 /*----------------------------------------------------------------------------
4662 | Returns the exponent bits of the quadruple-precision floating-point value
4663 | `a'.
4664 *----------------------------------------------------------------------------*/
4665
4666 static inline int32_t extractFloat128Exp( float128 a )
4667 {
4668
4669 return ( a.high>>48 ) & 0x7FFF;
4670
4671 }
4672
4673 /*----------------------------------------------------------------------------
4674 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4675 *----------------------------------------------------------------------------*/
4676
4677 static inline bool extractFloat128Sign(float128 a)
4678 {
4679 return a.high >> 63;
4680 }
4681
4682 /*----------------------------------------------------------------------------
4683 | Normalizes the subnormal quadruple-precision floating-point value
4684 | represented by the denormalized significand formed by the concatenation of
4685 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4686 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4687 | significand are stored at the location pointed to by `zSig0Ptr', and the
4688 | least significant 64 bits of the normalized significand are stored at the
4689 | location pointed to by `zSig1Ptr'.
4690 *----------------------------------------------------------------------------*/
4691
4692 static void
4693 normalizeFloat128Subnormal(
4694 uint64_t aSig0,
4695 uint64_t aSig1,
4696 int32_t *zExpPtr,
4697 uint64_t *zSig0Ptr,
4698 uint64_t *zSig1Ptr
4699 )
4700 {
4701 int8_t shiftCount;
4702
4703 if ( aSig0 == 0 ) {
4704 shiftCount = clz64(aSig1) - 15;
4705 if ( shiftCount < 0 ) {
4706 *zSig0Ptr = aSig1>>( - shiftCount );
4707 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4708 }
4709 else {
4710 *zSig0Ptr = aSig1<<shiftCount;
4711 *zSig1Ptr = 0;
4712 }
4713 *zExpPtr = - shiftCount - 63;
4714 }
4715 else {
4716 shiftCount = clz64(aSig0) - 15;
4717 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4718 *zExpPtr = 1 - shiftCount;
4719 }
4720
4721 }
4722
4723 /*----------------------------------------------------------------------------
4724 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4725 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4726 | floating-point value, returning the result. After being shifted into the
4727 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4728 | added together to form the most significant 32 bits of the result. This
4729 | means that any integer portion of `zSig0' will be added into the exponent.
4730 | Since a properly normalized significand will have an integer portion equal
4731 | to 1, the `zExp' input should be 1 less than the desired result exponent
4732 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4733 | significand.
4734 *----------------------------------------------------------------------------*/
4735
4736 static inline float128
4737 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4738 {
4739 float128 z;
4740
4741 z.low = zSig1;
4742 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4743 return z;
4744 }
4745
4746 /*----------------------------------------------------------------------------
4747 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4748 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4749 | and `zSig2', and returns the proper quadruple-precision floating-point value
4750 | corresponding to the abstract input. Ordinarily, the abstract value is
4751 | simply rounded and packed into the quadruple-precision format, with the
4752 | inexact exception raised if the abstract input cannot be represented
4753 | exactly. However, if the abstract value is too large, the overflow and
4754 | inexact exceptions are raised and an infinity or maximal finite value is
4755 | returned. If the abstract value is too small, the input value is rounded to
4756 | a subnormal number, and the underflow and inexact exceptions are raised if
4757 | the abstract input cannot be represented exactly as a subnormal quadruple-
4758 | precision floating-point number.
4759 | The input significand must be normalized or smaller. If the input
4760 | significand is not normalized, `zExp' must be 0; in that case, the result
4761 | returned is a subnormal number, and it must not require rounding. In the
4762 | usual case that the input significand is normalized, `zExp' must be 1 less
4763 | than the ``true'' floating-point exponent. The handling of underflow and
4764 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4766
4767 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4768 uint64_t zSig0, uint64_t zSig1,
4769 uint64_t zSig2, float_status *status)
4770 {
4771 int8_t roundingMode;
4772 bool roundNearestEven, increment, isTiny;
4773
4774 roundingMode = status->float_rounding_mode;
4775 roundNearestEven = ( roundingMode == float_round_nearest_even );
4776 switch (roundingMode) {
4777 case float_round_nearest_even:
4778 case float_round_ties_away:
4779 increment = ((int64_t)zSig2 < 0);
4780 break;
4781 case float_round_to_zero:
4782 increment = 0;
4783 break;
4784 case float_round_up:
4785 increment = !zSign && zSig2;
4786 break;
4787 case float_round_down:
4788 increment = zSign && zSig2;
4789 break;
4790 case float_round_to_odd:
4791 increment = !(zSig1 & 0x1) && zSig2;
4792 break;
4793 default:
4794 abort();
4795 }
4796 if ( 0x7FFD <= (uint32_t) zExp ) {
4797 if ( ( 0x7FFD < zExp )
4798 || ( ( zExp == 0x7FFD )
4799 && eq128(
4800 UINT64_C(0x0001FFFFFFFFFFFF),
4801 UINT64_C(0xFFFFFFFFFFFFFFFF),
4802 zSig0,
4803 zSig1
4804 )
4805 && increment
4806 )
4807 ) {
4808 float_raise(float_flag_overflow | float_flag_inexact, status);
4809 if ( ( roundingMode == float_round_to_zero )
4810 || ( zSign && ( roundingMode == float_round_up ) )
4811 || ( ! zSign && ( roundingMode == float_round_down ) )
4812 || (roundingMode == float_round_to_odd)
4813 ) {
4814 return
4815 packFloat128(
4816 zSign,
4817 0x7FFE,
4818 UINT64_C(0x0000FFFFFFFFFFFF),
4819 UINT64_C(0xFFFFFFFFFFFFFFFF)
4820 );
4821 }
4822 return packFloat128( zSign, 0x7FFF, 0, 0 );
4823 }
4824 if ( zExp < 0 ) {
4825 if (status->flush_to_zero) {
4826 float_raise(float_flag_output_denormal, status);
4827 return packFloat128(zSign, 0, 0, 0);
4828 }
4829 isTiny = status->tininess_before_rounding
4830 || (zExp < -1)
4831 || !increment
4832 || lt128(zSig0, zSig1,
4833 UINT64_C(0x0001FFFFFFFFFFFF),
4834 UINT64_C(0xFFFFFFFFFFFFFFFF));
4835 shift128ExtraRightJamming(
4836 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4837 zExp = 0;
4838 if (isTiny && zSig2) {
4839 float_raise(float_flag_underflow, status);
4840 }
4841 switch (roundingMode) {
4842 case float_round_nearest_even:
4843 case float_round_ties_away:
4844 increment = ((int64_t)zSig2 < 0);
4845 break;
4846 case float_round_to_zero:
4847 increment = 0;
4848 break;
4849 case float_round_up:
4850 increment = !zSign && zSig2;
4851 break;
4852 case float_round_down:
4853 increment = zSign && zSig2;
4854 break;
4855 case float_round_to_odd:
4856 increment = !(zSig1 & 0x1) && zSig2;
4857 break;
4858 default:
4859 abort();
4860 }
4861 }
4862 }
4863 if (zSig2) {
4864 float_raise(float_flag_inexact, status);
4865 }
4866 if ( increment ) {
4867 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4868 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4869 zSig1 &= ~1;
4870 }
4871 }
4872 else {
4873 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4874 }
4875 return packFloat128( zSign, zExp, zSig0, zSig1 );
4876
4877 }
4878
4879 /*----------------------------------------------------------------------------
4880 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4881 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4882 | returns the proper quadruple-precision floating-point value corresponding
4883 | to the abstract input. This routine is just like `roundAndPackFloat128'
4884 | except that the input significand has fewer bits and does not have to be
4885 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4886 | point exponent.
4887 *----------------------------------------------------------------------------*/
4888
4889 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4890 uint64_t zSig0, uint64_t zSig1,
4891 float_status *status)
4892 {
4893 int8_t shiftCount;
4894 uint64_t zSig2;
4895
4896 if ( zSig0 == 0 ) {
4897 zSig0 = zSig1;
4898 zSig1 = 0;
4899 zExp -= 64;
4900 }
4901 shiftCount = clz64(zSig0) - 15;
4902 if ( 0 <= shiftCount ) {
4903 zSig2 = 0;
4904 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4905 }
4906 else {
4907 shift128ExtraRightJamming(
4908 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4909 }
4910 zExp -= shiftCount;
4911 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4912
4913 }
4914
4915
4916 /*----------------------------------------------------------------------------
4917 | Returns the result of converting the 32-bit two's complement integer `a'
4918 | to the extended double-precision floating-point format. The conversion
4919 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4920 | Arithmetic.
4921 *----------------------------------------------------------------------------*/
4922
4923 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4924 {
4925 bool zSign;
4926 uint32_t absA;
4927 int8_t shiftCount;
4928 uint64_t zSig;
4929
4930 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4931 zSign = ( a < 0 );
4932 absA = zSign ? - a : a;
4933 shiftCount = clz32(absA) + 32;
4934 zSig = absA;
4935 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4936
4937 }
4938
4939 /*----------------------------------------------------------------------------
4940 | Returns the result of converting the 32-bit two's complement integer `a' to
4941 | the quadruple-precision floating-point format. The conversion is performed
4942 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4943 *----------------------------------------------------------------------------*/
4944
4945 float128 int32_to_float128(int32_t a, float_status *status)
4946 {
4947 bool zSign;
4948 uint32_t absA;
4949 int8_t shiftCount;
4950 uint64_t zSig0;
4951
4952 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4953 zSign = ( a < 0 );
4954 absA = zSign ? - a : a;
4955 shiftCount = clz32(absA) + 17;
4956 zSig0 = absA;
4957 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4958
4959 }
4960
4961 /*----------------------------------------------------------------------------
4962 | Returns the result of converting the 64-bit two's complement integer `a'
4963 | to the extended double-precision floating-point format. The conversion
4964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4965 | Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967
4968 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4969 {
4970 bool zSign;
4971 uint64_t absA;
4972 int8_t shiftCount;
4973
4974 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4975 zSign = ( a < 0 );
4976 absA = zSign ? - a : a;
4977 shiftCount = clz64(absA);
4978 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4979
4980 }
4981
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of converting the 64-bit two's complement integer `a' to
4984 | the quadruple-precision floating-point format. The conversion is performed
4985 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 float128 int64_to_float128(int64_t a, float_status *status)
4989 {
4990 bool zSign;
4991 uint64_t absA;
4992 int8_t shiftCount;
4993 int32_t zExp;
4994 uint64_t zSig0, zSig1;
4995
4996 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4997 zSign = ( a < 0 );
4998 absA = zSign ? - a : a;
4999 shiftCount = clz64(absA) + 49;
5000 zExp = 0x406E - shiftCount;
5001 if ( 64 <= shiftCount ) {
5002 zSig1 = 0;
5003 zSig0 = absA;
5004 shiftCount -= 64;
5005 }
5006 else {
5007 zSig1 = absA;
5008 zSig0 = 0;
5009 }
5010 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5011 return packFloat128( zSign, zExp, zSig0, zSig1 );
5012
5013 }
5014
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of converting the 64-bit unsigned integer `a'
5017 | to the quadruple-precision floating-point format. The conversion is performed
5018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5020
5021 float128 uint64_to_float128(uint64_t a, float_status *status)
5022 {
5023 if (a == 0) {
5024 return float128_zero;
5025 }
5026 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5027 }
5028
5029 /*----------------------------------------------------------------------------
5030 | Returns the result of converting the single-precision floating-point value
5031 | `a' to the extended double-precision floating-point format. The conversion
5032 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5033 | Arithmetic.
5034 *----------------------------------------------------------------------------*/
5035
5036 floatx80 float32_to_floatx80(float32 a, float_status *status)
5037 {
5038 bool aSign;
5039 int aExp;
5040 uint32_t aSig;
5041
5042 a = float32_squash_input_denormal(a, status);
5043 aSig = extractFloat32Frac( a );
5044 aExp = extractFloat32Exp( a );
5045 aSign = extractFloat32Sign( a );
5046 if ( aExp == 0xFF ) {
5047 if (aSig) {
5048 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5049 status);
5050 return floatx80_silence_nan(res, status);
5051 }
5052 return packFloatx80(aSign,
5053 floatx80_infinity_high,
5054 floatx80_infinity_low);
5055 }
5056 if ( aExp == 0 ) {
5057 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5058 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5059 }
5060 aSig |= 0x00800000;
5061 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5062
5063 }
5064
5065 /*----------------------------------------------------------------------------
5066 | Returns the result of converting the single-precision floating-point value
5067 | `a' to the double-precision floating-point format. The conversion is
5068 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5069 | Arithmetic.
5070 *----------------------------------------------------------------------------*/
5071
5072 float128 float32_to_float128(float32 a, float_status *status)
5073 {
5074 bool aSign;
5075 int aExp;
5076 uint32_t aSig;
5077
5078 a = float32_squash_input_denormal(a, status);
5079 aSig = extractFloat32Frac( a );
5080 aExp = extractFloat32Exp( a );
5081 aSign = extractFloat32Sign( a );
5082 if ( aExp == 0xFF ) {
5083 if (aSig) {
5084 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5085 }
5086 return packFloat128( aSign, 0x7FFF, 0, 0 );
5087 }
5088 if ( aExp == 0 ) {
5089 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5090 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5091 --aExp;
5092 }
5093 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5094
5095 }
5096
5097 /*----------------------------------------------------------------------------
5098 | Returns the remainder of the single-precision floating-point value `a'
5099 | with respect to the corresponding value `b'. The operation is performed
5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5102
5103 float32 float32_rem(float32 a, float32 b, float_status *status)
5104 {
5105 bool aSign, zSign;
5106 int aExp, bExp, expDiff;
5107 uint32_t aSig, bSig;
5108 uint32_t q;
5109 uint64_t aSig64, bSig64, q64;
5110 uint32_t alternateASig;
5111 int32_t sigMean;
5112 a = float32_squash_input_denormal(a, status);
5113 b = float32_squash_input_denormal(b, status);
5114
5115 aSig = extractFloat32Frac( a );
5116 aExp = extractFloat32Exp( a );
5117 aSign = extractFloat32Sign( a );
5118 bSig = extractFloat32Frac( b );
5119 bExp = extractFloat32Exp( b );
5120 if ( aExp == 0xFF ) {
5121 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5122 return propagateFloat32NaN(a, b, status);
5123 }
5124 float_raise(float_flag_invalid, status);
5125 return float32_default_nan(status);
5126 }
5127 if ( bExp == 0xFF ) {
5128 if (bSig) {
5129 return propagateFloat32NaN(a, b, status);
5130 }
5131 return a;
5132 }
5133 if ( bExp == 0 ) {
5134 if ( bSig == 0 ) {
5135 float_raise(float_flag_invalid, status);
5136 return float32_default_nan(status);
5137 }
5138 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5139 }
5140 if ( aExp == 0 ) {
5141 if ( aSig == 0 ) return a;
5142 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5143 }
5144 expDiff = aExp - bExp;
5145 aSig |= 0x00800000;
5146 bSig |= 0x00800000;
5147 if ( expDiff < 32 ) {
5148 aSig <<= 8;
5149 bSig <<= 8;
5150 if ( expDiff < 0 ) {
5151 if ( expDiff < -1 ) return a;
5152 aSig >>= 1;
5153 }
5154 q = ( bSig <= aSig );
5155 if ( q ) aSig -= bSig;
5156 if ( 0 < expDiff ) {
5157 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5158 q >>= 32 - expDiff;
5159 bSig >>= 2;
5160 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5161 }
5162 else {
5163 aSig >>= 2;
5164 bSig >>= 2;
5165 }
5166 }
5167 else {
5168 if ( bSig <= aSig ) aSig -= bSig;
5169 aSig64 = ( (uint64_t) aSig )<<40;
5170 bSig64 = ( (uint64_t) bSig )<<40;
5171 expDiff -= 64;
5172 while ( 0 < expDiff ) {
5173 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5174 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5175 aSig64 = - ( ( bSig * q64 )<<38 );
5176 expDiff -= 62;
5177 }
5178 expDiff += 64;
5179 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5180 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5181 q = q64>>( 64 - expDiff );
5182 bSig <<= 6;
5183 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5184 }
5185 do {
5186 alternateASig = aSig;
5187 ++q;
5188 aSig -= bSig;
5189 } while ( 0 <= (int32_t) aSig );
5190 sigMean = aSig + alternateASig;
5191 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5192 aSig = alternateASig;
5193 }
5194 zSign = ( (int32_t) aSig < 0 );
5195 if ( zSign ) aSig = - aSig;
5196 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5197 }
5198
5199
5200
5201 /*----------------------------------------------------------------------------
5202 | Returns the binary exponential of the single-precision floating-point value
5203 | `a'. The operation is performed according to the IEC/IEEE Standard for
5204 | Binary Floating-Point Arithmetic.
5205 |
5206 | Uses the following identities:
5207 |
5208 | 1. -------------------------------------------------------------------------
5209 | x x*ln(2)
5210 | 2 = e
5211 |
5212 | 2. -------------------------------------------------------------------------
5213 | 2 3 4 5 n
5214 | x x x x x x x
5215 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5216 | 1! 2! 3! 4! 5! n!
5217 *----------------------------------------------------------------------------*/
5218
5219 static const float64 float32_exp2_coefficients[15] =
5220 {
5221 const_float64( 0x3ff0000000000000ll ), /* 1 */
5222 const_float64( 0x3fe0000000000000ll ), /* 2 */
5223 const_float64( 0x3fc5555555555555ll ), /* 3 */
5224 const_float64( 0x3fa5555555555555ll ), /* 4 */
5225 const_float64( 0x3f81111111111111ll ), /* 5 */
5226 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5227 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5228 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5229 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5230 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5231 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5232 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5233 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5234 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5235 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5236 };
5237
5238 float32 float32_exp2(float32 a, float_status *status)
5239 {
5240 bool aSign;
5241 int aExp;
5242 uint32_t aSig;
5243 float64 r, x, xn;
5244 int i;
5245 a = float32_squash_input_denormal(a, status);
5246
5247 aSig = extractFloat32Frac( a );
5248 aExp = extractFloat32Exp( a );
5249 aSign = extractFloat32Sign( a );
5250
5251 if ( aExp == 0xFF) {
5252 if (aSig) {
5253 return propagateFloat32NaN(a, float32_zero, status);
5254 }
5255 return (aSign) ? float32_zero : a;
5256 }
5257 if (aExp == 0) {
5258 if (aSig == 0) return float32_one;
5259 }
5260
5261 float_raise(float_flag_inexact, status);
5262
5263 /* ******************************* */
5264 /* using float64 for approximation */
5265 /* ******************************* */
5266 x = float32_to_float64(a, status);
5267 x = float64_mul(x, float64_ln2, status);
5268
5269 xn = x;
5270 r = float64_one;
5271 for (i = 0 ; i < 15 ; i++) {
5272 float64 f;
5273
5274 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5275 r = float64_add(r, f, status);
5276
5277 xn = float64_mul(xn, x, status);
5278 }
5279
5280 return float64_to_float32(r, status);
5281 }
5282
5283 /*----------------------------------------------------------------------------
5284 | Returns the binary log of the single-precision floating-point value `a'.
5285 | The operation is performed according to the IEC/IEEE Standard for Binary
5286 | Floating-Point Arithmetic.
5287 *----------------------------------------------------------------------------*/
5288 float32 float32_log2(float32 a, float_status *status)
5289 {
5290 bool aSign, zSign;
5291 int aExp;
5292 uint32_t aSig, zSig, i;
5293
5294 a = float32_squash_input_denormal(a, status);
5295 aSig = extractFloat32Frac( a );
5296 aExp = extractFloat32Exp( a );
5297 aSign = extractFloat32Sign( a );
5298
5299 if ( aExp == 0 ) {
5300 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5301 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5302 }
5303 if ( aSign ) {
5304 float_raise(float_flag_invalid, status);
5305 return float32_default_nan(status);
5306 }
5307 if ( aExp == 0xFF ) {
5308 if (aSig) {
5309 return propagateFloat32NaN(a, float32_zero, status);
5310 }
5311 return a;
5312 }
5313
5314 aExp -= 0x7F;
5315 aSig |= 0x00800000;
5316 zSign = aExp < 0;
5317 zSig = aExp << 23;
5318
5319 for (i = 1 << 22; i > 0; i >>= 1) {
5320 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5321 if ( aSig & 0x01000000 ) {
5322 aSig >>= 1;
5323 zSig |= i;
5324 }
5325 }
5326
5327 if ( zSign )
5328 zSig = -zSig;
5329
5330 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5331 }
5332
5333 /*----------------------------------------------------------------------------
5334 | Returns the result of converting the double-precision floating-point value
5335 | `a' to the extended double-precision floating-point format. The conversion
5336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5337 | Arithmetic.
5338 *----------------------------------------------------------------------------*/
5339
5340 floatx80 float64_to_floatx80(float64 a, float_status *status)
5341 {
5342 bool aSign;
5343 int aExp;
5344 uint64_t aSig;
5345
5346 a = float64_squash_input_denormal(a, status);
5347 aSig = extractFloat64Frac( a );
5348 aExp = extractFloat64Exp( a );
5349 aSign = extractFloat64Sign( a );
5350 if ( aExp == 0x7FF ) {
5351 if (aSig) {
5352 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5353 status);
5354 return floatx80_silence_nan(res, status);
5355 }
5356 return packFloatx80(aSign,
5357 floatx80_infinity_high,
5358 floatx80_infinity_low);
5359 }
5360 if ( aExp == 0 ) {
5361 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5362 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5363 }
5364 return
5365 packFloatx80(
5366 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5367
5368 }
5369
5370 /*----------------------------------------------------------------------------
5371 | Returns the result of converting the double-precision floating-point value
5372 | `a' to the quadruple-precision floating-point format. The conversion is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5374 | Arithmetic.
5375 *----------------------------------------------------------------------------*/
5376
5377 float128 float64_to_float128(float64 a, float_status *status)
5378 {
5379 bool aSign;
5380 int aExp;
5381 uint64_t aSig, zSig0, zSig1;
5382
5383 a = float64_squash_input_denormal(a, status);
5384 aSig = extractFloat64Frac( a );
5385 aExp = extractFloat64Exp( a );
5386 aSign = extractFloat64Sign( a );
5387 if ( aExp == 0x7FF ) {
5388 if (aSig) {
5389 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5390 }
5391 return packFloat128( aSign, 0x7FFF, 0, 0 );
5392 }
5393 if ( aExp == 0 ) {
5394 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5395 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5396 --aExp;
5397 }
5398 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5399 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5400
5401 }
5402
5403
5404 /*----------------------------------------------------------------------------
5405 | Returns the remainder of the double-precision floating-point value `a'
5406 | with respect to the corresponding value `b'. The operation is performed
5407 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5409
5410 float64 float64_rem(float64 a, float64 b, float_status *status)
5411 {
5412 bool aSign, zSign;
5413 int aExp, bExp, expDiff;
5414 uint64_t aSig, bSig;
5415 uint64_t q, alternateASig;
5416 int64_t sigMean;
5417
5418 a = float64_squash_input_denormal(a, status);
5419 b = float64_squash_input_denormal(b, status);
5420 aSig = extractFloat64Frac( a );
5421 aExp = extractFloat64Exp( a );
5422 aSign = extractFloat64Sign( a );
5423 bSig = extractFloat64Frac( b );
5424 bExp = extractFloat64Exp( b );
5425 if ( aExp == 0x7FF ) {
5426 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5427 return propagateFloat64NaN(a, b, status);
5428 }
5429 float_raise(float_flag_invalid, status);
5430 return float64_default_nan(status);
5431 }
5432 if ( bExp == 0x7FF ) {
5433 if (bSig) {
5434 return propagateFloat64NaN(a, b, status);
5435 }
5436 return a;
5437 }
5438 if ( bExp == 0 ) {
5439 if ( bSig == 0 ) {
5440 float_raise(float_flag_invalid, status);
5441 return float64_default_nan(status);
5442 }
5443 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5444 }
5445 if ( aExp == 0 ) {
5446 if ( aSig == 0 ) return a;
5447 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5448 }
5449 expDiff = aExp - bExp;
5450 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5451 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5452 if ( expDiff < 0 ) {
5453 if ( expDiff < -1 ) return a;
5454 aSig >>= 1;
5455 }
5456 q = ( bSig <= aSig );
5457 if ( q ) aSig -= bSig;
5458 expDiff -= 64;
5459 while ( 0 < expDiff ) {
5460 q = estimateDiv128To64( aSig, 0, bSig );
5461 q = ( 2 < q ) ? q - 2 : 0;
5462 aSig = - ( ( bSig>>2 ) * q );
5463 expDiff -= 62;
5464 }
5465 expDiff += 64;
5466 if ( 0 < expDiff ) {
5467 q = estimateDiv128To64( aSig, 0, bSig );
5468 q = ( 2 < q ) ? q - 2 : 0;
5469 q >>= 64 - expDiff;
5470 bSig >>= 2;
5471 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5472 }
5473 else {
5474 aSig >>= 2;
5475 bSig >>= 2;
5476 }
5477 do {
5478 alternateASig = aSig;
5479 ++q;
5480 aSig -= bSig;
5481 } while ( 0 <= (int64_t) aSig );
5482 sigMean = aSig + alternateASig;
5483 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5484 aSig = alternateASig;
5485 }
5486 zSign = ( (int64_t) aSig < 0 );
5487 if ( zSign ) aSig = - aSig;
5488 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5489
5490 }
5491
5492 /*----------------------------------------------------------------------------
5493 | Returns the binary log of the double-precision floating-point value `a'.
5494 | The operation is performed according to the IEC/IEEE Standard for Binary
5495 | Floating-Point Arithmetic.
5496 *----------------------------------------------------------------------------*/
5497 float64 float64_log2(float64 a, float_status *status)
5498 {
5499 bool aSign, zSign;
5500 int aExp;
5501 uint64_t aSig, aSig0, aSig1, zSig, i;
5502 a = float64_squash_input_denormal(a, status);
5503
5504 aSig = extractFloat64Frac( a );
5505 aExp = extractFloat64Exp( a );
5506 aSign = extractFloat64Sign( a );
5507
5508 if ( aExp == 0 ) {
5509 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5510 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5511 }
5512 if ( aSign ) {
5513 float_raise(float_flag_invalid, status);
5514 return float64_default_nan(status);
5515 }
5516 if ( aExp == 0x7FF ) {
5517 if (aSig) {
5518 return propagateFloat64NaN(a, float64_zero, status);
5519 }
5520 return a;
5521 }
5522
5523 aExp -= 0x3FF;
5524 aSig |= UINT64_C(0x0010000000000000);
5525 zSign = aExp < 0;
5526 zSig = (uint64_t)aExp << 52;
5527 for (i = 1LL << 51; i > 0; i >>= 1) {
5528 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5529 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5530 if ( aSig & UINT64_C(0x0020000000000000) ) {
5531 aSig >>= 1;
5532 zSig |= i;
5533 }
5534 }
5535
5536 if ( zSign )
5537 zSig = -zSig;
5538 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5539 }
5540
5541 /*----------------------------------------------------------------------------
5542 | Returns the result of converting the extended double-precision floating-
5543 | point value `a' to the 32-bit two's complement integer format. The
5544 | conversion is performed according to the IEC/IEEE Standard for Binary
5545 | Floating-Point Arithmetic---which means in particular that the conversion
5546 | is rounded according to the current rounding mode. If `a' is a NaN, the
5547 | largest positive integer is returned. Otherwise, if the conversion
5548 | overflows, the largest integer with the same sign as `a' is returned.
5549 *----------------------------------------------------------------------------*/
5550
5551 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5552 {
5553 bool aSign;
5554 int32_t aExp, shiftCount;
5555 uint64_t aSig;
5556
5557 if (floatx80_invalid_encoding(a)) {
5558 float_raise(float_flag_invalid, status);
5559 return 1 << 31;
5560 }
5561 aSig = extractFloatx80Frac( a );
5562 aExp = extractFloatx80Exp( a );
5563 aSign = extractFloatx80Sign( a );
5564 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5565 shiftCount = 0x4037 - aExp;
5566 if ( shiftCount <= 0 ) shiftCount = 1;
5567 shift64RightJamming( aSig, shiftCount, &aSig );
5568 return roundAndPackInt32(aSign, aSig, status);
5569
5570 }
5571
5572 /*----------------------------------------------------------------------------
5573 | Returns the result of converting the extended double-precision floating-
5574 | point value `a' to the 32-bit two's complement integer format. The
5575 | conversion is performed according to the IEC/IEEE Standard for Binary
5576 | Floating-Point Arithmetic, except that the conversion is always rounded
5577 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5578 | Otherwise, if the conversion overflows, the largest integer with the same
5579 | sign as `a' is returned.
5580 *----------------------------------------------------------------------------*/
5581
5582 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5583 {
5584 bool aSign;
5585 int32_t aExp, shiftCount;
5586 uint64_t aSig, savedASig;
5587 int32_t z;
5588
5589 if (floatx80_invalid_encoding(a)) {
5590 float_raise(float_flag_invalid, status);
5591 return 1 << 31;
5592 }
5593 aSig = extractFloatx80Frac( a );
5594 aExp = extractFloatx80Exp( a );
5595 aSign = extractFloatx80Sign( a );
5596 if ( 0x401E < aExp ) {
5597 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5598 goto invalid;
5599 }
5600 else if ( aExp < 0x3FFF ) {
5601 if (aExp || aSig) {
5602 float_raise(float_flag_inexact, status);
5603 }
5604 return 0;
5605 }
5606 shiftCount = 0x403E - aExp;
5607 savedASig = aSig;
5608 aSig >>= shiftCount;
5609 z = aSig;
5610 if ( aSign ) z = - z;
5611 if ( ( z < 0 ) ^ aSign ) {
5612 invalid:
5613 float_raise(float_flag_invalid, status);
5614 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5615 }
5616 if ( ( aSig<<shiftCount ) != savedASig ) {
5617 float_raise(float_flag_inexact, status);
5618 }
5619 return z;
5620
5621 }
5622
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of converting the extended double-precision floating-
5625 | point value `a' to the 64-bit two's complement integer format. The
5626 | conversion is performed according to the IEC/IEEE Standard for Binary
5627 | Floating-Point Arithmetic---which means in particular that the conversion
5628 | is rounded according to the current rounding mode. If `a' is a NaN,
5629 | the largest positive integer is returned. Otherwise, if the conversion
5630 | overflows, the largest integer with the same sign as `a' is returned.
5631 *----------------------------------------------------------------------------*/
5632
5633 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5634 {
5635 bool aSign;
5636 int32_t aExp, shiftCount;
5637 uint64_t aSig, aSigExtra;
5638
5639 if (floatx80_invalid_encoding(a)) {
5640 float_raise(float_flag_invalid, status);
5641 return 1ULL << 63;
5642 }
5643 aSig = extractFloatx80Frac( a );
5644 aExp = extractFloatx80Exp( a );
5645 aSign = extractFloatx80Sign( a );
5646 shiftCount = 0x403E - aExp;
5647 if ( shiftCount <= 0 ) {
5648 if ( shiftCount ) {
5649 float_raise(float_flag_invalid, status);
5650 if (!aSign || floatx80_is_any_nan(a)) {
5651 return INT64_MAX;
5652 }
5653 return INT64_MIN;
5654 }
5655 aSigExtra = 0;
5656 }
5657 else {
5658 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5659 }
5660 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5661
5662 }
5663
5664 /*----------------------------------------------------------------------------
5665 | Returns the result of converting the extended double-precision floating-
5666 | point value `a' to the 64-bit two's complement integer format. The
5667 | conversion is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic, except that the conversion is always rounded
5669 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5670 | Otherwise, if the conversion overflows, the largest integer with the same
5671 | sign as `a' is returned.
5672 *----------------------------------------------------------------------------*/
5673
5674 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5675 {
5676 bool aSign;
5677 int32_t aExp, shiftCount;
5678 uint64_t aSig;
5679 int64_t z;
5680
5681 if (floatx80_invalid_encoding(a)) {
5682 float_raise(float_flag_invalid, status);
5683 return 1ULL << 63;
5684 }
5685 aSig = extractFloatx80Frac( a );
5686 aExp = extractFloatx80Exp( a );
5687 aSign = extractFloatx80Sign( a );
5688 shiftCount = aExp - 0x403E;
5689 if ( 0 <= shiftCount ) {
5690 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5691 if ( ( a.high != 0xC03E ) || aSig ) {
5692 float_raise(float_flag_invalid, status);
5693 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5694 return INT64_MAX;
5695 }
5696 }
5697 return INT64_MIN;
5698 }
5699 else if ( aExp < 0x3FFF ) {
5700 if (aExp | aSig) {
5701 float_raise(float_flag_inexact, status);
5702 }
5703 return 0;
5704 }
5705 z = aSig>>( - shiftCount );
5706 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5707 float_raise(float_flag_inexact, status);
5708 }
5709 if ( aSign ) z = - z;
5710 return z;
5711
5712 }
5713
5714 /*----------------------------------------------------------------------------
5715 | Returns the result of converting the extended double-precision floating-
5716 | point value `a' to the single-precision floating-point format. The
5717 | conversion is performed according to the IEC/IEEE Standard for Binary
5718 | Floating-Point Arithmetic.
5719 *----------------------------------------------------------------------------*/
5720
5721 float32 floatx80_to_float32(floatx80 a, float_status *status)
5722 {
5723 bool aSign;
5724 int32_t aExp;
5725 uint64_t aSig;
5726
5727 if (floatx80_invalid_encoding(a)) {
5728 float_raise(float_flag_invalid, status);
5729 return float32_default_nan(status);
5730 }
5731 aSig = extractFloatx80Frac( a );
5732 aExp = extractFloatx80Exp( a );
5733 aSign = extractFloatx80Sign( a );
5734 if ( aExp == 0x7FFF ) {
5735 if ( (uint64_t) ( aSig<<1 ) ) {
5736 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5737 status);
5738 return float32_silence_nan(res, status);
5739 }
5740 return packFloat32( aSign, 0xFF, 0 );
5741 }
5742 shift64RightJamming( aSig, 33, &aSig );
5743 if ( aExp || aSig ) aExp -= 0x3F81;
5744 return roundAndPackFloat32(aSign, aExp, aSig, status);
5745
5746 }
5747
5748 /*----------------------------------------------------------------------------
5749 | Returns the result of converting the extended double-precision floating-
5750 | point value `a' to the double-precision floating-point format. The
5751 | conversion is performed according to the IEC/IEEE Standard for Binary
5752 | Floating-Point Arithmetic.
5753 *----------------------------------------------------------------------------*/
5754
5755 float64 floatx80_to_float64(floatx80 a, float_status *status)
5756 {
5757 bool aSign;
5758 int32_t aExp;
5759 uint64_t aSig, zSig;
5760
5761 if (floatx80_invalid_encoding(a)) {
5762 float_raise(float_flag_invalid, status);
5763 return float64_default_nan(status);
5764 }
5765 aSig = extractFloatx80Frac( a );
5766 aExp = extractFloatx80Exp( a );
5767 aSign = extractFloatx80Sign( a );
5768 if ( aExp == 0x7FFF ) {
5769 if ( (uint64_t) ( aSig<<1 ) ) {
5770 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5771 status);
5772 return float64_silence_nan(res, status);
5773 }
5774 return packFloat64( aSign, 0x7FF, 0 );
5775 }
5776 shift64RightJamming( aSig, 1, &zSig );
5777 if ( aExp || aSig ) aExp -= 0x3C01;
5778 return roundAndPackFloat64(aSign, aExp, zSig, status);
5779
5780 }
5781
5782 /*----------------------------------------------------------------------------
5783 | Returns the result of converting the extended double-precision floating-
5784 | point value `a' to the quadruple-precision floating-point format. The
5785 | conversion is performed according to the IEC/IEEE Standard for Binary
5786 | Floating-Point Arithmetic.
5787 *----------------------------------------------------------------------------*/
5788
5789 float128 floatx80_to_float128(floatx80 a, float_status *status)
5790 {
5791 bool aSign;
5792 int aExp;
5793 uint64_t aSig, zSig0, zSig1;
5794
5795 if (floatx80_invalid_encoding(a)) {
5796 float_raise(float_flag_invalid, status);
5797 return float128_default_nan(status);
5798 }
5799 aSig = extractFloatx80Frac( a );
5800 aExp = extractFloatx80Exp( a );
5801 aSign = extractFloatx80Sign( a );
5802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5803 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5804 status);
5805 return float128_silence_nan(res, status);
5806 }
5807 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5808 return packFloat128( aSign, aExp, zSig0, zSig1 );
5809
5810 }
5811
5812 /*----------------------------------------------------------------------------
5813 | Rounds the extended double-precision floating-point value `a'
5814 | to the precision provided by floatx80_rounding_precision and returns the
5815 | result as an extended double-precision floating-point value.
5816 | The operation is performed according to the IEC/IEEE Standard for Binary
5817 | Floating-Point Arithmetic.
5818 *----------------------------------------------------------------------------*/
5819
5820 floatx80 floatx80_round(floatx80 a, float_status *status)
5821 {
5822 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5823 extractFloatx80Sign(a),
5824 extractFloatx80Exp(a),
5825 extractFloatx80Frac(a), 0, status);
5826 }
5827
5828 /*----------------------------------------------------------------------------
5829 | Rounds the extended double-precision floating-point value `a' to an integer,
5830 | and returns the result as an extended quadruple-precision floating-point
5831 | value. The operation is performed according to the IEC/IEEE Standard for
5832 | Binary Floating-Point Arithmetic.
5833 *----------------------------------------------------------------------------*/
5834
5835 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5836 {
5837 bool aSign;
5838 int32_t aExp;
5839 uint64_t lastBitMask, roundBitsMask;
5840 floatx80 z;
5841
5842 if (floatx80_invalid_encoding(a)) {
5843 float_raise(float_flag_invalid, status);
5844 return floatx80_default_nan(status);
5845 }
5846 aExp = extractFloatx80Exp( a );
5847 if ( 0x403E <= aExp ) {
5848 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5849 return propagateFloatx80NaN(a, a, status);
5850 }
5851 return a;
5852 }
5853 if ( aExp < 0x3FFF ) {
5854 if ( ( aExp == 0 )
5855 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5856 return a;
5857 }
5858 float_raise(float_flag_inexact, status);
5859 aSign = extractFloatx80Sign( a );
5860 switch (status->float_rounding_mode) {
5861 case float_round_nearest_even:
5862 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5863 ) {
5864 return
5865 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5866 }
5867 break;
5868 case float_round_ties_away:
5869 if (aExp == 0x3FFE) {
5870 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5871 }
5872 break;
5873 case float_round_down:
5874 return
5875 aSign ?
5876 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5877 : packFloatx80( 0, 0, 0 );
5878 case float_round_up:
5879 return
5880 aSign ? packFloatx80( 1, 0, 0 )
5881 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5882
5883 case float_round_to_zero:
5884 break;
5885 default:
5886 g_assert_not_reached();
5887 }
5888 return packFloatx80( aSign, 0, 0 );
5889 }
5890 lastBitMask = 1;
5891 lastBitMask <<= 0x403E - aExp;
5892 roundBitsMask = lastBitMask - 1;
5893 z = a;
5894 switch (status->float_rounding_mode) {
5895 case float_round_nearest_even:
5896 z.low += lastBitMask>>1;
5897 if ((z.low & roundBitsMask) == 0) {
5898 z.low &= ~lastBitMask;
5899 }
5900 break;
5901 case float_round_ties_away:
5902 z.low += lastBitMask >> 1;
5903 break;
5904 case float_round_to_zero:
5905 break;
5906 case float_round_up:
5907 if (!extractFloatx80Sign(z)) {
5908 z.low += roundBitsMask;
5909 }
5910 break;
5911 case float_round_down:
5912 if (extractFloatx80Sign(z)) {
5913 z.low += roundBitsMask;
5914 }
5915 break;
5916 default:
5917 abort();
5918 }
5919 z.low &= ~ roundBitsMask;
5920 if ( z.low == 0 ) {
5921 ++z.high;
5922 z.low = UINT64_C(0x8000000000000000);
5923 }
5924 if (z.low != a.low) {
5925 float_raise(float_flag_inexact, status);
5926 }
5927 return z;
5928
5929 }
5930
5931 /*----------------------------------------------------------------------------
5932 | Returns the result of adding the absolute values of the extended double-
5933 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5934 | negated before being returned. `zSign' is ignored if the result is a NaN.
5935 | The addition is performed according to the IEC/IEEE Standard for Binary
5936 | Floating-Point Arithmetic.
5937 *----------------------------------------------------------------------------*/
5938
5939 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5940 float_status *status)
5941 {
5942 int32_t aExp, bExp, zExp;
5943 uint64_t aSig, bSig, zSig0, zSig1;
5944 int32_t expDiff;
5945
5946 aSig = extractFloatx80Frac( a );
5947 aExp = extractFloatx80Exp( a );
5948 bSig = extractFloatx80Frac( b );
5949 bExp = extractFloatx80Exp( b );
5950 expDiff = aExp - bExp;
5951 if ( 0 < expDiff ) {
5952 if ( aExp == 0x7FFF ) {
5953 if ((uint64_t)(aSig << 1)) {
5954 return propagateFloatx80NaN(a, b, status);
5955 }
5956 return a;
5957 }
5958 if ( bExp == 0 ) --expDiff;
5959 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5960 zExp = aExp;
5961 }
5962 else if ( expDiff < 0 ) {
5963 if ( bExp == 0x7FFF ) {
5964 if ((uint64_t)(bSig << 1)) {
5965 return propagateFloatx80NaN(a, b, status);
5966 }
5967 return packFloatx80(zSign,
5968 floatx80_infinity_high,
5969 floatx80_infinity_low);
5970 }
5971 if ( aExp == 0 ) ++expDiff;
5972 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5973 zExp = bExp;
5974 }
5975 else {
5976 if ( aExp == 0x7FFF ) {
5977 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5978 return propagateFloatx80NaN(a, b, status);
5979 }
5980 return a;
5981 }
5982 zSig1 = 0;
5983 zSig0 = aSig + bSig;
5984 if ( aExp == 0 ) {
5985 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5986 /* At least one of the values is a pseudo-denormal,
5987 * and there is a carry out of the result. */
5988 zExp = 1;
5989 goto shiftRight1;
5990 }
5991 if (zSig0 == 0) {
5992 return packFloatx80(zSign, 0, 0);
5993 }
5994 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5995 goto roundAndPack;
5996 }
5997 zExp = aExp;
5998 goto shiftRight1;
5999 }
6000 zSig0 = aSig + bSig;
6001 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6002 shiftRight1:
6003 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6004 zSig0 |= UINT64_C(0x8000000000000000);
6005 ++zExp;
6006 roundAndPack:
6007 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6008 zSign, zExp, zSig0, zSig1, status);
6009 }
6010
6011 /*----------------------------------------------------------------------------
6012 | Returns the result of subtracting the absolute values of the extended
6013 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
6014 | difference is negated before being returned. `zSign' is ignored if the
6015 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6016 | Standard for Binary Floating-Point Arithmetic.
6017 *----------------------------------------------------------------------------*/
6018
6019 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6020 float_status *status)
6021 {
6022 int32_t aExp, bExp, zExp;
6023 uint64_t aSig, bSig, zSig0, zSig1;
6024 int32_t expDiff;
6025
6026 aSig = extractFloatx80Frac( a );
6027 aExp = extractFloatx80Exp( a );
6028 bSig = extractFloatx80Frac( b );
6029 bExp = extractFloatx80Exp( b );
6030 expDiff = aExp - bExp;
6031 if ( 0 < expDiff ) goto aExpBigger;
6032 if ( expDiff < 0 ) goto bExpBigger;
6033 if ( aExp == 0x7FFF ) {
6034 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6035 return propagateFloatx80NaN(a, b, status);
6036 }
6037 float_raise(float_flag_invalid, status);
6038 return floatx80_default_nan(status);
6039 }
6040 if ( aExp == 0 ) {
6041 aExp = 1;
6042 bExp = 1;
6043 }
6044 zSig1 = 0;
6045 if ( bSig < aSig ) goto aBigger;
6046 if ( aSig < bSig ) goto bBigger;
6047 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6048 bExpBigger:
6049 if ( bExp == 0x7FFF ) {
6050 if ((uint64_t)(bSig << 1)) {
6051 return propagateFloatx80NaN(a, b, status);
6052 }
6053 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6054 floatx80_infinity_low);
6055 }
6056 if ( aExp == 0 ) ++expDiff;
6057 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6058 bBigger:
6059 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6060 zExp = bExp;
6061 zSign ^= 1;
6062 goto normalizeRoundAndPack;
6063 aExpBigger:
6064 if ( aExp == 0x7FFF ) {
6065 if ((uint64_t)(aSig << 1)) {
6066 return propagateFloatx80NaN(a, b, status);
6067 }
6068 return a;
6069 }
6070 if ( bExp == 0 ) --expDiff;
6071 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6072 aBigger:
6073 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6074 zExp = aExp;
6075 normalizeRoundAndPack:
6076 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6077 zSign, zExp, zSig0, zSig1, status);
6078 }
6079
6080 /*----------------------------------------------------------------------------
6081 | Returns the result of adding the extended double-precision floating-point
6082 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6083 | Standard for Binary Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6085
6086 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6087 {
6088 bool aSign, bSign;
6089
6090 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6091 float_raise(float_flag_invalid, status);
6092 return floatx80_default_nan(status);
6093 }
6094 aSign = extractFloatx80Sign( a );
6095 bSign = extractFloatx80Sign( b );
6096 if ( aSign == bSign ) {
6097 return addFloatx80Sigs(a, b, aSign, status);
6098 }
6099 else {
6100 return subFloatx80Sigs(a, b, aSign, status);
6101 }
6102
6103 }
6104
6105 /*----------------------------------------------------------------------------
6106 | Returns the result of subtracting the extended double-precision floating-
6107 | point values `a' and `b'. The operation is performed according to the
6108 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6109 *----------------------------------------------------------------------------*/
6110
6111 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6112 {
6113 bool aSign, bSign;
6114
6115 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6116 float_raise(float_flag_invalid, status);
6117 return floatx80_default_nan(status);
6118 }
6119 aSign = extractFloatx80Sign( a );
6120 bSign = extractFloatx80Sign( b );
6121 if ( aSign == bSign ) {
6122 return subFloatx80Sigs(a, b, aSign, status);
6123 }
6124 else {
6125 return addFloatx80Sigs(a, b, aSign, status);
6126 }
6127
6128 }
6129
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of multiplying the extended double-precision floating-
6132 | point values `a' and `b'. The operation is performed according to the
6133 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6134 *----------------------------------------------------------------------------*/
6135
6136 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6137 {
6138 bool aSign, bSign, zSign;
6139 int32_t aExp, bExp, zExp;
6140 uint64_t aSig, bSig, zSig0, zSig1;
6141
6142 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6143 float_raise(float_flag_invalid, status);
6144 return floatx80_default_nan(status);
6145 }
6146 aSig = extractFloatx80Frac( a );
6147 aExp = extractFloatx80Exp( a );
6148 aSign = extractFloatx80Sign( a );
6149 bSig = extractFloatx80Frac( b );
6150 bExp = extractFloatx80Exp( b );
6151 bSign = extractFloatx80Sign( b );
6152 zSign = aSign ^ bSign;
6153 if ( aExp == 0x7FFF ) {
6154 if ( (uint64_t) ( aSig<<1 )
6155 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6156 return propagateFloatx80NaN(a, b, status);
6157 }
6158 if ( ( bExp | bSig ) == 0 ) goto invalid;
6159 return packFloatx80(zSign, floatx80_infinity_high,
6160 floatx80_infinity_low);
6161 }
6162 if ( bExp == 0x7FFF ) {
6163 if ((uint64_t)(bSig << 1)) {
6164 return propagateFloatx80NaN(a, b, status);
6165 }
6166 if ( ( aExp | aSig ) == 0 ) {
6167 invalid:
6168 float_raise(float_flag_invalid, status);
6169 return floatx80_default_nan(status);
6170 }
6171 return packFloatx80(zSign, floatx80_infinity_high,
6172 floatx80_infinity_low);
6173 }
6174 if ( aExp == 0 ) {
6175 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6176 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6177 }
6178 if ( bExp == 0 ) {
6179 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6180 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6181 }
6182 zExp = aExp + bExp - 0x3FFE;
6183 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6184 if ( 0 < (int64_t) zSig0 ) {
6185 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6186 --zExp;
6187 }
6188 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6189 zSign, zExp, zSig0, zSig1, status);
6190 }
6191
6192 /*----------------------------------------------------------------------------
6193 | Returns the result of dividing the extended double-precision floating-point
6194 | value `a' by the corresponding value `b'. The operation is performed
6195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6196 *----------------------------------------------------------------------------*/
6197
6198 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6199 {
6200 bool aSign, bSign, zSign;
6201 int32_t aExp, bExp, zExp;
6202 uint64_t aSig, bSig, zSig0, zSig1;
6203 uint64_t rem0, rem1, rem2, term0, term1, term2;
6204
6205 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6206 float_raise(float_flag_invalid, status);
6207 return floatx80_default_nan(status);
6208 }
6209 aSig = extractFloatx80Frac( a );
6210 aExp = extractFloatx80Exp( a );
6211 aSign = extractFloatx80Sign( a );
6212 bSig = extractFloatx80Frac( b );
6213 bExp = extractFloatx80Exp( b );
6214 bSign = extractFloatx80Sign( b );
6215 zSign = aSign ^ bSign;
6216 if ( aExp == 0x7FFF ) {
6217 if ((uint64_t)(aSig << 1)) {
6218 return propagateFloatx80NaN(a, b, status);
6219 }
6220 if ( bExp == 0x7FFF ) {
6221 if ((uint64_t)(bSig << 1)) {
6222 return propagateFloatx80NaN(a, b, status);
6223 }
6224 goto invalid;
6225 }
6226 return packFloatx80(zSign, floatx80_infinity_high,
6227 floatx80_infinity_low);
6228 }
6229 if ( bExp == 0x7FFF ) {
6230 if ((uint64_t)(bSig << 1)) {
6231 return propagateFloatx80NaN(a, b, status);
6232 }
6233 return packFloatx80( zSign, 0, 0 );
6234 }
6235 if ( bExp == 0 ) {
6236 if ( bSig == 0 ) {
6237 if ( ( aExp | aSig ) == 0 ) {
6238 invalid:
6239 float_raise(float_flag_invalid, status);
6240 return floatx80_default_nan(status);
6241 }
6242 float_raise(float_flag_divbyzero, status);
6243 return packFloatx80(zSign, floatx80_infinity_high,
6244 floatx80_infinity_low);
6245 }
6246 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6247 }
6248 if ( aExp == 0 ) {
6249 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6250 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6251 }
6252 zExp = aExp - bExp + 0x3FFE;
6253 rem1 = 0;
6254 if ( bSig <= aSig ) {
6255 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6256 ++zExp;
6257 }
6258 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6259 mul64To128( bSig, zSig0, &term0, &term1 );
6260 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6261 while ( (int64_t) rem0 < 0 ) {
6262 --zSig0;
6263 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6264 }
6265 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6266 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6267 mul64To128( bSig, zSig1, &term1, &term2 );
6268 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6269 while ( (int64_t) rem1 < 0 ) {
6270 --zSig1;
6271 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6272 }
6273 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6274 }
6275 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6276 zSign, zExp, zSig0, zSig1, status);
6277 }
6278
6279 /*----------------------------------------------------------------------------
6280 | Returns the remainder of the extended double-precision floating-point value
6281 | `a' with respect to the corresponding value `b'. The operation is performed
6282 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6283 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6284 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6285 | the absolute value of the integer quotient.
6286 *----------------------------------------------------------------------------*/
6287
6288 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6289 float_status *status)
6290 {
6291 bool aSign, zSign;
6292 int32_t aExp, bExp, expDiff, aExpOrig;
6293 uint64_t aSig0, aSig1, bSig;
6294 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6295
6296 *quotient = 0;
6297 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6298 float_raise(float_flag_invalid, status);
6299 return floatx80_default_nan(status);
6300 }
6301 aSig0 = extractFloatx80Frac( a );
6302 aExpOrig = aExp = extractFloatx80Exp( a );
6303 aSign = extractFloatx80Sign( a );
6304 bSig = extractFloatx80Frac( b );
6305 bExp = extractFloatx80Exp( b );
6306 if ( aExp == 0x7FFF ) {
6307 if ( (uint64_t) ( aSig0<<1 )
6308 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6309 return propagateFloatx80NaN(a, b, status);
6310 }
6311 goto invalid;
6312 }
6313 if ( bExp == 0x7FFF ) {
6314 if ((uint64_t)(bSig << 1)) {
6315 return propagateFloatx80NaN(a, b, status);
6316 }
6317 if (aExp == 0 && aSig0 >> 63) {
6318 /*
6319 * Pseudo-denormal argument must be returned in normalized
6320 * form.
6321 */
6322 return packFloatx80(aSign, 1, aSig0);
6323 }
6324 return a;
6325 }
6326 if ( bExp == 0 ) {
6327 if ( bSig == 0 ) {
6328 invalid:
6329 float_raise(float_flag_invalid, status);
6330 return floatx80_default_nan(status);
6331 }
6332 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6333 }
6334 if ( aExp == 0 ) {
6335 if ( aSig0 == 0 ) return a;
6336 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6337 }
6338 zSign = aSign;
6339 expDiff = aExp - bExp;
6340 aSig1 = 0;
6341 if ( expDiff < 0 ) {
6342 if ( mod || expDiff < -1 ) {
6343 if (aExp == 1 && aExpOrig == 0) {
6344 /*
6345 * Pseudo-denormal argument must be returned in
6346 * normalized form.
6347 */
6348 return packFloatx80(aSign, aExp, aSig0);
6349 }
6350 return a;
6351 }
6352 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6353 expDiff = 0;
6354 }
6355 *quotient = q = ( bSig <= aSig0 );
6356 if ( q ) aSig0 -= bSig;
6357 expDiff -= 64;
6358 while ( 0 < expDiff ) {
6359 q = estimateDiv128To64( aSig0, aSig1, bSig );
6360 q = ( 2 < q ) ? q - 2 : 0;
6361 mul64To128( bSig, q, &term0, &term1 );
6362 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6363 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6364 expDiff -= 62;
6365 *quotient <<= 62;
6366 *quotient += q;
6367 }
6368 expDiff += 64;
6369 if ( 0 < expDiff ) {
6370 q = estimateDiv128To64( aSig0, aSig1, bSig );
6371 q = ( 2 < q ) ? q - 2 : 0;
6372 q >>= 64 - expDiff;
6373 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6374 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6375 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6376 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6377 ++q;
6378 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6379 }
6380 if (expDiff < 64) {
6381 *quotient <<= expDiff;
6382 } else {
6383 *quotient = 0;
6384 }
6385 *quotient += q;
6386 }
6387 else {
6388 term1 = 0;
6389 term0 = bSig;
6390 }
6391 if (!mod) {
6392 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6393 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6394 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6395 && ( q & 1 ) )
6396 ) {
6397 aSig0 = alternateASig0;
6398 aSig1 = alternateASig1;
6399 zSign = ! zSign;
6400 ++*quotient;
6401 }
6402 }
6403 return
6404 normalizeRoundAndPackFloatx80(
6405 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6406
6407 }
6408
6409 /*----------------------------------------------------------------------------
6410 | Returns the remainder of the extended double-precision floating-point value
6411 | `a' with respect to the corresponding value `b'. The operation is performed
6412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6413 *----------------------------------------------------------------------------*/
6414
6415 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6416 {
6417 uint64_t quotient;
6418 return floatx80_modrem(a, b, false, &quotient, status);
6419 }
6420
6421 /*----------------------------------------------------------------------------
6422 | Returns the remainder of the extended double-precision floating-point value
6423 | `a' with respect to the corresponding value `b', with the quotient truncated
6424 | toward zero.
6425 *----------------------------------------------------------------------------*/
6426
6427 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6428 {
6429 uint64_t quotient;
6430 return floatx80_modrem(a, b, true, &quotient, status);
6431 }
6432
6433 /*----------------------------------------------------------------------------
6434 | Returns the square root of the extended double-precision floating-point
6435 | value `a'. The operation is performed according to the IEC/IEEE Standard
6436 | for Binary Floating-Point Arithmetic.
6437 *----------------------------------------------------------------------------*/
6438
6439 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6440 {
6441 bool aSign;
6442 int32_t aExp, zExp;
6443 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6444 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6445
6446 if (floatx80_invalid_encoding(a)) {
6447 float_raise(float_flag_invalid, status);
6448 return floatx80_default_nan(status);
6449 }
6450 aSig0 = extractFloatx80Frac( a );
6451 aExp = extractFloatx80Exp( a );
6452 aSign = extractFloatx80Sign( a );
6453 if ( aExp == 0x7FFF ) {
6454 if ((uint64_t)(aSig0 << 1)) {
6455 return propagateFloatx80NaN(a, a, status);
6456 }
6457 if ( ! aSign ) return a;
6458 goto invalid;
6459 }
6460 if ( aSign ) {
6461 if ( ( aExp | aSig0 ) == 0 ) return a;
6462 invalid:
6463 float_raise(float_flag_invalid, status);
6464 return floatx80_default_nan(status);
6465 }
6466 if ( aExp == 0 ) {
6467 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6468 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6469 }
6470 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6471 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6472 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6473 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6474 doubleZSig0 = zSig0<<1;
6475 mul64To128( zSig0, zSig0, &term0, &term1 );
6476 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6477 while ( (int64_t) rem0 < 0 ) {
6478 --zSig0;
6479 doubleZSig0 -= 2;
6480 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6481 }
6482 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6483 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6484 if ( zSig1 == 0 ) zSig1 = 1;
6485 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6486 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6487 mul64To128( zSig1, zSig1, &term2, &term3 );
6488 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6489 while ( (int64_t) rem1 < 0 ) {
6490 --zSig1;
6491 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6492 term3 |= 1;
6493 term2 |= doubleZSig0;
6494 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6495 }
6496 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6497 }
6498 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6499 zSig0 |= doubleZSig0;
6500 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6501 0, zExp, zSig0, zSig1, status);
6502 }
6503
6504 /*----------------------------------------------------------------------------
6505 | Returns the result of converting the quadruple-precision floating-point
6506 | value `a' to the 32-bit two's complement integer format. The conversion
6507 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6508 | Arithmetic---which means in particular that the conversion is rounded
6509 | according to the current rounding mode. If `a' is a NaN, the largest
6510 | positive integer is returned. Otherwise, if the conversion overflows, the
6511 | largest integer with the same sign as `a' is returned.
6512 *----------------------------------------------------------------------------*/
6513
6514 int32_t float128_to_int32(float128 a, float_status *status)
6515 {
6516 bool aSign;
6517 int32_t aExp, shiftCount;
6518 uint64_t aSig0, aSig1;
6519
6520 aSig1 = extractFloat128Frac1( a );
6521 aSig0 = extractFloat128Frac0( a );
6522 aExp = extractFloat128Exp( a );
6523 aSign = extractFloat128Sign( a );
6524 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6525 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6526 aSig0 |= ( aSig1 != 0 );
6527 shiftCount = 0x4028 - aExp;
6528 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6529 return roundAndPackInt32(aSign, aSig0, status);
6530
6531 }
6532
6533 /*----------------------------------------------------------------------------
6534 | Returns the result of converting the quadruple-precision floating-point
6535 | value `a' to the 32-bit two's complement integer format. The conversion
6536 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6537 | Arithmetic, except that the conversion is always rounded toward zero. If
6538 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6539 | conversion overflows, the largest integer with the same sign as `a' is
6540 | returned.
6541 *----------------------------------------------------------------------------*/
6542
6543 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6544 {
6545 bool aSign;
6546 int32_t aExp, shiftCount;
6547 uint64_t aSig0, aSig1, savedASig;
6548 int32_t z;
6549
6550 aSig1 = extractFloat128Frac1( a );
6551 aSig0 = extractFloat128Frac0( a );
6552 aExp = extractFloat128Exp( a );
6553 aSign = extractFloat128Sign( a );
6554 aSig0 |= ( aSig1 != 0 );
6555 if ( 0x401E < aExp ) {
6556 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6557 goto invalid;
6558 }
6559 else if ( aExp < 0x3FFF ) {
6560 if (aExp || aSig0) {
6561 float_raise(float_flag_inexact, status);
6562 }
6563 return 0;
6564 }
6565 aSig0 |= UINT64_C(0x0001000000000000);
6566 shiftCount = 0x402F - aExp;
6567 savedASig = aSig0;
6568 aSig0 >>= shiftCount;
6569 z = aSig0;
6570 if ( aSign ) z = - z;
6571 if ( ( z < 0 ) ^ aSign ) {
6572 invalid:
6573 float_raise(float_flag_invalid, status);
6574 return aSign ? INT32_MIN : INT32_MAX;
6575 }
6576 if ( ( aSig0<<shiftCount ) != savedASig ) {
6577 float_raise(float_flag_inexact, status);
6578 }
6579 return z;
6580
6581 }
6582
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of converting the quadruple-precision floating-point
6585 | value `a' to the 64-bit two's complement integer format. The conversion
6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6587 | Arithmetic---which means in particular that the conversion is rounded
6588 | according to the current rounding mode. If `a' is a NaN, the largest
6589 | positive integer is returned. Otherwise, if the conversion overflows, the
6590 | largest integer with the same sign as `a' is returned.
6591 *----------------------------------------------------------------------------*/
6592
6593 int64_t float128_to_int64(float128 a, float_status *status)
6594 {
6595 bool aSign;
6596 int32_t aExp, shiftCount;
6597 uint64_t aSig0, aSig1;
6598
6599 aSig1 = extractFloat128Frac1( a );
6600 aSig0 = extractFloat128Frac0( a );
6601 aExp = extractFloat128Exp( a );
6602 aSign = extractFloat128Sign( a );
6603 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6604 shiftCount = 0x402F - aExp;
6605 if ( shiftCount <= 0 ) {
6606 if ( 0x403E < aExp ) {
6607 float_raise(float_flag_invalid, status);
6608 if ( ! aSign
6609 || ( ( aExp == 0x7FFF )
6610 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6611 )
6612 ) {
6613 return INT64_MAX;
6614 }
6615 return INT64_MIN;
6616 }
6617 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6618 }
6619 else {
6620 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6621 }
6622 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6623
6624 }
6625
6626 /*----------------------------------------------------------------------------
6627 | Returns the result of converting the quadruple-precision floating-point
6628 | value `a' to the 64-bit two's complement integer format. The conversion
6629 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6630 | Arithmetic, except that the conversion is always rounded toward zero.
6631 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6632 | the conversion overflows, the largest integer with the same sign as `a' is
6633 | returned.
6634 *----------------------------------------------------------------------------*/
6635
6636 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6637 {
6638 bool aSign;
6639 int32_t aExp, shiftCount;
6640 uint64_t aSig0, aSig1;
6641 int64_t z;
6642
6643 aSig1 = extractFloat128Frac1( a );
6644 aSig0 = extractFloat128Frac0( a );
6645 aExp = extractFloat128Exp( a );
6646 aSign = extractFloat128Sign( a );
6647 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6648 shiftCount = aExp - 0x402F;
6649 if ( 0 < shiftCount ) {
6650 if ( 0x403E <= aExp ) {
6651 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6652 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6653 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6654 if (aSig1) {
6655 float_raise(float_flag_inexact, status);
6656 }
6657 }
6658 else {
6659 float_raise(float_flag_invalid, status);
6660 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6661 return INT64_MAX;
6662 }
6663 }
6664 return INT64_MIN;
6665 }
6666 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6667 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6668 float_raise(float_flag_inexact, status);
6669 }
6670 }
6671 else {
6672 if ( aExp < 0x3FFF ) {
6673 if ( aExp | aSig0 | aSig1 ) {
6674 float_raise(float_flag_inexact, status);
6675 }
6676 return 0;
6677 }
6678 z = aSig0>>( - shiftCount );
6679 if ( aSig1
6680 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6681 float_raise(float_flag_inexact, status);
6682 }
6683 }
6684 if ( aSign ) z = - z;
6685 return z;
6686
6687 }
6688
6689 /*----------------------------------------------------------------------------
6690 | Returns the result of converting the quadruple-precision floating-point value
6691 | `a' to the 64-bit unsigned integer format. The conversion is
6692 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6693 | Arithmetic---which means in particular that the conversion is rounded
6694 | according to the current rounding mode. If `a' is a NaN, the largest
6695 | positive integer is returned. If the conversion overflows, the
6696 | largest unsigned integer is returned. If 'a' is negative, the value is
6697 | rounded and zero is returned; negative values that do not round to zero
6698 | will raise the inexact exception.
6699 *----------------------------------------------------------------------------*/
6700
6701 uint64_t float128_to_uint64(float128 a, float_status *status)
6702 {
6703 bool aSign;
6704 int aExp;
6705 int shiftCount;
6706 uint64_t aSig0, aSig1;
6707
6708 aSig0 = extractFloat128Frac0(a);
6709 aSig1 = extractFloat128Frac1(a);
6710 aExp = extractFloat128Exp(a);
6711 aSign = extractFloat128Sign(a);
6712 if (aSign && (aExp > 0x3FFE)) {
6713 float_raise(float_flag_invalid, status);
6714 if (float128_is_any_nan(a)) {
6715 return UINT64_MAX;
6716 } else {
6717 return 0;
6718 }
6719 }
6720 if (aExp) {
6721 aSig0 |= UINT64_C(0x0001000000000000);
6722 }
6723 shiftCount = 0x402F - aExp;
6724 if (shiftCount <= 0) {
6725 if (0x403E < aExp) {
6726 float_raise(float_flag_invalid, status);
6727 return UINT64_MAX;
6728 }
6729 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6730 } else {
6731 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6732 }
6733 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6734 }
6735
6736 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6737 {
6738 uint64_t v;
6739 signed char current_rounding_mode = status->float_rounding_mode;
6740
6741 set_float_rounding_mode(float_round_to_zero, status);
6742 v = float128_to_uint64(a, status);
6743 set_float_rounding_mode(current_rounding_mode, status);
6744
6745 return v;
6746 }
6747
6748 /*----------------------------------------------------------------------------
6749 | Returns the result of converting the quadruple-precision floating-point
6750 | value `a' to the 32-bit unsigned integer format. The conversion
6751 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6752 | Arithmetic except that the conversion is always rounded toward zero.
6753 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6754 | if the conversion overflows, the largest unsigned integer is returned.
6755 | If 'a' is negative, the value is rounded and zero is returned; negative
6756 | values that do not round to zero will raise the inexact exception.
6757 *----------------------------------------------------------------------------*/
6758
6759 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6760 {
6761 uint64_t v;
6762 uint32_t res;
6763 int old_exc_flags = get_float_exception_flags(status);
6764
6765 v = float128_to_uint64_round_to_zero(a, status);
6766 if (v > 0xffffffff) {
6767 res = 0xffffffff;
6768 } else {
6769 return v;
6770 }
6771 set_float_exception_flags(old_exc_flags, status);
6772 float_raise(float_flag_invalid, status);
6773 return res;
6774 }
6775
6776 /*----------------------------------------------------------------------------
6777 | Returns the result of converting the quadruple-precision floating-point value
6778 | `a' to the 32-bit unsigned integer format. The conversion is
6779 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6780 | Arithmetic---which means in particular that the conversion is rounded
6781 | according to the current rounding mode. If `a' is a NaN, the largest
6782 | positive integer is returned. If the conversion overflows, the
6783 | largest unsigned integer is returned. If 'a' is negative, the value is
6784 | rounded and zero is returned; negative values that do not round to zero
6785 | will raise the inexact exception.
6786 *----------------------------------------------------------------------------*/
6787
6788 uint32_t float128_to_uint32(float128 a, float_status *status)
6789 {
6790 uint64_t v;
6791 uint32_t res;
6792 int old_exc_flags = get_float_exception_flags(status);
6793
6794 v = float128_to_uint64(a, status);
6795 if (v > 0xffffffff) {
6796 res = 0xffffffff;
6797 } else {
6798 return v;
6799 }
6800 set_float_exception_flags(old_exc_flags, status);
6801 float_raise(float_flag_invalid, status);
6802 return res;
6803 }
6804
6805 /*----------------------------------------------------------------------------
6806 | Returns the result of converting the quadruple-precision floating-point
6807 | value `a' to the single-precision floating-point format. The conversion
6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6809 | Arithmetic.
6810 *----------------------------------------------------------------------------*/
6811
6812 float32 float128_to_float32(float128 a, float_status *status)
6813 {
6814 bool aSign;
6815 int32_t aExp;
6816 uint64_t aSig0, aSig1;
6817 uint32_t zSig;
6818
6819 aSig1 = extractFloat128Frac1( a );
6820 aSig0 = extractFloat128Frac0( a );
6821 aExp = extractFloat128Exp( a );
6822 aSign = extractFloat128Sign( a );
6823 if ( aExp == 0x7FFF ) {
6824 if ( aSig0 | aSig1 ) {
6825 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6826 }
6827 return packFloat32( aSign, 0xFF, 0 );
6828 }
6829 aSig0 |= ( aSig1 != 0 );
6830 shift64RightJamming( aSig0, 18, &aSig0 );
6831 zSig = aSig0;
6832 if ( aExp || zSig ) {
6833 zSig |= 0x40000000;
6834 aExp -= 0x3F81;
6835 }
6836 return roundAndPackFloat32(aSign, aExp, zSig, status);
6837
6838 }
6839
6840 /*----------------------------------------------------------------------------
6841 | Returns the result of converting the quadruple-precision floating-point
6842 | value `a' to the double-precision floating-point format. The conversion
6843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6844 | Arithmetic.
6845 *----------------------------------------------------------------------------*/
6846
6847 float64 float128_to_float64(float128 a, float_status *status)
6848 {
6849 bool aSign;
6850 int32_t aExp;
6851 uint64_t aSig0, aSig1;
6852
6853 aSig1 = extractFloat128Frac1( a );
6854 aSig0 = extractFloat128Frac0( a );
6855 aExp = extractFloat128Exp( a );
6856 aSign = extractFloat128Sign( a );
6857 if ( aExp == 0x7FFF ) {
6858 if ( aSig0 | aSig1 ) {
6859 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6860 }
6861 return packFloat64( aSign, 0x7FF, 0 );
6862 }
6863 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6864 aSig0 |= ( aSig1 != 0 );
6865 if ( aExp || aSig0 ) {
6866 aSig0 |= UINT64_C(0x4000000000000000);
6867 aExp -= 0x3C01;
6868 }
6869 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6870
6871 }
6872
6873 /*----------------------------------------------------------------------------
6874 | Returns the result of converting the quadruple-precision floating-point
6875 | value `a' to the extended double-precision floating-point format. The
6876 | conversion is performed according to the IEC/IEEE Standard for Binary
6877 | Floating-Point Arithmetic.
6878 *----------------------------------------------------------------------------*/
6879
6880 floatx80 float128_to_floatx80(float128 a, float_status *status)
6881 {
6882 bool aSign;
6883 int32_t aExp;
6884 uint64_t aSig0, aSig1;
6885
6886 aSig1 = extractFloat128Frac1( a );
6887 aSig0 = extractFloat128Frac0( a );
6888 aExp = extractFloat128Exp( a );
6889 aSign = extractFloat128Sign( a );
6890 if ( aExp == 0x7FFF ) {
6891 if ( aSig0 | aSig1 ) {
6892 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6893 status);
6894 return floatx80_silence_nan(res, status);
6895 }
6896 return packFloatx80(aSign, floatx80_infinity_high,
6897 floatx80_infinity_low);
6898 }
6899 if ( aExp == 0 ) {
6900 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6901 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6902 }
6903 else {
6904 aSig0 |= UINT64_C(0x0001000000000000);
6905 }
6906 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6907 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6908
6909 }
6910
6911 /*----------------------------------------------------------------------------
6912 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6913 | returns the result as a quadruple-precision floating-point value. The
6914 | operation is performed according to the IEC/IEEE Standard for Binary
6915 | Floating-Point Arithmetic.
6916 *----------------------------------------------------------------------------*/
6917
6918 float128 float128_round_to_int(float128 a, float_status *status)
6919 {
6920 bool aSign;
6921 int32_t aExp;
6922 uint64_t lastBitMask, roundBitsMask;
6923 float128 z;
6924
6925 aExp = extractFloat128Exp( a );
6926 if ( 0x402F <= aExp ) {
6927 if ( 0x406F <= aExp ) {
6928 if ( ( aExp == 0x7FFF )
6929 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6930 ) {
6931 return propagateFloat128NaN(a, a, status);
6932 }
6933 return a;
6934 }
6935 lastBitMask = 1;
6936 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6937 roundBitsMask = lastBitMask - 1;
6938 z = a;
6939 switch (status->float_rounding_mode) {
6940 case float_round_nearest_even:
6941 if ( lastBitMask ) {
6942 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6943 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6944 }
6945 else {
6946 if ( (int64_t) z.low < 0 ) {
6947 ++z.high;
6948 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6949 }
6950 }
6951 break;
6952 case float_round_ties_away:
6953 if (lastBitMask) {
6954 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6955 } else {
6956 if ((int64_t) z.low < 0) {
6957 ++z.high;
6958 }
6959 }
6960 break;
6961 case float_round_to_zero:
6962 break;
6963 case float_round_up:
6964 if (!extractFloat128Sign(z)) {
6965 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6966 }
6967 break;
6968 case float_round_down:
6969 if (extractFloat128Sign(z)) {
6970 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6971 }
6972 break;
6973 case float_round_to_odd:
6974 /*
6975 * Note that if lastBitMask == 0, the last bit is the lsb
6976 * of high, and roundBitsMask == -1.
6977 */
6978 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6979 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6980 }
6981 break;
6982 default:
6983 abort();
6984 }
6985 z.low &= ~ roundBitsMask;
6986 }
6987 else {
6988 if ( aExp < 0x3FFF ) {
6989 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6990 float_raise(float_flag_inexact, status);
6991 aSign = extractFloat128Sign( a );
6992 switch (status->float_rounding_mode) {
6993 case float_round_nearest_even:
6994 if ( ( aExp == 0x3FFE )
6995 && ( extractFloat128Frac0( a )
6996 | extractFloat128Frac1( a ) )
6997 ) {
6998 return packFloat128( aSign, 0x3FFF, 0, 0 );
6999 }
7000 break;
7001 case float_round_ties_away:
7002 if (aExp == 0x3FFE) {
7003 return packFloat128(aSign, 0x3FFF, 0, 0);
7004 }
7005 break;
7006 case float_round_down:
7007 return
7008 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7009 : packFloat128( 0, 0, 0, 0 );
7010 case float_round_up:
7011 return
7012 aSign ? packFloat128( 1, 0, 0, 0 )
7013 : packFloat128( 0, 0x3FFF, 0, 0 );
7014
7015 case float_round_to_odd:
7016 return packFloat128(aSign, 0x3FFF, 0, 0);
7017
7018 case float_round_to_zero:
7019 break;
7020 }
7021 return packFloat128( aSign, 0, 0, 0 );
7022 }
7023 lastBitMask = 1;
7024 lastBitMask <<= 0x402F - aExp;
7025 roundBitsMask = lastBitMask - 1;
7026 z.low = 0;
7027 z.high = a.high;
7028 switch (status->float_rounding_mode) {
7029 case float_round_nearest_even:
7030 z.high += lastBitMask>>1;
7031 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7032 z.high &= ~ lastBitMask;
7033 }
7034 break;
7035 case float_round_ties_away:
7036 z.high += lastBitMask>>1;
7037 break;
7038 case float_round_to_zero:
7039 break;
7040 case float_round_up:
7041 if (!extractFloat128Sign(z)) {
7042 z.high |= ( a.low != 0 );
7043 z.high += roundBitsMask;
7044 }
7045 break;
7046 case float_round_down:
7047 if (extractFloat128Sign(z)) {
7048 z.high |= (a.low != 0);
7049 z.high += roundBitsMask;
7050 }
7051 break;
7052 case float_round_to_odd:
7053 if ((z.high & lastBitMask) == 0) {
7054 z.high |= (a.low != 0);
7055 z.high += roundBitsMask;
7056 }
7057 break;
7058 default:
7059 abort();
7060 }
7061 z.high &= ~ roundBitsMask;
7062 }
7063 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7064 float_raise(float_flag_inexact, status);
7065 }
7066 return z;
7067
7068 }
7069
7070 /*----------------------------------------------------------------------------
7071 | Returns the result of dividing the quadruple-precision floating-point value
7072 | `a' by the corresponding value `b'. The operation is performed according to
7073 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7074 *----------------------------------------------------------------------------*/
7075
7076 float128 float128_div(float128 a, float128 b, float_status *status)
7077 {
7078 bool aSign, bSign, zSign;
7079 int32_t aExp, bExp, zExp;
7080 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7081 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7082
7083 aSig1 = extractFloat128Frac1( a );
7084 aSig0 = extractFloat128Frac0( a );
7085 aExp = extractFloat128Exp( a );
7086 aSign = extractFloat128Sign( a );
7087 bSig1 = extractFloat128Frac1( b );
7088 bSig0 = extractFloat128Frac0( b );
7089 bExp = extractFloat128Exp( b );
7090 bSign = extractFloat128Sign( b );
7091 zSign = aSign ^ bSign;
7092 if ( aExp == 0x7FFF ) {
7093 if (aSig0 | aSig1) {
7094 return propagateFloat128NaN(a, b, status);
7095 }
7096 if ( bExp == 0x7FFF ) {
7097 if (bSig0 | bSig1) {
7098 return propagateFloat128NaN(a, b, status);
7099 }
7100 goto invalid;
7101 }
7102 return packFloat128( zSign, 0x7FFF, 0, 0 );
7103 }
7104 if ( bExp == 0x7FFF ) {
7105 if (bSig0 | bSig1) {
7106 return propagateFloat128NaN(a, b, status);
7107 }
7108 return packFloat128( zSign, 0, 0, 0 );
7109 }
7110 if ( bExp == 0 ) {
7111 if ( ( bSig0 | bSig1 ) == 0 ) {
7112 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7113 invalid:
7114 float_raise(float_flag_invalid, status);
7115 return float128_default_nan(status);
7116 }
7117 float_raise(float_flag_divbyzero, status);
7118 return packFloat128( zSign, 0x7FFF, 0, 0 );
7119 }
7120 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7121 }
7122 if ( aExp == 0 ) {
7123 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7124 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7125 }
7126 zExp = aExp - bExp + 0x3FFD;
7127 shortShift128Left(
7128 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7129 shortShift128Left(
7130 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7131 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7132 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7133 ++zExp;
7134 }
7135 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7136 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7137 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7138 while ( (int64_t) rem0 < 0 ) {
7139 --zSig0;
7140 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7141 }
7142 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7143 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7144 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7145 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7146 while ( (int64_t) rem1 < 0 ) {
7147 --zSig1;
7148 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7149 }
7150 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7151 }
7152 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7153 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7154
7155 }
7156
7157 /*----------------------------------------------------------------------------
7158 | Returns the remainder of the quadruple-precision floating-point value `a'
7159 | with respect to the corresponding value `b'. The operation is performed
7160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7161 *----------------------------------------------------------------------------*/
7162
7163 float128 float128_rem(float128 a, float128 b, float_status *status)
7164 {
7165 bool aSign, zSign;
7166 int32_t aExp, bExp, expDiff;
7167 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7168 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7169 int64_t sigMean0;
7170
7171 aSig1 = extractFloat128Frac1( a );
7172 aSig0 = extractFloat128Frac0( a );
7173 aExp = extractFloat128Exp( a );
7174 aSign = extractFloat128Sign( a );
7175 bSig1 = extractFloat128Frac1( b );
7176 bSig0 = extractFloat128Frac0( b );
7177 bExp = extractFloat128Exp( b );
7178 if ( aExp == 0x7FFF ) {
7179 if ( ( aSig0 | aSig1 )
7180 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7181 return propagateFloat128NaN(a, b, status);
7182 }
7183 goto invalid;
7184 }
7185 if ( bExp == 0x7FFF ) {
7186 if (bSig0 | bSig1) {
7187 return propagateFloat128NaN(a, b, status);
7188 }
7189 return a;
7190 }
7191 if ( bExp == 0 ) {
7192 if ( ( bSig0 | bSig1 ) == 0 ) {
7193 invalid:
7194 float_raise(float_flag_invalid, status);
7195 return float128_default_nan(status);
7196 }
7197 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7198 }
7199 if ( aExp == 0 ) {
7200 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7201 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7202 }
7203 expDiff = aExp - bExp;
7204 if ( expDiff < -1 ) return a;
7205 shortShift128Left(
7206 aSig0 | UINT64_C(0x0001000000000000),
7207 aSig1,
7208 15 - ( expDiff < 0 ),
7209 &aSig0,
7210 &aSig1
7211 );
7212 shortShift128Left(
7213 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7214 q = le128( bSig0, bSig1, aSig0, aSig1 );
7215 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7216 expDiff -= 64;
7217 while ( 0 < expDiff ) {
7218 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7219 q = ( 4 < q ) ? q - 4 : 0;
7220 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7221 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7222 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7223 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7224 expDiff -= 61;
7225 }
7226 if ( -64 < expDiff ) {
7227 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7228 q = ( 4 < q ) ? q - 4 : 0;
7229 q >>= - expDiff;
7230 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7231 expDiff += 52;
7232 if ( expDiff < 0 ) {
7233 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7234 }
7235 else {
7236 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7237 }
7238 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7239 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7240 }
7241 else {
7242 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7243 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7244 }
7245 do {
7246 alternateASig0 = aSig0;
7247 alternateASig1 = aSig1;
7248 ++q;
7249 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7250 } while ( 0 <= (int64_t) aSig0 );
7251 add128(
7252 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7253 if ( ( sigMean0 < 0 )
7254 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7255 aSig0 = alternateASig0;
7256 aSig1 = alternateASig1;
7257 }
7258 zSign = ( (int64_t) aSig0 < 0 );
7259 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7260 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7261 status);
7262 }
7263
7264 /*----------------------------------------------------------------------------
7265 | Returns the square root of the quadruple-precision floating-point value `a'.
7266 | The operation is performed according to the IEC/IEEE Standard for Binary
7267 | Floating-Point Arithmetic.
7268 *----------------------------------------------------------------------------*/
7269
7270 float128 float128_sqrt(float128 a, float_status *status)
7271 {
7272 bool aSign;
7273 int32_t aExp, zExp;
7274 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7275 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7276
7277 aSig1 = extractFloat128Frac1( a );
7278 aSig0 = extractFloat128Frac0( a );
7279 aExp = extractFloat128Exp( a );
7280 aSign = extractFloat128Sign( a );
7281 if ( aExp == 0x7FFF ) {
7282 if (aSig0 | aSig1) {
7283 return propagateFloat128NaN(a, a, status);
7284 }
7285 if ( ! aSign ) return a;
7286 goto invalid;
7287 }
7288 if ( aSign ) {
7289 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7290 invalid:
7291 float_raise(float_flag_invalid, status);
7292 return float128_default_nan(status);
7293 }
7294 if ( aExp == 0 ) {
7295 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7296 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7297 }
7298 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7299 aSig0 |= UINT64_C(0x0001000000000000);
7300 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7301 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7302 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7303 doubleZSig0 = zSig0<<1;
7304 mul64To128( zSig0, zSig0, &term0, &term1 );
7305 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7306 while ( (int64_t) rem0 < 0 ) {
7307 --zSig0;
7308 doubleZSig0 -= 2;
7309 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7310 }
7311 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7312 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7313 if ( zSig1 == 0 ) zSig1 = 1;
7314 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7315 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7316 mul64To128( zSig1, zSig1, &term2, &term3 );
7317 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7318 while ( (int64_t) rem1 < 0 ) {
7319 --zSig1;
7320 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7321 term3 |= 1;
7322 term2 |= doubleZSig0;
7323 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7324 }
7325 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7326 }
7327 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7328 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7329
7330 }
7331
7332 static inline FloatRelation
7333 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7334 float_status *status)
7335 {
7336 bool aSign, bSign;
7337
7338 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7339 float_raise(float_flag_invalid, status);
7340 return float_relation_unordered;
7341 }
7342 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7343 ( extractFloatx80Frac( a )<<1 ) ) ||
7344 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7345 ( extractFloatx80Frac( b )<<1 ) )) {
7346 if (!is_quiet ||
7347 floatx80_is_signaling_nan(a, status) ||
7348 floatx80_is_signaling_nan(b, status)) {
7349 float_raise(float_flag_invalid, status);
7350 }
7351 return float_relation_unordered;
7352 }
7353 aSign = extractFloatx80Sign( a );
7354 bSign = extractFloatx80Sign( b );
7355 if ( aSign != bSign ) {
7356
7357 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7358 ( ( a.low | b.low ) == 0 ) ) {
7359 /* zero case */
7360 return float_relation_equal;
7361 } else {
7362 return 1 - (2 * aSign);
7363 }
7364 } else {
7365 /* Normalize pseudo-denormals before comparison. */
7366 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7367 ++a.high;
7368 }
7369 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7370 ++b.high;
7371 }
7372 if (a.low == b.low && a.high == b.high) {
7373 return float_relation_equal;
7374 } else {
7375 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7376 }
7377 }
7378 }
7379
7380 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7381 {
7382 return floatx80_compare_internal(a, b, 0, status);
7383 }
7384
7385 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7386 float_status *status)
7387 {
7388 return floatx80_compare_internal(a, b, 1, status);
7389 }
7390
7391 static inline FloatRelation
7392 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7393 float_status *status)
7394 {
7395 bool aSign, bSign;
7396
7397 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7398 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7399 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7400 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7401 if (!is_quiet ||
7402 float128_is_signaling_nan(a, status) ||
7403 float128_is_signaling_nan(b, status)) {
7404 float_raise(float_flag_invalid, status);
7405 }
7406 return float_relation_unordered;
7407 }
7408 aSign = extractFloat128Sign( a );
7409 bSign = extractFloat128Sign( b );
7410 if ( aSign != bSign ) {
7411 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7412 /* zero case */
7413 return float_relation_equal;
7414 } else {
7415 return 1 - (2 * aSign);
7416 }
7417 } else {
7418 if (a.low == b.low && a.high == b.high) {
7419 return float_relation_equal;
7420 } else {
7421 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7422 }
7423 }
7424 }
7425
7426 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7427 {
7428 return float128_compare_internal(a, b, 0, status);
7429 }
7430
7431 FloatRelation float128_compare_quiet(float128 a, float128 b,
7432 float_status *status)
7433 {
7434 return float128_compare_internal(a, b, 1, status);
7435 }
7436
7437 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7438 {
7439 bool aSign;
7440 int32_t aExp;
7441 uint64_t aSig;
7442
7443 if (floatx80_invalid_encoding(a)) {
7444 float_raise(float_flag_invalid, status);
7445 return floatx80_default_nan(status);
7446 }
7447 aSig = extractFloatx80Frac( a );
7448 aExp = extractFloatx80Exp( a );
7449 aSign = extractFloatx80Sign( a );
7450
7451 if ( aExp == 0x7FFF ) {
7452 if ( aSig<<1 ) {
7453 return propagateFloatx80NaN(a, a, status);
7454 }
7455 return a;
7456 }
7457
7458 if (aExp == 0) {
7459 if (aSig == 0) {
7460 return a;
7461 }
7462 aExp++;
7463 }
7464
7465 if (n > 0x10000) {
7466 n = 0x10000;
7467 } else if (n < -0x10000) {
7468 n = -0x10000;
7469 }
7470
7471 aExp += n;
7472 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7473 aSign, aExp, aSig, 0, status);
7474 }
7475
7476 float128 float128_scalbn(float128 a, int n, float_status *status)
7477 {
7478 bool aSign;
7479 int32_t aExp;
7480 uint64_t aSig0, aSig1;
7481
7482 aSig1 = extractFloat128Frac1( a );
7483 aSig0 = extractFloat128Frac0( a );
7484 aExp = extractFloat128Exp( a );
7485 aSign = extractFloat128Sign( a );
7486 if ( aExp == 0x7FFF ) {
7487 if ( aSig0 | aSig1 ) {
7488 return propagateFloat128NaN(a, a, status);
7489 }
7490 return a;
7491 }
7492 if (aExp != 0) {
7493 aSig0 |= UINT64_C(0x0001000000000000);
7494 } else if (aSig0 == 0 && aSig1 == 0) {
7495 return a;
7496 } else {
7497 aExp++;
7498 }
7499
7500 if (n > 0x10000) {
7501 n = 0x10000;
7502 } else if (n < -0x10000) {
7503 n = -0x10000;
7504 }
7505
7506 aExp += n - 1;
7507 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7508 , status);
7509
7510 }
7511
7512 static void __attribute__((constructor)) softfloat_init(void)
7513 {
7514 union_float64 ua, ub, uc, ur;
7515
7516 if (QEMU_NO_HARDFLOAT) {
7517 return;
7518 }
7519 /*
7520 * Test that the host's FMA is not obviously broken. For example,
7521 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7522 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7523 */
7524 ua.s = 0x0020000000000001ULL;
7525 ub.s = 0x3ca0000000000000ULL;
7526 uc.s = 0x0020000000000000ULL;
7527 ur.h = fma(ua.h, ub.h, uc.h);
7528 if (ur.s != 0x0020000000000001ULL) {
7529 force_soft_fma = true;
7530 }
7531 }