]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Implement float128_add/sub via parts
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
505 *
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
509 */
510
511 typedef struct {
512 FloatClass cls;
513 bool sign;
514 int32_t exp;
515 union {
516 /* Routines that know the structure may reference the singular name. */
517 uint64_t frac;
518 /*
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
522 */
523 uint64_t frac_hi;
524 uint64_t frac_lo;
525 };
526 } FloatParts64;
527
528 typedef struct {
529 FloatClass cls;
530 bool sign;
531 int32_t exp;
532 uint64_t frac_hi;
533 uint64_t frac_lo;
534 } FloatParts128;
535
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT 63
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539
540 /* Structure holding all of the relevant parameters for a format.
541 * exp_size: the size of the exponent field
542 * exp_bias: the offset applied to the exponent field
543 * exp_max: the maximum normalised exponent
544 * frac_size: the size of the fraction field
545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546 * The following are computed based the size of fraction
547 * frac_lsb: least significant bit of fraction
548 * frac_lsbm1: the bit below the least significant bit (for rounding)
549 * round_mask/roundeven_mask: masks used for rounding
550 * The following optional modifiers are available:
551 * arm_althp: handle ARM Alternative Half Precision
552 */
553 typedef struct {
554 int exp_size;
555 int exp_bias;
556 int exp_max;
557 int frac_size;
558 int frac_shift;
559 uint64_t frac_lsb;
560 uint64_t frac_lsbm1;
561 uint64_t round_mask;
562 uint64_t roundeven_mask;
563 bool arm_althp;
564 } FloatFmt;
565
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F) \
568 .exp_size = E, \
569 .exp_bias = ((1 << E) - 1) >> 1, \
570 .exp_max = (1 << E) - 1, \
571 .frac_size = F, \
572 .frac_shift = (-F - 1) & 63, \
573 .frac_lsb = 1ull << ((-F - 1) & 63), \
574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577
578 static const FloatFmt float16_params = {
579 FLOAT_PARAMS(5, 10)
580 };
581
582 static const FloatFmt float16_params_ahp = {
583 FLOAT_PARAMS(5, 10),
584 .arm_althp = true
585 };
586
587 static const FloatFmt bfloat16_params = {
588 FLOAT_PARAMS(8, 7)
589 };
590
591 static const FloatFmt float32_params = {
592 FLOAT_PARAMS(8, 23)
593 };
594
595 static const FloatFmt float64_params = {
596 FLOAT_PARAMS(11, 52)
597 };
598
599 static const FloatFmt float128_params = {
600 FLOAT_PARAMS(15, 112)
601 };
602
603 /* Unpack a float to parts, but do not canonicalize. */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606 const int f_size = fmt->frac_size;
607 const int e_size = fmt->exp_size;
608
609 *r = (FloatParts64) {
610 .cls = float_class_unclassified,
611 .sign = extract64(raw, f_size + e_size, 1),
612 .exp = extract64(raw, f_size, e_size),
613 .frac = extract64(raw, 0, f_size)
614 };
615 }
616
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619 unpack_raw64(p, &float16_params, f);
620 }
621
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624 unpack_raw64(p, &bfloat16_params, f);
625 }
626
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629 unpack_raw64(p, &float32_params, f);
630 }
631
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634 unpack_raw64(p, &float64_params, f);
635 }
636
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639 const int f_size = float128_params.frac_size - 64;
640 const int e_size = float128_params.exp_size;
641
642 *p = (FloatParts128) {
643 .cls = float_class_unclassified,
644 .sign = extract64(f.high, f_size + e_size, 1),
645 .exp = extract64(f.high, f_size, e_size),
646 .frac_hi = extract64(f.high, 0, f_size),
647 .frac_lo = f.low,
648 };
649 }
650
651 /* Pack a float from parts, but do not canonicalize. */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654 const int f_size = fmt->frac_size;
655 const int e_size = fmt->exp_size;
656 uint64_t ret;
657
658 ret = (uint64_t)p->sign << (f_size + e_size);
659 ret = deposit64(ret, f_size, e_size, p->exp);
660 ret = deposit64(ret, 0, f_size, p->frac);
661 return ret;
662 }
663
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666 return make_float16(pack_raw64(p, &float16_params));
667 }
668
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671 return pack_raw64(p, &bfloat16_params);
672 }
673
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676 return make_float32(pack_raw64(p, &float32_params));
677 }
678
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681 return make_float64(pack_raw64(p, &float64_params));
682 }
683
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686 const int f_size = float128_params.frac_size - 64;
687 const int e_size = float128_params.exp_size;
688 uint64_t hi;
689
690 hi = (uint64_t)p->sign << (f_size + e_size);
691 hi = deposit64(hi, f_size, e_size, p->exp);
692 hi = deposit64(hi, 0, f_size, p->frac_hi);
693 return make_float128(hi, p->frac_lo);
694 }
695
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine: (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output. These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705
706 #define PARTS_GENERIC_64_128(NAME, P) \
707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708
709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711
712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
714
715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
716
717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
718 float_status *s);
719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
720 float_status *s);
721
722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
723
724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
725 FloatParts64 *c, float_status *s,
726 int ab_mask, int abc_mask);
727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
728 FloatParts128 *b,
729 FloatParts128 *c,
730 float_status *s,
731 int ab_mask, int abc_mask);
732
733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
735
736 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
737 const FloatFmt *fmt);
738 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
739 const FloatFmt *fmt);
740
741 #define parts_canonicalize(A, S, F) \
742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
743
744 static void parts64_uncanon(FloatParts64 *p, float_status *status,
745 const FloatFmt *fmt);
746 static void parts128_uncanon(FloatParts128 *p, float_status *status,
747 const FloatFmt *fmt);
748
749 #define parts_uncanon(A, S, F) \
750 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
751
752 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
753 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
754
755 #define parts_add_normal(A, B) \
756 PARTS_GENERIC_64_128(add_normal, A)(A, B)
757
758 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
759 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
760
761 #define parts_sub_normal(A, B) \
762 PARTS_GENERIC_64_128(sub_normal, A)(A, B)
763
764 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
765 float_status *s, bool subtract);
766 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
767 float_status *s, bool subtract);
768
769 #define parts_addsub(A, B, S, Z) \
770 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
771
772 /*
773 * Helper functions for softfloat-parts.c.inc, per-size operations.
774 */
775
776 #define FRAC_GENERIC_64_128(NAME, P) \
777 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
778
779 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
780 {
781 return uadd64_overflow(a->frac, b->frac, &r->frac);
782 }
783
784 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
785 {
786 bool c = 0;
787 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
788 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
789 return c;
790 }
791
792 #define frac_add(R, A, B) FRAC_GENERIC_64_128(add, R)(R, A, B)
793
794 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
795 {
796 return uadd64_overflow(a->frac, c, &r->frac);
797 }
798
799 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
800 {
801 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
802 return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
803 }
804
805 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
806
807 static void frac64_allones(FloatParts64 *a)
808 {
809 a->frac = -1;
810 }
811
812 static void frac128_allones(FloatParts128 *a)
813 {
814 a->frac_hi = a->frac_lo = -1;
815 }
816
817 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
818
819 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
820 {
821 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
822 }
823
824 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
825 {
826 uint64_t ta = a->frac_hi, tb = b->frac_hi;
827 if (ta == tb) {
828 ta = a->frac_lo, tb = b->frac_lo;
829 if (ta == tb) {
830 return 0;
831 }
832 }
833 return ta < tb ? -1 : 1;
834 }
835
836 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
837
838 static void frac64_clear(FloatParts64 *a)
839 {
840 a->frac = 0;
841 }
842
843 static void frac128_clear(FloatParts128 *a)
844 {
845 a->frac_hi = a->frac_lo = 0;
846 }
847
848 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
849
850 static bool frac64_eqz(FloatParts64 *a)
851 {
852 return a->frac == 0;
853 }
854
855 static bool frac128_eqz(FloatParts128 *a)
856 {
857 return (a->frac_hi | a->frac_lo) == 0;
858 }
859
860 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
861
862 static void frac64_neg(FloatParts64 *a)
863 {
864 a->frac = -a->frac;
865 }
866
867 static void frac128_neg(FloatParts128 *a)
868 {
869 bool c = 0;
870 a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
871 a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
872 }
873
874 #define frac_neg(A) FRAC_GENERIC_64_128(neg, A)(A)
875
876 static int frac64_normalize(FloatParts64 *a)
877 {
878 if (a->frac) {
879 int shift = clz64(a->frac);
880 a->frac <<= shift;
881 return shift;
882 }
883 return 64;
884 }
885
886 static int frac128_normalize(FloatParts128 *a)
887 {
888 if (a->frac_hi) {
889 int shl = clz64(a->frac_hi);
890 if (shl) {
891 int shr = 64 - shl;
892 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
893 a->frac_lo = (a->frac_lo << shl);
894 }
895 return shl;
896 } else if (a->frac_lo) {
897 int shl = clz64(a->frac_lo);
898 a->frac_hi = (a->frac_lo << shl);
899 a->frac_lo = 0;
900 return shl + 64;
901 }
902 return 128;
903 }
904
905 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A)
906
907 static void frac64_shl(FloatParts64 *a, int c)
908 {
909 a->frac <<= c;
910 }
911
912 static void frac128_shl(FloatParts128 *a, int c)
913 {
914 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
915 }
916
917 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
918
919 static void frac64_shr(FloatParts64 *a, int c)
920 {
921 a->frac >>= c;
922 }
923
924 static void frac128_shr(FloatParts128 *a, int c)
925 {
926 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
927 }
928
929 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
930
931 static void frac64_shrjam(FloatParts64 *a, int c)
932 {
933 shift64RightJamming(a->frac, c, &a->frac);
934 }
935
936 static void frac128_shrjam(FloatParts128 *a, int c)
937 {
938 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
939 }
940
941 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C)
942
943 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
944 {
945 return usub64_overflow(a->frac, b->frac, &r->frac);
946 }
947
948 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
949 {
950 bool c = 0;
951 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
952 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
953 return c;
954 }
955
956 #define frac_sub(R, A, B) FRAC_GENERIC_64_128(sub, R)(R, A, B)
957
958 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
959 #define FloatPartsN glue(FloatParts,N)
960
961 #define N 64
962
963 #include "softfloat-parts-addsub.c.inc"
964 #include "softfloat-parts.c.inc"
965
966 #undef N
967 #define N 128
968
969 #include "softfloat-parts-addsub.c.inc"
970 #include "softfloat-parts.c.inc"
971
972 #undef N
973 #undef partsN
974 #undef FloatPartsN
975
976 /*
977 * Pack/unpack routines with a specific FloatFmt.
978 */
979
980 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
981 float_status *s, const FloatFmt *params)
982 {
983 float16_unpack_raw(p, f);
984 parts_canonicalize(p, s, params);
985 }
986
987 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
988 float_status *s)
989 {
990 float16a_unpack_canonical(p, f, s, &float16_params);
991 }
992
993 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
994 float_status *s)
995 {
996 bfloat16_unpack_raw(p, f);
997 parts_canonicalize(p, s, &bfloat16_params);
998 }
999
1000 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1001 float_status *s,
1002 const FloatFmt *params)
1003 {
1004 parts_uncanon(p, s, params);
1005 return float16_pack_raw(p);
1006 }
1007
1008 static float16 float16_round_pack_canonical(FloatParts64 *p,
1009 float_status *s)
1010 {
1011 return float16a_round_pack_canonical(p, s, &float16_params);
1012 }
1013
1014 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1015 float_status *s)
1016 {
1017 parts_uncanon(p, s, &bfloat16_params);
1018 return bfloat16_pack_raw(p);
1019 }
1020
1021 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1022 float_status *s)
1023 {
1024 float32_unpack_raw(p, f);
1025 parts_canonicalize(p, s, &float32_params);
1026 }
1027
1028 static float32 float32_round_pack_canonical(FloatParts64 *p,
1029 float_status *s)
1030 {
1031 parts_uncanon(p, s, &float32_params);
1032 return float32_pack_raw(p);
1033 }
1034
1035 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1036 float_status *s)
1037 {
1038 float64_unpack_raw(p, f);
1039 parts_canonicalize(p, s, &float64_params);
1040 }
1041
1042 static float64 float64_round_pack_canonical(FloatParts64 *p,
1043 float_status *s)
1044 {
1045 parts_uncanon(p, s, &float64_params);
1046 return float64_pack_raw(p);
1047 }
1048
1049 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1050 float_status *s)
1051 {
1052 float128_unpack_raw(p, f);
1053 parts_canonicalize(p, s, &float128_params);
1054 }
1055
1056 static float128 float128_round_pack_canonical(FloatParts128 *p,
1057 float_status *s)
1058 {
1059 parts_uncanon(p, s, &float128_params);
1060 return float128_pack_raw(p);
1061 }
1062
1063 /*
1064 * Addition and subtraction
1065 */
1066
1067 static float16 QEMU_FLATTEN
1068 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1069 {
1070 FloatParts64 pa, pb, *pr;
1071
1072 float16_unpack_canonical(&pa, a, status);
1073 float16_unpack_canonical(&pb, b, status);
1074 pr = parts_addsub(&pa, &pb, status, subtract);
1075
1076 return float16_round_pack_canonical(pr, status);
1077 }
1078
1079 float16 float16_add(float16 a, float16 b, float_status *status)
1080 {
1081 return float16_addsub(a, b, status, false);
1082 }
1083
1084 float16 float16_sub(float16 a, float16 b, float_status *status)
1085 {
1086 return float16_addsub(a, b, status, true);
1087 }
1088
1089 static float32 QEMU_SOFTFLOAT_ATTR
1090 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1091 {
1092 FloatParts64 pa, pb, *pr;
1093
1094 float32_unpack_canonical(&pa, a, status);
1095 float32_unpack_canonical(&pb, b, status);
1096 pr = parts_addsub(&pa, &pb, status, subtract);
1097
1098 return float32_round_pack_canonical(pr, status);
1099 }
1100
1101 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1102 {
1103 return soft_f32_addsub(a, b, status, false);
1104 }
1105
1106 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1107 {
1108 return soft_f32_addsub(a, b, status, true);
1109 }
1110
1111 static float64 QEMU_SOFTFLOAT_ATTR
1112 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1113 {
1114 FloatParts64 pa, pb, *pr;
1115
1116 float64_unpack_canonical(&pa, a, status);
1117 float64_unpack_canonical(&pb, b, status);
1118 pr = parts_addsub(&pa, &pb, status, subtract);
1119
1120 return float64_round_pack_canonical(pr, status);
1121 }
1122
1123 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1124 {
1125 return soft_f64_addsub(a, b, status, false);
1126 }
1127
1128 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1129 {
1130 return soft_f64_addsub(a, b, status, true);
1131 }
1132
1133 static float hard_f32_add(float a, float b)
1134 {
1135 return a + b;
1136 }
1137
1138 static float hard_f32_sub(float a, float b)
1139 {
1140 return a - b;
1141 }
1142
1143 static double hard_f64_add(double a, double b)
1144 {
1145 return a + b;
1146 }
1147
1148 static double hard_f64_sub(double a, double b)
1149 {
1150 return a - b;
1151 }
1152
1153 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1154 {
1155 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1156 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1157 }
1158 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1159 }
1160
1161 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1162 {
1163 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1164 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1165 } else {
1166 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1167 }
1168 }
1169
1170 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1171 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1172 {
1173 return float32_gen2(a, b, s, hard, soft,
1174 f32_is_zon2, f32_addsubmul_post);
1175 }
1176
1177 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1178 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1179 {
1180 return float64_gen2(a, b, s, hard, soft,
1181 f64_is_zon2, f64_addsubmul_post);
1182 }
1183
1184 float32 QEMU_FLATTEN
1185 float32_add(float32 a, float32 b, float_status *s)
1186 {
1187 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1188 }
1189
1190 float32 QEMU_FLATTEN
1191 float32_sub(float32 a, float32 b, float_status *s)
1192 {
1193 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1194 }
1195
1196 float64 QEMU_FLATTEN
1197 float64_add(float64 a, float64 b, float_status *s)
1198 {
1199 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1200 }
1201
1202 float64 QEMU_FLATTEN
1203 float64_sub(float64 a, float64 b, float_status *s)
1204 {
1205 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1206 }
1207
1208 static bfloat16 QEMU_FLATTEN
1209 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1210 {
1211 FloatParts64 pa, pb, *pr;
1212
1213 bfloat16_unpack_canonical(&pa, a, status);
1214 bfloat16_unpack_canonical(&pb, b, status);
1215 pr = parts_addsub(&pa, &pb, status, subtract);
1216
1217 return bfloat16_round_pack_canonical(pr, status);
1218 }
1219
1220 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1221 {
1222 return bfloat16_addsub(a, b, status, false);
1223 }
1224
1225 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1226 {
1227 return bfloat16_addsub(a, b, status, true);
1228 }
1229
1230 static float128 QEMU_FLATTEN
1231 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1232 {
1233 FloatParts128 pa, pb, *pr;
1234
1235 float128_unpack_canonical(&pa, a, status);
1236 float128_unpack_canonical(&pb, b, status);
1237 pr = parts_addsub(&pa, &pb, status, subtract);
1238
1239 return float128_round_pack_canonical(pr, status);
1240 }
1241
1242 float128 float128_add(float128 a, float128 b, float_status *status)
1243 {
1244 return float128_addsub(a, b, status, false);
1245 }
1246
1247 float128 float128_sub(float128 a, float128 b, float_status *status)
1248 {
1249 return float128_addsub(a, b, status, true);
1250 }
1251
1252 /*
1253 * Returns the result of multiplying the floating-point values `a' and
1254 * `b'. The operation is performed according to the IEC/IEEE Standard
1255 * for Binary Floating-Point Arithmetic.
1256 */
1257
1258 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1259 {
1260 bool sign = a.sign ^ b.sign;
1261
1262 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1263 uint64_t hi, lo;
1264 int exp = a.exp + b.exp;
1265
1266 mul64To128(a.frac, b.frac, &hi, &lo);
1267 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1268 exp += 1;
1269 } else {
1270 hi <<= 1;
1271 }
1272 hi |= (lo != 0);
1273
1274 /* Re-use a */
1275 a.exp = exp;
1276 a.sign = sign;
1277 a.frac = hi;
1278 return a;
1279 }
1280 /* handle all the NaN cases */
1281 if (is_nan(a.cls) || is_nan(b.cls)) {
1282 return *parts_pick_nan(&a, &b, s);
1283 }
1284 /* Inf * Zero == NaN */
1285 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1286 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1287 float_raise(float_flag_invalid, s);
1288 parts_default_nan(&a, s);
1289 return a;
1290 }
1291 /* Multiply by 0 or Inf */
1292 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1293 a.sign = sign;
1294 return a;
1295 }
1296 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1297 b.sign = sign;
1298 return b;
1299 }
1300 g_assert_not_reached();
1301 }
1302
1303 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1304 {
1305 FloatParts64 pa, pb, pr;
1306
1307 float16_unpack_canonical(&pa, a, status);
1308 float16_unpack_canonical(&pb, b, status);
1309 pr = mul_floats(pa, pb, status);
1310
1311 return float16_round_pack_canonical(&pr, status);
1312 }
1313
1314 static float32 QEMU_SOFTFLOAT_ATTR
1315 soft_f32_mul(float32 a, float32 b, float_status *status)
1316 {
1317 FloatParts64 pa, pb, pr;
1318
1319 float32_unpack_canonical(&pa, a, status);
1320 float32_unpack_canonical(&pb, b, status);
1321 pr = mul_floats(pa, pb, status);
1322
1323 return float32_round_pack_canonical(&pr, status);
1324 }
1325
1326 static float64 QEMU_SOFTFLOAT_ATTR
1327 soft_f64_mul(float64 a, float64 b, float_status *status)
1328 {
1329 FloatParts64 pa, pb, pr;
1330
1331 float64_unpack_canonical(&pa, a, status);
1332 float64_unpack_canonical(&pb, b, status);
1333 pr = mul_floats(pa, pb, status);
1334
1335 return float64_round_pack_canonical(&pr, status);
1336 }
1337
1338 static float hard_f32_mul(float a, float b)
1339 {
1340 return a * b;
1341 }
1342
1343 static double hard_f64_mul(double a, double b)
1344 {
1345 return a * b;
1346 }
1347
1348 float32 QEMU_FLATTEN
1349 float32_mul(float32 a, float32 b, float_status *s)
1350 {
1351 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1352 f32_is_zon2, f32_addsubmul_post);
1353 }
1354
1355 float64 QEMU_FLATTEN
1356 float64_mul(float64 a, float64 b, float_status *s)
1357 {
1358 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1359 f64_is_zon2, f64_addsubmul_post);
1360 }
1361
1362 /*
1363 * Returns the result of multiplying the bfloat16
1364 * values `a' and `b'.
1365 */
1366
1367 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1368 {
1369 FloatParts64 pa, pb, pr;
1370
1371 bfloat16_unpack_canonical(&pa, a, status);
1372 bfloat16_unpack_canonical(&pb, b, status);
1373 pr = mul_floats(pa, pb, status);
1374
1375 return bfloat16_round_pack_canonical(&pr, status);
1376 }
1377
1378 /*
1379 * Returns the result of multiplying the floating-point values `a' and
1380 * `b' then adding 'c', with no intermediate rounding step after the
1381 * multiplication. The operation is performed according to the
1382 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1383 * The flags argument allows the caller to select negation of the
1384 * addend, the intermediate product, or the final result. (The
1385 * difference between this and having the caller do a separate
1386 * negation is that negating externally will flip the sign bit on
1387 * NaNs.)
1388 */
1389
1390 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1391 int flags, float_status *s)
1392 {
1393 bool inf_zero, p_sign;
1394 bool sign_flip = flags & float_muladd_negate_result;
1395 FloatClass p_class;
1396 uint64_t hi, lo;
1397 int p_exp;
1398 int ab_mask, abc_mask;
1399
1400 ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1401 abc_mask = float_cmask(c.cls) | ab_mask;
1402 inf_zero = ab_mask == float_cmask_infzero;
1403
1404 /* It is implementation-defined whether the cases of (0,inf,qnan)
1405 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1406 * they return if they do), so we have to hand this information
1407 * off to the target-specific pick-a-NaN routine.
1408 */
1409 if (unlikely(abc_mask & float_cmask_anynan)) {
1410 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1411 }
1412
1413 if (inf_zero) {
1414 float_raise(float_flag_invalid, s);
1415 parts_default_nan(&a, s);
1416 return a;
1417 }
1418
1419 if (flags & float_muladd_negate_c) {
1420 c.sign ^= 1;
1421 }
1422
1423 p_sign = a.sign ^ b.sign;
1424
1425 if (flags & float_muladd_negate_product) {
1426 p_sign ^= 1;
1427 }
1428
1429 if (ab_mask & float_cmask_inf) {
1430 p_class = float_class_inf;
1431 } else if (ab_mask & float_cmask_zero) {
1432 p_class = float_class_zero;
1433 } else {
1434 p_class = float_class_normal;
1435 }
1436
1437 if (c.cls == float_class_inf) {
1438 if (p_class == float_class_inf && p_sign != c.sign) {
1439 float_raise(float_flag_invalid, s);
1440 parts_default_nan(&c, s);
1441 } else {
1442 c.sign ^= sign_flip;
1443 }
1444 return c;
1445 }
1446
1447 if (p_class == float_class_inf) {
1448 a.cls = float_class_inf;
1449 a.sign = p_sign ^ sign_flip;
1450 return a;
1451 }
1452
1453 if (p_class == float_class_zero) {
1454 if (c.cls == float_class_zero) {
1455 if (p_sign != c.sign) {
1456 p_sign = s->float_rounding_mode == float_round_down;
1457 }
1458 c.sign = p_sign;
1459 } else if (flags & float_muladd_halve_result) {
1460 c.exp -= 1;
1461 }
1462 c.sign ^= sign_flip;
1463 return c;
1464 }
1465
1466 /* a & b should be normals now... */
1467 assert(a.cls == float_class_normal &&
1468 b.cls == float_class_normal);
1469
1470 p_exp = a.exp + b.exp;
1471
1472 mul64To128(a.frac, b.frac, &hi, &lo);
1473
1474 /* Renormalize to the msb. */
1475 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1476 p_exp += 1;
1477 } else {
1478 shortShift128Left(hi, lo, 1, &hi, &lo);
1479 }
1480
1481 /* + add/sub */
1482 if (c.cls != float_class_zero) {
1483 int exp_diff = p_exp - c.exp;
1484 if (p_sign == c.sign) {
1485 /* Addition */
1486 if (exp_diff <= 0) {
1487 shift64RightJamming(hi, -exp_diff, &hi);
1488 p_exp = c.exp;
1489 if (uadd64_overflow(hi, c.frac, &hi)) {
1490 shift64RightJamming(hi, 1, &hi);
1491 hi |= DECOMPOSED_IMPLICIT_BIT;
1492 p_exp += 1;
1493 }
1494 } else {
1495 uint64_t c_hi, c_lo, over;
1496 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1497 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1498 if (over) {
1499 shift64RightJamming(hi, 1, &hi);
1500 hi |= DECOMPOSED_IMPLICIT_BIT;
1501 p_exp += 1;
1502 }
1503 }
1504 } else {
1505 /* Subtraction */
1506 uint64_t c_hi = c.frac, c_lo = 0;
1507
1508 if (exp_diff <= 0) {
1509 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1510 if (exp_diff == 0
1511 &&
1512 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1513 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1514 } else {
1515 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1516 p_sign ^= 1;
1517 p_exp = c.exp;
1518 }
1519 } else {
1520 shift128RightJamming(c_hi, c_lo,
1521 exp_diff,
1522 &c_hi, &c_lo);
1523 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1524 }
1525
1526 if (hi == 0 && lo == 0) {
1527 a.cls = float_class_zero;
1528 a.sign = s->float_rounding_mode == float_round_down;
1529 a.sign ^= sign_flip;
1530 return a;
1531 } else {
1532 int shift;
1533 if (hi != 0) {
1534 shift = clz64(hi);
1535 } else {
1536 shift = clz64(lo) + 64;
1537 }
1538 /* Normalizing to a binary point of 124 is the
1539 correct adjust for the exponent. However since we're
1540 shifting, we might as well put the binary point back
1541 at 63 where we really want it. Therefore shift as
1542 if we're leaving 1 bit at the top of the word, but
1543 adjust the exponent as if we're leaving 3 bits. */
1544 shift128Left(hi, lo, shift, &hi, &lo);
1545 p_exp -= shift;
1546 }
1547 }
1548 }
1549 hi |= (lo != 0);
1550
1551 if (flags & float_muladd_halve_result) {
1552 p_exp -= 1;
1553 }
1554
1555 /* finally prepare our result */
1556 a.cls = float_class_normal;
1557 a.sign = p_sign ^ sign_flip;
1558 a.exp = p_exp;
1559 a.frac = hi;
1560
1561 return a;
1562 }
1563
1564 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1565 int flags, float_status *status)
1566 {
1567 FloatParts64 pa, pb, pc, pr;
1568
1569 float16_unpack_canonical(&pa, a, status);
1570 float16_unpack_canonical(&pb, b, status);
1571 float16_unpack_canonical(&pc, c, status);
1572 pr = muladd_floats(pa, pb, pc, flags, status);
1573
1574 return float16_round_pack_canonical(&pr, status);
1575 }
1576
1577 static float32 QEMU_SOFTFLOAT_ATTR
1578 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1579 float_status *status)
1580 {
1581 FloatParts64 pa, pb, pc, pr;
1582
1583 float32_unpack_canonical(&pa, a, status);
1584 float32_unpack_canonical(&pb, b, status);
1585 float32_unpack_canonical(&pc, c, status);
1586 pr = muladd_floats(pa, pb, pc, flags, status);
1587
1588 return float32_round_pack_canonical(&pr, status);
1589 }
1590
1591 static float64 QEMU_SOFTFLOAT_ATTR
1592 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1593 float_status *status)
1594 {
1595 FloatParts64 pa, pb, pc, pr;
1596
1597 float64_unpack_canonical(&pa, a, status);
1598 float64_unpack_canonical(&pb, b, status);
1599 float64_unpack_canonical(&pc, c, status);
1600 pr = muladd_floats(pa, pb, pc, flags, status);
1601
1602 return float64_round_pack_canonical(&pr, status);
1603 }
1604
1605 static bool force_soft_fma;
1606
1607 float32 QEMU_FLATTEN
1608 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1609 {
1610 union_float32 ua, ub, uc, ur;
1611
1612 ua.s = xa;
1613 ub.s = xb;
1614 uc.s = xc;
1615
1616 if (unlikely(!can_use_fpu(s))) {
1617 goto soft;
1618 }
1619 if (unlikely(flags & float_muladd_halve_result)) {
1620 goto soft;
1621 }
1622
1623 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1624 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1625 goto soft;
1626 }
1627
1628 if (unlikely(force_soft_fma)) {
1629 goto soft;
1630 }
1631
1632 /*
1633 * When (a || b) == 0, there's no need to check for under/over flow,
1634 * since we know the addend is (normal || 0) and the product is 0.
1635 */
1636 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1637 union_float32 up;
1638 bool prod_sign;
1639
1640 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1641 prod_sign ^= !!(flags & float_muladd_negate_product);
1642 up.s = float32_set_sign(float32_zero, prod_sign);
1643
1644 if (flags & float_muladd_negate_c) {
1645 uc.h = -uc.h;
1646 }
1647 ur.h = up.h + uc.h;
1648 } else {
1649 union_float32 ua_orig = ua;
1650 union_float32 uc_orig = uc;
1651
1652 if (flags & float_muladd_negate_product) {
1653 ua.h = -ua.h;
1654 }
1655 if (flags & float_muladd_negate_c) {
1656 uc.h = -uc.h;
1657 }
1658
1659 ur.h = fmaf(ua.h, ub.h, uc.h);
1660
1661 if (unlikely(f32_is_inf(ur))) {
1662 float_raise(float_flag_overflow, s);
1663 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1664 ua = ua_orig;
1665 uc = uc_orig;
1666 goto soft;
1667 }
1668 }
1669 if (flags & float_muladd_negate_result) {
1670 return float32_chs(ur.s);
1671 }
1672 return ur.s;
1673
1674 soft:
1675 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1676 }
1677
1678 float64 QEMU_FLATTEN
1679 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1680 {
1681 union_float64 ua, ub, uc, ur;
1682
1683 ua.s = xa;
1684 ub.s = xb;
1685 uc.s = xc;
1686
1687 if (unlikely(!can_use_fpu(s))) {
1688 goto soft;
1689 }
1690 if (unlikely(flags & float_muladd_halve_result)) {
1691 goto soft;
1692 }
1693
1694 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1695 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1696 goto soft;
1697 }
1698
1699 if (unlikely(force_soft_fma)) {
1700 goto soft;
1701 }
1702
1703 /*
1704 * When (a || b) == 0, there's no need to check for under/over flow,
1705 * since we know the addend is (normal || 0) and the product is 0.
1706 */
1707 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1708 union_float64 up;
1709 bool prod_sign;
1710
1711 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1712 prod_sign ^= !!(flags & float_muladd_negate_product);
1713 up.s = float64_set_sign(float64_zero, prod_sign);
1714
1715 if (flags & float_muladd_negate_c) {
1716 uc.h = -uc.h;
1717 }
1718 ur.h = up.h + uc.h;
1719 } else {
1720 union_float64 ua_orig = ua;
1721 union_float64 uc_orig = uc;
1722
1723 if (flags & float_muladd_negate_product) {
1724 ua.h = -ua.h;
1725 }
1726 if (flags & float_muladd_negate_c) {
1727 uc.h = -uc.h;
1728 }
1729
1730 ur.h = fma(ua.h, ub.h, uc.h);
1731
1732 if (unlikely(f64_is_inf(ur))) {
1733 float_raise(float_flag_overflow, s);
1734 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1735 ua = ua_orig;
1736 uc = uc_orig;
1737 goto soft;
1738 }
1739 }
1740 if (flags & float_muladd_negate_result) {
1741 return float64_chs(ur.s);
1742 }
1743 return ur.s;
1744
1745 soft:
1746 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1747 }
1748
1749 /*
1750 * Returns the result of multiplying the bfloat16 values `a'
1751 * and `b' then adding 'c', with no intermediate rounding step after the
1752 * multiplication.
1753 */
1754
1755 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1756 int flags, float_status *status)
1757 {
1758 FloatParts64 pa, pb, pc, pr;
1759
1760 bfloat16_unpack_canonical(&pa, a, status);
1761 bfloat16_unpack_canonical(&pb, b, status);
1762 bfloat16_unpack_canonical(&pc, c, status);
1763 pr = muladd_floats(pa, pb, pc, flags, status);
1764
1765 return bfloat16_round_pack_canonical(&pr, status);
1766 }
1767
1768 /*
1769 * Returns the result of dividing the floating-point value `a' by the
1770 * corresponding value `b'. The operation is performed according to
1771 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1772 */
1773
1774 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1775 {
1776 bool sign = a.sign ^ b.sign;
1777
1778 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1779 uint64_t n0, n1, q, r;
1780 int exp = a.exp - b.exp;
1781
1782 /*
1783 * We want a 2*N / N-bit division to produce exactly an N-bit
1784 * result, so that we do not lose any precision and so that we
1785 * do not have to renormalize afterward. If A.frac < B.frac,
1786 * then division would produce an (N-1)-bit result; shift A left
1787 * by one to produce the an N-bit result, and decrement the
1788 * exponent to match.
1789 *
1790 * The udiv_qrnnd algorithm that we're using requires normalization,
1791 * i.e. the msb of the denominator must be set, which is already true.
1792 */
1793 if (a.frac < b.frac) {
1794 exp -= 1;
1795 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1796 } else {
1797 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1798 }
1799 q = udiv_qrnnd(&r, n1, n0, b.frac);
1800
1801 /* Set lsb if there is a remainder, to set inexact. */
1802 a.frac = q | (r != 0);
1803 a.sign = sign;
1804 a.exp = exp;
1805 return a;
1806 }
1807 /* handle all the NaN cases */
1808 if (is_nan(a.cls) || is_nan(b.cls)) {
1809 return *parts_pick_nan(&a, &b, s);
1810 }
1811 /* 0/0 or Inf/Inf */
1812 if (a.cls == b.cls
1813 &&
1814 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1815 float_raise(float_flag_invalid, s);
1816 parts_default_nan(&a, s);
1817 return a;
1818 }
1819 /* Inf / x or 0 / x */
1820 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1821 a.sign = sign;
1822 return a;
1823 }
1824 /* Div 0 => Inf */
1825 if (b.cls == float_class_zero) {
1826 float_raise(float_flag_divbyzero, s);
1827 a.cls = float_class_inf;
1828 a.sign = sign;
1829 return a;
1830 }
1831 /* Div by Inf */
1832 if (b.cls == float_class_inf) {
1833 a.cls = float_class_zero;
1834 a.sign = sign;
1835 return a;
1836 }
1837 g_assert_not_reached();
1838 }
1839
1840 float16 float16_div(float16 a, float16 b, float_status *status)
1841 {
1842 FloatParts64 pa, pb, pr;
1843
1844 float16_unpack_canonical(&pa, a, status);
1845 float16_unpack_canonical(&pb, b, status);
1846 pr = div_floats(pa, pb, status);
1847
1848 return float16_round_pack_canonical(&pr, status);
1849 }
1850
1851 static float32 QEMU_SOFTFLOAT_ATTR
1852 soft_f32_div(float32 a, float32 b, float_status *status)
1853 {
1854 FloatParts64 pa, pb, pr;
1855
1856 float32_unpack_canonical(&pa, a, status);
1857 float32_unpack_canonical(&pb, b, status);
1858 pr = div_floats(pa, pb, status);
1859
1860 return float32_round_pack_canonical(&pr, status);
1861 }
1862
1863 static float64 QEMU_SOFTFLOAT_ATTR
1864 soft_f64_div(float64 a, float64 b, float_status *status)
1865 {
1866 FloatParts64 pa, pb, pr;
1867
1868 float64_unpack_canonical(&pa, a, status);
1869 float64_unpack_canonical(&pb, b, status);
1870 pr = div_floats(pa, pb, status);
1871
1872 return float64_round_pack_canonical(&pr, status);
1873 }
1874
1875 static float hard_f32_div(float a, float b)
1876 {
1877 return a / b;
1878 }
1879
1880 static double hard_f64_div(double a, double b)
1881 {
1882 return a / b;
1883 }
1884
1885 static bool f32_div_pre(union_float32 a, union_float32 b)
1886 {
1887 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1888 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1889 fpclassify(b.h) == FP_NORMAL;
1890 }
1891 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1892 }
1893
1894 static bool f64_div_pre(union_float64 a, union_float64 b)
1895 {
1896 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1897 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1898 fpclassify(b.h) == FP_NORMAL;
1899 }
1900 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1901 }
1902
1903 static bool f32_div_post(union_float32 a, union_float32 b)
1904 {
1905 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1906 return fpclassify(a.h) != FP_ZERO;
1907 }
1908 return !float32_is_zero(a.s);
1909 }
1910
1911 static bool f64_div_post(union_float64 a, union_float64 b)
1912 {
1913 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1914 return fpclassify(a.h) != FP_ZERO;
1915 }
1916 return !float64_is_zero(a.s);
1917 }
1918
1919 float32 QEMU_FLATTEN
1920 float32_div(float32 a, float32 b, float_status *s)
1921 {
1922 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1923 f32_div_pre, f32_div_post);
1924 }
1925
1926 float64 QEMU_FLATTEN
1927 float64_div(float64 a, float64 b, float_status *s)
1928 {
1929 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1930 f64_div_pre, f64_div_post);
1931 }
1932
1933 /*
1934 * Returns the result of dividing the bfloat16
1935 * value `a' by the corresponding value `b'.
1936 */
1937
1938 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1939 {
1940 FloatParts64 pa, pb, pr;
1941
1942 bfloat16_unpack_canonical(&pa, a, status);
1943 bfloat16_unpack_canonical(&pb, b, status);
1944 pr = div_floats(pa, pb, status);
1945
1946 return bfloat16_round_pack_canonical(&pr, status);
1947 }
1948
1949 /*
1950 * Float to Float conversions
1951 *
1952 * Returns the result of converting one float format to another. The
1953 * conversion is performed according to the IEC/IEEE Standard for
1954 * Binary Floating-Point Arithmetic.
1955 *
1956 * The float_to_float helper only needs to take care of raising
1957 * invalid exceptions and handling the conversion on NaNs.
1958 */
1959
1960 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1961 float_status *s)
1962 {
1963 if (dstf->arm_althp) {
1964 switch (a.cls) {
1965 case float_class_qnan:
1966 case float_class_snan:
1967 /* There is no NaN in the destination format. Raise Invalid
1968 * and return a zero with the sign of the input NaN.
1969 */
1970 float_raise(float_flag_invalid, s);
1971 a.cls = float_class_zero;
1972 a.frac = 0;
1973 a.exp = 0;
1974 break;
1975
1976 case float_class_inf:
1977 /* There is no Inf in the destination format. Raise Invalid
1978 * and return the maximum normal with the correct sign.
1979 */
1980 float_raise(float_flag_invalid, s);
1981 a.cls = float_class_normal;
1982 a.exp = dstf->exp_max;
1983 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1984 break;
1985
1986 default:
1987 break;
1988 }
1989 } else if (is_nan(a.cls)) {
1990 parts_return_nan(&a, s);
1991 }
1992 return a;
1993 }
1994
1995 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1996 {
1997 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1998 FloatParts64 pa, pr;
1999
2000 float16a_unpack_canonical(&pa, a, s, fmt16);
2001 pr = float_to_float(pa, &float32_params, s);
2002 return float32_round_pack_canonical(&pr, s);
2003 }
2004
2005 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2006 {
2007 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2008 FloatParts64 pa, pr;
2009
2010 float16a_unpack_canonical(&pa, a, s, fmt16);
2011 pr = float_to_float(pa, &float64_params, s);
2012 return float64_round_pack_canonical(&pr, s);
2013 }
2014
2015 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2016 {
2017 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2018 FloatParts64 pa, pr;
2019
2020 float32_unpack_canonical(&pa, a, s);
2021 pr = float_to_float(pa, fmt16, s);
2022 return float16a_round_pack_canonical(&pr, s, fmt16);
2023 }
2024
2025 static float64 QEMU_SOFTFLOAT_ATTR
2026 soft_float32_to_float64(float32 a, float_status *s)
2027 {
2028 FloatParts64 pa, pr;
2029
2030 float32_unpack_canonical(&pa, a, s);
2031 pr = float_to_float(pa, &float64_params, s);
2032 return float64_round_pack_canonical(&pr, s);
2033 }
2034
2035 float64 float32_to_float64(float32 a, float_status *s)
2036 {
2037 if (likely(float32_is_normal(a))) {
2038 /* Widening conversion can never produce inexact results. */
2039 union_float32 uf;
2040 union_float64 ud;
2041 uf.s = a;
2042 ud.h = uf.h;
2043 return ud.s;
2044 } else if (float32_is_zero(a)) {
2045 return float64_set_sign(float64_zero, float32_is_neg(a));
2046 } else {
2047 return soft_float32_to_float64(a, s);
2048 }
2049 }
2050
2051 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2052 {
2053 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2054 FloatParts64 pa, pr;
2055
2056 float64_unpack_canonical(&pa, a, s);
2057 pr = float_to_float(pa, fmt16, s);
2058 return float16a_round_pack_canonical(&pr, s, fmt16);
2059 }
2060
2061 float32 float64_to_float32(float64 a, float_status *s)
2062 {
2063 FloatParts64 pa, pr;
2064
2065 float64_unpack_canonical(&pa, a, s);
2066 pr = float_to_float(pa, &float32_params, s);
2067 return float32_round_pack_canonical(&pr, s);
2068 }
2069
2070 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2071 {
2072 FloatParts64 pa, pr;
2073
2074 bfloat16_unpack_canonical(&pa, a, s);
2075 pr = float_to_float(pa, &float32_params, s);
2076 return float32_round_pack_canonical(&pr, s);
2077 }
2078
2079 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2080 {
2081 FloatParts64 pa, pr;
2082
2083 bfloat16_unpack_canonical(&pa, a, s);
2084 pr = float_to_float(pa, &float64_params, s);
2085 return float64_round_pack_canonical(&pr, s);
2086 }
2087
2088 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2089 {
2090 FloatParts64 pa, pr;
2091
2092 float32_unpack_canonical(&pa, a, s);
2093 pr = float_to_float(pa, &bfloat16_params, s);
2094 return bfloat16_round_pack_canonical(&pr, s);
2095 }
2096
2097 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2098 {
2099 FloatParts64 pa, pr;
2100
2101 float64_unpack_canonical(&pa, a, s);
2102 pr = float_to_float(pa, &bfloat16_params, s);
2103 return bfloat16_round_pack_canonical(&pr, s);
2104 }
2105
2106 /*
2107 * Rounds the floating-point value `a' to an integer, and returns the
2108 * result as a floating-point value. The operation is performed
2109 * according to the IEC/IEEE Standard for Binary Floating-Point
2110 * Arithmetic.
2111 */
2112
2113 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2114 int scale, float_status *s)
2115 {
2116 switch (a.cls) {
2117 case float_class_qnan:
2118 case float_class_snan:
2119 parts_return_nan(&a, s);
2120 break;
2121
2122 case float_class_zero:
2123 case float_class_inf:
2124 /* already "integral" */
2125 break;
2126
2127 case float_class_normal:
2128 scale = MIN(MAX(scale, -0x10000), 0x10000);
2129 a.exp += scale;
2130
2131 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2132 /* already integral */
2133 break;
2134 }
2135 if (a.exp < 0) {
2136 bool one;
2137 /* all fractional */
2138 float_raise(float_flag_inexact, s);
2139 switch (rmode) {
2140 case float_round_nearest_even:
2141 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2142 break;
2143 case float_round_ties_away:
2144 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2145 break;
2146 case float_round_to_zero:
2147 one = false;
2148 break;
2149 case float_round_up:
2150 one = !a.sign;
2151 break;
2152 case float_round_down:
2153 one = a.sign;
2154 break;
2155 case float_round_to_odd:
2156 one = true;
2157 break;
2158 default:
2159 g_assert_not_reached();
2160 }
2161
2162 if (one) {
2163 a.frac = DECOMPOSED_IMPLICIT_BIT;
2164 a.exp = 0;
2165 } else {
2166 a.cls = float_class_zero;
2167 }
2168 } else {
2169 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2170 uint64_t frac_lsbm1 = frac_lsb >> 1;
2171 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2172 uint64_t rnd_mask = rnd_even_mask >> 1;
2173 uint64_t inc;
2174
2175 switch (rmode) {
2176 case float_round_nearest_even:
2177 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2178 break;
2179 case float_round_ties_away:
2180 inc = frac_lsbm1;
2181 break;
2182 case float_round_to_zero:
2183 inc = 0;
2184 break;
2185 case float_round_up:
2186 inc = a.sign ? 0 : rnd_mask;
2187 break;
2188 case float_round_down:
2189 inc = a.sign ? rnd_mask : 0;
2190 break;
2191 case float_round_to_odd:
2192 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2193 break;
2194 default:
2195 g_assert_not_reached();
2196 }
2197
2198 if (a.frac & rnd_mask) {
2199 float_raise(float_flag_inexact, s);
2200 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2201 a.frac >>= 1;
2202 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2203 a.exp++;
2204 }
2205 a.frac &= ~rnd_mask;
2206 }
2207 }
2208 break;
2209 default:
2210 g_assert_not_reached();
2211 }
2212 return a;
2213 }
2214
2215 float16 float16_round_to_int(float16 a, float_status *s)
2216 {
2217 FloatParts64 pa, pr;
2218
2219 float16_unpack_canonical(&pa, a, s);
2220 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2221 return float16_round_pack_canonical(&pr, s);
2222 }
2223
2224 float32 float32_round_to_int(float32 a, float_status *s)
2225 {
2226 FloatParts64 pa, pr;
2227
2228 float32_unpack_canonical(&pa, a, s);
2229 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2230 return float32_round_pack_canonical(&pr, s);
2231 }
2232
2233 float64 float64_round_to_int(float64 a, float_status *s)
2234 {
2235 FloatParts64 pa, pr;
2236
2237 float64_unpack_canonical(&pa, a, s);
2238 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2239 return float64_round_pack_canonical(&pr, s);
2240 }
2241
2242 /*
2243 * Rounds the bfloat16 value `a' to an integer, and returns the
2244 * result as a bfloat16 value.
2245 */
2246
2247 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2248 {
2249 FloatParts64 pa, pr;
2250
2251 bfloat16_unpack_canonical(&pa, a, s);
2252 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2253 return bfloat16_round_pack_canonical(&pr, s);
2254 }
2255
2256 /*
2257 * Returns the result of converting the floating-point value `a' to
2258 * the two's complement integer format. The conversion is performed
2259 * according to the IEC/IEEE Standard for Binary Floating-Point
2260 * Arithmetic---which means in particular that the conversion is
2261 * rounded according to the current rounding mode. If `a' is a NaN,
2262 * the largest positive integer is returned. Otherwise, if the
2263 * conversion overflows, the largest integer with the same sign as `a'
2264 * is returned.
2265 */
2266
2267 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2268 int scale, int64_t min, int64_t max,
2269 float_status *s)
2270 {
2271 uint64_t r;
2272 int orig_flags = get_float_exception_flags(s);
2273 FloatParts64 p = round_to_int(in, rmode, scale, s);
2274
2275 switch (p.cls) {
2276 case float_class_snan:
2277 case float_class_qnan:
2278 s->float_exception_flags = orig_flags | float_flag_invalid;
2279 return max;
2280 case float_class_inf:
2281 s->float_exception_flags = orig_flags | float_flag_invalid;
2282 return p.sign ? min : max;
2283 case float_class_zero:
2284 return 0;
2285 case float_class_normal:
2286 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2287 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2288 } else {
2289 r = UINT64_MAX;
2290 }
2291 if (p.sign) {
2292 if (r <= -(uint64_t) min) {
2293 return -r;
2294 } else {
2295 s->float_exception_flags = orig_flags | float_flag_invalid;
2296 return min;
2297 }
2298 } else {
2299 if (r <= max) {
2300 return r;
2301 } else {
2302 s->float_exception_flags = orig_flags | float_flag_invalid;
2303 return max;
2304 }
2305 }
2306 default:
2307 g_assert_not_reached();
2308 }
2309 }
2310
2311 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2312 float_status *s)
2313 {
2314 FloatParts64 p;
2315
2316 float16_unpack_canonical(&p, a, s);
2317 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2318 }
2319
2320 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2321 float_status *s)
2322 {
2323 FloatParts64 p;
2324
2325 float16_unpack_canonical(&p, a, s);
2326 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2327 }
2328
2329 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2330 float_status *s)
2331 {
2332 FloatParts64 p;
2333
2334 float16_unpack_canonical(&p, a, s);
2335 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2336 }
2337
2338 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2339 float_status *s)
2340 {
2341 FloatParts64 p;
2342
2343 float16_unpack_canonical(&p, a, s);
2344 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2345 }
2346
2347 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2348 float_status *s)
2349 {
2350 FloatParts64 p;
2351
2352 float32_unpack_canonical(&p, a, s);
2353 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2354 }
2355
2356 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2357 float_status *s)
2358 {
2359 FloatParts64 p;
2360
2361 float32_unpack_canonical(&p, a, s);
2362 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2363 }
2364
2365 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2366 float_status *s)
2367 {
2368 FloatParts64 p;
2369
2370 float32_unpack_canonical(&p, a, s);
2371 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2372 }
2373
2374 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2375 float_status *s)
2376 {
2377 FloatParts64 p;
2378
2379 float64_unpack_canonical(&p, a, s);
2380 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2381 }
2382
2383 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2384 float_status *s)
2385 {
2386 FloatParts64 p;
2387
2388 float64_unpack_canonical(&p, a, s);
2389 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2390 }
2391
2392 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2393 float_status *s)
2394 {
2395 FloatParts64 p;
2396
2397 float64_unpack_canonical(&p, a, s);
2398 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2399 }
2400
2401 int8_t float16_to_int8(float16 a, float_status *s)
2402 {
2403 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2404 }
2405
2406 int16_t float16_to_int16(float16 a, float_status *s)
2407 {
2408 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2409 }
2410
2411 int32_t float16_to_int32(float16 a, float_status *s)
2412 {
2413 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2414 }
2415
2416 int64_t float16_to_int64(float16 a, float_status *s)
2417 {
2418 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2419 }
2420
2421 int16_t float32_to_int16(float32 a, float_status *s)
2422 {
2423 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2424 }
2425
2426 int32_t float32_to_int32(float32 a, float_status *s)
2427 {
2428 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2429 }
2430
2431 int64_t float32_to_int64(float32 a, float_status *s)
2432 {
2433 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2434 }
2435
2436 int16_t float64_to_int16(float64 a, float_status *s)
2437 {
2438 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2439 }
2440
2441 int32_t float64_to_int32(float64 a, float_status *s)
2442 {
2443 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2444 }
2445
2446 int64_t float64_to_int64(float64 a, float_status *s)
2447 {
2448 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2449 }
2450
2451 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2452 {
2453 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2454 }
2455
2456 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2457 {
2458 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2459 }
2460
2461 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2462 {
2463 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2464 }
2465
2466 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2467 {
2468 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2469 }
2470
2471 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2472 {
2473 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2474 }
2475
2476 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2477 {
2478 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2479 }
2480
2481 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2482 {
2483 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2484 }
2485
2486 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2487 {
2488 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2489 }
2490
2491 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2492 {
2493 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2494 }
2495
2496 /*
2497 * Returns the result of converting the floating-point value `a' to
2498 * the two's complement integer format.
2499 */
2500
2501 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2502 float_status *s)
2503 {
2504 FloatParts64 p;
2505
2506 bfloat16_unpack_canonical(&p, a, s);
2507 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2508 }
2509
2510 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2511 float_status *s)
2512 {
2513 FloatParts64 p;
2514
2515 bfloat16_unpack_canonical(&p, a, s);
2516 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2517 }
2518
2519 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2520 float_status *s)
2521 {
2522 FloatParts64 p;
2523
2524 bfloat16_unpack_canonical(&p, a, s);
2525 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2526 }
2527
2528 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2529 {
2530 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2531 }
2532
2533 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2534 {
2535 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2536 }
2537
2538 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2539 {
2540 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2541 }
2542
2543 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2544 {
2545 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2546 }
2547
2548 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2549 {
2550 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2551 }
2552
2553 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2554 {
2555 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2556 }
2557
2558 /*
2559 * Returns the result of converting the floating-point value `a' to
2560 * the unsigned integer format. The conversion is performed according
2561 * to the IEC/IEEE Standard for Binary Floating-Point
2562 * Arithmetic---which means in particular that the conversion is
2563 * rounded according to the current rounding mode. If `a' is a NaN,
2564 * the largest unsigned integer is returned. Otherwise, if the
2565 * conversion overflows, the largest unsigned integer is returned. If
2566 * the 'a' is negative, the result is rounded and zero is returned;
2567 * values that do not round to zero will raise the inexact exception
2568 * flag.
2569 */
2570
2571 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2572 int scale, uint64_t max,
2573 float_status *s)
2574 {
2575 int orig_flags = get_float_exception_flags(s);
2576 FloatParts64 p = round_to_int(in, rmode, scale, s);
2577 uint64_t r;
2578
2579 switch (p.cls) {
2580 case float_class_snan:
2581 case float_class_qnan:
2582 s->float_exception_flags = orig_flags | float_flag_invalid;
2583 return max;
2584 case float_class_inf:
2585 s->float_exception_flags = orig_flags | float_flag_invalid;
2586 return p.sign ? 0 : max;
2587 case float_class_zero:
2588 return 0;
2589 case float_class_normal:
2590 if (p.sign) {
2591 s->float_exception_flags = orig_flags | float_flag_invalid;
2592 return 0;
2593 }
2594
2595 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2596 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2597 } else {
2598 s->float_exception_flags = orig_flags | float_flag_invalid;
2599 return max;
2600 }
2601
2602 /* For uint64 this will never trip, but if p.exp is too large
2603 * to shift a decomposed fraction we shall have exited via the
2604 * 3rd leg above.
2605 */
2606 if (r > max) {
2607 s->float_exception_flags = orig_flags | float_flag_invalid;
2608 return max;
2609 }
2610 return r;
2611 default:
2612 g_assert_not_reached();
2613 }
2614 }
2615
2616 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2617 float_status *s)
2618 {
2619 FloatParts64 p;
2620
2621 float16_unpack_canonical(&p, a, s);
2622 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2623 }
2624
2625 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2626 float_status *s)
2627 {
2628 FloatParts64 p;
2629
2630 float16_unpack_canonical(&p, a, s);
2631 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2632 }
2633
2634 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2635 float_status *s)
2636 {
2637 FloatParts64 p;
2638
2639 float16_unpack_canonical(&p, a, s);
2640 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2641 }
2642
2643 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2644 float_status *s)
2645 {
2646 FloatParts64 p;
2647
2648 float16_unpack_canonical(&p, a, s);
2649 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2650 }
2651
2652 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2653 float_status *s)
2654 {
2655 FloatParts64 p;
2656
2657 float32_unpack_canonical(&p, a, s);
2658 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2659 }
2660
2661 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2662 float_status *s)
2663 {
2664 FloatParts64 p;
2665
2666 float32_unpack_canonical(&p, a, s);
2667 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2668 }
2669
2670 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2671 float_status *s)
2672 {
2673 FloatParts64 p;
2674
2675 float32_unpack_canonical(&p, a, s);
2676 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2677 }
2678
2679 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2680 float_status *s)
2681 {
2682 FloatParts64 p;
2683
2684 float64_unpack_canonical(&p, a, s);
2685 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2686 }
2687
2688 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2689 float_status *s)
2690 {
2691 FloatParts64 p;
2692
2693 float64_unpack_canonical(&p, a, s);
2694 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2695 }
2696
2697 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2698 float_status *s)
2699 {
2700 FloatParts64 p;
2701
2702 float64_unpack_canonical(&p, a, s);
2703 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2704 }
2705
2706 uint8_t float16_to_uint8(float16 a, float_status *s)
2707 {
2708 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2709 }
2710
2711 uint16_t float16_to_uint16(float16 a, float_status *s)
2712 {
2713 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2714 }
2715
2716 uint32_t float16_to_uint32(float16 a, float_status *s)
2717 {
2718 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2719 }
2720
2721 uint64_t float16_to_uint64(float16 a, float_status *s)
2722 {
2723 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2724 }
2725
2726 uint16_t float32_to_uint16(float32 a, float_status *s)
2727 {
2728 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2729 }
2730
2731 uint32_t float32_to_uint32(float32 a, float_status *s)
2732 {
2733 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2734 }
2735
2736 uint64_t float32_to_uint64(float32 a, float_status *s)
2737 {
2738 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2739 }
2740
2741 uint16_t float64_to_uint16(float64 a, float_status *s)
2742 {
2743 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2744 }
2745
2746 uint32_t float64_to_uint32(float64 a, float_status *s)
2747 {
2748 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2749 }
2750
2751 uint64_t float64_to_uint64(float64 a, float_status *s)
2752 {
2753 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2754 }
2755
2756 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2757 {
2758 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2759 }
2760
2761 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2762 {
2763 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2764 }
2765
2766 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2767 {
2768 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2769 }
2770
2771 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2772 {
2773 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2774 }
2775
2776 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2777 {
2778 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2779 }
2780
2781 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2782 {
2783 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2784 }
2785
2786 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2787 {
2788 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2789 }
2790
2791 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2792 {
2793 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2794 }
2795
2796 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2797 {
2798 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2799 }
2800
2801 /*
2802 * Returns the result of converting the bfloat16 value `a' to
2803 * the unsigned integer format.
2804 */
2805
2806 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2807 int scale, float_status *s)
2808 {
2809 FloatParts64 p;
2810
2811 bfloat16_unpack_canonical(&p, a, s);
2812 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2813 }
2814
2815 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2816 int scale, float_status *s)
2817 {
2818 FloatParts64 p;
2819
2820 bfloat16_unpack_canonical(&p, a, s);
2821 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2822 }
2823
2824 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2825 int scale, float_status *s)
2826 {
2827 FloatParts64 p;
2828
2829 bfloat16_unpack_canonical(&p, a, s);
2830 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2831 }
2832
2833 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2834 {
2835 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2836 }
2837
2838 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2839 {
2840 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2841 }
2842
2843 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2844 {
2845 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2846 }
2847
2848 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2849 {
2850 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2851 }
2852
2853 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2854 {
2855 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2856 }
2857
2858 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2859 {
2860 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2861 }
2862
2863 /*
2864 * Integer to float conversions
2865 *
2866 * Returns the result of converting the two's complement integer `a'
2867 * to the floating-point format. The conversion is performed according
2868 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2869 */
2870
2871 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2872 {
2873 FloatParts64 r = { .sign = false };
2874
2875 if (a == 0) {
2876 r.cls = float_class_zero;
2877 } else {
2878 uint64_t f = a;
2879 int shift;
2880
2881 r.cls = float_class_normal;
2882 if (a < 0) {
2883 f = -f;
2884 r.sign = true;
2885 }
2886 shift = clz64(f);
2887 scale = MIN(MAX(scale, -0x10000), 0x10000);
2888
2889 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2890 r.frac = f << shift;
2891 }
2892
2893 return r;
2894 }
2895
2896 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2897 {
2898 FloatParts64 pa = int_to_float(a, scale, status);
2899 return float16_round_pack_canonical(&pa, status);
2900 }
2901
2902 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2903 {
2904 return int64_to_float16_scalbn(a, scale, status);
2905 }
2906
2907 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2908 {
2909 return int64_to_float16_scalbn(a, scale, status);
2910 }
2911
2912 float16 int64_to_float16(int64_t a, float_status *status)
2913 {
2914 return int64_to_float16_scalbn(a, 0, status);
2915 }
2916
2917 float16 int32_to_float16(int32_t a, float_status *status)
2918 {
2919 return int64_to_float16_scalbn(a, 0, status);
2920 }
2921
2922 float16 int16_to_float16(int16_t a, float_status *status)
2923 {
2924 return int64_to_float16_scalbn(a, 0, status);
2925 }
2926
2927 float16 int8_to_float16(int8_t a, float_status *status)
2928 {
2929 return int64_to_float16_scalbn(a, 0, status);
2930 }
2931
2932 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2933 {
2934 FloatParts64 pa = int_to_float(a, scale, status);
2935 return float32_round_pack_canonical(&pa, status);
2936 }
2937
2938 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2939 {
2940 return int64_to_float32_scalbn(a, scale, status);
2941 }
2942
2943 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2944 {
2945 return int64_to_float32_scalbn(a, scale, status);
2946 }
2947
2948 float32 int64_to_float32(int64_t a, float_status *status)
2949 {
2950 return int64_to_float32_scalbn(a, 0, status);
2951 }
2952
2953 float32 int32_to_float32(int32_t a, float_status *status)
2954 {
2955 return int64_to_float32_scalbn(a, 0, status);
2956 }
2957
2958 float32 int16_to_float32(int16_t a, float_status *status)
2959 {
2960 return int64_to_float32_scalbn(a, 0, status);
2961 }
2962
2963 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2964 {
2965 FloatParts64 pa = int_to_float(a, scale, status);
2966 return float64_round_pack_canonical(&pa, status);
2967 }
2968
2969 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2970 {
2971 return int64_to_float64_scalbn(a, scale, status);
2972 }
2973
2974 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2975 {
2976 return int64_to_float64_scalbn(a, scale, status);
2977 }
2978
2979 float64 int64_to_float64(int64_t a, float_status *status)
2980 {
2981 return int64_to_float64_scalbn(a, 0, status);
2982 }
2983
2984 float64 int32_to_float64(int32_t a, float_status *status)
2985 {
2986 return int64_to_float64_scalbn(a, 0, status);
2987 }
2988
2989 float64 int16_to_float64(int16_t a, float_status *status)
2990 {
2991 return int64_to_float64_scalbn(a, 0, status);
2992 }
2993
2994 /*
2995 * Returns the result of converting the two's complement integer `a'
2996 * to the bfloat16 format.
2997 */
2998
2999 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3000 {
3001 FloatParts64 pa = int_to_float(a, scale, status);
3002 return bfloat16_round_pack_canonical(&pa, status);
3003 }
3004
3005 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3006 {
3007 return int64_to_bfloat16_scalbn(a, scale, status);
3008 }
3009
3010 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3011 {
3012 return int64_to_bfloat16_scalbn(a, scale, status);
3013 }
3014
3015 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3016 {
3017 return int64_to_bfloat16_scalbn(a, 0, status);
3018 }
3019
3020 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3021 {
3022 return int64_to_bfloat16_scalbn(a, 0, status);
3023 }
3024
3025 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3026 {
3027 return int64_to_bfloat16_scalbn(a, 0, status);
3028 }
3029
3030 /*
3031 * Unsigned Integer to float conversions
3032 *
3033 * Returns the result of converting the unsigned integer `a' to the
3034 * floating-point format. The conversion is performed according to the
3035 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3036 */
3037
3038 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3039 {
3040 FloatParts64 r = { .sign = false };
3041 int shift;
3042
3043 if (a == 0) {
3044 r.cls = float_class_zero;
3045 } else {
3046 scale = MIN(MAX(scale, -0x10000), 0x10000);
3047 shift = clz64(a);
3048 r.cls = float_class_normal;
3049 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3050 r.frac = a << shift;
3051 }
3052
3053 return r;
3054 }
3055
3056 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3057 {
3058 FloatParts64 pa = uint_to_float(a, scale, status);
3059 return float16_round_pack_canonical(&pa, status);
3060 }
3061
3062 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3063 {
3064 return uint64_to_float16_scalbn(a, scale, status);
3065 }
3066
3067 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3068 {
3069 return uint64_to_float16_scalbn(a, scale, status);
3070 }
3071
3072 float16 uint64_to_float16(uint64_t a, float_status *status)
3073 {
3074 return uint64_to_float16_scalbn(a, 0, status);
3075 }
3076
3077 float16 uint32_to_float16(uint32_t a, float_status *status)
3078 {
3079 return uint64_to_float16_scalbn(a, 0, status);
3080 }
3081
3082 float16 uint16_to_float16(uint16_t a, float_status *status)
3083 {
3084 return uint64_to_float16_scalbn(a, 0, status);
3085 }
3086
3087 float16 uint8_to_float16(uint8_t a, float_status *status)
3088 {
3089 return uint64_to_float16_scalbn(a, 0, status);
3090 }
3091
3092 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3093 {
3094 FloatParts64 pa = uint_to_float(a, scale, status);
3095 return float32_round_pack_canonical(&pa, status);
3096 }
3097
3098 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3099 {
3100 return uint64_to_float32_scalbn(a, scale, status);
3101 }
3102
3103 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3104 {
3105 return uint64_to_float32_scalbn(a, scale, status);
3106 }
3107
3108 float32 uint64_to_float32(uint64_t a, float_status *status)
3109 {
3110 return uint64_to_float32_scalbn(a, 0, status);
3111 }
3112
3113 float32 uint32_to_float32(uint32_t a, float_status *status)
3114 {
3115 return uint64_to_float32_scalbn(a, 0, status);
3116 }
3117
3118 float32 uint16_to_float32(uint16_t a, float_status *status)
3119 {
3120 return uint64_to_float32_scalbn(a, 0, status);
3121 }
3122
3123 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3124 {
3125 FloatParts64 pa = uint_to_float(a, scale, status);
3126 return float64_round_pack_canonical(&pa, status);
3127 }
3128
3129 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3130 {
3131 return uint64_to_float64_scalbn(a, scale, status);
3132 }
3133
3134 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3135 {
3136 return uint64_to_float64_scalbn(a, scale, status);
3137 }
3138
3139 float64 uint64_to_float64(uint64_t a, float_status *status)
3140 {
3141 return uint64_to_float64_scalbn(a, 0, status);
3142 }
3143
3144 float64 uint32_to_float64(uint32_t a, float_status *status)
3145 {
3146 return uint64_to_float64_scalbn(a, 0, status);
3147 }
3148
3149 float64 uint16_to_float64(uint16_t a, float_status *status)
3150 {
3151 return uint64_to_float64_scalbn(a, 0, status);
3152 }
3153
3154 /*
3155 * Returns the result of converting the unsigned integer `a' to the
3156 * bfloat16 format.
3157 */
3158
3159 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3160 {
3161 FloatParts64 pa = uint_to_float(a, scale, status);
3162 return bfloat16_round_pack_canonical(&pa, status);
3163 }
3164
3165 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3166 {
3167 return uint64_to_bfloat16_scalbn(a, scale, status);
3168 }
3169
3170 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3171 {
3172 return uint64_to_bfloat16_scalbn(a, scale, status);
3173 }
3174
3175 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3176 {
3177 return uint64_to_bfloat16_scalbn(a, 0, status);
3178 }
3179
3180 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3181 {
3182 return uint64_to_bfloat16_scalbn(a, 0, status);
3183 }
3184
3185 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3186 {
3187 return uint64_to_bfloat16_scalbn(a, 0, status);
3188 }
3189
3190 /* Float Min/Max */
3191 /* min() and max() functions. These can't be implemented as
3192 * 'compare and pick one input' because that would mishandle
3193 * NaNs and +0 vs -0.
3194 *
3195 * minnum() and maxnum() functions. These are similar to the min()
3196 * and max() functions but if one of the arguments is a QNaN and
3197 * the other is numerical then the numerical argument is returned.
3198 * SNaNs will get quietened before being returned.
3199 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3200 * and maxNum() operations. min() and max() are the typical min/max
3201 * semantics provided by many CPUs which predate that specification.
3202 *
3203 * minnummag() and maxnummag() functions correspond to minNumMag()
3204 * and minNumMag() from the IEEE-754 2008.
3205 */
3206 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3207 bool ieee, bool ismag, float_status *s)
3208 {
3209 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3210 if (ieee) {
3211 /* Takes two floating-point values `a' and `b', one of
3212 * which is a NaN, and returns the appropriate NaN
3213 * result. If either `a' or `b' is a signaling NaN,
3214 * the invalid exception is raised.
3215 */
3216 if (is_snan(a.cls) || is_snan(b.cls)) {
3217 return *parts_pick_nan(&a, &b, s);
3218 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3219 return b;
3220 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3221 return a;
3222 }
3223 }
3224 return *parts_pick_nan(&a, &b, s);
3225 } else {
3226 int a_exp, b_exp;
3227
3228 switch (a.cls) {
3229 case float_class_normal:
3230 a_exp = a.exp;
3231 break;
3232 case float_class_inf:
3233 a_exp = INT_MAX;
3234 break;
3235 case float_class_zero:
3236 a_exp = INT_MIN;
3237 break;
3238 default:
3239 g_assert_not_reached();
3240 break;
3241 }
3242 switch (b.cls) {
3243 case float_class_normal:
3244 b_exp = b.exp;
3245 break;
3246 case float_class_inf:
3247 b_exp = INT_MAX;
3248 break;
3249 case float_class_zero:
3250 b_exp = INT_MIN;
3251 break;
3252 default:
3253 g_assert_not_reached();
3254 break;
3255 }
3256
3257 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3258 bool a_less = a_exp < b_exp;
3259 if (a_exp == b_exp) {
3260 a_less = a.frac < b.frac;
3261 }
3262 return a_less ^ ismin ? b : a;
3263 }
3264
3265 if (a.sign == b.sign) {
3266 bool a_less = a_exp < b_exp;
3267 if (a_exp == b_exp) {
3268 a_less = a.frac < b.frac;
3269 }
3270 return a.sign ^ a_less ^ ismin ? b : a;
3271 } else {
3272 return a.sign ^ ismin ? b : a;
3273 }
3274 }
3275 }
3276
3277 #define MINMAX(sz, name, ismin, isiee, ismag) \
3278 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3279 float_status *s) \
3280 { \
3281 FloatParts64 pa, pb, pr; \
3282 float ## sz ## _unpack_canonical(&pa, a, s); \
3283 float ## sz ## _unpack_canonical(&pb, b, s); \
3284 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3285 return float ## sz ## _round_pack_canonical(&pr, s); \
3286 }
3287
3288 MINMAX(16, min, true, false, false)
3289 MINMAX(16, minnum, true, true, false)
3290 MINMAX(16, minnummag, true, true, true)
3291 MINMAX(16, max, false, false, false)
3292 MINMAX(16, maxnum, false, true, false)
3293 MINMAX(16, maxnummag, false, true, true)
3294
3295 MINMAX(32, min, true, false, false)
3296 MINMAX(32, minnum, true, true, false)
3297 MINMAX(32, minnummag, true, true, true)
3298 MINMAX(32, max, false, false, false)
3299 MINMAX(32, maxnum, false, true, false)
3300 MINMAX(32, maxnummag, false, true, true)
3301
3302 MINMAX(64, min, true, false, false)
3303 MINMAX(64, minnum, true, true, false)
3304 MINMAX(64, minnummag, true, true, true)
3305 MINMAX(64, max, false, false, false)
3306 MINMAX(64, maxnum, false, true, false)
3307 MINMAX(64, maxnummag, false, true, true)
3308
3309 #undef MINMAX
3310
3311 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3312 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3313 { \
3314 FloatParts64 pa, pb, pr; \
3315 bfloat16_unpack_canonical(&pa, a, s); \
3316 bfloat16_unpack_canonical(&pb, b, s); \
3317 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3318 return bfloat16_round_pack_canonical(&pr, s); \
3319 }
3320
3321 BF16_MINMAX(min, true, false, false)
3322 BF16_MINMAX(minnum, true, true, false)
3323 BF16_MINMAX(minnummag, true, true, true)
3324 BF16_MINMAX(max, false, false, false)
3325 BF16_MINMAX(maxnum, false, true, false)
3326 BF16_MINMAX(maxnummag, false, true, true)
3327
3328 #undef BF16_MINMAX
3329
3330 /* Floating point compare */
3331 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3332 float_status *s)
3333 {
3334 if (is_nan(a.cls) || is_nan(b.cls)) {
3335 if (!is_quiet ||
3336 a.cls == float_class_snan ||
3337 b.cls == float_class_snan) {
3338 float_raise(float_flag_invalid, s);
3339 }
3340 return float_relation_unordered;
3341 }
3342
3343 if (a.cls == float_class_zero) {
3344 if (b.cls == float_class_zero) {
3345 return float_relation_equal;
3346 }
3347 return b.sign ? float_relation_greater : float_relation_less;
3348 } else if (b.cls == float_class_zero) {
3349 return a.sign ? float_relation_less : float_relation_greater;
3350 }
3351
3352 /* The only really important thing about infinity is its sign. If
3353 * both are infinities the sign marks the smallest of the two.
3354 */
3355 if (a.cls == float_class_inf) {
3356 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3357 return float_relation_equal;
3358 }
3359 return a.sign ? float_relation_less : float_relation_greater;
3360 } else if (b.cls == float_class_inf) {
3361 return b.sign ? float_relation_greater : float_relation_less;
3362 }
3363
3364 if (a.sign != b.sign) {
3365 return a.sign ? float_relation_less : float_relation_greater;
3366 }
3367
3368 if (a.exp == b.exp) {
3369 if (a.frac == b.frac) {
3370 return float_relation_equal;
3371 }
3372 if (a.sign) {
3373 return a.frac > b.frac ?
3374 float_relation_less : float_relation_greater;
3375 } else {
3376 return a.frac > b.frac ?
3377 float_relation_greater : float_relation_less;
3378 }
3379 } else {
3380 if (a.sign) {
3381 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3382 } else {
3383 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3384 }
3385 }
3386 }
3387
3388 #define COMPARE(name, attr, sz) \
3389 static int attr \
3390 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3391 { \
3392 FloatParts64 pa, pb; \
3393 float ## sz ## _unpack_canonical(&pa, a, s); \
3394 float ## sz ## _unpack_canonical(&pb, b, s); \
3395 return compare_floats(pa, pb, is_quiet, s); \
3396 }
3397
3398 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3399 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3400 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3401
3402 #undef COMPARE
3403
3404 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3405 {
3406 return soft_f16_compare(a, b, false, s);
3407 }
3408
3409 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3410 {
3411 return soft_f16_compare(a, b, true, s);
3412 }
3413
3414 static FloatRelation QEMU_FLATTEN
3415 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3416 {
3417 union_float32 ua, ub;
3418
3419 ua.s = xa;
3420 ub.s = xb;
3421
3422 if (QEMU_NO_HARDFLOAT) {
3423 goto soft;
3424 }
3425
3426 float32_input_flush2(&ua.s, &ub.s, s);
3427 if (isgreaterequal(ua.h, ub.h)) {
3428 if (isgreater(ua.h, ub.h)) {
3429 return float_relation_greater;
3430 }
3431 return float_relation_equal;
3432 }
3433 if (likely(isless(ua.h, ub.h))) {
3434 return float_relation_less;
3435 }
3436 /* The only condition remaining is unordered.
3437 * Fall through to set flags.
3438 */
3439 soft:
3440 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3441 }
3442
3443 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3444 {
3445 return f32_compare(a, b, false, s);
3446 }
3447
3448 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3449 {
3450 return f32_compare(a, b, true, s);
3451 }
3452
3453 static FloatRelation QEMU_FLATTEN
3454 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3455 {
3456 union_float64 ua, ub;
3457
3458 ua.s = xa;
3459 ub.s = xb;
3460
3461 if (QEMU_NO_HARDFLOAT) {
3462 goto soft;
3463 }
3464
3465 float64_input_flush2(&ua.s, &ub.s, s);
3466 if (isgreaterequal(ua.h, ub.h)) {
3467 if (isgreater(ua.h, ub.h)) {
3468 return float_relation_greater;
3469 }
3470 return float_relation_equal;
3471 }
3472 if (likely(isless(ua.h, ub.h))) {
3473 return float_relation_less;
3474 }
3475 /* The only condition remaining is unordered.
3476 * Fall through to set flags.
3477 */
3478 soft:
3479 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3480 }
3481
3482 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3483 {
3484 return f64_compare(a, b, false, s);
3485 }
3486
3487 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3488 {
3489 return f64_compare(a, b, true, s);
3490 }
3491
3492 static FloatRelation QEMU_FLATTEN
3493 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3494 {
3495 FloatParts64 pa, pb;
3496
3497 bfloat16_unpack_canonical(&pa, a, s);
3498 bfloat16_unpack_canonical(&pb, b, s);
3499 return compare_floats(pa, pb, is_quiet, s);
3500 }
3501
3502 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3503 {
3504 return soft_bf16_compare(a, b, false, s);
3505 }
3506
3507 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3508 {
3509 return soft_bf16_compare(a, b, true, s);
3510 }
3511
3512 /* Multiply A by 2 raised to the power N. */
3513 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3514 {
3515 if (unlikely(is_nan(a.cls))) {
3516 parts_return_nan(&a, s);
3517 }
3518 if (a.cls == float_class_normal) {
3519 /* The largest float type (even though not supported by FloatParts64)
3520 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3521 * still allows rounding to infinity, without allowing overflow
3522 * within the int32_t that backs FloatParts64.exp.
3523 */
3524 n = MIN(MAX(n, -0x10000), 0x10000);
3525 a.exp += n;
3526 }
3527 return a;
3528 }
3529
3530 float16 float16_scalbn(float16 a, int n, float_status *status)
3531 {
3532 FloatParts64 pa, pr;
3533
3534 float16_unpack_canonical(&pa, a, status);
3535 pr = scalbn_decomposed(pa, n, status);
3536 return float16_round_pack_canonical(&pr, status);
3537 }
3538
3539 float32 float32_scalbn(float32 a, int n, float_status *status)
3540 {
3541 FloatParts64 pa, pr;
3542
3543 float32_unpack_canonical(&pa, a, status);
3544 pr = scalbn_decomposed(pa, n, status);
3545 return float32_round_pack_canonical(&pr, status);
3546 }
3547
3548 float64 float64_scalbn(float64 a, int n, float_status *status)
3549 {
3550 FloatParts64 pa, pr;
3551
3552 float64_unpack_canonical(&pa, a, status);
3553 pr = scalbn_decomposed(pa, n, status);
3554 return float64_round_pack_canonical(&pr, status);
3555 }
3556
3557 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3558 {
3559 FloatParts64 pa, pr;
3560
3561 bfloat16_unpack_canonical(&pa, a, status);
3562 pr = scalbn_decomposed(pa, n, status);
3563 return bfloat16_round_pack_canonical(&pr, status);
3564 }
3565
3566 /*
3567 * Square Root
3568 *
3569 * The old softfloat code did an approximation step before zeroing in
3570 * on the final result. However for simpleness we just compute the
3571 * square root by iterating down from the implicit bit to enough extra
3572 * bits to ensure we get a correctly rounded result.
3573 *
3574 * This does mean however the calculation is slower than before,
3575 * especially for 64 bit floats.
3576 */
3577
3578 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3579 {
3580 uint64_t a_frac, r_frac, s_frac;
3581 int bit, last_bit;
3582
3583 if (is_nan(a.cls)) {
3584 parts_return_nan(&a, s);
3585 return a;
3586 }
3587 if (a.cls == float_class_zero) {
3588 return a; /* sqrt(+-0) = +-0 */
3589 }
3590 if (a.sign) {
3591 float_raise(float_flag_invalid, s);
3592 parts_default_nan(&a, s);
3593 return a;
3594 }
3595 if (a.cls == float_class_inf) {
3596 return a; /* sqrt(+inf) = +inf */
3597 }
3598
3599 assert(a.cls == float_class_normal);
3600
3601 /* We need two overflow bits at the top. Adding room for that is a
3602 * right shift. If the exponent is odd, we can discard the low bit
3603 * by multiplying the fraction by 2; that's a left shift. Combine
3604 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3605 */
3606 a_frac = a.frac >> (2 - (a.exp & 1));
3607 a.exp >>= 1;
3608
3609 /* Bit-by-bit computation of sqrt. */
3610 r_frac = 0;
3611 s_frac = 0;
3612
3613 /* Iterate from implicit bit down to the 3 extra bits to compute a
3614 * properly rounded result. Remember we've inserted two more bits
3615 * at the top, so these positions are two less.
3616 */
3617 bit = DECOMPOSED_BINARY_POINT - 2;
3618 last_bit = MAX(p->frac_shift - 4, 0);
3619 do {
3620 uint64_t q = 1ULL << bit;
3621 uint64_t t_frac = s_frac + q;
3622 if (t_frac <= a_frac) {
3623 s_frac = t_frac + q;
3624 a_frac -= t_frac;
3625 r_frac += q;
3626 }
3627 a_frac <<= 1;
3628 } while (--bit >= last_bit);
3629
3630 /* Undo the right shift done above. If there is any remaining
3631 * fraction, the result is inexact. Set the sticky bit.
3632 */
3633 a.frac = (r_frac << 2) + (a_frac != 0);
3634
3635 return a;
3636 }
3637
3638 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3639 {
3640 FloatParts64 pa, pr;
3641
3642 float16_unpack_canonical(&pa, a, status);
3643 pr = sqrt_float(pa, status, &float16_params);
3644 return float16_round_pack_canonical(&pr, status);
3645 }
3646
3647 static float32 QEMU_SOFTFLOAT_ATTR
3648 soft_f32_sqrt(float32 a, float_status *status)
3649 {
3650 FloatParts64 pa, pr;
3651
3652 float32_unpack_canonical(&pa, a, status);
3653 pr = sqrt_float(pa, status, &float32_params);
3654 return float32_round_pack_canonical(&pr, status);
3655 }
3656
3657 static float64 QEMU_SOFTFLOAT_ATTR
3658 soft_f64_sqrt(float64 a, float_status *status)
3659 {
3660 FloatParts64 pa, pr;
3661
3662 float64_unpack_canonical(&pa, a, status);
3663 pr = sqrt_float(pa, status, &float64_params);
3664 return float64_round_pack_canonical(&pr, status);
3665 }
3666
3667 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3668 {
3669 union_float32 ua, ur;
3670
3671 ua.s = xa;
3672 if (unlikely(!can_use_fpu(s))) {
3673 goto soft;
3674 }
3675
3676 float32_input_flush1(&ua.s, s);
3677 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3678 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3679 fpclassify(ua.h) == FP_ZERO) ||
3680 signbit(ua.h))) {
3681 goto soft;
3682 }
3683 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3684 float32_is_neg(ua.s))) {
3685 goto soft;
3686 }
3687 ur.h = sqrtf(ua.h);
3688 return ur.s;
3689
3690 soft:
3691 return soft_f32_sqrt(ua.s, s);
3692 }
3693
3694 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3695 {
3696 union_float64 ua, ur;
3697
3698 ua.s = xa;
3699 if (unlikely(!can_use_fpu(s))) {
3700 goto soft;
3701 }
3702
3703 float64_input_flush1(&ua.s, s);
3704 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3705 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3706 fpclassify(ua.h) == FP_ZERO) ||
3707 signbit(ua.h))) {
3708 goto soft;
3709 }
3710 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3711 float64_is_neg(ua.s))) {
3712 goto soft;
3713 }
3714 ur.h = sqrt(ua.h);
3715 return ur.s;
3716
3717 soft:
3718 return soft_f64_sqrt(ua.s, s);
3719 }
3720
3721 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3722 {
3723 FloatParts64 pa, pr;
3724
3725 bfloat16_unpack_canonical(&pa, a, status);
3726 pr = sqrt_float(pa, status, &bfloat16_params);
3727 return bfloat16_round_pack_canonical(&pr, status);
3728 }
3729
3730 /*----------------------------------------------------------------------------
3731 | The pattern for a default generated NaN.
3732 *----------------------------------------------------------------------------*/
3733
3734 float16 float16_default_nan(float_status *status)
3735 {
3736 FloatParts64 p;
3737
3738 parts_default_nan(&p, status);
3739 p.frac >>= float16_params.frac_shift;
3740 return float16_pack_raw(&p);
3741 }
3742
3743 float32 float32_default_nan(float_status *status)
3744 {
3745 FloatParts64 p;
3746
3747 parts_default_nan(&p, status);
3748 p.frac >>= float32_params.frac_shift;
3749 return float32_pack_raw(&p);
3750 }
3751
3752 float64 float64_default_nan(float_status *status)
3753 {
3754 FloatParts64 p;
3755
3756 parts_default_nan(&p, status);
3757 p.frac >>= float64_params.frac_shift;
3758 return float64_pack_raw(&p);
3759 }
3760
3761 float128 float128_default_nan(float_status *status)
3762 {
3763 FloatParts128 p;
3764
3765 parts_default_nan(&p, status);
3766 frac_shr(&p, float128_params.frac_shift);
3767 return float128_pack_raw(&p);
3768 }
3769
3770 bfloat16 bfloat16_default_nan(float_status *status)
3771 {
3772 FloatParts64 p;
3773
3774 parts_default_nan(&p, status);
3775 p.frac >>= bfloat16_params.frac_shift;
3776 return bfloat16_pack_raw(&p);
3777 }
3778
3779 /*----------------------------------------------------------------------------
3780 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3781 *----------------------------------------------------------------------------*/
3782
3783 float16 float16_silence_nan(float16 a, float_status *status)
3784 {
3785 FloatParts64 p;
3786
3787 float16_unpack_raw(&p, a);
3788 p.frac <<= float16_params.frac_shift;
3789 parts_silence_nan(&p, status);
3790 p.frac >>= float16_params.frac_shift;
3791 return float16_pack_raw(&p);
3792 }
3793
3794 float32 float32_silence_nan(float32 a, float_status *status)
3795 {
3796 FloatParts64 p;
3797
3798 float32_unpack_raw(&p, a);
3799 p.frac <<= float32_params.frac_shift;
3800 parts_silence_nan(&p, status);
3801 p.frac >>= float32_params.frac_shift;
3802 return float32_pack_raw(&p);
3803 }
3804
3805 float64 float64_silence_nan(float64 a, float_status *status)
3806 {
3807 FloatParts64 p;
3808
3809 float64_unpack_raw(&p, a);
3810 p.frac <<= float64_params.frac_shift;
3811 parts_silence_nan(&p, status);
3812 p.frac >>= float64_params.frac_shift;
3813 return float64_pack_raw(&p);
3814 }
3815
3816 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3817 {
3818 FloatParts64 p;
3819
3820 bfloat16_unpack_raw(&p, a);
3821 p.frac <<= bfloat16_params.frac_shift;
3822 parts_silence_nan(&p, status);
3823 p.frac >>= bfloat16_params.frac_shift;
3824 return bfloat16_pack_raw(&p);
3825 }
3826
3827 float128 float128_silence_nan(float128 a, float_status *status)
3828 {
3829 FloatParts128 p;
3830
3831 float128_unpack_raw(&p, a);
3832 frac_shl(&p, float128_params.frac_shift);
3833 parts_silence_nan(&p, status);
3834 frac_shr(&p, float128_params.frac_shift);
3835 return float128_pack_raw(&p);
3836 }
3837
3838 /*----------------------------------------------------------------------------
3839 | If `a' is denormal and we are in flush-to-zero mode then set the
3840 | input-denormal exception and return zero. Otherwise just return the value.
3841 *----------------------------------------------------------------------------*/
3842
3843 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3844 {
3845 if (p.exp == 0 && p.frac != 0) {
3846 float_raise(float_flag_input_denormal, status);
3847 return true;
3848 }
3849
3850 return false;
3851 }
3852
3853 float16 float16_squash_input_denormal(float16 a, float_status *status)
3854 {
3855 if (status->flush_inputs_to_zero) {
3856 FloatParts64 p;
3857
3858 float16_unpack_raw(&p, a);
3859 if (parts_squash_denormal(p, status)) {
3860 return float16_set_sign(float16_zero, p.sign);
3861 }
3862 }
3863 return a;
3864 }
3865
3866 float32 float32_squash_input_denormal(float32 a, float_status *status)
3867 {
3868 if (status->flush_inputs_to_zero) {
3869 FloatParts64 p;
3870
3871 float32_unpack_raw(&p, a);
3872 if (parts_squash_denormal(p, status)) {
3873 return float32_set_sign(float32_zero, p.sign);
3874 }
3875 }
3876 return a;
3877 }
3878
3879 float64 float64_squash_input_denormal(float64 a, float_status *status)
3880 {
3881 if (status->flush_inputs_to_zero) {
3882 FloatParts64 p;
3883
3884 float64_unpack_raw(&p, a);
3885 if (parts_squash_denormal(p, status)) {
3886 return float64_set_sign(float64_zero, p.sign);
3887 }
3888 }
3889 return a;
3890 }
3891
3892 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3893 {
3894 if (status->flush_inputs_to_zero) {
3895 FloatParts64 p;
3896
3897 bfloat16_unpack_raw(&p, a);
3898 if (parts_squash_denormal(p, status)) {
3899 return bfloat16_set_sign(bfloat16_zero, p.sign);
3900 }
3901 }
3902 return a;
3903 }
3904
3905 /*----------------------------------------------------------------------------
3906 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3907 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3908 | input. If `zSign' is 1, the input is negated before being converted to an
3909 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3910 | is simply rounded to an integer, with the inexact exception raised if the
3911 | input cannot be represented exactly as an integer. However, if the fixed-
3912 | point input is too large, the invalid exception is raised and the largest
3913 | positive or negative integer is returned.
3914 *----------------------------------------------------------------------------*/
3915
3916 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3917 float_status *status)
3918 {
3919 int8_t roundingMode;
3920 bool roundNearestEven;
3921 int8_t roundIncrement, roundBits;
3922 int32_t z;
3923
3924 roundingMode = status->float_rounding_mode;
3925 roundNearestEven = ( roundingMode == float_round_nearest_even );
3926 switch (roundingMode) {
3927 case float_round_nearest_even:
3928 case float_round_ties_away:
3929 roundIncrement = 0x40;
3930 break;
3931 case float_round_to_zero:
3932 roundIncrement = 0;
3933 break;
3934 case float_round_up:
3935 roundIncrement = zSign ? 0 : 0x7f;
3936 break;
3937 case float_round_down:
3938 roundIncrement = zSign ? 0x7f : 0;
3939 break;
3940 case float_round_to_odd:
3941 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3942 break;
3943 default:
3944 abort();
3945 }
3946 roundBits = absZ & 0x7F;
3947 absZ = ( absZ + roundIncrement )>>7;
3948 if (!(roundBits ^ 0x40) && roundNearestEven) {
3949 absZ &= ~1;
3950 }
3951 z = absZ;
3952 if ( zSign ) z = - z;
3953 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3954 float_raise(float_flag_invalid, status);
3955 return zSign ? INT32_MIN : INT32_MAX;
3956 }
3957 if (roundBits) {
3958 float_raise(float_flag_inexact, status);
3959 }
3960 return z;
3961
3962 }
3963
3964 /*----------------------------------------------------------------------------
3965 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3966 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3967 | and returns the properly rounded 64-bit integer corresponding to the input.
3968 | If `zSign' is 1, the input is negated before being converted to an integer.
3969 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3970 | the inexact exception raised if the input cannot be represented exactly as
3971 | an integer. However, if the fixed-point input is too large, the invalid
3972 | exception is raised and the largest positive or negative integer is
3973 | returned.
3974 *----------------------------------------------------------------------------*/
3975
3976 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3977 float_status *status)
3978 {
3979 int8_t roundingMode;
3980 bool roundNearestEven, increment;
3981 int64_t z;
3982
3983 roundingMode = status->float_rounding_mode;
3984 roundNearestEven = ( roundingMode == float_round_nearest_even );
3985 switch (roundingMode) {
3986 case float_round_nearest_even:
3987 case float_round_ties_away:
3988 increment = ((int64_t) absZ1 < 0);
3989 break;
3990 case float_round_to_zero:
3991 increment = 0;
3992 break;
3993 case float_round_up:
3994 increment = !zSign && absZ1;
3995 break;
3996 case float_round_down:
3997 increment = zSign && absZ1;
3998 break;
3999 case float_round_to_odd:
4000 increment = !(absZ0 & 1) && absZ1;
4001 break;
4002 default:
4003 abort();
4004 }
4005 if ( increment ) {
4006 ++absZ0;
4007 if ( absZ0 == 0 ) goto overflow;
4008 if (!(absZ1 << 1) && roundNearestEven) {
4009 absZ0 &= ~1;
4010 }
4011 }
4012 z = absZ0;
4013 if ( zSign ) z = - z;
4014 if ( z && ( ( z < 0 ) ^ zSign ) ) {
4015 overflow:
4016 float_raise(float_flag_invalid, status);
4017 return zSign ? INT64_MIN : INT64_MAX;
4018 }
4019 if (absZ1) {
4020 float_raise(float_flag_inexact, status);
4021 }
4022 return z;
4023
4024 }
4025
4026 /*----------------------------------------------------------------------------
4027 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4028 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4029 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4030 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
4031 | with the inexact exception raised if the input cannot be represented exactly
4032 | as an integer. However, if the fixed-point input is too large, the invalid
4033 | exception is raised and the largest unsigned integer is returned.
4034 *----------------------------------------------------------------------------*/
4035
4036 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4037 uint64_t absZ1, float_status *status)
4038 {
4039 int8_t roundingMode;
4040 bool roundNearestEven, increment;
4041
4042 roundingMode = status->float_rounding_mode;
4043 roundNearestEven = (roundingMode == float_round_nearest_even);
4044 switch (roundingMode) {
4045 case float_round_nearest_even:
4046 case float_round_ties_away:
4047 increment = ((int64_t)absZ1 < 0);
4048 break;
4049 case float_round_to_zero:
4050 increment = 0;
4051 break;
4052 case float_round_up:
4053 increment = !zSign && absZ1;
4054 break;
4055 case float_round_down:
4056 increment = zSign && absZ1;
4057 break;
4058 case float_round_to_odd:
4059 increment = !(absZ0 & 1) && absZ1;
4060 break;
4061 default:
4062 abort();
4063 }
4064 if (increment) {
4065 ++absZ0;
4066 if (absZ0 == 0) {
4067 float_raise(float_flag_invalid, status);
4068 return UINT64_MAX;
4069 }
4070 if (!(absZ1 << 1) && roundNearestEven) {
4071 absZ0 &= ~1;
4072 }
4073 }
4074
4075 if (zSign && absZ0) {
4076 float_raise(float_flag_invalid, status);
4077 return 0;
4078 }
4079
4080 if (absZ1) {
4081 float_raise(float_flag_inexact, status);
4082 }
4083 return absZ0;
4084 }
4085
4086 /*----------------------------------------------------------------------------
4087 | Normalizes the subnormal single-precision floating-point value represented
4088 | by the denormalized significand `aSig'. The normalized exponent and
4089 | significand are stored at the locations pointed to by `zExpPtr' and
4090 | `zSigPtr', respectively.
4091 *----------------------------------------------------------------------------*/
4092
4093 static void
4094 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4095 {
4096 int8_t shiftCount;
4097
4098 shiftCount = clz32(aSig) - 8;
4099 *zSigPtr = aSig<<shiftCount;
4100 *zExpPtr = 1 - shiftCount;
4101
4102 }
4103
4104 /*----------------------------------------------------------------------------
4105 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4106 | and significand `zSig', and returns the proper single-precision floating-
4107 | point value corresponding to the abstract input. Ordinarily, the abstract
4108 | value is simply rounded and packed into the single-precision format, with
4109 | the inexact exception raised if the abstract input cannot be represented
4110 | exactly. However, if the abstract value is too large, the overflow and
4111 | inexact exceptions are raised and an infinity or maximal finite value is
4112 | returned. If the abstract value is too small, the input value is rounded to
4113 | a subnormal number, and the underflow and inexact exceptions are raised if
4114 | the abstract input cannot be represented exactly as a subnormal single-
4115 | precision floating-point number.
4116 | The input significand `zSig' has its binary point between bits 30
4117 | and 29, which is 7 bits to the left of the usual location. This shifted
4118 | significand must be normalized or smaller. If `zSig' is not normalized,
4119 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4120 | and it must not require rounding. In the usual case that `zSig' is
4121 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4122 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4123 | Binary Floating-Point Arithmetic.
4124 *----------------------------------------------------------------------------*/
4125
4126 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4127 float_status *status)
4128 {
4129 int8_t roundingMode;
4130 bool roundNearestEven;
4131 int8_t roundIncrement, roundBits;
4132 bool isTiny;
4133
4134 roundingMode = status->float_rounding_mode;
4135 roundNearestEven = ( roundingMode == float_round_nearest_even );
4136 switch (roundingMode) {
4137 case float_round_nearest_even:
4138 case float_round_ties_away:
4139 roundIncrement = 0x40;
4140 break;
4141 case float_round_to_zero:
4142 roundIncrement = 0;
4143 break;
4144 case float_round_up:
4145 roundIncrement = zSign ? 0 : 0x7f;
4146 break;
4147 case float_round_down:
4148 roundIncrement = zSign ? 0x7f : 0;
4149 break;
4150 case float_round_to_odd:
4151 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4152 break;
4153 default:
4154 abort();
4155 break;
4156 }
4157 roundBits = zSig & 0x7F;
4158 if ( 0xFD <= (uint16_t) zExp ) {
4159 if ( ( 0xFD < zExp )
4160 || ( ( zExp == 0xFD )
4161 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4162 ) {
4163 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4164 roundIncrement != 0;
4165 float_raise(float_flag_overflow | float_flag_inexact, status);
4166 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4167 }
4168 if ( zExp < 0 ) {
4169 if (status->flush_to_zero) {
4170 float_raise(float_flag_output_denormal, status);
4171 return packFloat32(zSign, 0, 0);
4172 }
4173 isTiny = status->tininess_before_rounding
4174 || (zExp < -1)
4175 || (zSig + roundIncrement < 0x80000000);
4176 shift32RightJamming( zSig, - zExp, &zSig );
4177 zExp = 0;
4178 roundBits = zSig & 0x7F;
4179 if (isTiny && roundBits) {
4180 float_raise(float_flag_underflow, status);
4181 }
4182 if (roundingMode == float_round_to_odd) {
4183 /*
4184 * For round-to-odd case, the roundIncrement depends on
4185 * zSig which just changed.
4186 */
4187 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4188 }
4189 }
4190 }
4191 if (roundBits) {
4192 float_raise(float_flag_inexact, status);
4193 }
4194 zSig = ( zSig + roundIncrement )>>7;
4195 if (!(roundBits ^ 0x40) && roundNearestEven) {
4196 zSig &= ~1;
4197 }
4198 if ( zSig == 0 ) zExp = 0;
4199 return packFloat32( zSign, zExp, zSig );
4200
4201 }
4202
4203 /*----------------------------------------------------------------------------
4204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4205 | and significand `zSig', and returns the proper single-precision floating-
4206 | point value corresponding to the abstract input. This routine is just like
4207 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4208 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4209 | floating-point exponent.
4210 *----------------------------------------------------------------------------*/
4211
4212 static float32
4213 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4214 float_status *status)
4215 {
4216 int8_t shiftCount;
4217
4218 shiftCount = clz32(zSig) - 1;
4219 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4220 status);
4221
4222 }
4223
4224 /*----------------------------------------------------------------------------
4225 | Normalizes the subnormal double-precision floating-point value represented
4226 | by the denormalized significand `aSig'. The normalized exponent and
4227 | significand are stored at the locations pointed to by `zExpPtr' and
4228 | `zSigPtr', respectively.
4229 *----------------------------------------------------------------------------*/
4230
4231 static void
4232 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4233 {
4234 int8_t shiftCount;
4235
4236 shiftCount = clz64(aSig) - 11;
4237 *zSigPtr = aSig<<shiftCount;
4238 *zExpPtr = 1 - shiftCount;
4239
4240 }
4241
4242 /*----------------------------------------------------------------------------
4243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4244 | double-precision floating-point value, returning the result. After being
4245 | shifted into the proper positions, the three fields are simply added
4246 | together to form the result. This means that any integer portion of `zSig'
4247 | will be added into the exponent. Since a properly normalized significand
4248 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4249 | than the desired result exponent whenever `zSig' is a complete, normalized
4250 | significand.
4251 *----------------------------------------------------------------------------*/
4252
4253 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4254 {
4255
4256 return make_float64(
4257 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4258
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4263 | and significand `zSig', and returns the proper double-precision floating-
4264 | point value corresponding to the abstract input. Ordinarily, the abstract
4265 | value is simply rounded and packed into the double-precision format, with
4266 | the inexact exception raised if the abstract input cannot be represented
4267 | exactly. However, if the abstract value is too large, the overflow and
4268 | inexact exceptions are raised and an infinity or maximal finite value is
4269 | returned. If the abstract value is too small, the input value is rounded to
4270 | a subnormal number, and the underflow and inexact exceptions are raised if
4271 | the abstract input cannot be represented exactly as a subnormal double-
4272 | precision floating-point number.
4273 | The input significand `zSig' has its binary point between bits 62
4274 | and 61, which is 10 bits to the left of the usual location. This shifted
4275 | significand must be normalized or smaller. If `zSig' is not normalized,
4276 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4277 | and it must not require rounding. In the usual case that `zSig' is
4278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4279 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4280 | Binary Floating-Point Arithmetic.
4281 *----------------------------------------------------------------------------*/
4282
4283 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4284 float_status *status)
4285 {
4286 int8_t roundingMode;
4287 bool roundNearestEven;
4288 int roundIncrement, roundBits;
4289 bool isTiny;
4290
4291 roundingMode = status->float_rounding_mode;
4292 roundNearestEven = ( roundingMode == float_round_nearest_even );
4293 switch (roundingMode) {
4294 case float_round_nearest_even:
4295 case float_round_ties_away:
4296 roundIncrement = 0x200;
4297 break;
4298 case float_round_to_zero:
4299 roundIncrement = 0;
4300 break;
4301 case float_round_up:
4302 roundIncrement = zSign ? 0 : 0x3ff;
4303 break;
4304 case float_round_down:
4305 roundIncrement = zSign ? 0x3ff : 0;
4306 break;
4307 case float_round_to_odd:
4308 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4309 break;
4310 default:
4311 abort();
4312 }
4313 roundBits = zSig & 0x3FF;
4314 if ( 0x7FD <= (uint16_t) zExp ) {
4315 if ( ( 0x7FD < zExp )
4316 || ( ( zExp == 0x7FD )
4317 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4318 ) {
4319 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4320 roundIncrement != 0;
4321 float_raise(float_flag_overflow | float_flag_inexact, status);
4322 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4323 }
4324 if ( zExp < 0 ) {
4325 if (status->flush_to_zero) {
4326 float_raise(float_flag_output_denormal, status);
4327 return packFloat64(zSign, 0, 0);
4328 }
4329 isTiny = status->tininess_before_rounding
4330 || (zExp < -1)
4331 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4332 shift64RightJamming( zSig, - zExp, &zSig );
4333 zExp = 0;
4334 roundBits = zSig & 0x3FF;
4335 if (isTiny && roundBits) {
4336 float_raise(float_flag_underflow, status);
4337 }
4338 if (roundingMode == float_round_to_odd) {
4339 /*
4340 * For round-to-odd case, the roundIncrement depends on
4341 * zSig which just changed.
4342 */
4343 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4344 }
4345 }
4346 }
4347 if (roundBits) {
4348 float_raise(float_flag_inexact, status);
4349 }
4350 zSig = ( zSig + roundIncrement )>>10;
4351 if (!(roundBits ^ 0x200) && roundNearestEven) {
4352 zSig &= ~1;
4353 }
4354 if ( zSig == 0 ) zExp = 0;
4355 return packFloat64( zSign, zExp, zSig );
4356
4357 }
4358
4359 /*----------------------------------------------------------------------------
4360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4361 | and significand `zSig', and returns the proper double-precision floating-
4362 | point value corresponding to the abstract input. This routine is just like
4363 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4364 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4365 | floating-point exponent.
4366 *----------------------------------------------------------------------------*/
4367
4368 static float64
4369 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4370 float_status *status)
4371 {
4372 int8_t shiftCount;
4373
4374 shiftCount = clz64(zSig) - 1;
4375 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4376 status);
4377
4378 }
4379
4380 /*----------------------------------------------------------------------------
4381 | Normalizes the subnormal extended double-precision floating-point value
4382 | represented by the denormalized significand `aSig'. The normalized exponent
4383 | and significand are stored at the locations pointed to by `zExpPtr' and
4384 | `zSigPtr', respectively.
4385 *----------------------------------------------------------------------------*/
4386
4387 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4388 uint64_t *zSigPtr)
4389 {
4390 int8_t shiftCount;
4391
4392 shiftCount = clz64(aSig);
4393 *zSigPtr = aSig<<shiftCount;
4394 *zExpPtr = 1 - shiftCount;
4395 }
4396
4397 /*----------------------------------------------------------------------------
4398 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4399 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4400 | and returns the proper extended double-precision floating-point value
4401 | corresponding to the abstract input. Ordinarily, the abstract value is
4402 | rounded and packed into the extended double-precision format, with the
4403 | inexact exception raised if the abstract input cannot be represented
4404 | exactly. However, if the abstract value is too large, the overflow and
4405 | inexact exceptions are raised and an infinity or maximal finite value is
4406 | returned. If the abstract value is too small, the input value is rounded to
4407 | a subnormal number, and the underflow and inexact exceptions are raised if
4408 | the abstract input cannot be represented exactly as a subnormal extended
4409 | double-precision floating-point number.
4410 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4411 | number of bits as single or double precision, respectively. Otherwise, the
4412 | result is rounded to the full precision of the extended double-precision
4413 | format.
4414 | The input significand must be normalized or smaller. If the input
4415 | significand is not normalized, `zExp' must be 0; in that case, the result
4416 | returned is a subnormal number, and it must not require rounding. The
4417 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4418 | Floating-Point Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420
4421 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4422 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4423 float_status *status)
4424 {
4425 int8_t roundingMode;
4426 bool roundNearestEven, increment, isTiny;
4427 int64_t roundIncrement, roundMask, roundBits;
4428
4429 roundingMode = status->float_rounding_mode;
4430 roundNearestEven = ( roundingMode == float_round_nearest_even );
4431 if ( roundingPrecision == 80 ) goto precision80;
4432 if ( roundingPrecision == 64 ) {
4433 roundIncrement = UINT64_C(0x0000000000000400);
4434 roundMask = UINT64_C(0x00000000000007FF);
4435 }
4436 else if ( roundingPrecision == 32 ) {
4437 roundIncrement = UINT64_C(0x0000008000000000);
4438 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4439 }
4440 else {
4441 goto precision80;
4442 }
4443 zSig0 |= ( zSig1 != 0 );
4444 switch (roundingMode) {
4445 case float_round_nearest_even:
4446 case float_round_ties_away:
4447 break;
4448 case float_round_to_zero:
4449 roundIncrement = 0;
4450 break;
4451 case float_round_up:
4452 roundIncrement = zSign ? 0 : roundMask;
4453 break;
4454 case float_round_down:
4455 roundIncrement = zSign ? roundMask : 0;
4456 break;
4457 default:
4458 abort();
4459 }
4460 roundBits = zSig0 & roundMask;
4461 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4462 if ( ( 0x7FFE < zExp )
4463 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4464 ) {
4465 goto overflow;
4466 }
4467 if ( zExp <= 0 ) {
4468 if (status->flush_to_zero) {
4469 float_raise(float_flag_output_denormal, status);
4470 return packFloatx80(zSign, 0, 0);
4471 }
4472 isTiny = status->tininess_before_rounding
4473 || (zExp < 0 )
4474 || (zSig0 <= zSig0 + roundIncrement);
4475 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4476 zExp = 0;
4477 roundBits = zSig0 & roundMask;
4478 if (isTiny && roundBits) {
4479 float_raise(float_flag_underflow, status);
4480 }
4481 if (roundBits) {
4482 float_raise(float_flag_inexact, status);
4483 }
4484 zSig0 += roundIncrement;
4485 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4486 roundIncrement = roundMask + 1;
4487 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4488 roundMask |= roundIncrement;
4489 }
4490 zSig0 &= ~ roundMask;
4491 return packFloatx80( zSign, zExp, zSig0 );
4492 }
4493 }
4494 if (roundBits) {
4495 float_raise(float_flag_inexact, status);
4496 }
4497 zSig0 += roundIncrement;
4498 if ( zSig0 < roundIncrement ) {
4499 ++zExp;
4500 zSig0 = UINT64_C(0x8000000000000000);
4501 }
4502 roundIncrement = roundMask + 1;
4503 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4504 roundMask |= roundIncrement;
4505 }
4506 zSig0 &= ~ roundMask;
4507 if ( zSig0 == 0 ) zExp = 0;
4508 return packFloatx80( zSign, zExp, zSig0 );
4509 precision80:
4510 switch (roundingMode) {
4511 case float_round_nearest_even:
4512 case float_round_ties_away:
4513 increment = ((int64_t)zSig1 < 0);
4514 break;
4515 case float_round_to_zero:
4516 increment = 0;
4517 break;
4518 case float_round_up:
4519 increment = !zSign && zSig1;
4520 break;
4521 case float_round_down:
4522 increment = zSign && zSig1;
4523 break;
4524 default:
4525 abort();
4526 }
4527 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4528 if ( ( 0x7FFE < zExp )
4529 || ( ( zExp == 0x7FFE )
4530 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4531 && increment
4532 )
4533 ) {
4534 roundMask = 0;
4535 overflow:
4536 float_raise(float_flag_overflow | float_flag_inexact, status);
4537 if ( ( roundingMode == float_round_to_zero )
4538 || ( zSign && ( roundingMode == float_round_up ) )
4539 || ( ! zSign && ( roundingMode == float_round_down ) )
4540 ) {
4541 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4542 }
4543 return packFloatx80(zSign,
4544 floatx80_infinity_high,
4545 floatx80_infinity_low);
4546 }
4547 if ( zExp <= 0 ) {
4548 isTiny = status->tininess_before_rounding
4549 || (zExp < 0)
4550 || !increment
4551 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4552 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4553 zExp = 0;
4554 if (isTiny && zSig1) {
4555 float_raise(float_flag_underflow, status);
4556 }
4557 if (zSig1) {
4558 float_raise(float_flag_inexact, status);
4559 }
4560 switch (roundingMode) {
4561 case float_round_nearest_even:
4562 case float_round_ties_away:
4563 increment = ((int64_t)zSig1 < 0);
4564 break;
4565 case float_round_to_zero:
4566 increment = 0;
4567 break;
4568 case float_round_up:
4569 increment = !zSign && zSig1;
4570 break;
4571 case float_round_down:
4572 increment = zSign && zSig1;
4573 break;
4574 default:
4575 abort();
4576 }
4577 if ( increment ) {
4578 ++zSig0;
4579 if (!(zSig1 << 1) && roundNearestEven) {
4580 zSig0 &= ~1;
4581 }
4582 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4583 }
4584 return packFloatx80( zSign, zExp, zSig0 );
4585 }
4586 }
4587 if (zSig1) {
4588 float_raise(float_flag_inexact, status);
4589 }
4590 if ( increment ) {
4591 ++zSig0;
4592 if ( zSig0 == 0 ) {
4593 ++zExp;
4594 zSig0 = UINT64_C(0x8000000000000000);
4595 }
4596 else {
4597 if (!(zSig1 << 1) && roundNearestEven) {
4598 zSig0 &= ~1;
4599 }
4600 }
4601 }
4602 else {
4603 if ( zSig0 == 0 ) zExp = 0;
4604 }
4605 return packFloatx80( zSign, zExp, zSig0 );
4606
4607 }
4608
4609 /*----------------------------------------------------------------------------
4610 | Takes an abstract floating-point value having sign `zSign', exponent
4611 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4612 | and returns the proper extended double-precision floating-point value
4613 | corresponding to the abstract input. This routine is just like
4614 | `roundAndPackFloatx80' except that the input significand does not have to be
4615 | normalized.
4616 *----------------------------------------------------------------------------*/
4617
4618 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4619 bool zSign, int32_t zExp,
4620 uint64_t zSig0, uint64_t zSig1,
4621 float_status *status)
4622 {
4623 int8_t shiftCount;
4624
4625 if ( zSig0 == 0 ) {
4626 zSig0 = zSig1;
4627 zSig1 = 0;
4628 zExp -= 64;
4629 }
4630 shiftCount = clz64(zSig0);
4631 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4632 zExp -= shiftCount;
4633 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4634 zSig0, zSig1, status);
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the least-significant 64 fraction bits of the quadruple-precision
4640 | floating-point value `a'.
4641 *----------------------------------------------------------------------------*/
4642
4643 static inline uint64_t extractFloat128Frac1( float128 a )
4644 {
4645
4646 return a.low;
4647
4648 }
4649
4650 /*----------------------------------------------------------------------------
4651 | Returns the most-significant 48 fraction bits of the quadruple-precision
4652 | floating-point value `a'.
4653 *----------------------------------------------------------------------------*/
4654
4655 static inline uint64_t extractFloat128Frac0( float128 a )
4656 {
4657
4658 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4659
4660 }
4661
4662 /*----------------------------------------------------------------------------
4663 | Returns the exponent bits of the quadruple-precision floating-point value
4664 | `a'.
4665 *----------------------------------------------------------------------------*/
4666
4667 static inline int32_t extractFloat128Exp( float128 a )
4668 {
4669
4670 return ( a.high>>48 ) & 0x7FFF;
4671
4672 }
4673
4674 /*----------------------------------------------------------------------------
4675 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4676 *----------------------------------------------------------------------------*/
4677
4678 static inline bool extractFloat128Sign(float128 a)
4679 {
4680 return a.high >> 63;
4681 }
4682
4683 /*----------------------------------------------------------------------------
4684 | Normalizes the subnormal quadruple-precision floating-point value
4685 | represented by the denormalized significand formed by the concatenation of
4686 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4687 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4688 | significand are stored at the location pointed to by `zSig0Ptr', and the
4689 | least significant 64 bits of the normalized significand are stored at the
4690 | location pointed to by `zSig1Ptr'.
4691 *----------------------------------------------------------------------------*/
4692
4693 static void
4694 normalizeFloat128Subnormal(
4695 uint64_t aSig0,
4696 uint64_t aSig1,
4697 int32_t *zExpPtr,
4698 uint64_t *zSig0Ptr,
4699 uint64_t *zSig1Ptr
4700 )
4701 {
4702 int8_t shiftCount;
4703
4704 if ( aSig0 == 0 ) {
4705 shiftCount = clz64(aSig1) - 15;
4706 if ( shiftCount < 0 ) {
4707 *zSig0Ptr = aSig1>>( - shiftCount );
4708 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4709 }
4710 else {
4711 *zSig0Ptr = aSig1<<shiftCount;
4712 *zSig1Ptr = 0;
4713 }
4714 *zExpPtr = - shiftCount - 63;
4715 }
4716 else {
4717 shiftCount = clz64(aSig0) - 15;
4718 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4719 *zExpPtr = 1 - shiftCount;
4720 }
4721
4722 }
4723
4724 /*----------------------------------------------------------------------------
4725 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4726 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4727 | floating-point value, returning the result. After being shifted into the
4728 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4729 | added together to form the most significant 32 bits of the result. This
4730 | means that any integer portion of `zSig0' will be added into the exponent.
4731 | Since a properly normalized significand will have an integer portion equal
4732 | to 1, the `zExp' input should be 1 less than the desired result exponent
4733 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4734 | significand.
4735 *----------------------------------------------------------------------------*/
4736
4737 static inline float128
4738 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4739 {
4740 float128 z;
4741
4742 z.low = zSig1;
4743 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4744 return z;
4745 }
4746
4747 /*----------------------------------------------------------------------------
4748 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4749 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4750 | and `zSig2', and returns the proper quadruple-precision floating-point value
4751 | corresponding to the abstract input. Ordinarily, the abstract value is
4752 | simply rounded and packed into the quadruple-precision format, with the
4753 | inexact exception raised if the abstract input cannot be represented
4754 | exactly. However, if the abstract value is too large, the overflow and
4755 | inexact exceptions are raised and an infinity or maximal finite value is
4756 | returned. If the abstract value is too small, the input value is rounded to
4757 | a subnormal number, and the underflow and inexact exceptions are raised if
4758 | the abstract input cannot be represented exactly as a subnormal quadruple-
4759 | precision floating-point number.
4760 | The input significand must be normalized or smaller. If the input
4761 | significand is not normalized, `zExp' must be 0; in that case, the result
4762 | returned is a subnormal number, and it must not require rounding. In the
4763 | usual case that the input significand is normalized, `zExp' must be 1 less
4764 | than the ``true'' floating-point exponent. The handling of underflow and
4765 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4766 *----------------------------------------------------------------------------*/
4767
4768 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4769 uint64_t zSig0, uint64_t zSig1,
4770 uint64_t zSig2, float_status *status)
4771 {
4772 int8_t roundingMode;
4773 bool roundNearestEven, increment, isTiny;
4774
4775 roundingMode = status->float_rounding_mode;
4776 roundNearestEven = ( roundingMode == float_round_nearest_even );
4777 switch (roundingMode) {
4778 case float_round_nearest_even:
4779 case float_round_ties_away:
4780 increment = ((int64_t)zSig2 < 0);
4781 break;
4782 case float_round_to_zero:
4783 increment = 0;
4784 break;
4785 case float_round_up:
4786 increment = !zSign && zSig2;
4787 break;
4788 case float_round_down:
4789 increment = zSign && zSig2;
4790 break;
4791 case float_round_to_odd:
4792 increment = !(zSig1 & 0x1) && zSig2;
4793 break;
4794 default:
4795 abort();
4796 }
4797 if ( 0x7FFD <= (uint32_t) zExp ) {
4798 if ( ( 0x7FFD < zExp )
4799 || ( ( zExp == 0x7FFD )
4800 && eq128(
4801 UINT64_C(0x0001FFFFFFFFFFFF),
4802 UINT64_C(0xFFFFFFFFFFFFFFFF),
4803 zSig0,
4804 zSig1
4805 )
4806 && increment
4807 )
4808 ) {
4809 float_raise(float_flag_overflow | float_flag_inexact, status);
4810 if ( ( roundingMode == float_round_to_zero )
4811 || ( zSign && ( roundingMode == float_round_up ) )
4812 || ( ! zSign && ( roundingMode == float_round_down ) )
4813 || (roundingMode == float_round_to_odd)
4814 ) {
4815 return
4816 packFloat128(
4817 zSign,
4818 0x7FFE,
4819 UINT64_C(0x0000FFFFFFFFFFFF),
4820 UINT64_C(0xFFFFFFFFFFFFFFFF)
4821 );
4822 }
4823 return packFloat128( zSign, 0x7FFF, 0, 0 );
4824 }
4825 if ( zExp < 0 ) {
4826 if (status->flush_to_zero) {
4827 float_raise(float_flag_output_denormal, status);
4828 return packFloat128(zSign, 0, 0, 0);
4829 }
4830 isTiny = status->tininess_before_rounding
4831 || (zExp < -1)
4832 || !increment
4833 || lt128(zSig0, zSig1,
4834 UINT64_C(0x0001FFFFFFFFFFFF),
4835 UINT64_C(0xFFFFFFFFFFFFFFFF));
4836 shift128ExtraRightJamming(
4837 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4838 zExp = 0;
4839 if (isTiny && zSig2) {
4840 float_raise(float_flag_underflow, status);
4841 }
4842 switch (roundingMode) {
4843 case float_round_nearest_even:
4844 case float_round_ties_away:
4845 increment = ((int64_t)zSig2 < 0);
4846 break;
4847 case float_round_to_zero:
4848 increment = 0;
4849 break;
4850 case float_round_up:
4851 increment = !zSign && zSig2;
4852 break;
4853 case float_round_down:
4854 increment = zSign && zSig2;
4855 break;
4856 case float_round_to_odd:
4857 increment = !(zSig1 & 0x1) && zSig2;
4858 break;
4859 default:
4860 abort();
4861 }
4862 }
4863 }
4864 if (zSig2) {
4865 float_raise(float_flag_inexact, status);
4866 }
4867 if ( increment ) {
4868 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4869 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4870 zSig1 &= ~1;
4871 }
4872 }
4873 else {
4874 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4875 }
4876 return packFloat128( zSign, zExp, zSig0, zSig1 );
4877
4878 }
4879
4880 /*----------------------------------------------------------------------------
4881 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4882 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4883 | returns the proper quadruple-precision floating-point value corresponding
4884 | to the abstract input. This routine is just like `roundAndPackFloat128'
4885 | except that the input significand has fewer bits and does not have to be
4886 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4887 | point exponent.
4888 *----------------------------------------------------------------------------*/
4889
4890 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4891 uint64_t zSig0, uint64_t zSig1,
4892 float_status *status)
4893 {
4894 int8_t shiftCount;
4895 uint64_t zSig2;
4896
4897 if ( zSig0 == 0 ) {
4898 zSig0 = zSig1;
4899 zSig1 = 0;
4900 zExp -= 64;
4901 }
4902 shiftCount = clz64(zSig0) - 15;
4903 if ( 0 <= shiftCount ) {
4904 zSig2 = 0;
4905 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4906 }
4907 else {
4908 shift128ExtraRightJamming(
4909 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4910 }
4911 zExp -= shiftCount;
4912 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4913
4914 }
4915
4916
4917 /*----------------------------------------------------------------------------
4918 | Returns the result of converting the 32-bit two's complement integer `a'
4919 | to the extended double-precision floating-point format. The conversion
4920 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4921 | Arithmetic.
4922 *----------------------------------------------------------------------------*/
4923
4924 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4925 {
4926 bool zSign;
4927 uint32_t absA;
4928 int8_t shiftCount;
4929 uint64_t zSig;
4930
4931 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4932 zSign = ( a < 0 );
4933 absA = zSign ? - a : a;
4934 shiftCount = clz32(absA) + 32;
4935 zSig = absA;
4936 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4937
4938 }
4939
4940 /*----------------------------------------------------------------------------
4941 | Returns the result of converting the 32-bit two's complement integer `a' to
4942 | the quadruple-precision floating-point format. The conversion is performed
4943 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4945
4946 float128 int32_to_float128(int32_t a, float_status *status)
4947 {
4948 bool zSign;
4949 uint32_t absA;
4950 int8_t shiftCount;
4951 uint64_t zSig0;
4952
4953 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4954 zSign = ( a < 0 );
4955 absA = zSign ? - a : a;
4956 shiftCount = clz32(absA) + 17;
4957 zSig0 = absA;
4958 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4959
4960 }
4961
4962 /*----------------------------------------------------------------------------
4963 | Returns the result of converting the 64-bit two's complement integer `a'
4964 | to the extended double-precision floating-point format. The conversion
4965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4966 | Arithmetic.
4967 *----------------------------------------------------------------------------*/
4968
4969 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4970 {
4971 bool zSign;
4972 uint64_t absA;
4973 int8_t shiftCount;
4974
4975 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4976 zSign = ( a < 0 );
4977 absA = zSign ? - a : a;
4978 shiftCount = clz64(absA);
4979 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4980
4981 }
4982
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of converting the 64-bit two's complement integer `a' to
4985 | the quadruple-precision floating-point format. The conversion is performed
4986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4987 *----------------------------------------------------------------------------*/
4988
4989 float128 int64_to_float128(int64_t a, float_status *status)
4990 {
4991 bool zSign;
4992 uint64_t absA;
4993 int8_t shiftCount;
4994 int32_t zExp;
4995 uint64_t zSig0, zSig1;
4996
4997 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4998 zSign = ( a < 0 );
4999 absA = zSign ? - a : a;
5000 shiftCount = clz64(absA) + 49;
5001 zExp = 0x406E - shiftCount;
5002 if ( 64 <= shiftCount ) {
5003 zSig1 = 0;
5004 zSig0 = absA;
5005 shiftCount -= 64;
5006 }
5007 else {
5008 zSig1 = absA;
5009 zSig0 = 0;
5010 }
5011 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5012 return packFloat128( zSign, zExp, zSig0, zSig1 );
5013
5014 }
5015
5016 /*----------------------------------------------------------------------------
5017 | Returns the result of converting the 64-bit unsigned integer `a'
5018 | to the quadruple-precision floating-point format. The conversion is performed
5019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 *----------------------------------------------------------------------------*/
5021
5022 float128 uint64_to_float128(uint64_t a, float_status *status)
5023 {
5024 if (a == 0) {
5025 return float128_zero;
5026 }
5027 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5028 }
5029
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the single-precision floating-point value
5032 | `a' to the extended double-precision floating-point format. The conversion
5033 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5034 | Arithmetic.
5035 *----------------------------------------------------------------------------*/
5036
5037 floatx80 float32_to_floatx80(float32 a, float_status *status)
5038 {
5039 bool aSign;
5040 int aExp;
5041 uint32_t aSig;
5042
5043 a = float32_squash_input_denormal(a, status);
5044 aSig = extractFloat32Frac( a );
5045 aExp = extractFloat32Exp( a );
5046 aSign = extractFloat32Sign( a );
5047 if ( aExp == 0xFF ) {
5048 if (aSig) {
5049 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5050 status);
5051 return floatx80_silence_nan(res, status);
5052 }
5053 return packFloatx80(aSign,
5054 floatx80_infinity_high,
5055 floatx80_infinity_low);
5056 }
5057 if ( aExp == 0 ) {
5058 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5059 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5060 }
5061 aSig |= 0x00800000;
5062 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5063
5064 }
5065
5066 /*----------------------------------------------------------------------------
5067 | Returns the result of converting the single-precision floating-point value
5068 | `a' to the double-precision floating-point format. The conversion is
5069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5070 | Arithmetic.
5071 *----------------------------------------------------------------------------*/
5072
5073 float128 float32_to_float128(float32 a, float_status *status)
5074 {
5075 bool aSign;
5076 int aExp;
5077 uint32_t aSig;
5078
5079 a = float32_squash_input_denormal(a, status);
5080 aSig = extractFloat32Frac( a );
5081 aExp = extractFloat32Exp( a );
5082 aSign = extractFloat32Sign( a );
5083 if ( aExp == 0xFF ) {
5084 if (aSig) {
5085 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5086 }
5087 return packFloat128( aSign, 0x7FFF, 0, 0 );
5088 }
5089 if ( aExp == 0 ) {
5090 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5091 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5092 --aExp;
5093 }
5094 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5095
5096 }
5097
5098 /*----------------------------------------------------------------------------
5099 | Returns the remainder of the single-precision floating-point value `a'
5100 | with respect to the corresponding value `b'. The operation is performed
5101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102 *----------------------------------------------------------------------------*/
5103
5104 float32 float32_rem(float32 a, float32 b, float_status *status)
5105 {
5106 bool aSign, zSign;
5107 int aExp, bExp, expDiff;
5108 uint32_t aSig, bSig;
5109 uint32_t q;
5110 uint64_t aSig64, bSig64, q64;
5111 uint32_t alternateASig;
5112 int32_t sigMean;
5113 a = float32_squash_input_denormal(a, status);
5114 b = float32_squash_input_denormal(b, status);
5115
5116 aSig = extractFloat32Frac( a );
5117 aExp = extractFloat32Exp( a );
5118 aSign = extractFloat32Sign( a );
5119 bSig = extractFloat32Frac( b );
5120 bExp = extractFloat32Exp( b );
5121 if ( aExp == 0xFF ) {
5122 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5123 return propagateFloat32NaN(a, b, status);
5124 }
5125 float_raise(float_flag_invalid, status);
5126 return float32_default_nan(status);
5127 }
5128 if ( bExp == 0xFF ) {
5129 if (bSig) {
5130 return propagateFloat32NaN(a, b, status);
5131 }
5132 return a;
5133 }
5134 if ( bExp == 0 ) {
5135 if ( bSig == 0 ) {
5136 float_raise(float_flag_invalid, status);
5137 return float32_default_nan(status);
5138 }
5139 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5140 }
5141 if ( aExp == 0 ) {
5142 if ( aSig == 0 ) return a;
5143 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5144 }
5145 expDiff = aExp - bExp;
5146 aSig |= 0x00800000;
5147 bSig |= 0x00800000;
5148 if ( expDiff < 32 ) {
5149 aSig <<= 8;
5150 bSig <<= 8;
5151 if ( expDiff < 0 ) {
5152 if ( expDiff < -1 ) return a;
5153 aSig >>= 1;
5154 }
5155 q = ( bSig <= aSig );
5156 if ( q ) aSig -= bSig;
5157 if ( 0 < expDiff ) {
5158 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5159 q >>= 32 - expDiff;
5160 bSig >>= 2;
5161 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5162 }
5163 else {
5164 aSig >>= 2;
5165 bSig >>= 2;
5166 }
5167 }
5168 else {
5169 if ( bSig <= aSig ) aSig -= bSig;
5170 aSig64 = ( (uint64_t) aSig )<<40;
5171 bSig64 = ( (uint64_t) bSig )<<40;
5172 expDiff -= 64;
5173 while ( 0 < expDiff ) {
5174 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5175 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5176 aSig64 = - ( ( bSig * q64 )<<38 );
5177 expDiff -= 62;
5178 }
5179 expDiff += 64;
5180 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5181 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5182 q = q64>>( 64 - expDiff );
5183 bSig <<= 6;
5184 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5185 }
5186 do {
5187 alternateASig = aSig;
5188 ++q;
5189 aSig -= bSig;
5190 } while ( 0 <= (int32_t) aSig );
5191 sigMean = aSig + alternateASig;
5192 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5193 aSig = alternateASig;
5194 }
5195 zSign = ( (int32_t) aSig < 0 );
5196 if ( zSign ) aSig = - aSig;
5197 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5198 }
5199
5200
5201
5202 /*----------------------------------------------------------------------------
5203 | Returns the binary exponential of the single-precision floating-point value
5204 | `a'. The operation is performed according to the IEC/IEEE Standard for
5205 | Binary Floating-Point Arithmetic.
5206 |
5207 | Uses the following identities:
5208 |
5209 | 1. -------------------------------------------------------------------------
5210 | x x*ln(2)
5211 | 2 = e
5212 |
5213 | 2. -------------------------------------------------------------------------
5214 | 2 3 4 5 n
5215 | x x x x x x x
5216 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5217 | 1! 2! 3! 4! 5! n!
5218 *----------------------------------------------------------------------------*/
5219
5220 static const float64 float32_exp2_coefficients[15] =
5221 {
5222 const_float64( 0x3ff0000000000000ll ), /* 1 */
5223 const_float64( 0x3fe0000000000000ll ), /* 2 */
5224 const_float64( 0x3fc5555555555555ll ), /* 3 */
5225 const_float64( 0x3fa5555555555555ll ), /* 4 */
5226 const_float64( 0x3f81111111111111ll ), /* 5 */
5227 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5228 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5229 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5230 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5231 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5232 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5233 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5234 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5235 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5236 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5237 };
5238
5239 float32 float32_exp2(float32 a, float_status *status)
5240 {
5241 bool aSign;
5242 int aExp;
5243 uint32_t aSig;
5244 float64 r, x, xn;
5245 int i;
5246 a = float32_squash_input_denormal(a, status);
5247
5248 aSig = extractFloat32Frac( a );
5249 aExp = extractFloat32Exp( a );
5250 aSign = extractFloat32Sign( a );
5251
5252 if ( aExp == 0xFF) {
5253 if (aSig) {
5254 return propagateFloat32NaN(a, float32_zero, status);
5255 }
5256 return (aSign) ? float32_zero : a;
5257 }
5258 if (aExp == 0) {
5259 if (aSig == 0) return float32_one;
5260 }
5261
5262 float_raise(float_flag_inexact, status);
5263
5264 /* ******************************* */
5265 /* using float64 for approximation */
5266 /* ******************************* */
5267 x = float32_to_float64(a, status);
5268 x = float64_mul(x, float64_ln2, status);
5269
5270 xn = x;
5271 r = float64_one;
5272 for (i = 0 ; i < 15 ; i++) {
5273 float64 f;
5274
5275 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5276 r = float64_add(r, f, status);
5277
5278 xn = float64_mul(xn, x, status);
5279 }
5280
5281 return float64_to_float32(r, status);
5282 }
5283
5284 /*----------------------------------------------------------------------------
5285 | Returns the binary log of the single-precision floating-point value `a'.
5286 | The operation is performed according to the IEC/IEEE Standard for Binary
5287 | Floating-Point Arithmetic.
5288 *----------------------------------------------------------------------------*/
5289 float32 float32_log2(float32 a, float_status *status)
5290 {
5291 bool aSign, zSign;
5292 int aExp;
5293 uint32_t aSig, zSig, i;
5294
5295 a = float32_squash_input_denormal(a, status);
5296 aSig = extractFloat32Frac( a );
5297 aExp = extractFloat32Exp( a );
5298 aSign = extractFloat32Sign( a );
5299
5300 if ( aExp == 0 ) {
5301 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5302 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5303 }
5304 if ( aSign ) {
5305 float_raise(float_flag_invalid, status);
5306 return float32_default_nan(status);
5307 }
5308 if ( aExp == 0xFF ) {
5309 if (aSig) {
5310 return propagateFloat32NaN(a, float32_zero, status);
5311 }
5312 return a;
5313 }
5314
5315 aExp -= 0x7F;
5316 aSig |= 0x00800000;
5317 zSign = aExp < 0;
5318 zSig = aExp << 23;
5319
5320 for (i = 1 << 22; i > 0; i >>= 1) {
5321 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5322 if ( aSig & 0x01000000 ) {
5323 aSig >>= 1;
5324 zSig |= i;
5325 }
5326 }
5327
5328 if ( zSign )
5329 zSig = -zSig;
5330
5331 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5332 }
5333
5334 /*----------------------------------------------------------------------------
5335 | Returns the result of converting the double-precision floating-point value
5336 | `a' to the extended double-precision floating-point format. The conversion
5337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338 | Arithmetic.
5339 *----------------------------------------------------------------------------*/
5340
5341 floatx80 float64_to_floatx80(float64 a, float_status *status)
5342 {
5343 bool aSign;
5344 int aExp;
5345 uint64_t aSig;
5346
5347 a = float64_squash_input_denormal(a, status);
5348 aSig = extractFloat64Frac( a );
5349 aExp = extractFloat64Exp( a );
5350 aSign = extractFloat64Sign( a );
5351 if ( aExp == 0x7FF ) {
5352 if (aSig) {
5353 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5354 status);
5355 return floatx80_silence_nan(res, status);
5356 }
5357 return packFloatx80(aSign,
5358 floatx80_infinity_high,
5359 floatx80_infinity_low);
5360 }
5361 if ( aExp == 0 ) {
5362 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5363 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5364 }
5365 return
5366 packFloatx80(
5367 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5368
5369 }
5370
5371 /*----------------------------------------------------------------------------
5372 | Returns the result of converting the double-precision floating-point value
5373 | `a' to the quadruple-precision floating-point format. The conversion is
5374 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5375 | Arithmetic.
5376 *----------------------------------------------------------------------------*/
5377
5378 float128 float64_to_float128(float64 a, float_status *status)
5379 {
5380 bool aSign;
5381 int aExp;
5382 uint64_t aSig, zSig0, zSig1;
5383
5384 a = float64_squash_input_denormal(a, status);
5385 aSig = extractFloat64Frac( a );
5386 aExp = extractFloat64Exp( a );
5387 aSign = extractFloat64Sign( a );
5388 if ( aExp == 0x7FF ) {
5389 if (aSig) {
5390 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5391 }
5392 return packFloat128( aSign, 0x7FFF, 0, 0 );
5393 }
5394 if ( aExp == 0 ) {
5395 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5396 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5397 --aExp;
5398 }
5399 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5400 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5401
5402 }
5403
5404
5405 /*----------------------------------------------------------------------------
5406 | Returns the remainder of the double-precision floating-point value `a'
5407 | with respect to the corresponding value `b'. The operation is performed
5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 *----------------------------------------------------------------------------*/
5410
5411 float64 float64_rem(float64 a, float64 b, float_status *status)
5412 {
5413 bool aSign, zSign;
5414 int aExp, bExp, expDiff;
5415 uint64_t aSig, bSig;
5416 uint64_t q, alternateASig;
5417 int64_t sigMean;
5418
5419 a = float64_squash_input_denormal(a, status);
5420 b = float64_squash_input_denormal(b, status);
5421 aSig = extractFloat64Frac( a );
5422 aExp = extractFloat64Exp( a );
5423 aSign = extractFloat64Sign( a );
5424 bSig = extractFloat64Frac( b );
5425 bExp = extractFloat64Exp( b );
5426 if ( aExp == 0x7FF ) {
5427 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5428 return propagateFloat64NaN(a, b, status);
5429 }
5430 float_raise(float_flag_invalid, status);
5431 return float64_default_nan(status);
5432 }
5433 if ( bExp == 0x7FF ) {
5434 if (bSig) {
5435 return propagateFloat64NaN(a, b, status);
5436 }
5437 return a;
5438 }
5439 if ( bExp == 0 ) {
5440 if ( bSig == 0 ) {
5441 float_raise(float_flag_invalid, status);
5442 return float64_default_nan(status);
5443 }
5444 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5445 }
5446 if ( aExp == 0 ) {
5447 if ( aSig == 0 ) return a;
5448 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5449 }
5450 expDiff = aExp - bExp;
5451 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5452 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5453 if ( expDiff < 0 ) {
5454 if ( expDiff < -1 ) return a;
5455 aSig >>= 1;
5456 }
5457 q = ( bSig <= aSig );
5458 if ( q ) aSig -= bSig;
5459 expDiff -= 64;
5460 while ( 0 < expDiff ) {
5461 q = estimateDiv128To64( aSig, 0, bSig );
5462 q = ( 2 < q ) ? q - 2 : 0;
5463 aSig = - ( ( bSig>>2 ) * q );
5464 expDiff -= 62;
5465 }
5466 expDiff += 64;
5467 if ( 0 < expDiff ) {
5468 q = estimateDiv128To64( aSig, 0, bSig );
5469 q = ( 2 < q ) ? q - 2 : 0;
5470 q >>= 64 - expDiff;
5471 bSig >>= 2;
5472 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5473 }
5474 else {
5475 aSig >>= 2;
5476 bSig >>= 2;
5477 }
5478 do {
5479 alternateASig = aSig;
5480 ++q;
5481 aSig -= bSig;
5482 } while ( 0 <= (int64_t) aSig );
5483 sigMean = aSig + alternateASig;
5484 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5485 aSig = alternateASig;
5486 }
5487 zSign = ( (int64_t) aSig < 0 );
5488 if ( zSign ) aSig = - aSig;
5489 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5490
5491 }
5492
5493 /*----------------------------------------------------------------------------
5494 | Returns the binary log of the double-precision floating-point value `a'.
5495 | The operation is performed according to the IEC/IEEE Standard for Binary
5496 | Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498 float64 float64_log2(float64 a, float_status *status)
5499 {
5500 bool aSign, zSign;
5501 int aExp;
5502 uint64_t aSig, aSig0, aSig1, zSig, i;
5503 a = float64_squash_input_denormal(a, status);
5504
5505 aSig = extractFloat64Frac( a );
5506 aExp = extractFloat64Exp( a );
5507 aSign = extractFloat64Sign( a );
5508
5509 if ( aExp == 0 ) {
5510 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5511 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5512 }
5513 if ( aSign ) {
5514 float_raise(float_flag_invalid, status);
5515 return float64_default_nan(status);
5516 }
5517 if ( aExp == 0x7FF ) {
5518 if (aSig) {
5519 return propagateFloat64NaN(a, float64_zero, status);
5520 }
5521 return a;
5522 }
5523
5524 aExp -= 0x3FF;
5525 aSig |= UINT64_C(0x0010000000000000);
5526 zSign = aExp < 0;
5527 zSig = (uint64_t)aExp << 52;
5528 for (i = 1LL << 51; i > 0; i >>= 1) {
5529 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5530 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5531 if ( aSig & UINT64_C(0x0020000000000000) ) {
5532 aSig >>= 1;
5533 zSig |= i;
5534 }
5535 }
5536
5537 if ( zSign )
5538 zSig = -zSig;
5539 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5540 }
5541
5542 /*----------------------------------------------------------------------------
5543 | Returns the result of converting the extended double-precision floating-
5544 | point value `a' to the 32-bit two's complement integer format. The
5545 | conversion is performed according to the IEC/IEEE Standard for Binary
5546 | Floating-Point Arithmetic---which means in particular that the conversion
5547 | is rounded according to the current rounding mode. If `a' is a NaN, the
5548 | largest positive integer is returned. Otherwise, if the conversion
5549 | overflows, the largest integer with the same sign as `a' is returned.
5550 *----------------------------------------------------------------------------*/
5551
5552 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5553 {
5554 bool aSign;
5555 int32_t aExp, shiftCount;
5556 uint64_t aSig;
5557
5558 if (floatx80_invalid_encoding(a)) {
5559 float_raise(float_flag_invalid, status);
5560 return 1 << 31;
5561 }
5562 aSig = extractFloatx80Frac( a );
5563 aExp = extractFloatx80Exp( a );
5564 aSign = extractFloatx80Sign( a );
5565 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5566 shiftCount = 0x4037 - aExp;
5567 if ( shiftCount <= 0 ) shiftCount = 1;
5568 shift64RightJamming( aSig, shiftCount, &aSig );
5569 return roundAndPackInt32(aSign, aSig, status);
5570
5571 }
5572
5573 /*----------------------------------------------------------------------------
5574 | Returns the result of converting the extended double-precision floating-
5575 | point value `a' to the 32-bit two's complement integer format. The
5576 | conversion is performed according to the IEC/IEEE Standard for Binary
5577 | Floating-Point Arithmetic, except that the conversion is always rounded
5578 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5579 | Otherwise, if the conversion overflows, the largest integer with the same
5580 | sign as `a' is returned.
5581 *----------------------------------------------------------------------------*/
5582
5583 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5584 {
5585 bool aSign;
5586 int32_t aExp, shiftCount;
5587 uint64_t aSig, savedASig;
5588 int32_t z;
5589
5590 if (floatx80_invalid_encoding(a)) {
5591 float_raise(float_flag_invalid, status);
5592 return 1 << 31;
5593 }
5594 aSig = extractFloatx80Frac( a );
5595 aExp = extractFloatx80Exp( a );
5596 aSign = extractFloatx80Sign( a );
5597 if ( 0x401E < aExp ) {
5598 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5599 goto invalid;
5600 }
5601 else if ( aExp < 0x3FFF ) {
5602 if (aExp || aSig) {
5603 float_raise(float_flag_inexact, status);
5604 }
5605 return 0;
5606 }
5607 shiftCount = 0x403E - aExp;
5608 savedASig = aSig;
5609 aSig >>= shiftCount;
5610 z = aSig;
5611 if ( aSign ) z = - z;
5612 if ( ( z < 0 ) ^ aSign ) {
5613 invalid:
5614 float_raise(float_flag_invalid, status);
5615 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5616 }
5617 if ( ( aSig<<shiftCount ) != savedASig ) {
5618 float_raise(float_flag_inexact, status);
5619 }
5620 return z;
5621
5622 }
5623
5624 /*----------------------------------------------------------------------------
5625 | Returns the result of converting the extended double-precision floating-
5626 | point value `a' to the 64-bit two's complement integer format. The
5627 | conversion is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic---which means in particular that the conversion
5629 | is rounded according to the current rounding mode. If `a' is a NaN,
5630 | the largest positive integer is returned. Otherwise, if the conversion
5631 | overflows, the largest integer with the same sign as `a' is returned.
5632 *----------------------------------------------------------------------------*/
5633
5634 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5635 {
5636 bool aSign;
5637 int32_t aExp, shiftCount;
5638 uint64_t aSig, aSigExtra;
5639
5640 if (floatx80_invalid_encoding(a)) {
5641 float_raise(float_flag_invalid, status);
5642 return 1ULL << 63;
5643 }
5644 aSig = extractFloatx80Frac( a );
5645 aExp = extractFloatx80Exp( a );
5646 aSign = extractFloatx80Sign( a );
5647 shiftCount = 0x403E - aExp;
5648 if ( shiftCount <= 0 ) {
5649 if ( shiftCount ) {
5650 float_raise(float_flag_invalid, status);
5651 if (!aSign || floatx80_is_any_nan(a)) {
5652 return INT64_MAX;
5653 }
5654 return INT64_MIN;
5655 }
5656 aSigExtra = 0;
5657 }
5658 else {
5659 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5660 }
5661 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5662
5663 }
5664
5665 /*----------------------------------------------------------------------------
5666 | Returns the result of converting the extended double-precision floating-
5667 | point value `a' to the 64-bit two's complement integer format. The
5668 | conversion is performed according to the IEC/IEEE Standard for Binary
5669 | Floating-Point Arithmetic, except that the conversion is always rounded
5670 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5671 | Otherwise, if the conversion overflows, the largest integer with the same
5672 | sign as `a' is returned.
5673 *----------------------------------------------------------------------------*/
5674
5675 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5676 {
5677 bool aSign;
5678 int32_t aExp, shiftCount;
5679 uint64_t aSig;
5680 int64_t z;
5681
5682 if (floatx80_invalid_encoding(a)) {
5683 float_raise(float_flag_invalid, status);
5684 return 1ULL << 63;
5685 }
5686 aSig = extractFloatx80Frac( a );
5687 aExp = extractFloatx80Exp( a );
5688 aSign = extractFloatx80Sign( a );
5689 shiftCount = aExp - 0x403E;
5690 if ( 0 <= shiftCount ) {
5691 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5692 if ( ( a.high != 0xC03E ) || aSig ) {
5693 float_raise(float_flag_invalid, status);
5694 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5695 return INT64_MAX;
5696 }
5697 }
5698 return INT64_MIN;
5699 }
5700 else if ( aExp < 0x3FFF ) {
5701 if (aExp | aSig) {
5702 float_raise(float_flag_inexact, status);
5703 }
5704 return 0;
5705 }
5706 z = aSig>>( - shiftCount );
5707 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5708 float_raise(float_flag_inexact, status);
5709 }
5710 if ( aSign ) z = - z;
5711 return z;
5712
5713 }
5714
5715 /*----------------------------------------------------------------------------
5716 | Returns the result of converting the extended double-precision floating-
5717 | point value `a' to the single-precision floating-point format. The
5718 | conversion is performed according to the IEC/IEEE Standard for Binary
5719 | Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5721
5722 float32 floatx80_to_float32(floatx80 a, float_status *status)
5723 {
5724 bool aSign;
5725 int32_t aExp;
5726 uint64_t aSig;
5727
5728 if (floatx80_invalid_encoding(a)) {
5729 float_raise(float_flag_invalid, status);
5730 return float32_default_nan(status);
5731 }
5732 aSig = extractFloatx80Frac( a );
5733 aExp = extractFloatx80Exp( a );
5734 aSign = extractFloatx80Sign( a );
5735 if ( aExp == 0x7FFF ) {
5736 if ( (uint64_t) ( aSig<<1 ) ) {
5737 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5738 status);
5739 return float32_silence_nan(res, status);
5740 }
5741 return packFloat32( aSign, 0xFF, 0 );
5742 }
5743 shift64RightJamming( aSig, 33, &aSig );
5744 if ( aExp || aSig ) aExp -= 0x3F81;
5745 return roundAndPackFloat32(aSign, aExp, aSig, status);
5746
5747 }
5748
5749 /*----------------------------------------------------------------------------
5750 | Returns the result of converting the extended double-precision floating-
5751 | point value `a' to the double-precision floating-point format. The
5752 | conversion is performed according to the IEC/IEEE Standard for Binary
5753 | Floating-Point Arithmetic.
5754 *----------------------------------------------------------------------------*/
5755
5756 float64 floatx80_to_float64(floatx80 a, float_status *status)
5757 {
5758 bool aSign;
5759 int32_t aExp;
5760 uint64_t aSig, zSig;
5761
5762 if (floatx80_invalid_encoding(a)) {
5763 float_raise(float_flag_invalid, status);
5764 return float64_default_nan(status);
5765 }
5766 aSig = extractFloatx80Frac( a );
5767 aExp = extractFloatx80Exp( a );
5768 aSign = extractFloatx80Sign( a );
5769 if ( aExp == 0x7FFF ) {
5770 if ( (uint64_t) ( aSig<<1 ) ) {
5771 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5772 status);
5773 return float64_silence_nan(res, status);
5774 }
5775 return packFloat64( aSign, 0x7FF, 0 );
5776 }
5777 shift64RightJamming( aSig, 1, &zSig );
5778 if ( aExp || aSig ) aExp -= 0x3C01;
5779 return roundAndPackFloat64(aSign, aExp, zSig, status);
5780
5781 }
5782
5783 /*----------------------------------------------------------------------------
5784 | Returns the result of converting the extended double-precision floating-
5785 | point value `a' to the quadruple-precision floating-point format. The
5786 | conversion is performed according to the IEC/IEEE Standard for Binary
5787 | Floating-Point Arithmetic.
5788 *----------------------------------------------------------------------------*/
5789
5790 float128 floatx80_to_float128(floatx80 a, float_status *status)
5791 {
5792 bool aSign;
5793 int aExp;
5794 uint64_t aSig, zSig0, zSig1;
5795
5796 if (floatx80_invalid_encoding(a)) {
5797 float_raise(float_flag_invalid, status);
5798 return float128_default_nan(status);
5799 }
5800 aSig = extractFloatx80Frac( a );
5801 aExp = extractFloatx80Exp( a );
5802 aSign = extractFloatx80Sign( a );
5803 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5804 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5805 status);
5806 return float128_silence_nan(res, status);
5807 }
5808 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5809 return packFloat128( aSign, aExp, zSig0, zSig1 );
5810
5811 }
5812
5813 /*----------------------------------------------------------------------------
5814 | Rounds the extended double-precision floating-point value `a'
5815 | to the precision provided by floatx80_rounding_precision and returns the
5816 | result as an extended double-precision floating-point value.
5817 | The operation is performed according to the IEC/IEEE Standard for Binary
5818 | Floating-Point Arithmetic.
5819 *----------------------------------------------------------------------------*/
5820
5821 floatx80 floatx80_round(floatx80 a, float_status *status)
5822 {
5823 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5824 extractFloatx80Sign(a),
5825 extractFloatx80Exp(a),
5826 extractFloatx80Frac(a), 0, status);
5827 }
5828
5829 /*----------------------------------------------------------------------------
5830 | Rounds the extended double-precision floating-point value `a' to an integer,
5831 | and returns the result as an extended quadruple-precision floating-point
5832 | value. The operation is performed according to the IEC/IEEE Standard for
5833 | Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5835
5836 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5837 {
5838 bool aSign;
5839 int32_t aExp;
5840 uint64_t lastBitMask, roundBitsMask;
5841 floatx80 z;
5842
5843 if (floatx80_invalid_encoding(a)) {
5844 float_raise(float_flag_invalid, status);
5845 return floatx80_default_nan(status);
5846 }
5847 aExp = extractFloatx80Exp( a );
5848 if ( 0x403E <= aExp ) {
5849 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5850 return propagateFloatx80NaN(a, a, status);
5851 }
5852 return a;
5853 }
5854 if ( aExp < 0x3FFF ) {
5855 if ( ( aExp == 0 )
5856 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5857 return a;
5858 }
5859 float_raise(float_flag_inexact, status);
5860 aSign = extractFloatx80Sign( a );
5861 switch (status->float_rounding_mode) {
5862 case float_round_nearest_even:
5863 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5864 ) {
5865 return
5866 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5867 }
5868 break;
5869 case float_round_ties_away:
5870 if (aExp == 0x3FFE) {
5871 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5872 }
5873 break;
5874 case float_round_down:
5875 return
5876 aSign ?
5877 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5878 : packFloatx80( 0, 0, 0 );
5879 case float_round_up:
5880 return
5881 aSign ? packFloatx80( 1, 0, 0 )
5882 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5883
5884 case float_round_to_zero:
5885 break;
5886 default:
5887 g_assert_not_reached();
5888 }
5889 return packFloatx80( aSign, 0, 0 );
5890 }
5891 lastBitMask = 1;
5892 lastBitMask <<= 0x403E - aExp;
5893 roundBitsMask = lastBitMask - 1;
5894 z = a;
5895 switch (status->float_rounding_mode) {
5896 case float_round_nearest_even:
5897 z.low += lastBitMask>>1;
5898 if ((z.low & roundBitsMask) == 0) {
5899 z.low &= ~lastBitMask;
5900 }
5901 break;
5902 case float_round_ties_away:
5903 z.low += lastBitMask >> 1;
5904 break;
5905 case float_round_to_zero:
5906 break;
5907 case float_round_up:
5908 if (!extractFloatx80Sign(z)) {
5909 z.low += roundBitsMask;
5910 }
5911 break;
5912 case float_round_down:
5913 if (extractFloatx80Sign(z)) {
5914 z.low += roundBitsMask;
5915 }
5916 break;
5917 default:
5918 abort();
5919 }
5920 z.low &= ~ roundBitsMask;
5921 if ( z.low == 0 ) {
5922 ++z.high;
5923 z.low = UINT64_C(0x8000000000000000);
5924 }
5925 if (z.low != a.low) {
5926 float_raise(float_flag_inexact, status);
5927 }
5928 return z;
5929
5930 }
5931
5932 /*----------------------------------------------------------------------------
5933 | Returns the result of adding the absolute values of the extended double-
5934 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5935 | negated before being returned. `zSign' is ignored if the result is a NaN.
5936 | The addition is performed according to the IEC/IEEE Standard for Binary
5937 | Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939
5940 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5941 float_status *status)
5942 {
5943 int32_t aExp, bExp, zExp;
5944 uint64_t aSig, bSig, zSig0, zSig1;
5945 int32_t expDiff;
5946
5947 aSig = extractFloatx80Frac( a );
5948 aExp = extractFloatx80Exp( a );
5949 bSig = extractFloatx80Frac( b );
5950 bExp = extractFloatx80Exp( b );
5951 expDiff = aExp - bExp;
5952 if ( 0 < expDiff ) {
5953 if ( aExp == 0x7FFF ) {
5954 if ((uint64_t)(aSig << 1)) {
5955 return propagateFloatx80NaN(a, b, status);
5956 }
5957 return a;
5958 }
5959 if ( bExp == 0 ) --expDiff;
5960 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5961 zExp = aExp;
5962 }
5963 else if ( expDiff < 0 ) {
5964 if ( bExp == 0x7FFF ) {
5965 if ((uint64_t)(bSig << 1)) {
5966 return propagateFloatx80NaN(a, b, status);
5967 }
5968 return packFloatx80(zSign,
5969 floatx80_infinity_high,
5970 floatx80_infinity_low);
5971 }
5972 if ( aExp == 0 ) ++expDiff;
5973 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5974 zExp = bExp;
5975 }
5976 else {
5977 if ( aExp == 0x7FFF ) {
5978 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5979 return propagateFloatx80NaN(a, b, status);
5980 }
5981 return a;
5982 }
5983 zSig1 = 0;
5984 zSig0 = aSig + bSig;
5985 if ( aExp == 0 ) {
5986 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5987 /* At least one of the values is a pseudo-denormal,
5988 * and there is a carry out of the result. */
5989 zExp = 1;
5990 goto shiftRight1;
5991 }
5992 if (zSig0 == 0) {
5993 return packFloatx80(zSign, 0, 0);
5994 }
5995 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5996 goto roundAndPack;
5997 }
5998 zExp = aExp;
5999 goto shiftRight1;
6000 }
6001 zSig0 = aSig + bSig;
6002 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6003 shiftRight1:
6004 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6005 zSig0 |= UINT64_C(0x8000000000000000);
6006 ++zExp;
6007 roundAndPack:
6008 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6009 zSign, zExp, zSig0, zSig1, status);
6010 }
6011
6012 /*----------------------------------------------------------------------------
6013 | Returns the result of subtracting the absolute values of the extended
6014 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
6015 | difference is negated before being returned. `zSign' is ignored if the
6016 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6017 | Standard for Binary Floating-Point Arithmetic.
6018 *----------------------------------------------------------------------------*/
6019
6020 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6021 float_status *status)
6022 {
6023 int32_t aExp, bExp, zExp;
6024 uint64_t aSig, bSig, zSig0, zSig1;
6025 int32_t expDiff;
6026
6027 aSig = extractFloatx80Frac( a );
6028 aExp = extractFloatx80Exp( a );
6029 bSig = extractFloatx80Frac( b );
6030 bExp = extractFloatx80Exp( b );
6031 expDiff = aExp - bExp;
6032 if ( 0 < expDiff ) goto aExpBigger;
6033 if ( expDiff < 0 ) goto bExpBigger;
6034 if ( aExp == 0x7FFF ) {
6035 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6036 return propagateFloatx80NaN(a, b, status);
6037 }
6038 float_raise(float_flag_invalid, status);
6039 return floatx80_default_nan(status);
6040 }
6041 if ( aExp == 0 ) {
6042 aExp = 1;
6043 bExp = 1;
6044 }
6045 zSig1 = 0;
6046 if ( bSig < aSig ) goto aBigger;
6047 if ( aSig < bSig ) goto bBigger;
6048 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6049 bExpBigger:
6050 if ( bExp == 0x7FFF ) {
6051 if ((uint64_t)(bSig << 1)) {
6052 return propagateFloatx80NaN(a, b, status);
6053 }
6054 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6055 floatx80_infinity_low);
6056 }
6057 if ( aExp == 0 ) ++expDiff;
6058 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6059 bBigger:
6060 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6061 zExp = bExp;
6062 zSign ^= 1;
6063 goto normalizeRoundAndPack;
6064 aExpBigger:
6065 if ( aExp == 0x7FFF ) {
6066 if ((uint64_t)(aSig << 1)) {
6067 return propagateFloatx80NaN(a, b, status);
6068 }
6069 return a;
6070 }
6071 if ( bExp == 0 ) --expDiff;
6072 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6073 aBigger:
6074 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6075 zExp = aExp;
6076 normalizeRoundAndPack:
6077 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6078 zSign, zExp, zSig0, zSig1, status);
6079 }
6080
6081 /*----------------------------------------------------------------------------
6082 | Returns the result of adding the extended double-precision floating-point
6083 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6084 | Standard for Binary Floating-Point Arithmetic.
6085 *----------------------------------------------------------------------------*/
6086
6087 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6088 {
6089 bool aSign, bSign;
6090
6091 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6092 float_raise(float_flag_invalid, status);
6093 return floatx80_default_nan(status);
6094 }
6095 aSign = extractFloatx80Sign( a );
6096 bSign = extractFloatx80Sign( b );
6097 if ( aSign == bSign ) {
6098 return addFloatx80Sigs(a, b, aSign, status);
6099 }
6100 else {
6101 return subFloatx80Sigs(a, b, aSign, status);
6102 }
6103
6104 }
6105
6106 /*----------------------------------------------------------------------------
6107 | Returns the result of subtracting the extended double-precision floating-
6108 | point values `a' and `b'. The operation is performed according to the
6109 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6110 *----------------------------------------------------------------------------*/
6111
6112 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6113 {
6114 bool aSign, bSign;
6115
6116 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6117 float_raise(float_flag_invalid, status);
6118 return floatx80_default_nan(status);
6119 }
6120 aSign = extractFloatx80Sign( a );
6121 bSign = extractFloatx80Sign( b );
6122 if ( aSign == bSign ) {
6123 return subFloatx80Sigs(a, b, aSign, status);
6124 }
6125 else {
6126 return addFloatx80Sigs(a, b, aSign, status);
6127 }
6128
6129 }
6130
6131 /*----------------------------------------------------------------------------
6132 | Returns the result of multiplying the extended double-precision floating-
6133 | point values `a' and `b'. The operation is performed according to the
6134 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6135 *----------------------------------------------------------------------------*/
6136
6137 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6138 {
6139 bool aSign, bSign, zSign;
6140 int32_t aExp, bExp, zExp;
6141 uint64_t aSig, bSig, zSig0, zSig1;
6142
6143 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6144 float_raise(float_flag_invalid, status);
6145 return floatx80_default_nan(status);
6146 }
6147 aSig = extractFloatx80Frac( a );
6148 aExp = extractFloatx80Exp( a );
6149 aSign = extractFloatx80Sign( a );
6150 bSig = extractFloatx80Frac( b );
6151 bExp = extractFloatx80Exp( b );
6152 bSign = extractFloatx80Sign( b );
6153 zSign = aSign ^ bSign;
6154 if ( aExp == 0x7FFF ) {
6155 if ( (uint64_t) ( aSig<<1 )
6156 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6157 return propagateFloatx80NaN(a, b, status);
6158 }
6159 if ( ( bExp | bSig ) == 0 ) goto invalid;
6160 return packFloatx80(zSign, floatx80_infinity_high,
6161 floatx80_infinity_low);
6162 }
6163 if ( bExp == 0x7FFF ) {
6164 if ((uint64_t)(bSig << 1)) {
6165 return propagateFloatx80NaN(a, b, status);
6166 }
6167 if ( ( aExp | aSig ) == 0 ) {
6168 invalid:
6169 float_raise(float_flag_invalid, status);
6170 return floatx80_default_nan(status);
6171 }
6172 return packFloatx80(zSign, floatx80_infinity_high,
6173 floatx80_infinity_low);
6174 }
6175 if ( aExp == 0 ) {
6176 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6177 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6178 }
6179 if ( bExp == 0 ) {
6180 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6181 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6182 }
6183 zExp = aExp + bExp - 0x3FFE;
6184 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6185 if ( 0 < (int64_t) zSig0 ) {
6186 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6187 --zExp;
6188 }
6189 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6190 zSign, zExp, zSig0, zSig1, status);
6191 }
6192
6193 /*----------------------------------------------------------------------------
6194 | Returns the result of dividing the extended double-precision floating-point
6195 | value `a' by the corresponding value `b'. The operation is performed
6196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6197 *----------------------------------------------------------------------------*/
6198
6199 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6200 {
6201 bool aSign, bSign, zSign;
6202 int32_t aExp, bExp, zExp;
6203 uint64_t aSig, bSig, zSig0, zSig1;
6204 uint64_t rem0, rem1, rem2, term0, term1, term2;
6205
6206 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6207 float_raise(float_flag_invalid, status);
6208 return floatx80_default_nan(status);
6209 }
6210 aSig = extractFloatx80Frac( a );
6211 aExp = extractFloatx80Exp( a );
6212 aSign = extractFloatx80Sign( a );
6213 bSig = extractFloatx80Frac( b );
6214 bExp = extractFloatx80Exp( b );
6215 bSign = extractFloatx80Sign( b );
6216 zSign = aSign ^ bSign;
6217 if ( aExp == 0x7FFF ) {
6218 if ((uint64_t)(aSig << 1)) {
6219 return propagateFloatx80NaN(a, b, status);
6220 }
6221 if ( bExp == 0x7FFF ) {
6222 if ((uint64_t)(bSig << 1)) {
6223 return propagateFloatx80NaN(a, b, status);
6224 }
6225 goto invalid;
6226 }
6227 return packFloatx80(zSign, floatx80_infinity_high,
6228 floatx80_infinity_low);
6229 }
6230 if ( bExp == 0x7FFF ) {
6231 if ((uint64_t)(bSig << 1)) {
6232 return propagateFloatx80NaN(a, b, status);
6233 }
6234 return packFloatx80( zSign, 0, 0 );
6235 }
6236 if ( bExp == 0 ) {
6237 if ( bSig == 0 ) {
6238 if ( ( aExp | aSig ) == 0 ) {
6239 invalid:
6240 float_raise(float_flag_invalid, status);
6241 return floatx80_default_nan(status);
6242 }
6243 float_raise(float_flag_divbyzero, status);
6244 return packFloatx80(zSign, floatx80_infinity_high,
6245 floatx80_infinity_low);
6246 }
6247 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6248 }
6249 if ( aExp == 0 ) {
6250 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6251 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6252 }
6253 zExp = aExp - bExp + 0x3FFE;
6254 rem1 = 0;
6255 if ( bSig <= aSig ) {
6256 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6257 ++zExp;
6258 }
6259 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6260 mul64To128( bSig, zSig0, &term0, &term1 );
6261 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6262 while ( (int64_t) rem0 < 0 ) {
6263 --zSig0;
6264 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6265 }
6266 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6267 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6268 mul64To128( bSig, zSig1, &term1, &term2 );
6269 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6270 while ( (int64_t) rem1 < 0 ) {
6271 --zSig1;
6272 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6273 }
6274 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6275 }
6276 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6277 zSign, zExp, zSig0, zSig1, status);
6278 }
6279
6280 /*----------------------------------------------------------------------------
6281 | Returns the remainder of the extended double-precision floating-point value
6282 | `a' with respect to the corresponding value `b'. The operation is performed
6283 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6284 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6285 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6286 | the absolute value of the integer quotient.
6287 *----------------------------------------------------------------------------*/
6288
6289 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6290 float_status *status)
6291 {
6292 bool aSign, zSign;
6293 int32_t aExp, bExp, expDiff, aExpOrig;
6294 uint64_t aSig0, aSig1, bSig;
6295 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6296
6297 *quotient = 0;
6298 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6299 float_raise(float_flag_invalid, status);
6300 return floatx80_default_nan(status);
6301 }
6302 aSig0 = extractFloatx80Frac( a );
6303 aExpOrig = aExp = extractFloatx80Exp( a );
6304 aSign = extractFloatx80Sign( a );
6305 bSig = extractFloatx80Frac( b );
6306 bExp = extractFloatx80Exp( b );
6307 if ( aExp == 0x7FFF ) {
6308 if ( (uint64_t) ( aSig0<<1 )
6309 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6310 return propagateFloatx80NaN(a, b, status);
6311 }
6312 goto invalid;
6313 }
6314 if ( bExp == 0x7FFF ) {
6315 if ((uint64_t)(bSig << 1)) {
6316 return propagateFloatx80NaN(a, b, status);
6317 }
6318 if (aExp == 0 && aSig0 >> 63) {
6319 /*
6320 * Pseudo-denormal argument must be returned in normalized
6321 * form.
6322 */
6323 return packFloatx80(aSign, 1, aSig0);
6324 }
6325 return a;
6326 }
6327 if ( bExp == 0 ) {
6328 if ( bSig == 0 ) {
6329 invalid:
6330 float_raise(float_flag_invalid, status);
6331 return floatx80_default_nan(status);
6332 }
6333 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6334 }
6335 if ( aExp == 0 ) {
6336 if ( aSig0 == 0 ) return a;
6337 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6338 }
6339 zSign = aSign;
6340 expDiff = aExp - bExp;
6341 aSig1 = 0;
6342 if ( expDiff < 0 ) {
6343 if ( mod || expDiff < -1 ) {
6344 if (aExp == 1 && aExpOrig == 0) {
6345 /*
6346 * Pseudo-denormal argument must be returned in
6347 * normalized form.
6348 */
6349 return packFloatx80(aSign, aExp, aSig0);
6350 }
6351 return a;
6352 }
6353 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6354 expDiff = 0;
6355 }
6356 *quotient = q = ( bSig <= aSig0 );
6357 if ( q ) aSig0 -= bSig;
6358 expDiff -= 64;
6359 while ( 0 < expDiff ) {
6360 q = estimateDiv128To64( aSig0, aSig1, bSig );
6361 q = ( 2 < q ) ? q - 2 : 0;
6362 mul64To128( bSig, q, &term0, &term1 );
6363 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6364 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6365 expDiff -= 62;
6366 *quotient <<= 62;
6367 *quotient += q;
6368 }
6369 expDiff += 64;
6370 if ( 0 < expDiff ) {
6371 q = estimateDiv128To64( aSig0, aSig1, bSig );
6372 q = ( 2 < q ) ? q - 2 : 0;
6373 q >>= 64 - expDiff;
6374 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6375 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6376 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6377 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6378 ++q;
6379 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6380 }
6381 if (expDiff < 64) {
6382 *quotient <<= expDiff;
6383 } else {
6384 *quotient = 0;
6385 }
6386 *quotient += q;
6387 }
6388 else {
6389 term1 = 0;
6390 term0 = bSig;
6391 }
6392 if (!mod) {
6393 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6394 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6395 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6396 && ( q & 1 ) )
6397 ) {
6398 aSig0 = alternateASig0;
6399 aSig1 = alternateASig1;
6400 zSign = ! zSign;
6401 ++*quotient;
6402 }
6403 }
6404 return
6405 normalizeRoundAndPackFloatx80(
6406 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6407
6408 }
6409
6410 /*----------------------------------------------------------------------------
6411 | Returns the remainder of the extended double-precision floating-point value
6412 | `a' with respect to the corresponding value `b'. The operation is performed
6413 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6414 *----------------------------------------------------------------------------*/
6415
6416 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6417 {
6418 uint64_t quotient;
6419 return floatx80_modrem(a, b, false, &quotient, status);
6420 }
6421
6422 /*----------------------------------------------------------------------------
6423 | Returns the remainder of the extended double-precision floating-point value
6424 | `a' with respect to the corresponding value `b', with the quotient truncated
6425 | toward zero.
6426 *----------------------------------------------------------------------------*/
6427
6428 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6429 {
6430 uint64_t quotient;
6431 return floatx80_modrem(a, b, true, &quotient, status);
6432 }
6433
6434 /*----------------------------------------------------------------------------
6435 | Returns the square root of the extended double-precision floating-point
6436 | value `a'. The operation is performed according to the IEC/IEEE Standard
6437 | for Binary Floating-Point Arithmetic.
6438 *----------------------------------------------------------------------------*/
6439
6440 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6441 {
6442 bool aSign;
6443 int32_t aExp, zExp;
6444 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6445 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6446
6447 if (floatx80_invalid_encoding(a)) {
6448 float_raise(float_flag_invalid, status);
6449 return floatx80_default_nan(status);
6450 }
6451 aSig0 = extractFloatx80Frac( a );
6452 aExp = extractFloatx80Exp( a );
6453 aSign = extractFloatx80Sign( a );
6454 if ( aExp == 0x7FFF ) {
6455 if ((uint64_t)(aSig0 << 1)) {
6456 return propagateFloatx80NaN(a, a, status);
6457 }
6458 if ( ! aSign ) return a;
6459 goto invalid;
6460 }
6461 if ( aSign ) {
6462 if ( ( aExp | aSig0 ) == 0 ) return a;
6463 invalid:
6464 float_raise(float_flag_invalid, status);
6465 return floatx80_default_nan(status);
6466 }
6467 if ( aExp == 0 ) {
6468 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6469 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6470 }
6471 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6472 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6473 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6474 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6475 doubleZSig0 = zSig0<<1;
6476 mul64To128( zSig0, zSig0, &term0, &term1 );
6477 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6478 while ( (int64_t) rem0 < 0 ) {
6479 --zSig0;
6480 doubleZSig0 -= 2;
6481 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6482 }
6483 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6484 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6485 if ( zSig1 == 0 ) zSig1 = 1;
6486 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6487 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6488 mul64To128( zSig1, zSig1, &term2, &term3 );
6489 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6490 while ( (int64_t) rem1 < 0 ) {
6491 --zSig1;
6492 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6493 term3 |= 1;
6494 term2 |= doubleZSig0;
6495 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6496 }
6497 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6498 }
6499 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6500 zSig0 |= doubleZSig0;
6501 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6502 0, zExp, zSig0, zSig1, status);
6503 }
6504
6505 /*----------------------------------------------------------------------------
6506 | Returns the result of converting the quadruple-precision floating-point
6507 | value `a' to the 32-bit two's complement integer format. The conversion
6508 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6509 | Arithmetic---which means in particular that the conversion is rounded
6510 | according to the current rounding mode. If `a' is a NaN, the largest
6511 | positive integer is returned. Otherwise, if the conversion overflows, the
6512 | largest integer with the same sign as `a' is returned.
6513 *----------------------------------------------------------------------------*/
6514
6515 int32_t float128_to_int32(float128 a, float_status *status)
6516 {
6517 bool aSign;
6518 int32_t aExp, shiftCount;
6519 uint64_t aSig0, aSig1;
6520
6521 aSig1 = extractFloat128Frac1( a );
6522 aSig0 = extractFloat128Frac0( a );
6523 aExp = extractFloat128Exp( a );
6524 aSign = extractFloat128Sign( a );
6525 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6526 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6527 aSig0 |= ( aSig1 != 0 );
6528 shiftCount = 0x4028 - aExp;
6529 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6530 return roundAndPackInt32(aSign, aSig0, status);
6531
6532 }
6533
6534 /*----------------------------------------------------------------------------
6535 | Returns the result of converting the quadruple-precision floating-point
6536 | value `a' to the 32-bit two's complement integer format. The conversion
6537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6538 | Arithmetic, except that the conversion is always rounded toward zero. If
6539 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6540 | conversion overflows, the largest integer with the same sign as `a' is
6541 | returned.
6542 *----------------------------------------------------------------------------*/
6543
6544 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6545 {
6546 bool aSign;
6547 int32_t aExp, shiftCount;
6548 uint64_t aSig0, aSig1, savedASig;
6549 int32_t z;
6550
6551 aSig1 = extractFloat128Frac1( a );
6552 aSig0 = extractFloat128Frac0( a );
6553 aExp = extractFloat128Exp( a );
6554 aSign = extractFloat128Sign( a );
6555 aSig0 |= ( aSig1 != 0 );
6556 if ( 0x401E < aExp ) {
6557 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6558 goto invalid;
6559 }
6560 else if ( aExp < 0x3FFF ) {
6561 if (aExp || aSig0) {
6562 float_raise(float_flag_inexact, status);
6563 }
6564 return 0;
6565 }
6566 aSig0 |= UINT64_C(0x0001000000000000);
6567 shiftCount = 0x402F - aExp;
6568 savedASig = aSig0;
6569 aSig0 >>= shiftCount;
6570 z = aSig0;
6571 if ( aSign ) z = - z;
6572 if ( ( z < 0 ) ^ aSign ) {
6573 invalid:
6574 float_raise(float_flag_invalid, status);
6575 return aSign ? INT32_MIN : INT32_MAX;
6576 }
6577 if ( ( aSig0<<shiftCount ) != savedASig ) {
6578 float_raise(float_flag_inexact, status);
6579 }
6580 return z;
6581
6582 }
6583
6584 /*----------------------------------------------------------------------------
6585 | Returns the result of converting the quadruple-precision floating-point
6586 | value `a' to the 64-bit two's complement integer format. The conversion
6587 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6588 | Arithmetic---which means in particular that the conversion is rounded
6589 | according to the current rounding mode. If `a' is a NaN, the largest
6590 | positive integer is returned. Otherwise, if the conversion overflows, the
6591 | largest integer with the same sign as `a' is returned.
6592 *----------------------------------------------------------------------------*/
6593
6594 int64_t float128_to_int64(float128 a, float_status *status)
6595 {
6596 bool aSign;
6597 int32_t aExp, shiftCount;
6598 uint64_t aSig0, aSig1;
6599
6600 aSig1 = extractFloat128Frac1( a );
6601 aSig0 = extractFloat128Frac0( a );
6602 aExp = extractFloat128Exp( a );
6603 aSign = extractFloat128Sign( a );
6604 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6605 shiftCount = 0x402F - aExp;
6606 if ( shiftCount <= 0 ) {
6607 if ( 0x403E < aExp ) {
6608 float_raise(float_flag_invalid, status);
6609 if ( ! aSign
6610 || ( ( aExp == 0x7FFF )
6611 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6612 )
6613 ) {
6614 return INT64_MAX;
6615 }
6616 return INT64_MIN;
6617 }
6618 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6619 }
6620 else {
6621 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6622 }
6623 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6624
6625 }
6626
6627 /*----------------------------------------------------------------------------
6628 | Returns the result of converting the quadruple-precision floating-point
6629 | value `a' to the 64-bit two's complement integer format. The conversion
6630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6631 | Arithmetic, except that the conversion is always rounded toward zero.
6632 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6633 | the conversion overflows, the largest integer with the same sign as `a' is
6634 | returned.
6635 *----------------------------------------------------------------------------*/
6636
6637 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6638 {
6639 bool aSign;
6640 int32_t aExp, shiftCount;
6641 uint64_t aSig0, aSig1;
6642 int64_t z;
6643
6644 aSig1 = extractFloat128Frac1( a );
6645 aSig0 = extractFloat128Frac0( a );
6646 aExp = extractFloat128Exp( a );
6647 aSign = extractFloat128Sign( a );
6648 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6649 shiftCount = aExp - 0x402F;
6650 if ( 0 < shiftCount ) {
6651 if ( 0x403E <= aExp ) {
6652 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6653 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6654 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6655 if (aSig1) {
6656 float_raise(float_flag_inexact, status);
6657 }
6658 }
6659 else {
6660 float_raise(float_flag_invalid, status);
6661 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6662 return INT64_MAX;
6663 }
6664 }
6665 return INT64_MIN;
6666 }
6667 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6668 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6669 float_raise(float_flag_inexact, status);
6670 }
6671 }
6672 else {
6673 if ( aExp < 0x3FFF ) {
6674 if ( aExp | aSig0 | aSig1 ) {
6675 float_raise(float_flag_inexact, status);
6676 }
6677 return 0;
6678 }
6679 z = aSig0>>( - shiftCount );
6680 if ( aSig1
6681 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6682 float_raise(float_flag_inexact, status);
6683 }
6684 }
6685 if ( aSign ) z = - z;
6686 return z;
6687
6688 }
6689
6690 /*----------------------------------------------------------------------------
6691 | Returns the result of converting the quadruple-precision floating-point value
6692 | `a' to the 64-bit unsigned integer format. The conversion is
6693 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6694 | Arithmetic---which means in particular that the conversion is rounded
6695 | according to the current rounding mode. If `a' is a NaN, the largest
6696 | positive integer is returned. If the conversion overflows, the
6697 | largest unsigned integer is returned. If 'a' is negative, the value is
6698 | rounded and zero is returned; negative values that do not round to zero
6699 | will raise the inexact exception.
6700 *----------------------------------------------------------------------------*/
6701
6702 uint64_t float128_to_uint64(float128 a, float_status *status)
6703 {
6704 bool aSign;
6705 int aExp;
6706 int shiftCount;
6707 uint64_t aSig0, aSig1;
6708
6709 aSig0 = extractFloat128Frac0(a);
6710 aSig1 = extractFloat128Frac1(a);
6711 aExp = extractFloat128Exp(a);
6712 aSign = extractFloat128Sign(a);
6713 if (aSign && (aExp > 0x3FFE)) {
6714 float_raise(float_flag_invalid, status);
6715 if (float128_is_any_nan(a)) {
6716 return UINT64_MAX;
6717 } else {
6718 return 0;
6719 }
6720 }
6721 if (aExp) {
6722 aSig0 |= UINT64_C(0x0001000000000000);
6723 }
6724 shiftCount = 0x402F - aExp;
6725 if (shiftCount <= 0) {
6726 if (0x403E < aExp) {
6727 float_raise(float_flag_invalid, status);
6728 return UINT64_MAX;
6729 }
6730 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6731 } else {
6732 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6733 }
6734 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6735 }
6736
6737 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6738 {
6739 uint64_t v;
6740 signed char current_rounding_mode = status->float_rounding_mode;
6741
6742 set_float_rounding_mode(float_round_to_zero, status);
6743 v = float128_to_uint64(a, status);
6744 set_float_rounding_mode(current_rounding_mode, status);
6745
6746 return v;
6747 }
6748
6749 /*----------------------------------------------------------------------------
6750 | Returns the result of converting the quadruple-precision floating-point
6751 | value `a' to the 32-bit unsigned integer format. The conversion
6752 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6753 | Arithmetic except that the conversion is always rounded toward zero.
6754 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6755 | if the conversion overflows, the largest unsigned integer is returned.
6756 | If 'a' is negative, the value is rounded and zero is returned; negative
6757 | values that do not round to zero will raise the inexact exception.
6758 *----------------------------------------------------------------------------*/
6759
6760 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6761 {
6762 uint64_t v;
6763 uint32_t res;
6764 int old_exc_flags = get_float_exception_flags(status);
6765
6766 v = float128_to_uint64_round_to_zero(a, status);
6767 if (v > 0xffffffff) {
6768 res = 0xffffffff;
6769 } else {
6770 return v;
6771 }
6772 set_float_exception_flags(old_exc_flags, status);
6773 float_raise(float_flag_invalid, status);
6774 return res;
6775 }
6776
6777 /*----------------------------------------------------------------------------
6778 | Returns the result of converting the quadruple-precision floating-point value
6779 | `a' to the 32-bit unsigned integer format. The conversion is
6780 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6781 | Arithmetic---which means in particular that the conversion is rounded
6782 | according to the current rounding mode. If `a' is a NaN, the largest
6783 | positive integer is returned. If the conversion overflows, the
6784 | largest unsigned integer is returned. If 'a' is negative, the value is
6785 | rounded and zero is returned; negative values that do not round to zero
6786 | will raise the inexact exception.
6787 *----------------------------------------------------------------------------*/
6788
6789 uint32_t float128_to_uint32(float128 a, float_status *status)
6790 {
6791 uint64_t v;
6792 uint32_t res;
6793 int old_exc_flags = get_float_exception_flags(status);
6794
6795 v = float128_to_uint64(a, status);
6796 if (v > 0xffffffff) {
6797 res = 0xffffffff;
6798 } else {
6799 return v;
6800 }
6801 set_float_exception_flags(old_exc_flags, status);
6802 float_raise(float_flag_invalid, status);
6803 return res;
6804 }
6805
6806 /*----------------------------------------------------------------------------
6807 | Returns the result of converting the quadruple-precision floating-point
6808 | value `a' to the single-precision floating-point format. The conversion
6809 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6810 | Arithmetic.
6811 *----------------------------------------------------------------------------*/
6812
6813 float32 float128_to_float32(float128 a, float_status *status)
6814 {
6815 bool aSign;
6816 int32_t aExp;
6817 uint64_t aSig0, aSig1;
6818 uint32_t zSig;
6819
6820 aSig1 = extractFloat128Frac1( a );
6821 aSig0 = extractFloat128Frac0( a );
6822 aExp = extractFloat128Exp( a );
6823 aSign = extractFloat128Sign( a );
6824 if ( aExp == 0x7FFF ) {
6825 if ( aSig0 | aSig1 ) {
6826 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6827 }
6828 return packFloat32( aSign, 0xFF, 0 );
6829 }
6830 aSig0 |= ( aSig1 != 0 );
6831 shift64RightJamming( aSig0, 18, &aSig0 );
6832 zSig = aSig0;
6833 if ( aExp || zSig ) {
6834 zSig |= 0x40000000;
6835 aExp -= 0x3F81;
6836 }
6837 return roundAndPackFloat32(aSign, aExp, zSig, status);
6838
6839 }
6840
6841 /*----------------------------------------------------------------------------
6842 | Returns the result of converting the quadruple-precision floating-point
6843 | value `a' to the double-precision floating-point format. The conversion
6844 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6845 | Arithmetic.
6846 *----------------------------------------------------------------------------*/
6847
6848 float64 float128_to_float64(float128 a, float_status *status)
6849 {
6850 bool aSign;
6851 int32_t aExp;
6852 uint64_t aSig0, aSig1;
6853
6854 aSig1 = extractFloat128Frac1( a );
6855 aSig0 = extractFloat128Frac0( a );
6856 aExp = extractFloat128Exp( a );
6857 aSign = extractFloat128Sign( a );
6858 if ( aExp == 0x7FFF ) {
6859 if ( aSig0 | aSig1 ) {
6860 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6861 }
6862 return packFloat64( aSign, 0x7FF, 0 );
6863 }
6864 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6865 aSig0 |= ( aSig1 != 0 );
6866 if ( aExp || aSig0 ) {
6867 aSig0 |= UINT64_C(0x4000000000000000);
6868 aExp -= 0x3C01;
6869 }
6870 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6871
6872 }
6873
6874 /*----------------------------------------------------------------------------
6875 | Returns the result of converting the quadruple-precision floating-point
6876 | value `a' to the extended double-precision floating-point format. The
6877 | conversion is performed according to the IEC/IEEE Standard for Binary
6878 | Floating-Point Arithmetic.
6879 *----------------------------------------------------------------------------*/
6880
6881 floatx80 float128_to_floatx80(float128 a, float_status *status)
6882 {
6883 bool aSign;
6884 int32_t aExp;
6885 uint64_t aSig0, aSig1;
6886
6887 aSig1 = extractFloat128Frac1( a );
6888 aSig0 = extractFloat128Frac0( a );
6889 aExp = extractFloat128Exp( a );
6890 aSign = extractFloat128Sign( a );
6891 if ( aExp == 0x7FFF ) {
6892 if ( aSig0 | aSig1 ) {
6893 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6894 status);
6895 return floatx80_silence_nan(res, status);
6896 }
6897 return packFloatx80(aSign, floatx80_infinity_high,
6898 floatx80_infinity_low);
6899 }
6900 if ( aExp == 0 ) {
6901 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6902 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6903 }
6904 else {
6905 aSig0 |= UINT64_C(0x0001000000000000);
6906 }
6907 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6908 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6909
6910 }
6911
6912 /*----------------------------------------------------------------------------
6913 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6914 | returns the result as a quadruple-precision floating-point value. The
6915 | operation is performed according to the IEC/IEEE Standard for Binary
6916 | Floating-Point Arithmetic.
6917 *----------------------------------------------------------------------------*/
6918
6919 float128 float128_round_to_int(float128 a, float_status *status)
6920 {
6921 bool aSign;
6922 int32_t aExp;
6923 uint64_t lastBitMask, roundBitsMask;
6924 float128 z;
6925
6926 aExp = extractFloat128Exp( a );
6927 if ( 0x402F <= aExp ) {
6928 if ( 0x406F <= aExp ) {
6929 if ( ( aExp == 0x7FFF )
6930 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6931 ) {
6932 return propagateFloat128NaN(a, a, status);
6933 }
6934 return a;
6935 }
6936 lastBitMask = 1;
6937 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6938 roundBitsMask = lastBitMask - 1;
6939 z = a;
6940 switch (status->float_rounding_mode) {
6941 case float_round_nearest_even:
6942 if ( lastBitMask ) {
6943 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6944 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6945 }
6946 else {
6947 if ( (int64_t) z.low < 0 ) {
6948 ++z.high;
6949 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6950 }
6951 }
6952 break;
6953 case float_round_ties_away:
6954 if (lastBitMask) {
6955 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6956 } else {
6957 if ((int64_t) z.low < 0) {
6958 ++z.high;
6959 }
6960 }
6961 break;
6962 case float_round_to_zero:
6963 break;
6964 case float_round_up:
6965 if (!extractFloat128Sign(z)) {
6966 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6967 }
6968 break;
6969 case float_round_down:
6970 if (extractFloat128Sign(z)) {
6971 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6972 }
6973 break;
6974 case float_round_to_odd:
6975 /*
6976 * Note that if lastBitMask == 0, the last bit is the lsb
6977 * of high, and roundBitsMask == -1.
6978 */
6979 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6980 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6981 }
6982 break;
6983 default:
6984 abort();
6985 }
6986 z.low &= ~ roundBitsMask;
6987 }
6988 else {
6989 if ( aExp < 0x3FFF ) {
6990 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6991 float_raise(float_flag_inexact, status);
6992 aSign = extractFloat128Sign( a );
6993 switch (status->float_rounding_mode) {
6994 case float_round_nearest_even:
6995 if ( ( aExp == 0x3FFE )
6996 && ( extractFloat128Frac0( a )
6997 | extractFloat128Frac1( a ) )
6998 ) {
6999 return packFloat128( aSign, 0x3FFF, 0, 0 );
7000 }
7001 break;
7002 case float_round_ties_away:
7003 if (aExp == 0x3FFE) {
7004 return packFloat128(aSign, 0x3FFF, 0, 0);
7005 }
7006 break;
7007 case float_round_down:
7008 return
7009 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7010 : packFloat128( 0, 0, 0, 0 );
7011 case float_round_up:
7012 return
7013 aSign ? packFloat128( 1, 0, 0, 0 )
7014 : packFloat128( 0, 0x3FFF, 0, 0 );
7015
7016 case float_round_to_odd:
7017 return packFloat128(aSign, 0x3FFF, 0, 0);
7018
7019 case float_round_to_zero:
7020 break;
7021 }
7022 return packFloat128( aSign, 0, 0, 0 );
7023 }
7024 lastBitMask = 1;
7025 lastBitMask <<= 0x402F - aExp;
7026 roundBitsMask = lastBitMask - 1;
7027 z.low = 0;
7028 z.high = a.high;
7029 switch (status->float_rounding_mode) {
7030 case float_round_nearest_even:
7031 z.high += lastBitMask>>1;
7032 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7033 z.high &= ~ lastBitMask;
7034 }
7035 break;
7036 case float_round_ties_away:
7037 z.high += lastBitMask>>1;
7038 break;
7039 case float_round_to_zero:
7040 break;
7041 case float_round_up:
7042 if (!extractFloat128Sign(z)) {
7043 z.high |= ( a.low != 0 );
7044 z.high += roundBitsMask;
7045 }
7046 break;
7047 case float_round_down:
7048 if (extractFloat128Sign(z)) {
7049 z.high |= (a.low != 0);
7050 z.high += roundBitsMask;
7051 }
7052 break;
7053 case float_round_to_odd:
7054 if ((z.high & lastBitMask) == 0) {
7055 z.high |= (a.low != 0);
7056 z.high += roundBitsMask;
7057 }
7058 break;
7059 default:
7060 abort();
7061 }
7062 z.high &= ~ roundBitsMask;
7063 }
7064 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7065 float_raise(float_flag_inexact, status);
7066 }
7067 return z;
7068
7069 }
7070
7071 /*----------------------------------------------------------------------------
7072 | Returns the result of multiplying the quadruple-precision floating-point
7073 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7074 | Standard for Binary Floating-Point Arithmetic.
7075 *----------------------------------------------------------------------------*/
7076
7077 float128 float128_mul(float128 a, float128 b, float_status *status)
7078 {
7079 bool aSign, bSign, zSign;
7080 int32_t aExp, bExp, zExp;
7081 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7082
7083 aSig1 = extractFloat128Frac1( a );
7084 aSig0 = extractFloat128Frac0( a );
7085 aExp = extractFloat128Exp( a );
7086 aSign = extractFloat128Sign( a );
7087 bSig1 = extractFloat128Frac1( b );
7088 bSig0 = extractFloat128Frac0( b );
7089 bExp = extractFloat128Exp( b );
7090 bSign = extractFloat128Sign( b );
7091 zSign = aSign ^ bSign;
7092 if ( aExp == 0x7FFF ) {
7093 if ( ( aSig0 | aSig1 )
7094 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7095 return propagateFloat128NaN(a, b, status);
7096 }
7097 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7098 return packFloat128( zSign, 0x7FFF, 0, 0 );
7099 }
7100 if ( bExp == 0x7FFF ) {
7101 if (bSig0 | bSig1) {
7102 return propagateFloat128NaN(a, b, status);
7103 }
7104 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7105 invalid:
7106 float_raise(float_flag_invalid, status);
7107 return float128_default_nan(status);
7108 }
7109 return packFloat128( zSign, 0x7FFF, 0, 0 );
7110 }
7111 if ( aExp == 0 ) {
7112 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7113 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7114 }
7115 if ( bExp == 0 ) {
7116 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7117 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7118 }
7119 zExp = aExp + bExp - 0x4000;
7120 aSig0 |= UINT64_C(0x0001000000000000);
7121 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7122 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7123 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7124 zSig2 |= ( zSig3 != 0 );
7125 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7126 shift128ExtraRightJamming(
7127 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7128 ++zExp;
7129 }
7130 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7131
7132 }
7133
7134 /*----------------------------------------------------------------------------
7135 | Returns the result of dividing the quadruple-precision floating-point value
7136 | `a' by the corresponding value `b'. The operation is performed according to
7137 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7139
7140 float128 float128_div(float128 a, float128 b, float_status *status)
7141 {
7142 bool aSign, bSign, zSign;
7143 int32_t aExp, bExp, zExp;
7144 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7145 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7146
7147 aSig1 = extractFloat128Frac1( a );
7148 aSig0 = extractFloat128Frac0( a );
7149 aExp = extractFloat128Exp( a );
7150 aSign = extractFloat128Sign( a );
7151 bSig1 = extractFloat128Frac1( b );
7152 bSig0 = extractFloat128Frac0( b );
7153 bExp = extractFloat128Exp( b );
7154 bSign = extractFloat128Sign( b );
7155 zSign = aSign ^ bSign;
7156 if ( aExp == 0x7FFF ) {
7157 if (aSig0 | aSig1) {
7158 return propagateFloat128NaN(a, b, status);
7159 }
7160 if ( bExp == 0x7FFF ) {
7161 if (bSig0 | bSig1) {
7162 return propagateFloat128NaN(a, b, status);
7163 }
7164 goto invalid;
7165 }
7166 return packFloat128( zSign, 0x7FFF, 0, 0 );
7167 }
7168 if ( bExp == 0x7FFF ) {
7169 if (bSig0 | bSig1) {
7170 return propagateFloat128NaN(a, b, status);
7171 }
7172 return packFloat128( zSign, 0, 0, 0 );
7173 }
7174 if ( bExp == 0 ) {
7175 if ( ( bSig0 | bSig1 ) == 0 ) {
7176 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7177 invalid:
7178 float_raise(float_flag_invalid, status);
7179 return float128_default_nan(status);
7180 }
7181 float_raise(float_flag_divbyzero, status);
7182 return packFloat128( zSign, 0x7FFF, 0, 0 );
7183 }
7184 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7185 }
7186 if ( aExp == 0 ) {
7187 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7188 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7189 }
7190 zExp = aExp - bExp + 0x3FFD;
7191 shortShift128Left(
7192 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7193 shortShift128Left(
7194 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7195 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7196 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7197 ++zExp;
7198 }
7199 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7200 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7201 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7202 while ( (int64_t) rem0 < 0 ) {
7203 --zSig0;
7204 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7205 }
7206 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7207 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7208 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7209 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7210 while ( (int64_t) rem1 < 0 ) {
7211 --zSig1;
7212 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7213 }
7214 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7215 }
7216 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7217 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7218
7219 }
7220
7221 /*----------------------------------------------------------------------------
7222 | Returns the remainder of the quadruple-precision floating-point value `a'
7223 | with respect to the corresponding value `b'. The operation is performed
7224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7225 *----------------------------------------------------------------------------*/
7226
7227 float128 float128_rem(float128 a, float128 b, float_status *status)
7228 {
7229 bool aSign, zSign;
7230 int32_t aExp, bExp, expDiff;
7231 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7232 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7233 int64_t sigMean0;
7234
7235 aSig1 = extractFloat128Frac1( a );
7236 aSig0 = extractFloat128Frac0( a );
7237 aExp = extractFloat128Exp( a );
7238 aSign = extractFloat128Sign( a );
7239 bSig1 = extractFloat128Frac1( b );
7240 bSig0 = extractFloat128Frac0( b );
7241 bExp = extractFloat128Exp( b );
7242 if ( aExp == 0x7FFF ) {
7243 if ( ( aSig0 | aSig1 )
7244 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7245 return propagateFloat128NaN(a, b, status);
7246 }
7247 goto invalid;
7248 }
7249 if ( bExp == 0x7FFF ) {
7250 if (bSig0 | bSig1) {
7251 return propagateFloat128NaN(a, b, status);
7252 }
7253 return a;
7254 }
7255 if ( bExp == 0 ) {
7256 if ( ( bSig0 | bSig1 ) == 0 ) {
7257 invalid:
7258 float_raise(float_flag_invalid, status);
7259 return float128_default_nan(status);
7260 }
7261 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7262 }
7263 if ( aExp == 0 ) {
7264 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7265 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7266 }
7267 expDiff = aExp - bExp;
7268 if ( expDiff < -1 ) return a;
7269 shortShift128Left(
7270 aSig0 | UINT64_C(0x0001000000000000),
7271 aSig1,
7272 15 - ( expDiff < 0 ),
7273 &aSig0,
7274 &aSig1
7275 );
7276 shortShift128Left(
7277 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7278 q = le128( bSig0, bSig1, aSig0, aSig1 );
7279 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7280 expDiff -= 64;
7281 while ( 0 < expDiff ) {
7282 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7283 q = ( 4 < q ) ? q - 4 : 0;
7284 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7285 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7286 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7287 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7288 expDiff -= 61;
7289 }
7290 if ( -64 < expDiff ) {
7291 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7292 q = ( 4 < q ) ? q - 4 : 0;
7293 q >>= - expDiff;
7294 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7295 expDiff += 52;
7296 if ( expDiff < 0 ) {
7297 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7298 }
7299 else {
7300 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7301 }
7302 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7303 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7304 }
7305 else {
7306 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7307 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7308 }
7309 do {
7310 alternateASig0 = aSig0;
7311 alternateASig1 = aSig1;
7312 ++q;
7313 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7314 } while ( 0 <= (int64_t) aSig0 );
7315 add128(
7316 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7317 if ( ( sigMean0 < 0 )
7318 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7319 aSig0 = alternateASig0;
7320 aSig1 = alternateASig1;
7321 }
7322 zSign = ( (int64_t) aSig0 < 0 );
7323 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7324 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7325 status);
7326 }
7327
7328 /*----------------------------------------------------------------------------
7329 | Returns the square root of the quadruple-precision floating-point value `a'.
7330 | The operation is performed according to the IEC/IEEE Standard for Binary
7331 | Floating-Point Arithmetic.
7332 *----------------------------------------------------------------------------*/
7333
7334 float128 float128_sqrt(float128 a, float_status *status)
7335 {
7336 bool aSign;
7337 int32_t aExp, zExp;
7338 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7339 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7340
7341 aSig1 = extractFloat128Frac1( a );
7342 aSig0 = extractFloat128Frac0( a );
7343 aExp = extractFloat128Exp( a );
7344 aSign = extractFloat128Sign( a );
7345 if ( aExp == 0x7FFF ) {
7346 if (aSig0 | aSig1) {
7347 return propagateFloat128NaN(a, a, status);
7348 }
7349 if ( ! aSign ) return a;
7350 goto invalid;
7351 }
7352 if ( aSign ) {
7353 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7354 invalid:
7355 float_raise(float_flag_invalid, status);
7356 return float128_default_nan(status);
7357 }
7358 if ( aExp == 0 ) {
7359 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7360 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7361 }
7362 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7363 aSig0 |= UINT64_C(0x0001000000000000);
7364 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7365 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7366 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7367 doubleZSig0 = zSig0<<1;
7368 mul64To128( zSig0, zSig0, &term0, &term1 );
7369 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7370 while ( (int64_t) rem0 < 0 ) {
7371 --zSig0;
7372 doubleZSig0 -= 2;
7373 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7374 }
7375 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7376 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7377 if ( zSig1 == 0 ) zSig1 = 1;
7378 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7379 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7380 mul64To128( zSig1, zSig1, &term2, &term3 );
7381 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7382 while ( (int64_t) rem1 < 0 ) {
7383 --zSig1;
7384 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7385 term3 |= 1;
7386 term2 |= doubleZSig0;
7387 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7388 }
7389 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7390 }
7391 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7392 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7393
7394 }
7395
7396 static inline FloatRelation
7397 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7398 float_status *status)
7399 {
7400 bool aSign, bSign;
7401
7402 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7403 float_raise(float_flag_invalid, status);
7404 return float_relation_unordered;
7405 }
7406 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7407 ( extractFloatx80Frac( a )<<1 ) ) ||
7408 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7409 ( extractFloatx80Frac( b )<<1 ) )) {
7410 if (!is_quiet ||
7411 floatx80_is_signaling_nan(a, status) ||
7412 floatx80_is_signaling_nan(b, status)) {
7413 float_raise(float_flag_invalid, status);
7414 }
7415 return float_relation_unordered;
7416 }
7417 aSign = extractFloatx80Sign( a );
7418 bSign = extractFloatx80Sign( b );
7419 if ( aSign != bSign ) {
7420
7421 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7422 ( ( a.low | b.low ) == 0 ) ) {
7423 /* zero case */
7424 return float_relation_equal;
7425 } else {
7426 return 1 - (2 * aSign);
7427 }
7428 } else {
7429 /* Normalize pseudo-denormals before comparison. */
7430 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7431 ++a.high;
7432 }
7433 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7434 ++b.high;
7435 }
7436 if (a.low == b.low && a.high == b.high) {
7437 return float_relation_equal;
7438 } else {
7439 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7440 }
7441 }
7442 }
7443
7444 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7445 {
7446 return floatx80_compare_internal(a, b, 0, status);
7447 }
7448
7449 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7450 float_status *status)
7451 {
7452 return floatx80_compare_internal(a, b, 1, status);
7453 }
7454
7455 static inline FloatRelation
7456 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7457 float_status *status)
7458 {
7459 bool aSign, bSign;
7460
7461 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7462 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7463 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7464 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7465 if (!is_quiet ||
7466 float128_is_signaling_nan(a, status) ||
7467 float128_is_signaling_nan(b, status)) {
7468 float_raise(float_flag_invalid, status);
7469 }
7470 return float_relation_unordered;
7471 }
7472 aSign = extractFloat128Sign( a );
7473 bSign = extractFloat128Sign( b );
7474 if ( aSign != bSign ) {
7475 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7476 /* zero case */
7477 return float_relation_equal;
7478 } else {
7479 return 1 - (2 * aSign);
7480 }
7481 } else {
7482 if (a.low == b.low && a.high == b.high) {
7483 return float_relation_equal;
7484 } else {
7485 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7486 }
7487 }
7488 }
7489
7490 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7491 {
7492 return float128_compare_internal(a, b, 0, status);
7493 }
7494
7495 FloatRelation float128_compare_quiet(float128 a, float128 b,
7496 float_status *status)
7497 {
7498 return float128_compare_internal(a, b, 1, status);
7499 }
7500
7501 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7502 {
7503 bool aSign;
7504 int32_t aExp;
7505 uint64_t aSig;
7506
7507 if (floatx80_invalid_encoding(a)) {
7508 float_raise(float_flag_invalid, status);
7509 return floatx80_default_nan(status);
7510 }
7511 aSig = extractFloatx80Frac( a );
7512 aExp = extractFloatx80Exp( a );
7513 aSign = extractFloatx80Sign( a );
7514
7515 if ( aExp == 0x7FFF ) {
7516 if ( aSig<<1 ) {
7517 return propagateFloatx80NaN(a, a, status);
7518 }
7519 return a;
7520 }
7521
7522 if (aExp == 0) {
7523 if (aSig == 0) {
7524 return a;
7525 }
7526 aExp++;
7527 }
7528
7529 if (n > 0x10000) {
7530 n = 0x10000;
7531 } else if (n < -0x10000) {
7532 n = -0x10000;
7533 }
7534
7535 aExp += n;
7536 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7537 aSign, aExp, aSig, 0, status);
7538 }
7539
7540 float128 float128_scalbn(float128 a, int n, float_status *status)
7541 {
7542 bool aSign;
7543 int32_t aExp;
7544 uint64_t aSig0, aSig1;
7545
7546 aSig1 = extractFloat128Frac1( a );
7547 aSig0 = extractFloat128Frac0( a );
7548 aExp = extractFloat128Exp( a );
7549 aSign = extractFloat128Sign( a );
7550 if ( aExp == 0x7FFF ) {
7551 if ( aSig0 | aSig1 ) {
7552 return propagateFloat128NaN(a, a, status);
7553 }
7554 return a;
7555 }
7556 if (aExp != 0) {
7557 aSig0 |= UINT64_C(0x0001000000000000);
7558 } else if (aSig0 == 0 && aSig1 == 0) {
7559 return a;
7560 } else {
7561 aExp++;
7562 }
7563
7564 if (n > 0x10000) {
7565 n = 0x10000;
7566 } else if (n < -0x10000) {
7567 n = -0x10000;
7568 }
7569
7570 aExp += n - 1;
7571 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7572 , status);
7573
7574 }
7575
7576 static void __attribute__((constructor)) softfloat_init(void)
7577 {
7578 union_float64 ua, ub, uc, ur;
7579
7580 if (QEMU_NO_HARDFLOAT) {
7581 return;
7582 }
7583 /*
7584 * Test that the host's FMA is not obviously broken. For example,
7585 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7586 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7587 */
7588 ua.s = 0x0020000000000001ULL;
7589 ub.s = 0x3ca0000000000000ULL;
7590 uc.s = 0x0020000000000000ULL;
7591 ur.h = fma(ua.h, ub.h, uc.h);
7592 if (ur.s != 0x0020000000000001ULL) {
7593 force_soft_fma = true;
7594 }
7595 }