]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Move addsub_floats to softfloat-parts.c.inc
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
505 *
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
509 */
510
511 typedef struct {
512 FloatClass cls;
513 bool sign;
514 int32_t exp;
515 union {
516 /* Routines that know the structure may reference the singular name. */
517 uint64_t frac;
518 /*
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
522 */
523 uint64_t frac_hi;
524 uint64_t frac_lo;
525 };
526 } FloatParts64;
527
528 typedef struct {
529 FloatClass cls;
530 bool sign;
531 int32_t exp;
532 uint64_t frac_hi;
533 uint64_t frac_lo;
534 } FloatParts128;
535
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT 63
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539
540 /* Structure holding all of the relevant parameters for a format.
541 * exp_size: the size of the exponent field
542 * exp_bias: the offset applied to the exponent field
543 * exp_max: the maximum normalised exponent
544 * frac_size: the size of the fraction field
545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546 * The following are computed based the size of fraction
547 * frac_lsb: least significant bit of fraction
548 * frac_lsbm1: the bit below the least significant bit (for rounding)
549 * round_mask/roundeven_mask: masks used for rounding
550 * The following optional modifiers are available:
551 * arm_althp: handle ARM Alternative Half Precision
552 */
553 typedef struct {
554 int exp_size;
555 int exp_bias;
556 int exp_max;
557 int frac_size;
558 int frac_shift;
559 uint64_t frac_lsb;
560 uint64_t frac_lsbm1;
561 uint64_t round_mask;
562 uint64_t roundeven_mask;
563 bool arm_althp;
564 } FloatFmt;
565
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F) \
568 .exp_size = E, \
569 .exp_bias = ((1 << E) - 1) >> 1, \
570 .exp_max = (1 << E) - 1, \
571 .frac_size = F, \
572 .frac_shift = (-F - 1) & 63, \
573 .frac_lsb = 1ull << ((-F - 1) & 63), \
574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577
578 static const FloatFmt float16_params = {
579 FLOAT_PARAMS(5, 10)
580 };
581
582 static const FloatFmt float16_params_ahp = {
583 FLOAT_PARAMS(5, 10),
584 .arm_althp = true
585 };
586
587 static const FloatFmt bfloat16_params = {
588 FLOAT_PARAMS(8, 7)
589 };
590
591 static const FloatFmt float32_params = {
592 FLOAT_PARAMS(8, 23)
593 };
594
595 static const FloatFmt float64_params = {
596 FLOAT_PARAMS(11, 52)
597 };
598
599 static const FloatFmt float128_params = {
600 FLOAT_PARAMS(15, 112)
601 };
602
603 /* Unpack a float to parts, but do not canonicalize. */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606 const int f_size = fmt->frac_size;
607 const int e_size = fmt->exp_size;
608
609 *r = (FloatParts64) {
610 .cls = float_class_unclassified,
611 .sign = extract64(raw, f_size + e_size, 1),
612 .exp = extract64(raw, f_size, e_size),
613 .frac = extract64(raw, 0, f_size)
614 };
615 }
616
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619 unpack_raw64(p, &float16_params, f);
620 }
621
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624 unpack_raw64(p, &bfloat16_params, f);
625 }
626
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629 unpack_raw64(p, &float32_params, f);
630 }
631
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634 unpack_raw64(p, &float64_params, f);
635 }
636
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639 const int f_size = float128_params.frac_size - 64;
640 const int e_size = float128_params.exp_size;
641
642 *p = (FloatParts128) {
643 .cls = float_class_unclassified,
644 .sign = extract64(f.high, f_size + e_size, 1),
645 .exp = extract64(f.high, f_size, e_size),
646 .frac_hi = extract64(f.high, 0, f_size),
647 .frac_lo = f.low,
648 };
649 }
650
651 /* Pack a float from parts, but do not canonicalize. */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654 const int f_size = fmt->frac_size;
655 const int e_size = fmt->exp_size;
656 uint64_t ret;
657
658 ret = (uint64_t)p->sign << (f_size + e_size);
659 ret = deposit64(ret, f_size, e_size, p->exp);
660 ret = deposit64(ret, 0, f_size, p->frac);
661 return ret;
662 }
663
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666 return make_float16(pack_raw64(p, &float16_params));
667 }
668
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671 return pack_raw64(p, &bfloat16_params);
672 }
673
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676 return make_float32(pack_raw64(p, &float32_params));
677 }
678
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681 return make_float64(pack_raw64(p, &float64_params));
682 }
683
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686 const int f_size = float128_params.frac_size - 64;
687 const int e_size = float128_params.exp_size;
688 uint64_t hi;
689
690 hi = (uint64_t)p->sign << (f_size + e_size);
691 hi = deposit64(hi, f_size, e_size, p->exp);
692 hi = deposit64(hi, 0, f_size, p->frac_hi);
693 return make_float128(hi, p->frac_lo);
694 }
695
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine: (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output. These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705
706 #define PARTS_GENERIC_64_128(NAME, P) \
707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708
709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711
712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
714
715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
716
717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
718 float_status *s);
719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
720 float_status *s);
721
722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
723
724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
725 FloatParts64 *c, float_status *s,
726 int ab_mask, int abc_mask);
727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
728 FloatParts128 *b,
729 FloatParts128 *c,
730 float_status *s,
731 int ab_mask, int abc_mask);
732
733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
735
736 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
737 const FloatFmt *fmt);
738 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
739 const FloatFmt *fmt);
740
741 #define parts_canonicalize(A, S, F) \
742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
743
744 static void parts64_uncanon(FloatParts64 *p, float_status *status,
745 const FloatFmt *fmt);
746 static void parts128_uncanon(FloatParts128 *p, float_status *status,
747 const FloatFmt *fmt);
748
749 #define parts_uncanon(A, S, F) \
750 PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
751
752 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
753 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
754
755 #define parts_add_normal(A, B) \
756 PARTS_GENERIC_64_128(add_normal, A)(A, B)
757
758 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
759 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
760
761 #define parts_sub_normal(A, B) \
762 PARTS_GENERIC_64_128(sub_normal, A)(A, B)
763
764 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
765 float_status *s, bool subtract);
766 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
767 float_status *s, bool subtract);
768
769 #define parts_addsub(A, B, S, Z) \
770 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
771
772 /*
773 * Helper functions for softfloat-parts.c.inc, per-size operations.
774 */
775
776 #define FRAC_GENERIC_64_128(NAME, P) \
777 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
778
779 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
780 {
781 return uadd64_overflow(a->frac, b->frac, &r->frac);
782 }
783
784 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
785 {
786 bool c = 0;
787 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
788 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
789 return c;
790 }
791
792 #define frac_add(R, A, B) FRAC_GENERIC_64_128(add, R)(R, A, B)
793
794 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
795 {
796 return uadd64_overflow(a->frac, c, &r->frac);
797 }
798
799 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
800 {
801 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
802 return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
803 }
804
805 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C)
806
807 static void frac64_allones(FloatParts64 *a)
808 {
809 a->frac = -1;
810 }
811
812 static void frac128_allones(FloatParts128 *a)
813 {
814 a->frac_hi = a->frac_lo = -1;
815 }
816
817 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A)
818
819 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
820 {
821 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
822 }
823
824 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
825 {
826 uint64_t ta = a->frac_hi, tb = b->frac_hi;
827 if (ta == tb) {
828 ta = a->frac_lo, tb = b->frac_lo;
829 if (ta == tb) {
830 return 0;
831 }
832 }
833 return ta < tb ? -1 : 1;
834 }
835
836 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
837
838 static void frac64_clear(FloatParts64 *a)
839 {
840 a->frac = 0;
841 }
842
843 static void frac128_clear(FloatParts128 *a)
844 {
845 a->frac_hi = a->frac_lo = 0;
846 }
847
848 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
849
850 static bool frac64_eqz(FloatParts64 *a)
851 {
852 return a->frac == 0;
853 }
854
855 static bool frac128_eqz(FloatParts128 *a)
856 {
857 return (a->frac_hi | a->frac_lo) == 0;
858 }
859
860 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
861
862 static void frac64_neg(FloatParts64 *a)
863 {
864 a->frac = -a->frac;
865 }
866
867 static void frac128_neg(FloatParts128 *a)
868 {
869 bool c = 0;
870 a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
871 a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
872 }
873
874 #define frac_neg(A) FRAC_GENERIC_64_128(neg, A)(A)
875
876 static int frac64_normalize(FloatParts64 *a)
877 {
878 if (a->frac) {
879 int shift = clz64(a->frac);
880 a->frac <<= shift;
881 return shift;
882 }
883 return 64;
884 }
885
886 static int frac128_normalize(FloatParts128 *a)
887 {
888 if (a->frac_hi) {
889 int shl = clz64(a->frac_hi);
890 if (shl) {
891 int shr = 64 - shl;
892 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
893 a->frac_lo = (a->frac_lo << shl);
894 }
895 return shl;
896 } else if (a->frac_lo) {
897 int shl = clz64(a->frac_lo);
898 a->frac_hi = (a->frac_lo << shl);
899 a->frac_lo = 0;
900 return shl + 64;
901 }
902 return 128;
903 }
904
905 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A)
906
907 static void frac64_shl(FloatParts64 *a, int c)
908 {
909 a->frac <<= c;
910 }
911
912 static void frac128_shl(FloatParts128 *a, int c)
913 {
914 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
915 }
916
917 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
918
919 static void frac64_shr(FloatParts64 *a, int c)
920 {
921 a->frac >>= c;
922 }
923
924 static void frac128_shr(FloatParts128 *a, int c)
925 {
926 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
927 }
928
929 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
930
931 static void frac64_shrjam(FloatParts64 *a, int c)
932 {
933 shift64RightJamming(a->frac, c, &a->frac);
934 }
935
936 static void frac128_shrjam(FloatParts128 *a, int c)
937 {
938 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
939 }
940
941 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C)
942
943 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
944 {
945 return usub64_overflow(a->frac, b->frac, &r->frac);
946 }
947
948 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
949 {
950 bool c = 0;
951 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
952 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
953 return c;
954 }
955
956 #define frac_sub(R, A, B) FRAC_GENERIC_64_128(sub, R)(R, A, B)
957
958 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME)
959 #define FloatPartsN glue(FloatParts,N)
960
961 #define N 64
962
963 #include "softfloat-parts-addsub.c.inc"
964 #include "softfloat-parts.c.inc"
965
966 #undef N
967 #define N 128
968
969 #include "softfloat-parts-addsub.c.inc"
970 #include "softfloat-parts.c.inc"
971
972 #undef N
973 #undef partsN
974 #undef FloatPartsN
975
976 /*
977 * Pack/unpack routines with a specific FloatFmt.
978 */
979
980 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
981 float_status *s, const FloatFmt *params)
982 {
983 float16_unpack_raw(p, f);
984 parts_canonicalize(p, s, params);
985 }
986
987 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
988 float_status *s)
989 {
990 float16a_unpack_canonical(p, f, s, &float16_params);
991 }
992
993 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
994 float_status *s)
995 {
996 bfloat16_unpack_raw(p, f);
997 parts_canonicalize(p, s, &bfloat16_params);
998 }
999
1000 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1001 float_status *s,
1002 const FloatFmt *params)
1003 {
1004 parts_uncanon(p, s, params);
1005 return float16_pack_raw(p);
1006 }
1007
1008 static float16 float16_round_pack_canonical(FloatParts64 *p,
1009 float_status *s)
1010 {
1011 return float16a_round_pack_canonical(p, s, &float16_params);
1012 }
1013
1014 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1015 float_status *s)
1016 {
1017 parts_uncanon(p, s, &bfloat16_params);
1018 return bfloat16_pack_raw(p);
1019 }
1020
1021 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1022 float_status *s)
1023 {
1024 float32_unpack_raw(p, f);
1025 parts_canonicalize(p, s, &float32_params);
1026 }
1027
1028 static float32 float32_round_pack_canonical(FloatParts64 *p,
1029 float_status *s)
1030 {
1031 parts_uncanon(p, s, &float32_params);
1032 return float32_pack_raw(p);
1033 }
1034
1035 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1036 float_status *s)
1037 {
1038 float64_unpack_raw(p, f);
1039 parts_canonicalize(p, s, &float64_params);
1040 }
1041
1042 static float64 float64_round_pack_canonical(FloatParts64 *p,
1043 float_status *s)
1044 {
1045 parts_uncanon(p, s, &float64_params);
1046 return float64_pack_raw(p);
1047 }
1048
1049 /*
1050 * Addition and subtraction
1051 */
1052
1053 static float16 QEMU_FLATTEN
1054 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1055 {
1056 FloatParts64 pa, pb, *pr;
1057
1058 float16_unpack_canonical(&pa, a, status);
1059 float16_unpack_canonical(&pb, b, status);
1060 pr = parts_addsub(&pa, &pb, status, subtract);
1061
1062 return float16_round_pack_canonical(pr, status);
1063 }
1064
1065 float16 float16_add(float16 a, float16 b, float_status *status)
1066 {
1067 return float16_addsub(a, b, status, false);
1068 }
1069
1070 float16 float16_sub(float16 a, float16 b, float_status *status)
1071 {
1072 return float16_addsub(a, b, status, true);
1073 }
1074
1075 static float32 QEMU_SOFTFLOAT_ATTR
1076 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1077 {
1078 FloatParts64 pa, pb, *pr;
1079
1080 float32_unpack_canonical(&pa, a, status);
1081 float32_unpack_canonical(&pb, b, status);
1082 pr = parts_addsub(&pa, &pb, status, subtract);
1083
1084 return float32_round_pack_canonical(pr, status);
1085 }
1086
1087 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1088 {
1089 return soft_f32_addsub(a, b, status, false);
1090 }
1091
1092 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1093 {
1094 return soft_f32_addsub(a, b, status, true);
1095 }
1096
1097 static float64 QEMU_SOFTFLOAT_ATTR
1098 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1099 {
1100 FloatParts64 pa, pb, *pr;
1101
1102 float64_unpack_canonical(&pa, a, status);
1103 float64_unpack_canonical(&pb, b, status);
1104 pr = parts_addsub(&pa, &pb, status, subtract);
1105
1106 return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1110 {
1111 return soft_f64_addsub(a, b, status, false);
1112 }
1113
1114 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1115 {
1116 return soft_f64_addsub(a, b, status, true);
1117 }
1118
1119 static float hard_f32_add(float a, float b)
1120 {
1121 return a + b;
1122 }
1123
1124 static float hard_f32_sub(float a, float b)
1125 {
1126 return a - b;
1127 }
1128
1129 static double hard_f64_add(double a, double b)
1130 {
1131 return a + b;
1132 }
1133
1134 static double hard_f64_sub(double a, double b)
1135 {
1136 return a - b;
1137 }
1138
1139 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1140 {
1141 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1142 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1143 }
1144 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1145 }
1146
1147 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1148 {
1149 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1150 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1151 } else {
1152 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1153 }
1154 }
1155
1156 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1157 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1158 {
1159 return float32_gen2(a, b, s, hard, soft,
1160 f32_is_zon2, f32_addsubmul_post);
1161 }
1162
1163 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1164 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1165 {
1166 return float64_gen2(a, b, s, hard, soft,
1167 f64_is_zon2, f64_addsubmul_post);
1168 }
1169
1170 float32 QEMU_FLATTEN
1171 float32_add(float32 a, float32 b, float_status *s)
1172 {
1173 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1174 }
1175
1176 float32 QEMU_FLATTEN
1177 float32_sub(float32 a, float32 b, float_status *s)
1178 {
1179 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1180 }
1181
1182 float64 QEMU_FLATTEN
1183 float64_add(float64 a, float64 b, float_status *s)
1184 {
1185 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1186 }
1187
1188 float64 QEMU_FLATTEN
1189 float64_sub(float64 a, float64 b, float_status *s)
1190 {
1191 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1192 }
1193
1194 static bfloat16 QEMU_FLATTEN
1195 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1196 {
1197 FloatParts64 pa, pb, *pr;
1198
1199 bfloat16_unpack_canonical(&pa, a, status);
1200 bfloat16_unpack_canonical(&pb, b, status);
1201 pr = parts_addsub(&pa, &pb, status, subtract);
1202
1203 return bfloat16_round_pack_canonical(pr, status);
1204 }
1205
1206 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1207 {
1208 return bfloat16_addsub(a, b, status, false);
1209 }
1210
1211 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1212 {
1213 return bfloat16_addsub(a, b, status, true);
1214 }
1215
1216 /*
1217 * Returns the result of multiplying the floating-point values `a' and
1218 * `b'. The operation is performed according to the IEC/IEEE Standard
1219 * for Binary Floating-Point Arithmetic.
1220 */
1221
1222 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1223 {
1224 bool sign = a.sign ^ b.sign;
1225
1226 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1227 uint64_t hi, lo;
1228 int exp = a.exp + b.exp;
1229
1230 mul64To128(a.frac, b.frac, &hi, &lo);
1231 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1232 exp += 1;
1233 } else {
1234 hi <<= 1;
1235 }
1236 hi |= (lo != 0);
1237
1238 /* Re-use a */
1239 a.exp = exp;
1240 a.sign = sign;
1241 a.frac = hi;
1242 return a;
1243 }
1244 /* handle all the NaN cases */
1245 if (is_nan(a.cls) || is_nan(b.cls)) {
1246 return *parts_pick_nan(&a, &b, s);
1247 }
1248 /* Inf * Zero == NaN */
1249 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1250 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1251 float_raise(float_flag_invalid, s);
1252 parts_default_nan(&a, s);
1253 return a;
1254 }
1255 /* Multiply by 0 or Inf */
1256 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1257 a.sign = sign;
1258 return a;
1259 }
1260 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1261 b.sign = sign;
1262 return b;
1263 }
1264 g_assert_not_reached();
1265 }
1266
1267 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1268 {
1269 FloatParts64 pa, pb, pr;
1270
1271 float16_unpack_canonical(&pa, a, status);
1272 float16_unpack_canonical(&pb, b, status);
1273 pr = mul_floats(pa, pb, status);
1274
1275 return float16_round_pack_canonical(&pr, status);
1276 }
1277
1278 static float32 QEMU_SOFTFLOAT_ATTR
1279 soft_f32_mul(float32 a, float32 b, float_status *status)
1280 {
1281 FloatParts64 pa, pb, pr;
1282
1283 float32_unpack_canonical(&pa, a, status);
1284 float32_unpack_canonical(&pb, b, status);
1285 pr = mul_floats(pa, pb, status);
1286
1287 return float32_round_pack_canonical(&pr, status);
1288 }
1289
1290 static float64 QEMU_SOFTFLOAT_ATTR
1291 soft_f64_mul(float64 a, float64 b, float_status *status)
1292 {
1293 FloatParts64 pa, pb, pr;
1294
1295 float64_unpack_canonical(&pa, a, status);
1296 float64_unpack_canonical(&pb, b, status);
1297 pr = mul_floats(pa, pb, status);
1298
1299 return float64_round_pack_canonical(&pr, status);
1300 }
1301
1302 static float hard_f32_mul(float a, float b)
1303 {
1304 return a * b;
1305 }
1306
1307 static double hard_f64_mul(double a, double b)
1308 {
1309 return a * b;
1310 }
1311
1312 float32 QEMU_FLATTEN
1313 float32_mul(float32 a, float32 b, float_status *s)
1314 {
1315 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1316 f32_is_zon2, f32_addsubmul_post);
1317 }
1318
1319 float64 QEMU_FLATTEN
1320 float64_mul(float64 a, float64 b, float_status *s)
1321 {
1322 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1323 f64_is_zon2, f64_addsubmul_post);
1324 }
1325
1326 /*
1327 * Returns the result of multiplying the bfloat16
1328 * values `a' and `b'.
1329 */
1330
1331 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1332 {
1333 FloatParts64 pa, pb, pr;
1334
1335 bfloat16_unpack_canonical(&pa, a, status);
1336 bfloat16_unpack_canonical(&pb, b, status);
1337 pr = mul_floats(pa, pb, status);
1338
1339 return bfloat16_round_pack_canonical(&pr, status);
1340 }
1341
1342 /*
1343 * Returns the result of multiplying the floating-point values `a' and
1344 * `b' then adding 'c', with no intermediate rounding step after the
1345 * multiplication. The operation is performed according to the
1346 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1347 * The flags argument allows the caller to select negation of the
1348 * addend, the intermediate product, or the final result. (The
1349 * difference between this and having the caller do a separate
1350 * negation is that negating externally will flip the sign bit on
1351 * NaNs.)
1352 */
1353
1354 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1355 int flags, float_status *s)
1356 {
1357 bool inf_zero, p_sign;
1358 bool sign_flip = flags & float_muladd_negate_result;
1359 FloatClass p_class;
1360 uint64_t hi, lo;
1361 int p_exp;
1362 int ab_mask, abc_mask;
1363
1364 ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1365 abc_mask = float_cmask(c.cls) | ab_mask;
1366 inf_zero = ab_mask == float_cmask_infzero;
1367
1368 /* It is implementation-defined whether the cases of (0,inf,qnan)
1369 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1370 * they return if they do), so we have to hand this information
1371 * off to the target-specific pick-a-NaN routine.
1372 */
1373 if (unlikely(abc_mask & float_cmask_anynan)) {
1374 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1375 }
1376
1377 if (inf_zero) {
1378 float_raise(float_flag_invalid, s);
1379 parts_default_nan(&a, s);
1380 return a;
1381 }
1382
1383 if (flags & float_muladd_negate_c) {
1384 c.sign ^= 1;
1385 }
1386
1387 p_sign = a.sign ^ b.sign;
1388
1389 if (flags & float_muladd_negate_product) {
1390 p_sign ^= 1;
1391 }
1392
1393 if (ab_mask & float_cmask_inf) {
1394 p_class = float_class_inf;
1395 } else if (ab_mask & float_cmask_zero) {
1396 p_class = float_class_zero;
1397 } else {
1398 p_class = float_class_normal;
1399 }
1400
1401 if (c.cls == float_class_inf) {
1402 if (p_class == float_class_inf && p_sign != c.sign) {
1403 float_raise(float_flag_invalid, s);
1404 parts_default_nan(&c, s);
1405 } else {
1406 c.sign ^= sign_flip;
1407 }
1408 return c;
1409 }
1410
1411 if (p_class == float_class_inf) {
1412 a.cls = float_class_inf;
1413 a.sign = p_sign ^ sign_flip;
1414 return a;
1415 }
1416
1417 if (p_class == float_class_zero) {
1418 if (c.cls == float_class_zero) {
1419 if (p_sign != c.sign) {
1420 p_sign = s->float_rounding_mode == float_round_down;
1421 }
1422 c.sign = p_sign;
1423 } else if (flags & float_muladd_halve_result) {
1424 c.exp -= 1;
1425 }
1426 c.sign ^= sign_flip;
1427 return c;
1428 }
1429
1430 /* a & b should be normals now... */
1431 assert(a.cls == float_class_normal &&
1432 b.cls == float_class_normal);
1433
1434 p_exp = a.exp + b.exp;
1435
1436 mul64To128(a.frac, b.frac, &hi, &lo);
1437
1438 /* Renormalize to the msb. */
1439 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1440 p_exp += 1;
1441 } else {
1442 shortShift128Left(hi, lo, 1, &hi, &lo);
1443 }
1444
1445 /* + add/sub */
1446 if (c.cls != float_class_zero) {
1447 int exp_diff = p_exp - c.exp;
1448 if (p_sign == c.sign) {
1449 /* Addition */
1450 if (exp_diff <= 0) {
1451 shift64RightJamming(hi, -exp_diff, &hi);
1452 p_exp = c.exp;
1453 if (uadd64_overflow(hi, c.frac, &hi)) {
1454 shift64RightJamming(hi, 1, &hi);
1455 hi |= DECOMPOSED_IMPLICIT_BIT;
1456 p_exp += 1;
1457 }
1458 } else {
1459 uint64_t c_hi, c_lo, over;
1460 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1461 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1462 if (over) {
1463 shift64RightJamming(hi, 1, &hi);
1464 hi |= DECOMPOSED_IMPLICIT_BIT;
1465 p_exp += 1;
1466 }
1467 }
1468 } else {
1469 /* Subtraction */
1470 uint64_t c_hi = c.frac, c_lo = 0;
1471
1472 if (exp_diff <= 0) {
1473 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1474 if (exp_diff == 0
1475 &&
1476 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1477 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1478 } else {
1479 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1480 p_sign ^= 1;
1481 p_exp = c.exp;
1482 }
1483 } else {
1484 shift128RightJamming(c_hi, c_lo,
1485 exp_diff,
1486 &c_hi, &c_lo);
1487 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1488 }
1489
1490 if (hi == 0 && lo == 0) {
1491 a.cls = float_class_zero;
1492 a.sign = s->float_rounding_mode == float_round_down;
1493 a.sign ^= sign_flip;
1494 return a;
1495 } else {
1496 int shift;
1497 if (hi != 0) {
1498 shift = clz64(hi);
1499 } else {
1500 shift = clz64(lo) + 64;
1501 }
1502 /* Normalizing to a binary point of 124 is the
1503 correct adjust for the exponent. However since we're
1504 shifting, we might as well put the binary point back
1505 at 63 where we really want it. Therefore shift as
1506 if we're leaving 1 bit at the top of the word, but
1507 adjust the exponent as if we're leaving 3 bits. */
1508 shift128Left(hi, lo, shift, &hi, &lo);
1509 p_exp -= shift;
1510 }
1511 }
1512 }
1513 hi |= (lo != 0);
1514
1515 if (flags & float_muladd_halve_result) {
1516 p_exp -= 1;
1517 }
1518
1519 /* finally prepare our result */
1520 a.cls = float_class_normal;
1521 a.sign = p_sign ^ sign_flip;
1522 a.exp = p_exp;
1523 a.frac = hi;
1524
1525 return a;
1526 }
1527
1528 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1529 int flags, float_status *status)
1530 {
1531 FloatParts64 pa, pb, pc, pr;
1532
1533 float16_unpack_canonical(&pa, a, status);
1534 float16_unpack_canonical(&pb, b, status);
1535 float16_unpack_canonical(&pc, c, status);
1536 pr = muladd_floats(pa, pb, pc, flags, status);
1537
1538 return float16_round_pack_canonical(&pr, status);
1539 }
1540
1541 static float32 QEMU_SOFTFLOAT_ATTR
1542 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1543 float_status *status)
1544 {
1545 FloatParts64 pa, pb, pc, pr;
1546
1547 float32_unpack_canonical(&pa, a, status);
1548 float32_unpack_canonical(&pb, b, status);
1549 float32_unpack_canonical(&pc, c, status);
1550 pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552 return float32_round_pack_canonical(&pr, status);
1553 }
1554
1555 static float64 QEMU_SOFTFLOAT_ATTR
1556 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1557 float_status *status)
1558 {
1559 FloatParts64 pa, pb, pc, pr;
1560
1561 float64_unpack_canonical(&pa, a, status);
1562 float64_unpack_canonical(&pb, b, status);
1563 float64_unpack_canonical(&pc, c, status);
1564 pr = muladd_floats(pa, pb, pc, flags, status);
1565
1566 return float64_round_pack_canonical(&pr, status);
1567 }
1568
1569 static bool force_soft_fma;
1570
1571 float32 QEMU_FLATTEN
1572 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1573 {
1574 union_float32 ua, ub, uc, ur;
1575
1576 ua.s = xa;
1577 ub.s = xb;
1578 uc.s = xc;
1579
1580 if (unlikely(!can_use_fpu(s))) {
1581 goto soft;
1582 }
1583 if (unlikely(flags & float_muladd_halve_result)) {
1584 goto soft;
1585 }
1586
1587 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1588 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1589 goto soft;
1590 }
1591
1592 if (unlikely(force_soft_fma)) {
1593 goto soft;
1594 }
1595
1596 /*
1597 * When (a || b) == 0, there's no need to check for under/over flow,
1598 * since we know the addend is (normal || 0) and the product is 0.
1599 */
1600 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1601 union_float32 up;
1602 bool prod_sign;
1603
1604 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1605 prod_sign ^= !!(flags & float_muladd_negate_product);
1606 up.s = float32_set_sign(float32_zero, prod_sign);
1607
1608 if (flags & float_muladd_negate_c) {
1609 uc.h = -uc.h;
1610 }
1611 ur.h = up.h + uc.h;
1612 } else {
1613 union_float32 ua_orig = ua;
1614 union_float32 uc_orig = uc;
1615
1616 if (flags & float_muladd_negate_product) {
1617 ua.h = -ua.h;
1618 }
1619 if (flags & float_muladd_negate_c) {
1620 uc.h = -uc.h;
1621 }
1622
1623 ur.h = fmaf(ua.h, ub.h, uc.h);
1624
1625 if (unlikely(f32_is_inf(ur))) {
1626 float_raise(float_flag_overflow, s);
1627 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1628 ua = ua_orig;
1629 uc = uc_orig;
1630 goto soft;
1631 }
1632 }
1633 if (flags & float_muladd_negate_result) {
1634 return float32_chs(ur.s);
1635 }
1636 return ur.s;
1637
1638 soft:
1639 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1640 }
1641
1642 float64 QEMU_FLATTEN
1643 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1644 {
1645 union_float64 ua, ub, uc, ur;
1646
1647 ua.s = xa;
1648 ub.s = xb;
1649 uc.s = xc;
1650
1651 if (unlikely(!can_use_fpu(s))) {
1652 goto soft;
1653 }
1654 if (unlikely(flags & float_muladd_halve_result)) {
1655 goto soft;
1656 }
1657
1658 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1659 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1660 goto soft;
1661 }
1662
1663 if (unlikely(force_soft_fma)) {
1664 goto soft;
1665 }
1666
1667 /*
1668 * When (a || b) == 0, there's no need to check for under/over flow,
1669 * since we know the addend is (normal || 0) and the product is 0.
1670 */
1671 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1672 union_float64 up;
1673 bool prod_sign;
1674
1675 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1676 prod_sign ^= !!(flags & float_muladd_negate_product);
1677 up.s = float64_set_sign(float64_zero, prod_sign);
1678
1679 if (flags & float_muladd_negate_c) {
1680 uc.h = -uc.h;
1681 }
1682 ur.h = up.h + uc.h;
1683 } else {
1684 union_float64 ua_orig = ua;
1685 union_float64 uc_orig = uc;
1686
1687 if (flags & float_muladd_negate_product) {
1688 ua.h = -ua.h;
1689 }
1690 if (flags & float_muladd_negate_c) {
1691 uc.h = -uc.h;
1692 }
1693
1694 ur.h = fma(ua.h, ub.h, uc.h);
1695
1696 if (unlikely(f64_is_inf(ur))) {
1697 float_raise(float_flag_overflow, s);
1698 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1699 ua = ua_orig;
1700 uc = uc_orig;
1701 goto soft;
1702 }
1703 }
1704 if (flags & float_muladd_negate_result) {
1705 return float64_chs(ur.s);
1706 }
1707 return ur.s;
1708
1709 soft:
1710 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1711 }
1712
1713 /*
1714 * Returns the result of multiplying the bfloat16 values `a'
1715 * and `b' then adding 'c', with no intermediate rounding step after the
1716 * multiplication.
1717 */
1718
1719 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1720 int flags, float_status *status)
1721 {
1722 FloatParts64 pa, pb, pc, pr;
1723
1724 bfloat16_unpack_canonical(&pa, a, status);
1725 bfloat16_unpack_canonical(&pb, b, status);
1726 bfloat16_unpack_canonical(&pc, c, status);
1727 pr = muladd_floats(pa, pb, pc, flags, status);
1728
1729 return bfloat16_round_pack_canonical(&pr, status);
1730 }
1731
1732 /*
1733 * Returns the result of dividing the floating-point value `a' by the
1734 * corresponding value `b'. The operation is performed according to
1735 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1736 */
1737
1738 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1739 {
1740 bool sign = a.sign ^ b.sign;
1741
1742 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1743 uint64_t n0, n1, q, r;
1744 int exp = a.exp - b.exp;
1745
1746 /*
1747 * We want a 2*N / N-bit division to produce exactly an N-bit
1748 * result, so that we do not lose any precision and so that we
1749 * do not have to renormalize afterward. If A.frac < B.frac,
1750 * then division would produce an (N-1)-bit result; shift A left
1751 * by one to produce the an N-bit result, and decrement the
1752 * exponent to match.
1753 *
1754 * The udiv_qrnnd algorithm that we're using requires normalization,
1755 * i.e. the msb of the denominator must be set, which is already true.
1756 */
1757 if (a.frac < b.frac) {
1758 exp -= 1;
1759 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1760 } else {
1761 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1762 }
1763 q = udiv_qrnnd(&r, n1, n0, b.frac);
1764
1765 /* Set lsb if there is a remainder, to set inexact. */
1766 a.frac = q | (r != 0);
1767 a.sign = sign;
1768 a.exp = exp;
1769 return a;
1770 }
1771 /* handle all the NaN cases */
1772 if (is_nan(a.cls) || is_nan(b.cls)) {
1773 return *parts_pick_nan(&a, &b, s);
1774 }
1775 /* 0/0 or Inf/Inf */
1776 if (a.cls == b.cls
1777 &&
1778 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1779 float_raise(float_flag_invalid, s);
1780 parts_default_nan(&a, s);
1781 return a;
1782 }
1783 /* Inf / x or 0 / x */
1784 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1785 a.sign = sign;
1786 return a;
1787 }
1788 /* Div 0 => Inf */
1789 if (b.cls == float_class_zero) {
1790 float_raise(float_flag_divbyzero, s);
1791 a.cls = float_class_inf;
1792 a.sign = sign;
1793 return a;
1794 }
1795 /* Div by Inf */
1796 if (b.cls == float_class_inf) {
1797 a.cls = float_class_zero;
1798 a.sign = sign;
1799 return a;
1800 }
1801 g_assert_not_reached();
1802 }
1803
1804 float16 float16_div(float16 a, float16 b, float_status *status)
1805 {
1806 FloatParts64 pa, pb, pr;
1807
1808 float16_unpack_canonical(&pa, a, status);
1809 float16_unpack_canonical(&pb, b, status);
1810 pr = div_floats(pa, pb, status);
1811
1812 return float16_round_pack_canonical(&pr, status);
1813 }
1814
1815 static float32 QEMU_SOFTFLOAT_ATTR
1816 soft_f32_div(float32 a, float32 b, float_status *status)
1817 {
1818 FloatParts64 pa, pb, pr;
1819
1820 float32_unpack_canonical(&pa, a, status);
1821 float32_unpack_canonical(&pb, b, status);
1822 pr = div_floats(pa, pb, status);
1823
1824 return float32_round_pack_canonical(&pr, status);
1825 }
1826
1827 static float64 QEMU_SOFTFLOAT_ATTR
1828 soft_f64_div(float64 a, float64 b, float_status *status)
1829 {
1830 FloatParts64 pa, pb, pr;
1831
1832 float64_unpack_canonical(&pa, a, status);
1833 float64_unpack_canonical(&pb, b, status);
1834 pr = div_floats(pa, pb, status);
1835
1836 return float64_round_pack_canonical(&pr, status);
1837 }
1838
1839 static float hard_f32_div(float a, float b)
1840 {
1841 return a / b;
1842 }
1843
1844 static double hard_f64_div(double a, double b)
1845 {
1846 return a / b;
1847 }
1848
1849 static bool f32_div_pre(union_float32 a, union_float32 b)
1850 {
1851 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1852 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1853 fpclassify(b.h) == FP_NORMAL;
1854 }
1855 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1856 }
1857
1858 static bool f64_div_pre(union_float64 a, union_float64 b)
1859 {
1860 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1861 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1862 fpclassify(b.h) == FP_NORMAL;
1863 }
1864 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1865 }
1866
1867 static bool f32_div_post(union_float32 a, union_float32 b)
1868 {
1869 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1870 return fpclassify(a.h) != FP_ZERO;
1871 }
1872 return !float32_is_zero(a.s);
1873 }
1874
1875 static bool f64_div_post(union_float64 a, union_float64 b)
1876 {
1877 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1878 return fpclassify(a.h) != FP_ZERO;
1879 }
1880 return !float64_is_zero(a.s);
1881 }
1882
1883 float32 QEMU_FLATTEN
1884 float32_div(float32 a, float32 b, float_status *s)
1885 {
1886 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1887 f32_div_pre, f32_div_post);
1888 }
1889
1890 float64 QEMU_FLATTEN
1891 float64_div(float64 a, float64 b, float_status *s)
1892 {
1893 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1894 f64_div_pre, f64_div_post);
1895 }
1896
1897 /*
1898 * Returns the result of dividing the bfloat16
1899 * value `a' by the corresponding value `b'.
1900 */
1901
1902 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1903 {
1904 FloatParts64 pa, pb, pr;
1905
1906 bfloat16_unpack_canonical(&pa, a, status);
1907 bfloat16_unpack_canonical(&pb, b, status);
1908 pr = div_floats(pa, pb, status);
1909
1910 return bfloat16_round_pack_canonical(&pr, status);
1911 }
1912
1913 /*
1914 * Float to Float conversions
1915 *
1916 * Returns the result of converting one float format to another. The
1917 * conversion is performed according to the IEC/IEEE Standard for
1918 * Binary Floating-Point Arithmetic.
1919 *
1920 * The float_to_float helper only needs to take care of raising
1921 * invalid exceptions and handling the conversion on NaNs.
1922 */
1923
1924 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1925 float_status *s)
1926 {
1927 if (dstf->arm_althp) {
1928 switch (a.cls) {
1929 case float_class_qnan:
1930 case float_class_snan:
1931 /* There is no NaN in the destination format. Raise Invalid
1932 * and return a zero with the sign of the input NaN.
1933 */
1934 float_raise(float_flag_invalid, s);
1935 a.cls = float_class_zero;
1936 a.frac = 0;
1937 a.exp = 0;
1938 break;
1939
1940 case float_class_inf:
1941 /* There is no Inf in the destination format. Raise Invalid
1942 * and return the maximum normal with the correct sign.
1943 */
1944 float_raise(float_flag_invalid, s);
1945 a.cls = float_class_normal;
1946 a.exp = dstf->exp_max;
1947 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1948 break;
1949
1950 default:
1951 break;
1952 }
1953 } else if (is_nan(a.cls)) {
1954 parts_return_nan(&a, s);
1955 }
1956 return a;
1957 }
1958
1959 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1960 {
1961 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1962 FloatParts64 pa, pr;
1963
1964 float16a_unpack_canonical(&pa, a, s, fmt16);
1965 pr = float_to_float(pa, &float32_params, s);
1966 return float32_round_pack_canonical(&pr, s);
1967 }
1968
1969 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1970 {
1971 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1972 FloatParts64 pa, pr;
1973
1974 float16a_unpack_canonical(&pa, a, s, fmt16);
1975 pr = float_to_float(pa, &float64_params, s);
1976 return float64_round_pack_canonical(&pr, s);
1977 }
1978
1979 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1980 {
1981 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1982 FloatParts64 pa, pr;
1983
1984 float32_unpack_canonical(&pa, a, s);
1985 pr = float_to_float(pa, fmt16, s);
1986 return float16a_round_pack_canonical(&pr, s, fmt16);
1987 }
1988
1989 static float64 QEMU_SOFTFLOAT_ATTR
1990 soft_float32_to_float64(float32 a, float_status *s)
1991 {
1992 FloatParts64 pa, pr;
1993
1994 float32_unpack_canonical(&pa, a, s);
1995 pr = float_to_float(pa, &float64_params, s);
1996 return float64_round_pack_canonical(&pr, s);
1997 }
1998
1999 float64 float32_to_float64(float32 a, float_status *s)
2000 {
2001 if (likely(float32_is_normal(a))) {
2002 /* Widening conversion can never produce inexact results. */
2003 union_float32 uf;
2004 union_float64 ud;
2005 uf.s = a;
2006 ud.h = uf.h;
2007 return ud.s;
2008 } else if (float32_is_zero(a)) {
2009 return float64_set_sign(float64_zero, float32_is_neg(a));
2010 } else {
2011 return soft_float32_to_float64(a, s);
2012 }
2013 }
2014
2015 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2016 {
2017 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2018 FloatParts64 pa, pr;
2019
2020 float64_unpack_canonical(&pa, a, s);
2021 pr = float_to_float(pa, fmt16, s);
2022 return float16a_round_pack_canonical(&pr, s, fmt16);
2023 }
2024
2025 float32 float64_to_float32(float64 a, float_status *s)
2026 {
2027 FloatParts64 pa, pr;
2028
2029 float64_unpack_canonical(&pa, a, s);
2030 pr = float_to_float(pa, &float32_params, s);
2031 return float32_round_pack_canonical(&pr, s);
2032 }
2033
2034 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2035 {
2036 FloatParts64 pa, pr;
2037
2038 bfloat16_unpack_canonical(&pa, a, s);
2039 pr = float_to_float(pa, &float32_params, s);
2040 return float32_round_pack_canonical(&pr, s);
2041 }
2042
2043 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2044 {
2045 FloatParts64 pa, pr;
2046
2047 bfloat16_unpack_canonical(&pa, a, s);
2048 pr = float_to_float(pa, &float64_params, s);
2049 return float64_round_pack_canonical(&pr, s);
2050 }
2051
2052 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2053 {
2054 FloatParts64 pa, pr;
2055
2056 float32_unpack_canonical(&pa, a, s);
2057 pr = float_to_float(pa, &bfloat16_params, s);
2058 return bfloat16_round_pack_canonical(&pr, s);
2059 }
2060
2061 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2062 {
2063 FloatParts64 pa, pr;
2064
2065 float64_unpack_canonical(&pa, a, s);
2066 pr = float_to_float(pa, &bfloat16_params, s);
2067 return bfloat16_round_pack_canonical(&pr, s);
2068 }
2069
2070 /*
2071 * Rounds the floating-point value `a' to an integer, and returns the
2072 * result as a floating-point value. The operation is performed
2073 * according to the IEC/IEEE Standard for Binary Floating-Point
2074 * Arithmetic.
2075 */
2076
2077 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2078 int scale, float_status *s)
2079 {
2080 switch (a.cls) {
2081 case float_class_qnan:
2082 case float_class_snan:
2083 parts_return_nan(&a, s);
2084 break;
2085
2086 case float_class_zero:
2087 case float_class_inf:
2088 /* already "integral" */
2089 break;
2090
2091 case float_class_normal:
2092 scale = MIN(MAX(scale, -0x10000), 0x10000);
2093 a.exp += scale;
2094
2095 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2096 /* already integral */
2097 break;
2098 }
2099 if (a.exp < 0) {
2100 bool one;
2101 /* all fractional */
2102 float_raise(float_flag_inexact, s);
2103 switch (rmode) {
2104 case float_round_nearest_even:
2105 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2106 break;
2107 case float_round_ties_away:
2108 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2109 break;
2110 case float_round_to_zero:
2111 one = false;
2112 break;
2113 case float_round_up:
2114 one = !a.sign;
2115 break;
2116 case float_round_down:
2117 one = a.sign;
2118 break;
2119 case float_round_to_odd:
2120 one = true;
2121 break;
2122 default:
2123 g_assert_not_reached();
2124 }
2125
2126 if (one) {
2127 a.frac = DECOMPOSED_IMPLICIT_BIT;
2128 a.exp = 0;
2129 } else {
2130 a.cls = float_class_zero;
2131 }
2132 } else {
2133 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2134 uint64_t frac_lsbm1 = frac_lsb >> 1;
2135 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2136 uint64_t rnd_mask = rnd_even_mask >> 1;
2137 uint64_t inc;
2138
2139 switch (rmode) {
2140 case float_round_nearest_even:
2141 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2142 break;
2143 case float_round_ties_away:
2144 inc = frac_lsbm1;
2145 break;
2146 case float_round_to_zero:
2147 inc = 0;
2148 break;
2149 case float_round_up:
2150 inc = a.sign ? 0 : rnd_mask;
2151 break;
2152 case float_round_down:
2153 inc = a.sign ? rnd_mask : 0;
2154 break;
2155 case float_round_to_odd:
2156 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2157 break;
2158 default:
2159 g_assert_not_reached();
2160 }
2161
2162 if (a.frac & rnd_mask) {
2163 float_raise(float_flag_inexact, s);
2164 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2165 a.frac >>= 1;
2166 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2167 a.exp++;
2168 }
2169 a.frac &= ~rnd_mask;
2170 }
2171 }
2172 break;
2173 default:
2174 g_assert_not_reached();
2175 }
2176 return a;
2177 }
2178
2179 float16 float16_round_to_int(float16 a, float_status *s)
2180 {
2181 FloatParts64 pa, pr;
2182
2183 float16_unpack_canonical(&pa, a, s);
2184 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2185 return float16_round_pack_canonical(&pr, s);
2186 }
2187
2188 float32 float32_round_to_int(float32 a, float_status *s)
2189 {
2190 FloatParts64 pa, pr;
2191
2192 float32_unpack_canonical(&pa, a, s);
2193 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2194 return float32_round_pack_canonical(&pr, s);
2195 }
2196
2197 float64 float64_round_to_int(float64 a, float_status *s)
2198 {
2199 FloatParts64 pa, pr;
2200
2201 float64_unpack_canonical(&pa, a, s);
2202 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2203 return float64_round_pack_canonical(&pr, s);
2204 }
2205
2206 /*
2207 * Rounds the bfloat16 value `a' to an integer, and returns the
2208 * result as a bfloat16 value.
2209 */
2210
2211 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2212 {
2213 FloatParts64 pa, pr;
2214
2215 bfloat16_unpack_canonical(&pa, a, s);
2216 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2217 return bfloat16_round_pack_canonical(&pr, s);
2218 }
2219
2220 /*
2221 * Returns the result of converting the floating-point value `a' to
2222 * the two's complement integer format. The conversion is performed
2223 * according to the IEC/IEEE Standard for Binary Floating-Point
2224 * Arithmetic---which means in particular that the conversion is
2225 * rounded according to the current rounding mode. If `a' is a NaN,
2226 * the largest positive integer is returned. Otherwise, if the
2227 * conversion overflows, the largest integer with the same sign as `a'
2228 * is returned.
2229 */
2230
2231 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2232 int scale, int64_t min, int64_t max,
2233 float_status *s)
2234 {
2235 uint64_t r;
2236 int orig_flags = get_float_exception_flags(s);
2237 FloatParts64 p = round_to_int(in, rmode, scale, s);
2238
2239 switch (p.cls) {
2240 case float_class_snan:
2241 case float_class_qnan:
2242 s->float_exception_flags = orig_flags | float_flag_invalid;
2243 return max;
2244 case float_class_inf:
2245 s->float_exception_flags = orig_flags | float_flag_invalid;
2246 return p.sign ? min : max;
2247 case float_class_zero:
2248 return 0;
2249 case float_class_normal:
2250 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2251 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2252 } else {
2253 r = UINT64_MAX;
2254 }
2255 if (p.sign) {
2256 if (r <= -(uint64_t) min) {
2257 return -r;
2258 } else {
2259 s->float_exception_flags = orig_flags | float_flag_invalid;
2260 return min;
2261 }
2262 } else {
2263 if (r <= max) {
2264 return r;
2265 } else {
2266 s->float_exception_flags = orig_flags | float_flag_invalid;
2267 return max;
2268 }
2269 }
2270 default:
2271 g_assert_not_reached();
2272 }
2273 }
2274
2275 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2276 float_status *s)
2277 {
2278 FloatParts64 p;
2279
2280 float16_unpack_canonical(&p, a, s);
2281 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2282 }
2283
2284 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2285 float_status *s)
2286 {
2287 FloatParts64 p;
2288
2289 float16_unpack_canonical(&p, a, s);
2290 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2291 }
2292
2293 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2294 float_status *s)
2295 {
2296 FloatParts64 p;
2297
2298 float16_unpack_canonical(&p, a, s);
2299 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2300 }
2301
2302 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2303 float_status *s)
2304 {
2305 FloatParts64 p;
2306
2307 float16_unpack_canonical(&p, a, s);
2308 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2309 }
2310
2311 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2312 float_status *s)
2313 {
2314 FloatParts64 p;
2315
2316 float32_unpack_canonical(&p, a, s);
2317 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2318 }
2319
2320 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2321 float_status *s)
2322 {
2323 FloatParts64 p;
2324
2325 float32_unpack_canonical(&p, a, s);
2326 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2327 }
2328
2329 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2330 float_status *s)
2331 {
2332 FloatParts64 p;
2333
2334 float32_unpack_canonical(&p, a, s);
2335 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2336 }
2337
2338 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2339 float_status *s)
2340 {
2341 FloatParts64 p;
2342
2343 float64_unpack_canonical(&p, a, s);
2344 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2345 }
2346
2347 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2348 float_status *s)
2349 {
2350 FloatParts64 p;
2351
2352 float64_unpack_canonical(&p, a, s);
2353 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2354 }
2355
2356 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2357 float_status *s)
2358 {
2359 FloatParts64 p;
2360
2361 float64_unpack_canonical(&p, a, s);
2362 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2363 }
2364
2365 int8_t float16_to_int8(float16 a, float_status *s)
2366 {
2367 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2368 }
2369
2370 int16_t float16_to_int16(float16 a, float_status *s)
2371 {
2372 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2373 }
2374
2375 int32_t float16_to_int32(float16 a, float_status *s)
2376 {
2377 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2378 }
2379
2380 int64_t float16_to_int64(float16 a, float_status *s)
2381 {
2382 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2383 }
2384
2385 int16_t float32_to_int16(float32 a, float_status *s)
2386 {
2387 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2388 }
2389
2390 int32_t float32_to_int32(float32 a, float_status *s)
2391 {
2392 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2393 }
2394
2395 int64_t float32_to_int64(float32 a, float_status *s)
2396 {
2397 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2398 }
2399
2400 int16_t float64_to_int16(float64 a, float_status *s)
2401 {
2402 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2403 }
2404
2405 int32_t float64_to_int32(float64 a, float_status *s)
2406 {
2407 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2408 }
2409
2410 int64_t float64_to_int64(float64 a, float_status *s)
2411 {
2412 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2413 }
2414
2415 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2416 {
2417 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2418 }
2419
2420 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2421 {
2422 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2423 }
2424
2425 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2426 {
2427 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2428 }
2429
2430 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2431 {
2432 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2433 }
2434
2435 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2436 {
2437 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2438 }
2439
2440 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2441 {
2442 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2443 }
2444
2445 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2446 {
2447 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2448 }
2449
2450 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2451 {
2452 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2453 }
2454
2455 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2456 {
2457 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2458 }
2459
2460 /*
2461 * Returns the result of converting the floating-point value `a' to
2462 * the two's complement integer format.
2463 */
2464
2465 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2466 float_status *s)
2467 {
2468 FloatParts64 p;
2469
2470 bfloat16_unpack_canonical(&p, a, s);
2471 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2472 }
2473
2474 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2475 float_status *s)
2476 {
2477 FloatParts64 p;
2478
2479 bfloat16_unpack_canonical(&p, a, s);
2480 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2481 }
2482
2483 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2484 float_status *s)
2485 {
2486 FloatParts64 p;
2487
2488 bfloat16_unpack_canonical(&p, a, s);
2489 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2490 }
2491
2492 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2493 {
2494 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2495 }
2496
2497 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2498 {
2499 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2500 }
2501
2502 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2503 {
2504 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2505 }
2506
2507 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2508 {
2509 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2510 }
2511
2512 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2513 {
2514 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2515 }
2516
2517 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2518 {
2519 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2520 }
2521
2522 /*
2523 * Returns the result of converting the floating-point value `a' to
2524 * the unsigned integer format. The conversion is performed according
2525 * to the IEC/IEEE Standard for Binary Floating-Point
2526 * Arithmetic---which means in particular that the conversion is
2527 * rounded according to the current rounding mode. If `a' is a NaN,
2528 * the largest unsigned integer is returned. Otherwise, if the
2529 * conversion overflows, the largest unsigned integer is returned. If
2530 * the 'a' is negative, the result is rounded and zero is returned;
2531 * values that do not round to zero will raise the inexact exception
2532 * flag.
2533 */
2534
2535 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2536 int scale, uint64_t max,
2537 float_status *s)
2538 {
2539 int orig_flags = get_float_exception_flags(s);
2540 FloatParts64 p = round_to_int(in, rmode, scale, s);
2541 uint64_t r;
2542
2543 switch (p.cls) {
2544 case float_class_snan:
2545 case float_class_qnan:
2546 s->float_exception_flags = orig_flags | float_flag_invalid;
2547 return max;
2548 case float_class_inf:
2549 s->float_exception_flags = orig_flags | float_flag_invalid;
2550 return p.sign ? 0 : max;
2551 case float_class_zero:
2552 return 0;
2553 case float_class_normal:
2554 if (p.sign) {
2555 s->float_exception_flags = orig_flags | float_flag_invalid;
2556 return 0;
2557 }
2558
2559 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2560 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2561 } else {
2562 s->float_exception_flags = orig_flags | float_flag_invalid;
2563 return max;
2564 }
2565
2566 /* For uint64 this will never trip, but if p.exp is too large
2567 * to shift a decomposed fraction we shall have exited via the
2568 * 3rd leg above.
2569 */
2570 if (r > max) {
2571 s->float_exception_flags = orig_flags | float_flag_invalid;
2572 return max;
2573 }
2574 return r;
2575 default:
2576 g_assert_not_reached();
2577 }
2578 }
2579
2580 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2581 float_status *s)
2582 {
2583 FloatParts64 p;
2584
2585 float16_unpack_canonical(&p, a, s);
2586 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2587 }
2588
2589 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2590 float_status *s)
2591 {
2592 FloatParts64 p;
2593
2594 float16_unpack_canonical(&p, a, s);
2595 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2596 }
2597
2598 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2599 float_status *s)
2600 {
2601 FloatParts64 p;
2602
2603 float16_unpack_canonical(&p, a, s);
2604 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2605 }
2606
2607 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2608 float_status *s)
2609 {
2610 FloatParts64 p;
2611
2612 float16_unpack_canonical(&p, a, s);
2613 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2614 }
2615
2616 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2617 float_status *s)
2618 {
2619 FloatParts64 p;
2620
2621 float32_unpack_canonical(&p, a, s);
2622 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2623 }
2624
2625 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2626 float_status *s)
2627 {
2628 FloatParts64 p;
2629
2630 float32_unpack_canonical(&p, a, s);
2631 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2632 }
2633
2634 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2635 float_status *s)
2636 {
2637 FloatParts64 p;
2638
2639 float32_unpack_canonical(&p, a, s);
2640 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2641 }
2642
2643 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2644 float_status *s)
2645 {
2646 FloatParts64 p;
2647
2648 float64_unpack_canonical(&p, a, s);
2649 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2650 }
2651
2652 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2653 float_status *s)
2654 {
2655 FloatParts64 p;
2656
2657 float64_unpack_canonical(&p, a, s);
2658 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2659 }
2660
2661 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2662 float_status *s)
2663 {
2664 FloatParts64 p;
2665
2666 float64_unpack_canonical(&p, a, s);
2667 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2668 }
2669
2670 uint8_t float16_to_uint8(float16 a, float_status *s)
2671 {
2672 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2673 }
2674
2675 uint16_t float16_to_uint16(float16 a, float_status *s)
2676 {
2677 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2678 }
2679
2680 uint32_t float16_to_uint32(float16 a, float_status *s)
2681 {
2682 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2683 }
2684
2685 uint64_t float16_to_uint64(float16 a, float_status *s)
2686 {
2687 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2688 }
2689
2690 uint16_t float32_to_uint16(float32 a, float_status *s)
2691 {
2692 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2693 }
2694
2695 uint32_t float32_to_uint32(float32 a, float_status *s)
2696 {
2697 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2698 }
2699
2700 uint64_t float32_to_uint64(float32 a, float_status *s)
2701 {
2702 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2703 }
2704
2705 uint16_t float64_to_uint16(float64 a, float_status *s)
2706 {
2707 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2708 }
2709
2710 uint32_t float64_to_uint32(float64 a, float_status *s)
2711 {
2712 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2713 }
2714
2715 uint64_t float64_to_uint64(float64 a, float_status *s)
2716 {
2717 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2718 }
2719
2720 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2721 {
2722 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2723 }
2724
2725 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2726 {
2727 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2728 }
2729
2730 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2731 {
2732 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2733 }
2734
2735 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2736 {
2737 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2738 }
2739
2740 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2741 {
2742 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2743 }
2744
2745 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2746 {
2747 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2748 }
2749
2750 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2751 {
2752 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2753 }
2754
2755 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2756 {
2757 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2758 }
2759
2760 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2761 {
2762 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2763 }
2764
2765 /*
2766 * Returns the result of converting the bfloat16 value `a' to
2767 * the unsigned integer format.
2768 */
2769
2770 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2771 int scale, float_status *s)
2772 {
2773 FloatParts64 p;
2774
2775 bfloat16_unpack_canonical(&p, a, s);
2776 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2777 }
2778
2779 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2780 int scale, float_status *s)
2781 {
2782 FloatParts64 p;
2783
2784 bfloat16_unpack_canonical(&p, a, s);
2785 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2786 }
2787
2788 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2789 int scale, float_status *s)
2790 {
2791 FloatParts64 p;
2792
2793 bfloat16_unpack_canonical(&p, a, s);
2794 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2795 }
2796
2797 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2798 {
2799 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2800 }
2801
2802 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2803 {
2804 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2805 }
2806
2807 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2808 {
2809 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2810 }
2811
2812 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2813 {
2814 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2815 }
2816
2817 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2818 {
2819 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2820 }
2821
2822 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2823 {
2824 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2825 }
2826
2827 /*
2828 * Integer to float conversions
2829 *
2830 * Returns the result of converting the two's complement integer `a'
2831 * to the floating-point format. The conversion is performed according
2832 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 */
2834
2835 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2836 {
2837 FloatParts64 r = { .sign = false };
2838
2839 if (a == 0) {
2840 r.cls = float_class_zero;
2841 } else {
2842 uint64_t f = a;
2843 int shift;
2844
2845 r.cls = float_class_normal;
2846 if (a < 0) {
2847 f = -f;
2848 r.sign = true;
2849 }
2850 shift = clz64(f);
2851 scale = MIN(MAX(scale, -0x10000), 0x10000);
2852
2853 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2854 r.frac = f << shift;
2855 }
2856
2857 return r;
2858 }
2859
2860 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2861 {
2862 FloatParts64 pa = int_to_float(a, scale, status);
2863 return float16_round_pack_canonical(&pa, status);
2864 }
2865
2866 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2867 {
2868 return int64_to_float16_scalbn(a, scale, status);
2869 }
2870
2871 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2872 {
2873 return int64_to_float16_scalbn(a, scale, status);
2874 }
2875
2876 float16 int64_to_float16(int64_t a, float_status *status)
2877 {
2878 return int64_to_float16_scalbn(a, 0, status);
2879 }
2880
2881 float16 int32_to_float16(int32_t a, float_status *status)
2882 {
2883 return int64_to_float16_scalbn(a, 0, status);
2884 }
2885
2886 float16 int16_to_float16(int16_t a, float_status *status)
2887 {
2888 return int64_to_float16_scalbn(a, 0, status);
2889 }
2890
2891 float16 int8_to_float16(int8_t a, float_status *status)
2892 {
2893 return int64_to_float16_scalbn(a, 0, status);
2894 }
2895
2896 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2897 {
2898 FloatParts64 pa = int_to_float(a, scale, status);
2899 return float32_round_pack_canonical(&pa, status);
2900 }
2901
2902 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2903 {
2904 return int64_to_float32_scalbn(a, scale, status);
2905 }
2906
2907 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2908 {
2909 return int64_to_float32_scalbn(a, scale, status);
2910 }
2911
2912 float32 int64_to_float32(int64_t a, float_status *status)
2913 {
2914 return int64_to_float32_scalbn(a, 0, status);
2915 }
2916
2917 float32 int32_to_float32(int32_t a, float_status *status)
2918 {
2919 return int64_to_float32_scalbn(a, 0, status);
2920 }
2921
2922 float32 int16_to_float32(int16_t a, float_status *status)
2923 {
2924 return int64_to_float32_scalbn(a, 0, status);
2925 }
2926
2927 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2928 {
2929 FloatParts64 pa = int_to_float(a, scale, status);
2930 return float64_round_pack_canonical(&pa, status);
2931 }
2932
2933 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2934 {
2935 return int64_to_float64_scalbn(a, scale, status);
2936 }
2937
2938 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2939 {
2940 return int64_to_float64_scalbn(a, scale, status);
2941 }
2942
2943 float64 int64_to_float64(int64_t a, float_status *status)
2944 {
2945 return int64_to_float64_scalbn(a, 0, status);
2946 }
2947
2948 float64 int32_to_float64(int32_t a, float_status *status)
2949 {
2950 return int64_to_float64_scalbn(a, 0, status);
2951 }
2952
2953 float64 int16_to_float64(int16_t a, float_status *status)
2954 {
2955 return int64_to_float64_scalbn(a, 0, status);
2956 }
2957
2958 /*
2959 * Returns the result of converting the two's complement integer `a'
2960 * to the bfloat16 format.
2961 */
2962
2963 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2964 {
2965 FloatParts64 pa = int_to_float(a, scale, status);
2966 return bfloat16_round_pack_canonical(&pa, status);
2967 }
2968
2969 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2970 {
2971 return int64_to_bfloat16_scalbn(a, scale, status);
2972 }
2973
2974 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2975 {
2976 return int64_to_bfloat16_scalbn(a, scale, status);
2977 }
2978
2979 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2980 {
2981 return int64_to_bfloat16_scalbn(a, 0, status);
2982 }
2983
2984 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2985 {
2986 return int64_to_bfloat16_scalbn(a, 0, status);
2987 }
2988
2989 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2990 {
2991 return int64_to_bfloat16_scalbn(a, 0, status);
2992 }
2993
2994 /*
2995 * Unsigned Integer to float conversions
2996 *
2997 * Returns the result of converting the unsigned integer `a' to the
2998 * floating-point format. The conversion is performed according to the
2999 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000 */
3001
3002 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3003 {
3004 FloatParts64 r = { .sign = false };
3005 int shift;
3006
3007 if (a == 0) {
3008 r.cls = float_class_zero;
3009 } else {
3010 scale = MIN(MAX(scale, -0x10000), 0x10000);
3011 shift = clz64(a);
3012 r.cls = float_class_normal;
3013 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3014 r.frac = a << shift;
3015 }
3016
3017 return r;
3018 }
3019
3020 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3021 {
3022 FloatParts64 pa = uint_to_float(a, scale, status);
3023 return float16_round_pack_canonical(&pa, status);
3024 }
3025
3026 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3027 {
3028 return uint64_to_float16_scalbn(a, scale, status);
3029 }
3030
3031 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3032 {
3033 return uint64_to_float16_scalbn(a, scale, status);
3034 }
3035
3036 float16 uint64_to_float16(uint64_t a, float_status *status)
3037 {
3038 return uint64_to_float16_scalbn(a, 0, status);
3039 }
3040
3041 float16 uint32_to_float16(uint32_t a, float_status *status)
3042 {
3043 return uint64_to_float16_scalbn(a, 0, status);
3044 }
3045
3046 float16 uint16_to_float16(uint16_t a, float_status *status)
3047 {
3048 return uint64_to_float16_scalbn(a, 0, status);
3049 }
3050
3051 float16 uint8_to_float16(uint8_t a, float_status *status)
3052 {
3053 return uint64_to_float16_scalbn(a, 0, status);
3054 }
3055
3056 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3057 {
3058 FloatParts64 pa = uint_to_float(a, scale, status);
3059 return float32_round_pack_canonical(&pa, status);
3060 }
3061
3062 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3063 {
3064 return uint64_to_float32_scalbn(a, scale, status);
3065 }
3066
3067 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3068 {
3069 return uint64_to_float32_scalbn(a, scale, status);
3070 }
3071
3072 float32 uint64_to_float32(uint64_t a, float_status *status)
3073 {
3074 return uint64_to_float32_scalbn(a, 0, status);
3075 }
3076
3077 float32 uint32_to_float32(uint32_t a, float_status *status)
3078 {
3079 return uint64_to_float32_scalbn(a, 0, status);
3080 }
3081
3082 float32 uint16_to_float32(uint16_t a, float_status *status)
3083 {
3084 return uint64_to_float32_scalbn(a, 0, status);
3085 }
3086
3087 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3088 {
3089 FloatParts64 pa = uint_to_float(a, scale, status);
3090 return float64_round_pack_canonical(&pa, status);
3091 }
3092
3093 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3094 {
3095 return uint64_to_float64_scalbn(a, scale, status);
3096 }
3097
3098 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3099 {
3100 return uint64_to_float64_scalbn(a, scale, status);
3101 }
3102
3103 float64 uint64_to_float64(uint64_t a, float_status *status)
3104 {
3105 return uint64_to_float64_scalbn(a, 0, status);
3106 }
3107
3108 float64 uint32_to_float64(uint32_t a, float_status *status)
3109 {
3110 return uint64_to_float64_scalbn(a, 0, status);
3111 }
3112
3113 float64 uint16_to_float64(uint16_t a, float_status *status)
3114 {
3115 return uint64_to_float64_scalbn(a, 0, status);
3116 }
3117
3118 /*
3119 * Returns the result of converting the unsigned integer `a' to the
3120 * bfloat16 format.
3121 */
3122
3123 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3124 {
3125 FloatParts64 pa = uint_to_float(a, scale, status);
3126 return bfloat16_round_pack_canonical(&pa, status);
3127 }
3128
3129 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3130 {
3131 return uint64_to_bfloat16_scalbn(a, scale, status);
3132 }
3133
3134 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3135 {
3136 return uint64_to_bfloat16_scalbn(a, scale, status);
3137 }
3138
3139 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3140 {
3141 return uint64_to_bfloat16_scalbn(a, 0, status);
3142 }
3143
3144 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3145 {
3146 return uint64_to_bfloat16_scalbn(a, 0, status);
3147 }
3148
3149 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3150 {
3151 return uint64_to_bfloat16_scalbn(a, 0, status);
3152 }
3153
3154 /* Float Min/Max */
3155 /* min() and max() functions. These can't be implemented as
3156 * 'compare and pick one input' because that would mishandle
3157 * NaNs and +0 vs -0.
3158 *
3159 * minnum() and maxnum() functions. These are similar to the min()
3160 * and max() functions but if one of the arguments is a QNaN and
3161 * the other is numerical then the numerical argument is returned.
3162 * SNaNs will get quietened before being returned.
3163 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3164 * and maxNum() operations. min() and max() are the typical min/max
3165 * semantics provided by many CPUs which predate that specification.
3166 *
3167 * minnummag() and maxnummag() functions correspond to minNumMag()
3168 * and minNumMag() from the IEEE-754 2008.
3169 */
3170 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3171 bool ieee, bool ismag, float_status *s)
3172 {
3173 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3174 if (ieee) {
3175 /* Takes two floating-point values `a' and `b', one of
3176 * which is a NaN, and returns the appropriate NaN
3177 * result. If either `a' or `b' is a signaling NaN,
3178 * the invalid exception is raised.
3179 */
3180 if (is_snan(a.cls) || is_snan(b.cls)) {
3181 return *parts_pick_nan(&a, &b, s);
3182 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3183 return b;
3184 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3185 return a;
3186 }
3187 }
3188 return *parts_pick_nan(&a, &b, s);
3189 } else {
3190 int a_exp, b_exp;
3191
3192 switch (a.cls) {
3193 case float_class_normal:
3194 a_exp = a.exp;
3195 break;
3196 case float_class_inf:
3197 a_exp = INT_MAX;
3198 break;
3199 case float_class_zero:
3200 a_exp = INT_MIN;
3201 break;
3202 default:
3203 g_assert_not_reached();
3204 break;
3205 }
3206 switch (b.cls) {
3207 case float_class_normal:
3208 b_exp = b.exp;
3209 break;
3210 case float_class_inf:
3211 b_exp = INT_MAX;
3212 break;
3213 case float_class_zero:
3214 b_exp = INT_MIN;
3215 break;
3216 default:
3217 g_assert_not_reached();
3218 break;
3219 }
3220
3221 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3222 bool a_less = a_exp < b_exp;
3223 if (a_exp == b_exp) {
3224 a_less = a.frac < b.frac;
3225 }
3226 return a_less ^ ismin ? b : a;
3227 }
3228
3229 if (a.sign == b.sign) {
3230 bool a_less = a_exp < b_exp;
3231 if (a_exp == b_exp) {
3232 a_less = a.frac < b.frac;
3233 }
3234 return a.sign ^ a_less ^ ismin ? b : a;
3235 } else {
3236 return a.sign ^ ismin ? b : a;
3237 }
3238 }
3239 }
3240
3241 #define MINMAX(sz, name, ismin, isiee, ismag) \
3242 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3243 float_status *s) \
3244 { \
3245 FloatParts64 pa, pb, pr; \
3246 float ## sz ## _unpack_canonical(&pa, a, s); \
3247 float ## sz ## _unpack_canonical(&pb, b, s); \
3248 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3249 return float ## sz ## _round_pack_canonical(&pr, s); \
3250 }
3251
3252 MINMAX(16, min, true, false, false)
3253 MINMAX(16, minnum, true, true, false)
3254 MINMAX(16, minnummag, true, true, true)
3255 MINMAX(16, max, false, false, false)
3256 MINMAX(16, maxnum, false, true, false)
3257 MINMAX(16, maxnummag, false, true, true)
3258
3259 MINMAX(32, min, true, false, false)
3260 MINMAX(32, minnum, true, true, false)
3261 MINMAX(32, minnummag, true, true, true)
3262 MINMAX(32, max, false, false, false)
3263 MINMAX(32, maxnum, false, true, false)
3264 MINMAX(32, maxnummag, false, true, true)
3265
3266 MINMAX(64, min, true, false, false)
3267 MINMAX(64, minnum, true, true, false)
3268 MINMAX(64, minnummag, true, true, true)
3269 MINMAX(64, max, false, false, false)
3270 MINMAX(64, maxnum, false, true, false)
3271 MINMAX(64, maxnummag, false, true, true)
3272
3273 #undef MINMAX
3274
3275 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3276 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3277 { \
3278 FloatParts64 pa, pb, pr; \
3279 bfloat16_unpack_canonical(&pa, a, s); \
3280 bfloat16_unpack_canonical(&pb, b, s); \
3281 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3282 return bfloat16_round_pack_canonical(&pr, s); \
3283 }
3284
3285 BF16_MINMAX(min, true, false, false)
3286 BF16_MINMAX(minnum, true, true, false)
3287 BF16_MINMAX(minnummag, true, true, true)
3288 BF16_MINMAX(max, false, false, false)
3289 BF16_MINMAX(maxnum, false, true, false)
3290 BF16_MINMAX(maxnummag, false, true, true)
3291
3292 #undef BF16_MINMAX
3293
3294 /* Floating point compare */
3295 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3296 float_status *s)
3297 {
3298 if (is_nan(a.cls) || is_nan(b.cls)) {
3299 if (!is_quiet ||
3300 a.cls == float_class_snan ||
3301 b.cls == float_class_snan) {
3302 float_raise(float_flag_invalid, s);
3303 }
3304 return float_relation_unordered;
3305 }
3306
3307 if (a.cls == float_class_zero) {
3308 if (b.cls == float_class_zero) {
3309 return float_relation_equal;
3310 }
3311 return b.sign ? float_relation_greater : float_relation_less;
3312 } else if (b.cls == float_class_zero) {
3313 return a.sign ? float_relation_less : float_relation_greater;
3314 }
3315
3316 /* The only really important thing about infinity is its sign. If
3317 * both are infinities the sign marks the smallest of the two.
3318 */
3319 if (a.cls == float_class_inf) {
3320 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3321 return float_relation_equal;
3322 }
3323 return a.sign ? float_relation_less : float_relation_greater;
3324 } else if (b.cls == float_class_inf) {
3325 return b.sign ? float_relation_greater : float_relation_less;
3326 }
3327
3328 if (a.sign != b.sign) {
3329 return a.sign ? float_relation_less : float_relation_greater;
3330 }
3331
3332 if (a.exp == b.exp) {
3333 if (a.frac == b.frac) {
3334 return float_relation_equal;
3335 }
3336 if (a.sign) {
3337 return a.frac > b.frac ?
3338 float_relation_less : float_relation_greater;
3339 } else {
3340 return a.frac > b.frac ?
3341 float_relation_greater : float_relation_less;
3342 }
3343 } else {
3344 if (a.sign) {
3345 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3346 } else {
3347 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3348 }
3349 }
3350 }
3351
3352 #define COMPARE(name, attr, sz) \
3353 static int attr \
3354 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3355 { \
3356 FloatParts64 pa, pb; \
3357 float ## sz ## _unpack_canonical(&pa, a, s); \
3358 float ## sz ## _unpack_canonical(&pb, b, s); \
3359 return compare_floats(pa, pb, is_quiet, s); \
3360 }
3361
3362 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3363 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3364 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3365
3366 #undef COMPARE
3367
3368 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3369 {
3370 return soft_f16_compare(a, b, false, s);
3371 }
3372
3373 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3374 {
3375 return soft_f16_compare(a, b, true, s);
3376 }
3377
3378 static FloatRelation QEMU_FLATTEN
3379 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3380 {
3381 union_float32 ua, ub;
3382
3383 ua.s = xa;
3384 ub.s = xb;
3385
3386 if (QEMU_NO_HARDFLOAT) {
3387 goto soft;
3388 }
3389
3390 float32_input_flush2(&ua.s, &ub.s, s);
3391 if (isgreaterequal(ua.h, ub.h)) {
3392 if (isgreater(ua.h, ub.h)) {
3393 return float_relation_greater;
3394 }
3395 return float_relation_equal;
3396 }
3397 if (likely(isless(ua.h, ub.h))) {
3398 return float_relation_less;
3399 }
3400 /* The only condition remaining is unordered.
3401 * Fall through to set flags.
3402 */
3403 soft:
3404 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3405 }
3406
3407 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3408 {
3409 return f32_compare(a, b, false, s);
3410 }
3411
3412 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3413 {
3414 return f32_compare(a, b, true, s);
3415 }
3416
3417 static FloatRelation QEMU_FLATTEN
3418 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3419 {
3420 union_float64 ua, ub;
3421
3422 ua.s = xa;
3423 ub.s = xb;
3424
3425 if (QEMU_NO_HARDFLOAT) {
3426 goto soft;
3427 }
3428
3429 float64_input_flush2(&ua.s, &ub.s, s);
3430 if (isgreaterequal(ua.h, ub.h)) {
3431 if (isgreater(ua.h, ub.h)) {
3432 return float_relation_greater;
3433 }
3434 return float_relation_equal;
3435 }
3436 if (likely(isless(ua.h, ub.h))) {
3437 return float_relation_less;
3438 }
3439 /* The only condition remaining is unordered.
3440 * Fall through to set flags.
3441 */
3442 soft:
3443 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3444 }
3445
3446 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3447 {
3448 return f64_compare(a, b, false, s);
3449 }
3450
3451 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3452 {
3453 return f64_compare(a, b, true, s);
3454 }
3455
3456 static FloatRelation QEMU_FLATTEN
3457 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3458 {
3459 FloatParts64 pa, pb;
3460
3461 bfloat16_unpack_canonical(&pa, a, s);
3462 bfloat16_unpack_canonical(&pb, b, s);
3463 return compare_floats(pa, pb, is_quiet, s);
3464 }
3465
3466 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3467 {
3468 return soft_bf16_compare(a, b, false, s);
3469 }
3470
3471 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3472 {
3473 return soft_bf16_compare(a, b, true, s);
3474 }
3475
3476 /* Multiply A by 2 raised to the power N. */
3477 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3478 {
3479 if (unlikely(is_nan(a.cls))) {
3480 parts_return_nan(&a, s);
3481 }
3482 if (a.cls == float_class_normal) {
3483 /* The largest float type (even though not supported by FloatParts64)
3484 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3485 * still allows rounding to infinity, without allowing overflow
3486 * within the int32_t that backs FloatParts64.exp.
3487 */
3488 n = MIN(MAX(n, -0x10000), 0x10000);
3489 a.exp += n;
3490 }
3491 return a;
3492 }
3493
3494 float16 float16_scalbn(float16 a, int n, float_status *status)
3495 {
3496 FloatParts64 pa, pr;
3497
3498 float16_unpack_canonical(&pa, a, status);
3499 pr = scalbn_decomposed(pa, n, status);
3500 return float16_round_pack_canonical(&pr, status);
3501 }
3502
3503 float32 float32_scalbn(float32 a, int n, float_status *status)
3504 {
3505 FloatParts64 pa, pr;
3506
3507 float32_unpack_canonical(&pa, a, status);
3508 pr = scalbn_decomposed(pa, n, status);
3509 return float32_round_pack_canonical(&pr, status);
3510 }
3511
3512 float64 float64_scalbn(float64 a, int n, float_status *status)
3513 {
3514 FloatParts64 pa, pr;
3515
3516 float64_unpack_canonical(&pa, a, status);
3517 pr = scalbn_decomposed(pa, n, status);
3518 return float64_round_pack_canonical(&pr, status);
3519 }
3520
3521 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3522 {
3523 FloatParts64 pa, pr;
3524
3525 bfloat16_unpack_canonical(&pa, a, status);
3526 pr = scalbn_decomposed(pa, n, status);
3527 return bfloat16_round_pack_canonical(&pr, status);
3528 }
3529
3530 /*
3531 * Square Root
3532 *
3533 * The old softfloat code did an approximation step before zeroing in
3534 * on the final result. However for simpleness we just compute the
3535 * square root by iterating down from the implicit bit to enough extra
3536 * bits to ensure we get a correctly rounded result.
3537 *
3538 * This does mean however the calculation is slower than before,
3539 * especially for 64 bit floats.
3540 */
3541
3542 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3543 {
3544 uint64_t a_frac, r_frac, s_frac;
3545 int bit, last_bit;
3546
3547 if (is_nan(a.cls)) {
3548 parts_return_nan(&a, s);
3549 return a;
3550 }
3551 if (a.cls == float_class_zero) {
3552 return a; /* sqrt(+-0) = +-0 */
3553 }
3554 if (a.sign) {
3555 float_raise(float_flag_invalid, s);
3556 parts_default_nan(&a, s);
3557 return a;
3558 }
3559 if (a.cls == float_class_inf) {
3560 return a; /* sqrt(+inf) = +inf */
3561 }
3562
3563 assert(a.cls == float_class_normal);
3564
3565 /* We need two overflow bits at the top. Adding room for that is a
3566 * right shift. If the exponent is odd, we can discard the low bit
3567 * by multiplying the fraction by 2; that's a left shift. Combine
3568 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3569 */
3570 a_frac = a.frac >> (2 - (a.exp & 1));
3571 a.exp >>= 1;
3572
3573 /* Bit-by-bit computation of sqrt. */
3574 r_frac = 0;
3575 s_frac = 0;
3576
3577 /* Iterate from implicit bit down to the 3 extra bits to compute a
3578 * properly rounded result. Remember we've inserted two more bits
3579 * at the top, so these positions are two less.
3580 */
3581 bit = DECOMPOSED_BINARY_POINT - 2;
3582 last_bit = MAX(p->frac_shift - 4, 0);
3583 do {
3584 uint64_t q = 1ULL << bit;
3585 uint64_t t_frac = s_frac + q;
3586 if (t_frac <= a_frac) {
3587 s_frac = t_frac + q;
3588 a_frac -= t_frac;
3589 r_frac += q;
3590 }
3591 a_frac <<= 1;
3592 } while (--bit >= last_bit);
3593
3594 /* Undo the right shift done above. If there is any remaining
3595 * fraction, the result is inexact. Set the sticky bit.
3596 */
3597 a.frac = (r_frac << 2) + (a_frac != 0);
3598
3599 return a;
3600 }
3601
3602 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3603 {
3604 FloatParts64 pa, pr;
3605
3606 float16_unpack_canonical(&pa, a, status);
3607 pr = sqrt_float(pa, status, &float16_params);
3608 return float16_round_pack_canonical(&pr, status);
3609 }
3610
3611 static float32 QEMU_SOFTFLOAT_ATTR
3612 soft_f32_sqrt(float32 a, float_status *status)
3613 {
3614 FloatParts64 pa, pr;
3615
3616 float32_unpack_canonical(&pa, a, status);
3617 pr = sqrt_float(pa, status, &float32_params);
3618 return float32_round_pack_canonical(&pr, status);
3619 }
3620
3621 static float64 QEMU_SOFTFLOAT_ATTR
3622 soft_f64_sqrt(float64 a, float_status *status)
3623 {
3624 FloatParts64 pa, pr;
3625
3626 float64_unpack_canonical(&pa, a, status);
3627 pr = sqrt_float(pa, status, &float64_params);
3628 return float64_round_pack_canonical(&pr, status);
3629 }
3630
3631 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3632 {
3633 union_float32 ua, ur;
3634
3635 ua.s = xa;
3636 if (unlikely(!can_use_fpu(s))) {
3637 goto soft;
3638 }
3639
3640 float32_input_flush1(&ua.s, s);
3641 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3642 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3643 fpclassify(ua.h) == FP_ZERO) ||
3644 signbit(ua.h))) {
3645 goto soft;
3646 }
3647 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3648 float32_is_neg(ua.s))) {
3649 goto soft;
3650 }
3651 ur.h = sqrtf(ua.h);
3652 return ur.s;
3653
3654 soft:
3655 return soft_f32_sqrt(ua.s, s);
3656 }
3657
3658 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3659 {
3660 union_float64 ua, ur;
3661
3662 ua.s = xa;
3663 if (unlikely(!can_use_fpu(s))) {
3664 goto soft;
3665 }
3666
3667 float64_input_flush1(&ua.s, s);
3668 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3669 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3670 fpclassify(ua.h) == FP_ZERO) ||
3671 signbit(ua.h))) {
3672 goto soft;
3673 }
3674 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3675 float64_is_neg(ua.s))) {
3676 goto soft;
3677 }
3678 ur.h = sqrt(ua.h);
3679 return ur.s;
3680
3681 soft:
3682 return soft_f64_sqrt(ua.s, s);
3683 }
3684
3685 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3686 {
3687 FloatParts64 pa, pr;
3688
3689 bfloat16_unpack_canonical(&pa, a, status);
3690 pr = sqrt_float(pa, status, &bfloat16_params);
3691 return bfloat16_round_pack_canonical(&pr, status);
3692 }
3693
3694 /*----------------------------------------------------------------------------
3695 | The pattern for a default generated NaN.
3696 *----------------------------------------------------------------------------*/
3697
3698 float16 float16_default_nan(float_status *status)
3699 {
3700 FloatParts64 p;
3701
3702 parts_default_nan(&p, status);
3703 p.frac >>= float16_params.frac_shift;
3704 return float16_pack_raw(&p);
3705 }
3706
3707 float32 float32_default_nan(float_status *status)
3708 {
3709 FloatParts64 p;
3710
3711 parts_default_nan(&p, status);
3712 p.frac >>= float32_params.frac_shift;
3713 return float32_pack_raw(&p);
3714 }
3715
3716 float64 float64_default_nan(float_status *status)
3717 {
3718 FloatParts64 p;
3719
3720 parts_default_nan(&p, status);
3721 p.frac >>= float64_params.frac_shift;
3722 return float64_pack_raw(&p);
3723 }
3724
3725 float128 float128_default_nan(float_status *status)
3726 {
3727 FloatParts128 p;
3728
3729 parts_default_nan(&p, status);
3730 frac_shr(&p, float128_params.frac_shift);
3731 return float128_pack_raw(&p);
3732 }
3733
3734 bfloat16 bfloat16_default_nan(float_status *status)
3735 {
3736 FloatParts64 p;
3737
3738 parts_default_nan(&p, status);
3739 p.frac >>= bfloat16_params.frac_shift;
3740 return bfloat16_pack_raw(&p);
3741 }
3742
3743 /*----------------------------------------------------------------------------
3744 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3745 *----------------------------------------------------------------------------*/
3746
3747 float16 float16_silence_nan(float16 a, float_status *status)
3748 {
3749 FloatParts64 p;
3750
3751 float16_unpack_raw(&p, a);
3752 p.frac <<= float16_params.frac_shift;
3753 parts_silence_nan(&p, status);
3754 p.frac >>= float16_params.frac_shift;
3755 return float16_pack_raw(&p);
3756 }
3757
3758 float32 float32_silence_nan(float32 a, float_status *status)
3759 {
3760 FloatParts64 p;
3761
3762 float32_unpack_raw(&p, a);
3763 p.frac <<= float32_params.frac_shift;
3764 parts_silence_nan(&p, status);
3765 p.frac >>= float32_params.frac_shift;
3766 return float32_pack_raw(&p);
3767 }
3768
3769 float64 float64_silence_nan(float64 a, float_status *status)
3770 {
3771 FloatParts64 p;
3772
3773 float64_unpack_raw(&p, a);
3774 p.frac <<= float64_params.frac_shift;
3775 parts_silence_nan(&p, status);
3776 p.frac >>= float64_params.frac_shift;
3777 return float64_pack_raw(&p);
3778 }
3779
3780 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3781 {
3782 FloatParts64 p;
3783
3784 bfloat16_unpack_raw(&p, a);
3785 p.frac <<= bfloat16_params.frac_shift;
3786 parts_silence_nan(&p, status);
3787 p.frac >>= bfloat16_params.frac_shift;
3788 return bfloat16_pack_raw(&p);
3789 }
3790
3791 float128 float128_silence_nan(float128 a, float_status *status)
3792 {
3793 FloatParts128 p;
3794
3795 float128_unpack_raw(&p, a);
3796 frac_shl(&p, float128_params.frac_shift);
3797 parts_silence_nan(&p, status);
3798 frac_shr(&p, float128_params.frac_shift);
3799 return float128_pack_raw(&p);
3800 }
3801
3802 /*----------------------------------------------------------------------------
3803 | If `a' is denormal and we are in flush-to-zero mode then set the
3804 | input-denormal exception and return zero. Otherwise just return the value.
3805 *----------------------------------------------------------------------------*/
3806
3807 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3808 {
3809 if (p.exp == 0 && p.frac != 0) {
3810 float_raise(float_flag_input_denormal, status);
3811 return true;
3812 }
3813
3814 return false;
3815 }
3816
3817 float16 float16_squash_input_denormal(float16 a, float_status *status)
3818 {
3819 if (status->flush_inputs_to_zero) {
3820 FloatParts64 p;
3821
3822 float16_unpack_raw(&p, a);
3823 if (parts_squash_denormal(p, status)) {
3824 return float16_set_sign(float16_zero, p.sign);
3825 }
3826 }
3827 return a;
3828 }
3829
3830 float32 float32_squash_input_denormal(float32 a, float_status *status)
3831 {
3832 if (status->flush_inputs_to_zero) {
3833 FloatParts64 p;
3834
3835 float32_unpack_raw(&p, a);
3836 if (parts_squash_denormal(p, status)) {
3837 return float32_set_sign(float32_zero, p.sign);
3838 }
3839 }
3840 return a;
3841 }
3842
3843 float64 float64_squash_input_denormal(float64 a, float_status *status)
3844 {
3845 if (status->flush_inputs_to_zero) {
3846 FloatParts64 p;
3847
3848 float64_unpack_raw(&p, a);
3849 if (parts_squash_denormal(p, status)) {
3850 return float64_set_sign(float64_zero, p.sign);
3851 }
3852 }
3853 return a;
3854 }
3855
3856 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3857 {
3858 if (status->flush_inputs_to_zero) {
3859 FloatParts64 p;
3860
3861 bfloat16_unpack_raw(&p, a);
3862 if (parts_squash_denormal(p, status)) {
3863 return bfloat16_set_sign(bfloat16_zero, p.sign);
3864 }
3865 }
3866 return a;
3867 }
3868
3869 /*----------------------------------------------------------------------------
3870 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3871 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3872 | input. If `zSign' is 1, the input is negated before being converted to an
3873 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3874 | is simply rounded to an integer, with the inexact exception raised if the
3875 | input cannot be represented exactly as an integer. However, if the fixed-
3876 | point input is too large, the invalid exception is raised and the largest
3877 | positive or negative integer is returned.
3878 *----------------------------------------------------------------------------*/
3879
3880 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3881 float_status *status)
3882 {
3883 int8_t roundingMode;
3884 bool roundNearestEven;
3885 int8_t roundIncrement, roundBits;
3886 int32_t z;
3887
3888 roundingMode = status->float_rounding_mode;
3889 roundNearestEven = ( roundingMode == float_round_nearest_even );
3890 switch (roundingMode) {
3891 case float_round_nearest_even:
3892 case float_round_ties_away:
3893 roundIncrement = 0x40;
3894 break;
3895 case float_round_to_zero:
3896 roundIncrement = 0;
3897 break;
3898 case float_round_up:
3899 roundIncrement = zSign ? 0 : 0x7f;
3900 break;
3901 case float_round_down:
3902 roundIncrement = zSign ? 0x7f : 0;
3903 break;
3904 case float_round_to_odd:
3905 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3906 break;
3907 default:
3908 abort();
3909 }
3910 roundBits = absZ & 0x7F;
3911 absZ = ( absZ + roundIncrement )>>7;
3912 if (!(roundBits ^ 0x40) && roundNearestEven) {
3913 absZ &= ~1;
3914 }
3915 z = absZ;
3916 if ( zSign ) z = - z;
3917 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3918 float_raise(float_flag_invalid, status);
3919 return zSign ? INT32_MIN : INT32_MAX;
3920 }
3921 if (roundBits) {
3922 float_raise(float_flag_inexact, status);
3923 }
3924 return z;
3925
3926 }
3927
3928 /*----------------------------------------------------------------------------
3929 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3930 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3931 | and returns the properly rounded 64-bit integer corresponding to the input.
3932 | If `zSign' is 1, the input is negated before being converted to an integer.
3933 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3934 | the inexact exception raised if the input cannot be represented exactly as
3935 | an integer. However, if the fixed-point input is too large, the invalid
3936 | exception is raised and the largest positive or negative integer is
3937 | returned.
3938 *----------------------------------------------------------------------------*/
3939
3940 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3941 float_status *status)
3942 {
3943 int8_t roundingMode;
3944 bool roundNearestEven, increment;
3945 int64_t z;
3946
3947 roundingMode = status->float_rounding_mode;
3948 roundNearestEven = ( roundingMode == float_round_nearest_even );
3949 switch (roundingMode) {
3950 case float_round_nearest_even:
3951 case float_round_ties_away:
3952 increment = ((int64_t) absZ1 < 0);
3953 break;
3954 case float_round_to_zero:
3955 increment = 0;
3956 break;
3957 case float_round_up:
3958 increment = !zSign && absZ1;
3959 break;
3960 case float_round_down:
3961 increment = zSign && absZ1;
3962 break;
3963 case float_round_to_odd:
3964 increment = !(absZ0 & 1) && absZ1;
3965 break;
3966 default:
3967 abort();
3968 }
3969 if ( increment ) {
3970 ++absZ0;
3971 if ( absZ0 == 0 ) goto overflow;
3972 if (!(absZ1 << 1) && roundNearestEven) {
3973 absZ0 &= ~1;
3974 }
3975 }
3976 z = absZ0;
3977 if ( zSign ) z = - z;
3978 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3979 overflow:
3980 float_raise(float_flag_invalid, status);
3981 return zSign ? INT64_MIN : INT64_MAX;
3982 }
3983 if (absZ1) {
3984 float_raise(float_flag_inexact, status);
3985 }
3986 return z;
3987
3988 }
3989
3990 /*----------------------------------------------------------------------------
3991 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3992 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3993 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3994 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3995 | with the inexact exception raised if the input cannot be represented exactly
3996 | as an integer. However, if the fixed-point input is too large, the invalid
3997 | exception is raised and the largest unsigned integer is returned.
3998 *----------------------------------------------------------------------------*/
3999
4000 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4001 uint64_t absZ1, float_status *status)
4002 {
4003 int8_t roundingMode;
4004 bool roundNearestEven, increment;
4005
4006 roundingMode = status->float_rounding_mode;
4007 roundNearestEven = (roundingMode == float_round_nearest_even);
4008 switch (roundingMode) {
4009 case float_round_nearest_even:
4010 case float_round_ties_away:
4011 increment = ((int64_t)absZ1 < 0);
4012 break;
4013 case float_round_to_zero:
4014 increment = 0;
4015 break;
4016 case float_round_up:
4017 increment = !zSign && absZ1;
4018 break;
4019 case float_round_down:
4020 increment = zSign && absZ1;
4021 break;
4022 case float_round_to_odd:
4023 increment = !(absZ0 & 1) && absZ1;
4024 break;
4025 default:
4026 abort();
4027 }
4028 if (increment) {
4029 ++absZ0;
4030 if (absZ0 == 0) {
4031 float_raise(float_flag_invalid, status);
4032 return UINT64_MAX;
4033 }
4034 if (!(absZ1 << 1) && roundNearestEven) {
4035 absZ0 &= ~1;
4036 }
4037 }
4038
4039 if (zSign && absZ0) {
4040 float_raise(float_flag_invalid, status);
4041 return 0;
4042 }
4043
4044 if (absZ1) {
4045 float_raise(float_flag_inexact, status);
4046 }
4047 return absZ0;
4048 }
4049
4050 /*----------------------------------------------------------------------------
4051 | Normalizes the subnormal single-precision floating-point value represented
4052 | by the denormalized significand `aSig'. The normalized exponent and
4053 | significand are stored at the locations pointed to by `zExpPtr' and
4054 | `zSigPtr', respectively.
4055 *----------------------------------------------------------------------------*/
4056
4057 static void
4058 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4059 {
4060 int8_t shiftCount;
4061
4062 shiftCount = clz32(aSig) - 8;
4063 *zSigPtr = aSig<<shiftCount;
4064 *zExpPtr = 1 - shiftCount;
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4070 | and significand `zSig', and returns the proper single-precision floating-
4071 | point value corresponding to the abstract input. Ordinarily, the abstract
4072 | value is simply rounded and packed into the single-precision format, with
4073 | the inexact exception raised if the abstract input cannot be represented
4074 | exactly. However, if the abstract value is too large, the overflow and
4075 | inexact exceptions are raised and an infinity or maximal finite value is
4076 | returned. If the abstract value is too small, the input value is rounded to
4077 | a subnormal number, and the underflow and inexact exceptions are raised if
4078 | the abstract input cannot be represented exactly as a subnormal single-
4079 | precision floating-point number.
4080 | The input significand `zSig' has its binary point between bits 30
4081 | and 29, which is 7 bits to the left of the usual location. This shifted
4082 | significand must be normalized or smaller. If `zSig' is not normalized,
4083 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4084 | and it must not require rounding. In the usual case that `zSig' is
4085 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4086 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4087 | Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4089
4090 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4091 float_status *status)
4092 {
4093 int8_t roundingMode;
4094 bool roundNearestEven;
4095 int8_t roundIncrement, roundBits;
4096 bool isTiny;
4097
4098 roundingMode = status->float_rounding_mode;
4099 roundNearestEven = ( roundingMode == float_round_nearest_even );
4100 switch (roundingMode) {
4101 case float_round_nearest_even:
4102 case float_round_ties_away:
4103 roundIncrement = 0x40;
4104 break;
4105 case float_round_to_zero:
4106 roundIncrement = 0;
4107 break;
4108 case float_round_up:
4109 roundIncrement = zSign ? 0 : 0x7f;
4110 break;
4111 case float_round_down:
4112 roundIncrement = zSign ? 0x7f : 0;
4113 break;
4114 case float_round_to_odd:
4115 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4116 break;
4117 default:
4118 abort();
4119 break;
4120 }
4121 roundBits = zSig & 0x7F;
4122 if ( 0xFD <= (uint16_t) zExp ) {
4123 if ( ( 0xFD < zExp )
4124 || ( ( zExp == 0xFD )
4125 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4126 ) {
4127 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4128 roundIncrement != 0;
4129 float_raise(float_flag_overflow | float_flag_inexact, status);
4130 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4131 }
4132 if ( zExp < 0 ) {
4133 if (status->flush_to_zero) {
4134 float_raise(float_flag_output_denormal, status);
4135 return packFloat32(zSign, 0, 0);
4136 }
4137 isTiny = status->tininess_before_rounding
4138 || (zExp < -1)
4139 || (zSig + roundIncrement < 0x80000000);
4140 shift32RightJamming( zSig, - zExp, &zSig );
4141 zExp = 0;
4142 roundBits = zSig & 0x7F;
4143 if (isTiny && roundBits) {
4144 float_raise(float_flag_underflow, status);
4145 }
4146 if (roundingMode == float_round_to_odd) {
4147 /*
4148 * For round-to-odd case, the roundIncrement depends on
4149 * zSig which just changed.
4150 */
4151 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4152 }
4153 }
4154 }
4155 if (roundBits) {
4156 float_raise(float_flag_inexact, status);
4157 }
4158 zSig = ( zSig + roundIncrement )>>7;
4159 if (!(roundBits ^ 0x40) && roundNearestEven) {
4160 zSig &= ~1;
4161 }
4162 if ( zSig == 0 ) zExp = 0;
4163 return packFloat32( zSign, zExp, zSig );
4164
4165 }
4166
4167 /*----------------------------------------------------------------------------
4168 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4169 | and significand `zSig', and returns the proper single-precision floating-
4170 | point value corresponding to the abstract input. This routine is just like
4171 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4172 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4173 | floating-point exponent.
4174 *----------------------------------------------------------------------------*/
4175
4176 static float32
4177 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4178 float_status *status)
4179 {
4180 int8_t shiftCount;
4181
4182 shiftCount = clz32(zSig) - 1;
4183 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4184 status);
4185
4186 }
4187
4188 /*----------------------------------------------------------------------------
4189 | Normalizes the subnormal double-precision floating-point value represented
4190 | by the denormalized significand `aSig'. The normalized exponent and
4191 | significand are stored at the locations pointed to by `zExpPtr' and
4192 | `zSigPtr', respectively.
4193 *----------------------------------------------------------------------------*/
4194
4195 static void
4196 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4197 {
4198 int8_t shiftCount;
4199
4200 shiftCount = clz64(aSig) - 11;
4201 *zSigPtr = aSig<<shiftCount;
4202 *zExpPtr = 1 - shiftCount;
4203
4204 }
4205
4206 /*----------------------------------------------------------------------------
4207 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4208 | double-precision floating-point value, returning the result. After being
4209 | shifted into the proper positions, the three fields are simply added
4210 | together to form the result. This means that any integer portion of `zSig'
4211 | will be added into the exponent. Since a properly normalized significand
4212 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4213 | than the desired result exponent whenever `zSig' is a complete, normalized
4214 | significand.
4215 *----------------------------------------------------------------------------*/
4216
4217 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4218 {
4219
4220 return make_float64(
4221 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4222
4223 }
4224
4225 /*----------------------------------------------------------------------------
4226 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4227 | and significand `zSig', and returns the proper double-precision floating-
4228 | point value corresponding to the abstract input. Ordinarily, the abstract
4229 | value is simply rounded and packed into the double-precision format, with
4230 | the inexact exception raised if the abstract input cannot be represented
4231 | exactly. However, if the abstract value is too large, the overflow and
4232 | inexact exceptions are raised and an infinity or maximal finite value is
4233 | returned. If the abstract value is too small, the input value is rounded to
4234 | a subnormal number, and the underflow and inexact exceptions are raised if
4235 | the abstract input cannot be represented exactly as a subnormal double-
4236 | precision floating-point number.
4237 | The input significand `zSig' has its binary point between bits 62
4238 | and 61, which is 10 bits to the left of the usual location. This shifted
4239 | significand must be normalized or smaller. If `zSig' is not normalized,
4240 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4241 | and it must not require rounding. In the usual case that `zSig' is
4242 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4243 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4244 | Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4246
4247 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4248 float_status *status)
4249 {
4250 int8_t roundingMode;
4251 bool roundNearestEven;
4252 int roundIncrement, roundBits;
4253 bool isTiny;
4254
4255 roundingMode = status->float_rounding_mode;
4256 roundNearestEven = ( roundingMode == float_round_nearest_even );
4257 switch (roundingMode) {
4258 case float_round_nearest_even:
4259 case float_round_ties_away:
4260 roundIncrement = 0x200;
4261 break;
4262 case float_round_to_zero:
4263 roundIncrement = 0;
4264 break;
4265 case float_round_up:
4266 roundIncrement = zSign ? 0 : 0x3ff;
4267 break;
4268 case float_round_down:
4269 roundIncrement = zSign ? 0x3ff : 0;
4270 break;
4271 case float_round_to_odd:
4272 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4273 break;
4274 default:
4275 abort();
4276 }
4277 roundBits = zSig & 0x3FF;
4278 if ( 0x7FD <= (uint16_t) zExp ) {
4279 if ( ( 0x7FD < zExp )
4280 || ( ( zExp == 0x7FD )
4281 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4282 ) {
4283 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4284 roundIncrement != 0;
4285 float_raise(float_flag_overflow | float_flag_inexact, status);
4286 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4287 }
4288 if ( zExp < 0 ) {
4289 if (status->flush_to_zero) {
4290 float_raise(float_flag_output_denormal, status);
4291 return packFloat64(zSign, 0, 0);
4292 }
4293 isTiny = status->tininess_before_rounding
4294 || (zExp < -1)
4295 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4296 shift64RightJamming( zSig, - zExp, &zSig );
4297 zExp = 0;
4298 roundBits = zSig & 0x3FF;
4299 if (isTiny && roundBits) {
4300 float_raise(float_flag_underflow, status);
4301 }
4302 if (roundingMode == float_round_to_odd) {
4303 /*
4304 * For round-to-odd case, the roundIncrement depends on
4305 * zSig which just changed.
4306 */
4307 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4308 }
4309 }
4310 }
4311 if (roundBits) {
4312 float_raise(float_flag_inexact, status);
4313 }
4314 zSig = ( zSig + roundIncrement )>>10;
4315 if (!(roundBits ^ 0x200) && roundNearestEven) {
4316 zSig &= ~1;
4317 }
4318 if ( zSig == 0 ) zExp = 0;
4319 return packFloat64( zSign, zExp, zSig );
4320
4321 }
4322
4323 /*----------------------------------------------------------------------------
4324 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4325 | and significand `zSig', and returns the proper double-precision floating-
4326 | point value corresponding to the abstract input. This routine is just like
4327 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4328 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4329 | floating-point exponent.
4330 *----------------------------------------------------------------------------*/
4331
4332 static float64
4333 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4334 float_status *status)
4335 {
4336 int8_t shiftCount;
4337
4338 shiftCount = clz64(zSig) - 1;
4339 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4340 status);
4341
4342 }
4343
4344 /*----------------------------------------------------------------------------
4345 | Normalizes the subnormal extended double-precision floating-point value
4346 | represented by the denormalized significand `aSig'. The normalized exponent
4347 | and significand are stored at the locations pointed to by `zExpPtr' and
4348 | `zSigPtr', respectively.
4349 *----------------------------------------------------------------------------*/
4350
4351 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4352 uint64_t *zSigPtr)
4353 {
4354 int8_t shiftCount;
4355
4356 shiftCount = clz64(aSig);
4357 *zSigPtr = aSig<<shiftCount;
4358 *zExpPtr = 1 - shiftCount;
4359 }
4360
4361 /*----------------------------------------------------------------------------
4362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4363 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4364 | and returns the proper extended double-precision floating-point value
4365 | corresponding to the abstract input. Ordinarily, the abstract value is
4366 | rounded and packed into the extended double-precision format, with the
4367 | inexact exception raised if the abstract input cannot be represented
4368 | exactly. However, if the abstract value is too large, the overflow and
4369 | inexact exceptions are raised and an infinity or maximal finite value is
4370 | returned. If the abstract value is too small, the input value is rounded to
4371 | a subnormal number, and the underflow and inexact exceptions are raised if
4372 | the abstract input cannot be represented exactly as a subnormal extended
4373 | double-precision floating-point number.
4374 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4375 | number of bits as single or double precision, respectively. Otherwise, the
4376 | result is rounded to the full precision of the extended double-precision
4377 | format.
4378 | The input significand must be normalized or smaller. If the input
4379 | significand is not normalized, `zExp' must be 0; in that case, the result
4380 | returned is a subnormal number, and it must not require rounding. The
4381 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4382 | Floating-Point Arithmetic.
4383 *----------------------------------------------------------------------------*/
4384
4385 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4386 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4387 float_status *status)
4388 {
4389 int8_t roundingMode;
4390 bool roundNearestEven, increment, isTiny;
4391 int64_t roundIncrement, roundMask, roundBits;
4392
4393 roundingMode = status->float_rounding_mode;
4394 roundNearestEven = ( roundingMode == float_round_nearest_even );
4395 if ( roundingPrecision == 80 ) goto precision80;
4396 if ( roundingPrecision == 64 ) {
4397 roundIncrement = UINT64_C(0x0000000000000400);
4398 roundMask = UINT64_C(0x00000000000007FF);
4399 }
4400 else if ( roundingPrecision == 32 ) {
4401 roundIncrement = UINT64_C(0x0000008000000000);
4402 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4403 }
4404 else {
4405 goto precision80;
4406 }
4407 zSig0 |= ( zSig1 != 0 );
4408 switch (roundingMode) {
4409 case float_round_nearest_even:
4410 case float_round_ties_away:
4411 break;
4412 case float_round_to_zero:
4413 roundIncrement = 0;
4414 break;
4415 case float_round_up:
4416 roundIncrement = zSign ? 0 : roundMask;
4417 break;
4418 case float_round_down:
4419 roundIncrement = zSign ? roundMask : 0;
4420 break;
4421 default:
4422 abort();
4423 }
4424 roundBits = zSig0 & roundMask;
4425 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4426 if ( ( 0x7FFE < zExp )
4427 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4428 ) {
4429 goto overflow;
4430 }
4431 if ( zExp <= 0 ) {
4432 if (status->flush_to_zero) {
4433 float_raise(float_flag_output_denormal, status);
4434 return packFloatx80(zSign, 0, 0);
4435 }
4436 isTiny = status->tininess_before_rounding
4437 || (zExp < 0 )
4438 || (zSig0 <= zSig0 + roundIncrement);
4439 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4440 zExp = 0;
4441 roundBits = zSig0 & roundMask;
4442 if (isTiny && roundBits) {
4443 float_raise(float_flag_underflow, status);
4444 }
4445 if (roundBits) {
4446 float_raise(float_flag_inexact, status);
4447 }
4448 zSig0 += roundIncrement;
4449 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4450 roundIncrement = roundMask + 1;
4451 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4452 roundMask |= roundIncrement;
4453 }
4454 zSig0 &= ~ roundMask;
4455 return packFloatx80( zSign, zExp, zSig0 );
4456 }
4457 }
4458 if (roundBits) {
4459 float_raise(float_flag_inexact, status);
4460 }
4461 zSig0 += roundIncrement;
4462 if ( zSig0 < roundIncrement ) {
4463 ++zExp;
4464 zSig0 = UINT64_C(0x8000000000000000);
4465 }
4466 roundIncrement = roundMask + 1;
4467 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4468 roundMask |= roundIncrement;
4469 }
4470 zSig0 &= ~ roundMask;
4471 if ( zSig0 == 0 ) zExp = 0;
4472 return packFloatx80( zSign, zExp, zSig0 );
4473 precision80:
4474 switch (roundingMode) {
4475 case float_round_nearest_even:
4476 case float_round_ties_away:
4477 increment = ((int64_t)zSig1 < 0);
4478 break;
4479 case float_round_to_zero:
4480 increment = 0;
4481 break;
4482 case float_round_up:
4483 increment = !zSign && zSig1;
4484 break;
4485 case float_round_down:
4486 increment = zSign && zSig1;
4487 break;
4488 default:
4489 abort();
4490 }
4491 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4492 if ( ( 0x7FFE < zExp )
4493 || ( ( zExp == 0x7FFE )
4494 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4495 && increment
4496 )
4497 ) {
4498 roundMask = 0;
4499 overflow:
4500 float_raise(float_flag_overflow | float_flag_inexact, status);
4501 if ( ( roundingMode == float_round_to_zero )
4502 || ( zSign && ( roundingMode == float_round_up ) )
4503 || ( ! zSign && ( roundingMode == float_round_down ) )
4504 ) {
4505 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4506 }
4507 return packFloatx80(zSign,
4508 floatx80_infinity_high,
4509 floatx80_infinity_low);
4510 }
4511 if ( zExp <= 0 ) {
4512 isTiny = status->tininess_before_rounding
4513 || (zExp < 0)
4514 || !increment
4515 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4516 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4517 zExp = 0;
4518 if (isTiny && zSig1) {
4519 float_raise(float_flag_underflow, status);
4520 }
4521 if (zSig1) {
4522 float_raise(float_flag_inexact, status);
4523 }
4524 switch (roundingMode) {
4525 case float_round_nearest_even:
4526 case float_round_ties_away:
4527 increment = ((int64_t)zSig1 < 0);
4528 break;
4529 case float_round_to_zero:
4530 increment = 0;
4531 break;
4532 case float_round_up:
4533 increment = !zSign && zSig1;
4534 break;
4535 case float_round_down:
4536 increment = zSign && zSig1;
4537 break;
4538 default:
4539 abort();
4540 }
4541 if ( increment ) {
4542 ++zSig0;
4543 if (!(zSig1 << 1) && roundNearestEven) {
4544 zSig0 &= ~1;
4545 }
4546 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4547 }
4548 return packFloatx80( zSign, zExp, zSig0 );
4549 }
4550 }
4551 if (zSig1) {
4552 float_raise(float_flag_inexact, status);
4553 }
4554 if ( increment ) {
4555 ++zSig0;
4556 if ( zSig0 == 0 ) {
4557 ++zExp;
4558 zSig0 = UINT64_C(0x8000000000000000);
4559 }
4560 else {
4561 if (!(zSig1 << 1) && roundNearestEven) {
4562 zSig0 &= ~1;
4563 }
4564 }
4565 }
4566 else {
4567 if ( zSig0 == 0 ) zExp = 0;
4568 }
4569 return packFloatx80( zSign, zExp, zSig0 );
4570
4571 }
4572
4573 /*----------------------------------------------------------------------------
4574 | Takes an abstract floating-point value having sign `zSign', exponent
4575 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4576 | and returns the proper extended double-precision floating-point value
4577 | corresponding to the abstract input. This routine is just like
4578 | `roundAndPackFloatx80' except that the input significand does not have to be
4579 | normalized.
4580 *----------------------------------------------------------------------------*/
4581
4582 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4583 bool zSign, int32_t zExp,
4584 uint64_t zSig0, uint64_t zSig1,
4585 float_status *status)
4586 {
4587 int8_t shiftCount;
4588
4589 if ( zSig0 == 0 ) {
4590 zSig0 = zSig1;
4591 zSig1 = 0;
4592 zExp -= 64;
4593 }
4594 shiftCount = clz64(zSig0);
4595 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4596 zExp -= shiftCount;
4597 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4598 zSig0, zSig1, status);
4599
4600 }
4601
4602 /*----------------------------------------------------------------------------
4603 | Returns the least-significant 64 fraction bits of the quadruple-precision
4604 | floating-point value `a'.
4605 *----------------------------------------------------------------------------*/
4606
4607 static inline uint64_t extractFloat128Frac1( float128 a )
4608 {
4609
4610 return a.low;
4611
4612 }
4613
4614 /*----------------------------------------------------------------------------
4615 | Returns the most-significant 48 fraction bits of the quadruple-precision
4616 | floating-point value `a'.
4617 *----------------------------------------------------------------------------*/
4618
4619 static inline uint64_t extractFloat128Frac0( float128 a )
4620 {
4621
4622 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4623
4624 }
4625
4626 /*----------------------------------------------------------------------------
4627 | Returns the exponent bits of the quadruple-precision floating-point value
4628 | `a'.
4629 *----------------------------------------------------------------------------*/
4630
4631 static inline int32_t extractFloat128Exp( float128 a )
4632 {
4633
4634 return ( a.high>>48 ) & 0x7FFF;
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4640 *----------------------------------------------------------------------------*/
4641
4642 static inline bool extractFloat128Sign(float128 a)
4643 {
4644 return a.high >> 63;
4645 }
4646
4647 /*----------------------------------------------------------------------------
4648 | Normalizes the subnormal quadruple-precision floating-point value
4649 | represented by the denormalized significand formed by the concatenation of
4650 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4651 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4652 | significand are stored at the location pointed to by `zSig0Ptr', and the
4653 | least significant 64 bits of the normalized significand are stored at the
4654 | location pointed to by `zSig1Ptr'.
4655 *----------------------------------------------------------------------------*/
4656
4657 static void
4658 normalizeFloat128Subnormal(
4659 uint64_t aSig0,
4660 uint64_t aSig1,
4661 int32_t *zExpPtr,
4662 uint64_t *zSig0Ptr,
4663 uint64_t *zSig1Ptr
4664 )
4665 {
4666 int8_t shiftCount;
4667
4668 if ( aSig0 == 0 ) {
4669 shiftCount = clz64(aSig1) - 15;
4670 if ( shiftCount < 0 ) {
4671 *zSig0Ptr = aSig1>>( - shiftCount );
4672 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4673 }
4674 else {
4675 *zSig0Ptr = aSig1<<shiftCount;
4676 *zSig1Ptr = 0;
4677 }
4678 *zExpPtr = - shiftCount - 63;
4679 }
4680 else {
4681 shiftCount = clz64(aSig0) - 15;
4682 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4683 *zExpPtr = 1 - shiftCount;
4684 }
4685
4686 }
4687
4688 /*----------------------------------------------------------------------------
4689 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4690 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4691 | floating-point value, returning the result. After being shifted into the
4692 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4693 | added together to form the most significant 32 bits of the result. This
4694 | means that any integer portion of `zSig0' will be added into the exponent.
4695 | Since a properly normalized significand will have an integer portion equal
4696 | to 1, the `zExp' input should be 1 less than the desired result exponent
4697 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4698 | significand.
4699 *----------------------------------------------------------------------------*/
4700
4701 static inline float128
4702 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4703 {
4704 float128 z;
4705
4706 z.low = zSig1;
4707 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4708 return z;
4709 }
4710
4711 /*----------------------------------------------------------------------------
4712 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4713 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4714 | and `zSig2', and returns the proper quadruple-precision floating-point value
4715 | corresponding to the abstract input. Ordinarily, the abstract value is
4716 | simply rounded and packed into the quadruple-precision format, with the
4717 | inexact exception raised if the abstract input cannot be represented
4718 | exactly. However, if the abstract value is too large, the overflow and
4719 | inexact exceptions are raised and an infinity or maximal finite value is
4720 | returned. If the abstract value is too small, the input value is rounded to
4721 | a subnormal number, and the underflow and inexact exceptions are raised if
4722 | the abstract input cannot be represented exactly as a subnormal quadruple-
4723 | precision floating-point number.
4724 | The input significand must be normalized or smaller. If the input
4725 | significand is not normalized, `zExp' must be 0; in that case, the result
4726 | returned is a subnormal number, and it must not require rounding. In the
4727 | usual case that the input significand is normalized, `zExp' must be 1 less
4728 | than the ``true'' floating-point exponent. The handling of underflow and
4729 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4730 *----------------------------------------------------------------------------*/
4731
4732 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4733 uint64_t zSig0, uint64_t zSig1,
4734 uint64_t zSig2, float_status *status)
4735 {
4736 int8_t roundingMode;
4737 bool roundNearestEven, increment, isTiny;
4738
4739 roundingMode = status->float_rounding_mode;
4740 roundNearestEven = ( roundingMode == float_round_nearest_even );
4741 switch (roundingMode) {
4742 case float_round_nearest_even:
4743 case float_round_ties_away:
4744 increment = ((int64_t)zSig2 < 0);
4745 break;
4746 case float_round_to_zero:
4747 increment = 0;
4748 break;
4749 case float_round_up:
4750 increment = !zSign && zSig2;
4751 break;
4752 case float_round_down:
4753 increment = zSign && zSig2;
4754 break;
4755 case float_round_to_odd:
4756 increment = !(zSig1 & 0x1) && zSig2;
4757 break;
4758 default:
4759 abort();
4760 }
4761 if ( 0x7FFD <= (uint32_t) zExp ) {
4762 if ( ( 0x7FFD < zExp )
4763 || ( ( zExp == 0x7FFD )
4764 && eq128(
4765 UINT64_C(0x0001FFFFFFFFFFFF),
4766 UINT64_C(0xFFFFFFFFFFFFFFFF),
4767 zSig0,
4768 zSig1
4769 )
4770 && increment
4771 )
4772 ) {
4773 float_raise(float_flag_overflow | float_flag_inexact, status);
4774 if ( ( roundingMode == float_round_to_zero )
4775 || ( zSign && ( roundingMode == float_round_up ) )
4776 || ( ! zSign && ( roundingMode == float_round_down ) )
4777 || (roundingMode == float_round_to_odd)
4778 ) {
4779 return
4780 packFloat128(
4781 zSign,
4782 0x7FFE,
4783 UINT64_C(0x0000FFFFFFFFFFFF),
4784 UINT64_C(0xFFFFFFFFFFFFFFFF)
4785 );
4786 }
4787 return packFloat128( zSign, 0x7FFF, 0, 0 );
4788 }
4789 if ( zExp < 0 ) {
4790 if (status->flush_to_zero) {
4791 float_raise(float_flag_output_denormal, status);
4792 return packFloat128(zSign, 0, 0, 0);
4793 }
4794 isTiny = status->tininess_before_rounding
4795 || (zExp < -1)
4796 || !increment
4797 || lt128(zSig0, zSig1,
4798 UINT64_C(0x0001FFFFFFFFFFFF),
4799 UINT64_C(0xFFFFFFFFFFFFFFFF));
4800 shift128ExtraRightJamming(
4801 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4802 zExp = 0;
4803 if (isTiny && zSig2) {
4804 float_raise(float_flag_underflow, status);
4805 }
4806 switch (roundingMode) {
4807 case float_round_nearest_even:
4808 case float_round_ties_away:
4809 increment = ((int64_t)zSig2 < 0);
4810 break;
4811 case float_round_to_zero:
4812 increment = 0;
4813 break;
4814 case float_round_up:
4815 increment = !zSign && zSig2;
4816 break;
4817 case float_round_down:
4818 increment = zSign && zSig2;
4819 break;
4820 case float_round_to_odd:
4821 increment = !(zSig1 & 0x1) && zSig2;
4822 break;
4823 default:
4824 abort();
4825 }
4826 }
4827 }
4828 if (zSig2) {
4829 float_raise(float_flag_inexact, status);
4830 }
4831 if ( increment ) {
4832 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4833 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4834 zSig1 &= ~1;
4835 }
4836 }
4837 else {
4838 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4839 }
4840 return packFloat128( zSign, zExp, zSig0, zSig1 );
4841
4842 }
4843
4844 /*----------------------------------------------------------------------------
4845 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4846 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4847 | returns the proper quadruple-precision floating-point value corresponding
4848 | to the abstract input. This routine is just like `roundAndPackFloat128'
4849 | except that the input significand has fewer bits and does not have to be
4850 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4851 | point exponent.
4852 *----------------------------------------------------------------------------*/
4853
4854 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4855 uint64_t zSig0, uint64_t zSig1,
4856 float_status *status)
4857 {
4858 int8_t shiftCount;
4859 uint64_t zSig2;
4860
4861 if ( zSig0 == 0 ) {
4862 zSig0 = zSig1;
4863 zSig1 = 0;
4864 zExp -= 64;
4865 }
4866 shiftCount = clz64(zSig0) - 15;
4867 if ( 0 <= shiftCount ) {
4868 zSig2 = 0;
4869 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4870 }
4871 else {
4872 shift128ExtraRightJamming(
4873 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4874 }
4875 zExp -= shiftCount;
4876 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4877
4878 }
4879
4880
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the 32-bit two's complement integer `a'
4883 | to the extended double-precision floating-point format. The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic.
4886 *----------------------------------------------------------------------------*/
4887
4888 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4889 {
4890 bool zSign;
4891 uint32_t absA;
4892 int8_t shiftCount;
4893 uint64_t zSig;
4894
4895 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4896 zSign = ( a < 0 );
4897 absA = zSign ? - a : a;
4898 shiftCount = clz32(absA) + 32;
4899 zSig = absA;
4900 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4901
4902 }
4903
4904 /*----------------------------------------------------------------------------
4905 | Returns the result of converting the 32-bit two's complement integer `a' to
4906 | the quadruple-precision floating-point format. The conversion is performed
4907 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4908 *----------------------------------------------------------------------------*/
4909
4910 float128 int32_to_float128(int32_t a, float_status *status)
4911 {
4912 bool zSign;
4913 uint32_t absA;
4914 int8_t shiftCount;
4915 uint64_t zSig0;
4916
4917 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4918 zSign = ( a < 0 );
4919 absA = zSign ? - a : a;
4920 shiftCount = clz32(absA) + 17;
4921 zSig0 = absA;
4922 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4923
4924 }
4925
4926 /*----------------------------------------------------------------------------
4927 | Returns the result of converting the 64-bit two's complement integer `a'
4928 | to the extended double-precision floating-point format. The conversion
4929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4930 | Arithmetic.
4931 *----------------------------------------------------------------------------*/
4932
4933 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4934 {
4935 bool zSign;
4936 uint64_t absA;
4937 int8_t shiftCount;
4938
4939 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4940 zSign = ( a < 0 );
4941 absA = zSign ? - a : a;
4942 shiftCount = clz64(absA);
4943 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4944
4945 }
4946
4947 /*----------------------------------------------------------------------------
4948 | Returns the result of converting the 64-bit two's complement integer `a' to
4949 | the quadruple-precision floating-point format. The conversion is performed
4950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4951 *----------------------------------------------------------------------------*/
4952
4953 float128 int64_to_float128(int64_t a, float_status *status)
4954 {
4955 bool zSign;
4956 uint64_t absA;
4957 int8_t shiftCount;
4958 int32_t zExp;
4959 uint64_t zSig0, zSig1;
4960
4961 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4962 zSign = ( a < 0 );
4963 absA = zSign ? - a : a;
4964 shiftCount = clz64(absA) + 49;
4965 zExp = 0x406E - shiftCount;
4966 if ( 64 <= shiftCount ) {
4967 zSig1 = 0;
4968 zSig0 = absA;
4969 shiftCount -= 64;
4970 }
4971 else {
4972 zSig1 = absA;
4973 zSig0 = 0;
4974 }
4975 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4976 return packFloat128( zSign, zExp, zSig0, zSig1 );
4977
4978 }
4979
4980 /*----------------------------------------------------------------------------
4981 | Returns the result of converting the 64-bit unsigned integer `a'
4982 | to the quadruple-precision floating-point format. The conversion is performed
4983 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4984 *----------------------------------------------------------------------------*/
4985
4986 float128 uint64_to_float128(uint64_t a, float_status *status)
4987 {
4988 if (a == 0) {
4989 return float128_zero;
4990 }
4991 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4992 }
4993
4994 /*----------------------------------------------------------------------------
4995 | Returns the result of converting the single-precision floating-point value
4996 | `a' to the extended double-precision floating-point format. The conversion
4997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4998 | Arithmetic.
4999 *----------------------------------------------------------------------------*/
5000
5001 floatx80 float32_to_floatx80(float32 a, float_status *status)
5002 {
5003 bool aSign;
5004 int aExp;
5005 uint32_t aSig;
5006
5007 a = float32_squash_input_denormal(a, status);
5008 aSig = extractFloat32Frac( a );
5009 aExp = extractFloat32Exp( a );
5010 aSign = extractFloat32Sign( a );
5011 if ( aExp == 0xFF ) {
5012 if (aSig) {
5013 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5014 status);
5015 return floatx80_silence_nan(res, status);
5016 }
5017 return packFloatx80(aSign,
5018 floatx80_infinity_high,
5019 floatx80_infinity_low);
5020 }
5021 if ( aExp == 0 ) {
5022 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5023 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5024 }
5025 aSig |= 0x00800000;
5026 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5027
5028 }
5029
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the single-precision floating-point value
5032 | `a' to the double-precision floating-point format. The conversion is
5033 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5034 | Arithmetic.
5035 *----------------------------------------------------------------------------*/
5036
5037 float128 float32_to_float128(float32 a, float_status *status)
5038 {
5039 bool aSign;
5040 int aExp;
5041 uint32_t aSig;
5042
5043 a = float32_squash_input_denormal(a, status);
5044 aSig = extractFloat32Frac( a );
5045 aExp = extractFloat32Exp( a );
5046 aSign = extractFloat32Sign( a );
5047 if ( aExp == 0xFF ) {
5048 if (aSig) {
5049 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5050 }
5051 return packFloat128( aSign, 0x7FFF, 0, 0 );
5052 }
5053 if ( aExp == 0 ) {
5054 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5055 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5056 --aExp;
5057 }
5058 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5059
5060 }
5061
5062 /*----------------------------------------------------------------------------
5063 | Returns the remainder of the single-precision floating-point value `a'
5064 | with respect to the corresponding value `b'. The operation is performed
5065 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5066 *----------------------------------------------------------------------------*/
5067
5068 float32 float32_rem(float32 a, float32 b, float_status *status)
5069 {
5070 bool aSign, zSign;
5071 int aExp, bExp, expDiff;
5072 uint32_t aSig, bSig;
5073 uint32_t q;
5074 uint64_t aSig64, bSig64, q64;
5075 uint32_t alternateASig;
5076 int32_t sigMean;
5077 a = float32_squash_input_denormal(a, status);
5078 b = float32_squash_input_denormal(b, status);
5079
5080 aSig = extractFloat32Frac( a );
5081 aExp = extractFloat32Exp( a );
5082 aSign = extractFloat32Sign( a );
5083 bSig = extractFloat32Frac( b );
5084 bExp = extractFloat32Exp( b );
5085 if ( aExp == 0xFF ) {
5086 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5087 return propagateFloat32NaN(a, b, status);
5088 }
5089 float_raise(float_flag_invalid, status);
5090 return float32_default_nan(status);
5091 }
5092 if ( bExp == 0xFF ) {
5093 if (bSig) {
5094 return propagateFloat32NaN(a, b, status);
5095 }
5096 return a;
5097 }
5098 if ( bExp == 0 ) {
5099 if ( bSig == 0 ) {
5100 float_raise(float_flag_invalid, status);
5101 return float32_default_nan(status);
5102 }
5103 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5104 }
5105 if ( aExp == 0 ) {
5106 if ( aSig == 0 ) return a;
5107 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5108 }
5109 expDiff = aExp - bExp;
5110 aSig |= 0x00800000;
5111 bSig |= 0x00800000;
5112 if ( expDiff < 32 ) {
5113 aSig <<= 8;
5114 bSig <<= 8;
5115 if ( expDiff < 0 ) {
5116 if ( expDiff < -1 ) return a;
5117 aSig >>= 1;
5118 }
5119 q = ( bSig <= aSig );
5120 if ( q ) aSig -= bSig;
5121 if ( 0 < expDiff ) {
5122 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5123 q >>= 32 - expDiff;
5124 bSig >>= 2;
5125 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5126 }
5127 else {
5128 aSig >>= 2;
5129 bSig >>= 2;
5130 }
5131 }
5132 else {
5133 if ( bSig <= aSig ) aSig -= bSig;
5134 aSig64 = ( (uint64_t) aSig )<<40;
5135 bSig64 = ( (uint64_t) bSig )<<40;
5136 expDiff -= 64;
5137 while ( 0 < expDiff ) {
5138 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5139 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5140 aSig64 = - ( ( bSig * q64 )<<38 );
5141 expDiff -= 62;
5142 }
5143 expDiff += 64;
5144 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5145 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5146 q = q64>>( 64 - expDiff );
5147 bSig <<= 6;
5148 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5149 }
5150 do {
5151 alternateASig = aSig;
5152 ++q;
5153 aSig -= bSig;
5154 } while ( 0 <= (int32_t) aSig );
5155 sigMean = aSig + alternateASig;
5156 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5157 aSig = alternateASig;
5158 }
5159 zSign = ( (int32_t) aSig < 0 );
5160 if ( zSign ) aSig = - aSig;
5161 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5162 }
5163
5164
5165
5166 /*----------------------------------------------------------------------------
5167 | Returns the binary exponential of the single-precision floating-point value
5168 | `a'. The operation is performed according to the IEC/IEEE Standard for
5169 | Binary Floating-Point Arithmetic.
5170 |
5171 | Uses the following identities:
5172 |
5173 | 1. -------------------------------------------------------------------------
5174 | x x*ln(2)
5175 | 2 = e
5176 |
5177 | 2. -------------------------------------------------------------------------
5178 | 2 3 4 5 n
5179 | x x x x x x x
5180 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5181 | 1! 2! 3! 4! 5! n!
5182 *----------------------------------------------------------------------------*/
5183
5184 static const float64 float32_exp2_coefficients[15] =
5185 {
5186 const_float64( 0x3ff0000000000000ll ), /* 1 */
5187 const_float64( 0x3fe0000000000000ll ), /* 2 */
5188 const_float64( 0x3fc5555555555555ll ), /* 3 */
5189 const_float64( 0x3fa5555555555555ll ), /* 4 */
5190 const_float64( 0x3f81111111111111ll ), /* 5 */
5191 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5192 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5193 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5194 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5195 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5196 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5197 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5198 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5199 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5200 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5201 };
5202
5203 float32 float32_exp2(float32 a, float_status *status)
5204 {
5205 bool aSign;
5206 int aExp;
5207 uint32_t aSig;
5208 float64 r, x, xn;
5209 int i;
5210 a = float32_squash_input_denormal(a, status);
5211
5212 aSig = extractFloat32Frac( a );
5213 aExp = extractFloat32Exp( a );
5214 aSign = extractFloat32Sign( a );
5215
5216 if ( aExp == 0xFF) {
5217 if (aSig) {
5218 return propagateFloat32NaN(a, float32_zero, status);
5219 }
5220 return (aSign) ? float32_zero : a;
5221 }
5222 if (aExp == 0) {
5223 if (aSig == 0) return float32_one;
5224 }
5225
5226 float_raise(float_flag_inexact, status);
5227
5228 /* ******************************* */
5229 /* using float64 for approximation */
5230 /* ******************************* */
5231 x = float32_to_float64(a, status);
5232 x = float64_mul(x, float64_ln2, status);
5233
5234 xn = x;
5235 r = float64_one;
5236 for (i = 0 ; i < 15 ; i++) {
5237 float64 f;
5238
5239 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5240 r = float64_add(r, f, status);
5241
5242 xn = float64_mul(xn, x, status);
5243 }
5244
5245 return float64_to_float32(r, status);
5246 }
5247
5248 /*----------------------------------------------------------------------------
5249 | Returns the binary log of the single-precision floating-point value `a'.
5250 | The operation is performed according to the IEC/IEEE Standard for Binary
5251 | Floating-Point Arithmetic.
5252 *----------------------------------------------------------------------------*/
5253 float32 float32_log2(float32 a, float_status *status)
5254 {
5255 bool aSign, zSign;
5256 int aExp;
5257 uint32_t aSig, zSig, i;
5258
5259 a = float32_squash_input_denormal(a, status);
5260 aSig = extractFloat32Frac( a );
5261 aExp = extractFloat32Exp( a );
5262 aSign = extractFloat32Sign( a );
5263
5264 if ( aExp == 0 ) {
5265 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5266 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5267 }
5268 if ( aSign ) {
5269 float_raise(float_flag_invalid, status);
5270 return float32_default_nan(status);
5271 }
5272 if ( aExp == 0xFF ) {
5273 if (aSig) {
5274 return propagateFloat32NaN(a, float32_zero, status);
5275 }
5276 return a;
5277 }
5278
5279 aExp -= 0x7F;
5280 aSig |= 0x00800000;
5281 zSign = aExp < 0;
5282 zSig = aExp << 23;
5283
5284 for (i = 1 << 22; i > 0; i >>= 1) {
5285 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5286 if ( aSig & 0x01000000 ) {
5287 aSig >>= 1;
5288 zSig |= i;
5289 }
5290 }
5291
5292 if ( zSign )
5293 zSig = -zSig;
5294
5295 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5296 }
5297
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of converting the double-precision floating-point value
5300 | `a' to the extended double-precision floating-point format. The conversion
5301 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5302 | Arithmetic.
5303 *----------------------------------------------------------------------------*/
5304
5305 floatx80 float64_to_floatx80(float64 a, float_status *status)
5306 {
5307 bool aSign;
5308 int aExp;
5309 uint64_t aSig;
5310
5311 a = float64_squash_input_denormal(a, status);
5312 aSig = extractFloat64Frac( a );
5313 aExp = extractFloat64Exp( a );
5314 aSign = extractFloat64Sign( a );
5315 if ( aExp == 0x7FF ) {
5316 if (aSig) {
5317 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5318 status);
5319 return floatx80_silence_nan(res, status);
5320 }
5321 return packFloatx80(aSign,
5322 floatx80_infinity_high,
5323 floatx80_infinity_low);
5324 }
5325 if ( aExp == 0 ) {
5326 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5327 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5328 }
5329 return
5330 packFloatx80(
5331 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5332
5333 }
5334
5335 /*----------------------------------------------------------------------------
5336 | Returns the result of converting the double-precision floating-point value
5337 | `a' to the quadruple-precision floating-point format. The conversion is
5338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5339 | Arithmetic.
5340 *----------------------------------------------------------------------------*/
5341
5342 float128 float64_to_float128(float64 a, float_status *status)
5343 {
5344 bool aSign;
5345 int aExp;
5346 uint64_t aSig, zSig0, zSig1;
5347
5348 a = float64_squash_input_denormal(a, status);
5349 aSig = extractFloat64Frac( a );
5350 aExp = extractFloat64Exp( a );
5351 aSign = extractFloat64Sign( a );
5352 if ( aExp == 0x7FF ) {
5353 if (aSig) {
5354 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5355 }
5356 return packFloat128( aSign, 0x7FFF, 0, 0 );
5357 }
5358 if ( aExp == 0 ) {
5359 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5360 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5361 --aExp;
5362 }
5363 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5364 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5365
5366 }
5367
5368
5369 /*----------------------------------------------------------------------------
5370 | Returns the remainder of the double-precision floating-point value `a'
5371 | with respect to the corresponding value `b'. The operation is performed
5372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5373 *----------------------------------------------------------------------------*/
5374
5375 float64 float64_rem(float64 a, float64 b, float_status *status)
5376 {
5377 bool aSign, zSign;
5378 int aExp, bExp, expDiff;
5379 uint64_t aSig, bSig;
5380 uint64_t q, alternateASig;
5381 int64_t sigMean;
5382
5383 a = float64_squash_input_denormal(a, status);
5384 b = float64_squash_input_denormal(b, status);
5385 aSig = extractFloat64Frac( a );
5386 aExp = extractFloat64Exp( a );
5387 aSign = extractFloat64Sign( a );
5388 bSig = extractFloat64Frac( b );
5389 bExp = extractFloat64Exp( b );
5390 if ( aExp == 0x7FF ) {
5391 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5392 return propagateFloat64NaN(a, b, status);
5393 }
5394 float_raise(float_flag_invalid, status);
5395 return float64_default_nan(status);
5396 }
5397 if ( bExp == 0x7FF ) {
5398 if (bSig) {
5399 return propagateFloat64NaN(a, b, status);
5400 }
5401 return a;
5402 }
5403 if ( bExp == 0 ) {
5404 if ( bSig == 0 ) {
5405 float_raise(float_flag_invalid, status);
5406 return float64_default_nan(status);
5407 }
5408 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5409 }
5410 if ( aExp == 0 ) {
5411 if ( aSig == 0 ) return a;
5412 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5413 }
5414 expDiff = aExp - bExp;
5415 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5416 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5417 if ( expDiff < 0 ) {
5418 if ( expDiff < -1 ) return a;
5419 aSig >>= 1;
5420 }
5421 q = ( bSig <= aSig );
5422 if ( q ) aSig -= bSig;
5423 expDiff -= 64;
5424 while ( 0 < expDiff ) {
5425 q = estimateDiv128To64( aSig, 0, bSig );
5426 q = ( 2 < q ) ? q - 2 : 0;
5427 aSig = - ( ( bSig>>2 ) * q );
5428 expDiff -= 62;
5429 }
5430 expDiff += 64;
5431 if ( 0 < expDiff ) {
5432 q = estimateDiv128To64( aSig, 0, bSig );
5433 q = ( 2 < q ) ? q - 2 : 0;
5434 q >>= 64 - expDiff;
5435 bSig >>= 2;
5436 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5437 }
5438 else {
5439 aSig >>= 2;
5440 bSig >>= 2;
5441 }
5442 do {
5443 alternateASig = aSig;
5444 ++q;
5445 aSig -= bSig;
5446 } while ( 0 <= (int64_t) aSig );
5447 sigMean = aSig + alternateASig;
5448 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5449 aSig = alternateASig;
5450 }
5451 zSign = ( (int64_t) aSig < 0 );
5452 if ( zSign ) aSig = - aSig;
5453 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5454
5455 }
5456
5457 /*----------------------------------------------------------------------------
5458 | Returns the binary log of the double-precision floating-point value `a'.
5459 | The operation is performed according to the IEC/IEEE Standard for Binary
5460 | Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462 float64 float64_log2(float64 a, float_status *status)
5463 {
5464 bool aSign, zSign;
5465 int aExp;
5466 uint64_t aSig, aSig0, aSig1, zSig, i;
5467 a = float64_squash_input_denormal(a, status);
5468
5469 aSig = extractFloat64Frac( a );
5470 aExp = extractFloat64Exp( a );
5471 aSign = extractFloat64Sign( a );
5472
5473 if ( aExp == 0 ) {
5474 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5475 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5476 }
5477 if ( aSign ) {
5478 float_raise(float_flag_invalid, status);
5479 return float64_default_nan(status);
5480 }
5481 if ( aExp == 0x7FF ) {
5482 if (aSig) {
5483 return propagateFloat64NaN(a, float64_zero, status);
5484 }
5485 return a;
5486 }
5487
5488 aExp -= 0x3FF;
5489 aSig |= UINT64_C(0x0010000000000000);
5490 zSign = aExp < 0;
5491 zSig = (uint64_t)aExp << 52;
5492 for (i = 1LL << 51; i > 0; i >>= 1) {
5493 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5494 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5495 if ( aSig & UINT64_C(0x0020000000000000) ) {
5496 aSig >>= 1;
5497 zSig |= i;
5498 }
5499 }
5500
5501 if ( zSign )
5502 zSig = -zSig;
5503 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5504 }
5505
5506 /*----------------------------------------------------------------------------
5507 | Returns the result of converting the extended double-precision floating-
5508 | point value `a' to the 32-bit two's complement integer format. The
5509 | conversion is performed according to the IEC/IEEE Standard for Binary
5510 | Floating-Point Arithmetic---which means in particular that the conversion
5511 | is rounded according to the current rounding mode. If `a' is a NaN, the
5512 | largest positive integer is returned. Otherwise, if the conversion
5513 | overflows, the largest integer with the same sign as `a' is returned.
5514 *----------------------------------------------------------------------------*/
5515
5516 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5517 {
5518 bool aSign;
5519 int32_t aExp, shiftCount;
5520 uint64_t aSig;
5521
5522 if (floatx80_invalid_encoding(a)) {
5523 float_raise(float_flag_invalid, status);
5524 return 1 << 31;
5525 }
5526 aSig = extractFloatx80Frac( a );
5527 aExp = extractFloatx80Exp( a );
5528 aSign = extractFloatx80Sign( a );
5529 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5530 shiftCount = 0x4037 - aExp;
5531 if ( shiftCount <= 0 ) shiftCount = 1;
5532 shift64RightJamming( aSig, shiftCount, &aSig );
5533 return roundAndPackInt32(aSign, aSig, status);
5534
5535 }
5536
5537 /*----------------------------------------------------------------------------
5538 | Returns the result of converting the extended double-precision floating-
5539 | point value `a' to the 32-bit two's complement integer format. The
5540 | conversion is performed according to the IEC/IEEE Standard for Binary
5541 | Floating-Point Arithmetic, except that the conversion is always rounded
5542 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5543 | Otherwise, if the conversion overflows, the largest integer with the same
5544 | sign as `a' is returned.
5545 *----------------------------------------------------------------------------*/
5546
5547 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5548 {
5549 bool aSign;
5550 int32_t aExp, shiftCount;
5551 uint64_t aSig, savedASig;
5552 int32_t z;
5553
5554 if (floatx80_invalid_encoding(a)) {
5555 float_raise(float_flag_invalid, status);
5556 return 1 << 31;
5557 }
5558 aSig = extractFloatx80Frac( a );
5559 aExp = extractFloatx80Exp( a );
5560 aSign = extractFloatx80Sign( a );
5561 if ( 0x401E < aExp ) {
5562 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5563 goto invalid;
5564 }
5565 else if ( aExp < 0x3FFF ) {
5566 if (aExp || aSig) {
5567 float_raise(float_flag_inexact, status);
5568 }
5569 return 0;
5570 }
5571 shiftCount = 0x403E - aExp;
5572 savedASig = aSig;
5573 aSig >>= shiftCount;
5574 z = aSig;
5575 if ( aSign ) z = - z;
5576 if ( ( z < 0 ) ^ aSign ) {
5577 invalid:
5578 float_raise(float_flag_invalid, status);
5579 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5580 }
5581 if ( ( aSig<<shiftCount ) != savedASig ) {
5582 float_raise(float_flag_inexact, status);
5583 }
5584 return z;
5585
5586 }
5587
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the 64-bit two's complement integer format. The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic---which means in particular that the conversion
5593 | is rounded according to the current rounding mode. If `a' is a NaN,
5594 | the largest positive integer is returned. Otherwise, if the conversion
5595 | overflows, the largest integer with the same sign as `a' is returned.
5596 *----------------------------------------------------------------------------*/
5597
5598 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5599 {
5600 bool aSign;
5601 int32_t aExp, shiftCount;
5602 uint64_t aSig, aSigExtra;
5603
5604 if (floatx80_invalid_encoding(a)) {
5605 float_raise(float_flag_invalid, status);
5606 return 1ULL << 63;
5607 }
5608 aSig = extractFloatx80Frac( a );
5609 aExp = extractFloatx80Exp( a );
5610 aSign = extractFloatx80Sign( a );
5611 shiftCount = 0x403E - aExp;
5612 if ( shiftCount <= 0 ) {
5613 if ( shiftCount ) {
5614 float_raise(float_flag_invalid, status);
5615 if (!aSign || floatx80_is_any_nan(a)) {
5616 return INT64_MAX;
5617 }
5618 return INT64_MIN;
5619 }
5620 aSigExtra = 0;
5621 }
5622 else {
5623 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5624 }
5625 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5626
5627 }
5628
5629 /*----------------------------------------------------------------------------
5630 | Returns the result of converting the extended double-precision floating-
5631 | point value `a' to the 64-bit two's complement integer format. The
5632 | conversion is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic, except that the conversion is always rounded
5634 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5635 | Otherwise, if the conversion overflows, the largest integer with the same
5636 | sign as `a' is returned.
5637 *----------------------------------------------------------------------------*/
5638
5639 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5640 {
5641 bool aSign;
5642 int32_t aExp, shiftCount;
5643 uint64_t aSig;
5644 int64_t z;
5645
5646 if (floatx80_invalid_encoding(a)) {
5647 float_raise(float_flag_invalid, status);
5648 return 1ULL << 63;
5649 }
5650 aSig = extractFloatx80Frac( a );
5651 aExp = extractFloatx80Exp( a );
5652 aSign = extractFloatx80Sign( a );
5653 shiftCount = aExp - 0x403E;
5654 if ( 0 <= shiftCount ) {
5655 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5656 if ( ( a.high != 0xC03E ) || aSig ) {
5657 float_raise(float_flag_invalid, status);
5658 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5659 return INT64_MAX;
5660 }
5661 }
5662 return INT64_MIN;
5663 }
5664 else if ( aExp < 0x3FFF ) {
5665 if (aExp | aSig) {
5666 float_raise(float_flag_inexact, status);
5667 }
5668 return 0;
5669 }
5670 z = aSig>>( - shiftCount );
5671 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5672 float_raise(float_flag_inexact, status);
5673 }
5674 if ( aSign ) z = - z;
5675 return z;
5676
5677 }
5678
5679 /*----------------------------------------------------------------------------
5680 | Returns the result of converting the extended double-precision floating-
5681 | point value `a' to the single-precision floating-point format. The
5682 | conversion is performed according to the IEC/IEEE Standard for Binary
5683 | Floating-Point Arithmetic.
5684 *----------------------------------------------------------------------------*/
5685
5686 float32 floatx80_to_float32(floatx80 a, float_status *status)
5687 {
5688 bool aSign;
5689 int32_t aExp;
5690 uint64_t aSig;
5691
5692 if (floatx80_invalid_encoding(a)) {
5693 float_raise(float_flag_invalid, status);
5694 return float32_default_nan(status);
5695 }
5696 aSig = extractFloatx80Frac( a );
5697 aExp = extractFloatx80Exp( a );
5698 aSign = extractFloatx80Sign( a );
5699 if ( aExp == 0x7FFF ) {
5700 if ( (uint64_t) ( aSig<<1 ) ) {
5701 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5702 status);
5703 return float32_silence_nan(res, status);
5704 }
5705 return packFloat32( aSign, 0xFF, 0 );
5706 }
5707 shift64RightJamming( aSig, 33, &aSig );
5708 if ( aExp || aSig ) aExp -= 0x3F81;
5709 return roundAndPackFloat32(aSign, aExp, aSig, status);
5710
5711 }
5712
5713 /*----------------------------------------------------------------------------
5714 | Returns the result of converting the extended double-precision floating-
5715 | point value `a' to the double-precision floating-point format. The
5716 | conversion is performed according to the IEC/IEEE Standard for Binary
5717 | Floating-Point Arithmetic.
5718 *----------------------------------------------------------------------------*/
5719
5720 float64 floatx80_to_float64(floatx80 a, float_status *status)
5721 {
5722 bool aSign;
5723 int32_t aExp;
5724 uint64_t aSig, zSig;
5725
5726 if (floatx80_invalid_encoding(a)) {
5727 float_raise(float_flag_invalid, status);
5728 return float64_default_nan(status);
5729 }
5730 aSig = extractFloatx80Frac( a );
5731 aExp = extractFloatx80Exp( a );
5732 aSign = extractFloatx80Sign( a );
5733 if ( aExp == 0x7FFF ) {
5734 if ( (uint64_t) ( aSig<<1 ) ) {
5735 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5736 status);
5737 return float64_silence_nan(res, status);
5738 }
5739 return packFloat64( aSign, 0x7FF, 0 );
5740 }
5741 shift64RightJamming( aSig, 1, &zSig );
5742 if ( aExp || aSig ) aExp -= 0x3C01;
5743 return roundAndPackFloat64(aSign, aExp, zSig, status);
5744
5745 }
5746
5747 /*----------------------------------------------------------------------------
5748 | Returns the result of converting the extended double-precision floating-
5749 | point value `a' to the quadruple-precision floating-point format. The
5750 | conversion is performed according to the IEC/IEEE Standard for Binary
5751 | Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5753
5754 float128 floatx80_to_float128(floatx80 a, float_status *status)
5755 {
5756 bool aSign;
5757 int aExp;
5758 uint64_t aSig, zSig0, zSig1;
5759
5760 if (floatx80_invalid_encoding(a)) {
5761 float_raise(float_flag_invalid, status);
5762 return float128_default_nan(status);
5763 }
5764 aSig = extractFloatx80Frac( a );
5765 aExp = extractFloatx80Exp( a );
5766 aSign = extractFloatx80Sign( a );
5767 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5768 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5769 status);
5770 return float128_silence_nan(res, status);
5771 }
5772 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5773 return packFloat128( aSign, aExp, zSig0, zSig1 );
5774
5775 }
5776
5777 /*----------------------------------------------------------------------------
5778 | Rounds the extended double-precision floating-point value `a'
5779 | to the precision provided by floatx80_rounding_precision and returns the
5780 | result as an extended double-precision floating-point value.
5781 | The operation is performed according to the IEC/IEEE Standard for Binary
5782 | Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5784
5785 floatx80 floatx80_round(floatx80 a, float_status *status)
5786 {
5787 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5788 extractFloatx80Sign(a),
5789 extractFloatx80Exp(a),
5790 extractFloatx80Frac(a), 0, status);
5791 }
5792
5793 /*----------------------------------------------------------------------------
5794 | Rounds the extended double-precision floating-point value `a' to an integer,
5795 | and returns the result as an extended quadruple-precision floating-point
5796 | value. The operation is performed according to the IEC/IEEE Standard for
5797 | Binary Floating-Point Arithmetic.
5798 *----------------------------------------------------------------------------*/
5799
5800 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5801 {
5802 bool aSign;
5803 int32_t aExp;
5804 uint64_t lastBitMask, roundBitsMask;
5805 floatx80 z;
5806
5807 if (floatx80_invalid_encoding(a)) {
5808 float_raise(float_flag_invalid, status);
5809 return floatx80_default_nan(status);
5810 }
5811 aExp = extractFloatx80Exp( a );
5812 if ( 0x403E <= aExp ) {
5813 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5814 return propagateFloatx80NaN(a, a, status);
5815 }
5816 return a;
5817 }
5818 if ( aExp < 0x3FFF ) {
5819 if ( ( aExp == 0 )
5820 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5821 return a;
5822 }
5823 float_raise(float_flag_inexact, status);
5824 aSign = extractFloatx80Sign( a );
5825 switch (status->float_rounding_mode) {
5826 case float_round_nearest_even:
5827 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5828 ) {
5829 return
5830 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5831 }
5832 break;
5833 case float_round_ties_away:
5834 if (aExp == 0x3FFE) {
5835 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5836 }
5837 break;
5838 case float_round_down:
5839 return
5840 aSign ?
5841 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5842 : packFloatx80( 0, 0, 0 );
5843 case float_round_up:
5844 return
5845 aSign ? packFloatx80( 1, 0, 0 )
5846 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5847
5848 case float_round_to_zero:
5849 break;
5850 default:
5851 g_assert_not_reached();
5852 }
5853 return packFloatx80( aSign, 0, 0 );
5854 }
5855 lastBitMask = 1;
5856 lastBitMask <<= 0x403E - aExp;
5857 roundBitsMask = lastBitMask - 1;
5858 z = a;
5859 switch (status->float_rounding_mode) {
5860 case float_round_nearest_even:
5861 z.low += lastBitMask>>1;
5862 if ((z.low & roundBitsMask) == 0) {
5863 z.low &= ~lastBitMask;
5864 }
5865 break;
5866 case float_round_ties_away:
5867 z.low += lastBitMask >> 1;
5868 break;
5869 case float_round_to_zero:
5870 break;
5871 case float_round_up:
5872 if (!extractFloatx80Sign(z)) {
5873 z.low += roundBitsMask;
5874 }
5875 break;
5876 case float_round_down:
5877 if (extractFloatx80Sign(z)) {
5878 z.low += roundBitsMask;
5879 }
5880 break;
5881 default:
5882 abort();
5883 }
5884 z.low &= ~ roundBitsMask;
5885 if ( z.low == 0 ) {
5886 ++z.high;
5887 z.low = UINT64_C(0x8000000000000000);
5888 }
5889 if (z.low != a.low) {
5890 float_raise(float_flag_inexact, status);
5891 }
5892 return z;
5893
5894 }
5895
5896 /*----------------------------------------------------------------------------
5897 | Returns the result of adding the absolute values of the extended double-
5898 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5899 | negated before being returned. `zSign' is ignored if the result is a NaN.
5900 | The addition is performed according to the IEC/IEEE Standard for Binary
5901 | Floating-Point Arithmetic.
5902 *----------------------------------------------------------------------------*/
5903
5904 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5905 float_status *status)
5906 {
5907 int32_t aExp, bExp, zExp;
5908 uint64_t aSig, bSig, zSig0, zSig1;
5909 int32_t expDiff;
5910
5911 aSig = extractFloatx80Frac( a );
5912 aExp = extractFloatx80Exp( a );
5913 bSig = extractFloatx80Frac( b );
5914 bExp = extractFloatx80Exp( b );
5915 expDiff = aExp - bExp;
5916 if ( 0 < expDiff ) {
5917 if ( aExp == 0x7FFF ) {
5918 if ((uint64_t)(aSig << 1)) {
5919 return propagateFloatx80NaN(a, b, status);
5920 }
5921 return a;
5922 }
5923 if ( bExp == 0 ) --expDiff;
5924 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5925 zExp = aExp;
5926 }
5927 else if ( expDiff < 0 ) {
5928 if ( bExp == 0x7FFF ) {
5929 if ((uint64_t)(bSig << 1)) {
5930 return propagateFloatx80NaN(a, b, status);
5931 }
5932 return packFloatx80(zSign,
5933 floatx80_infinity_high,
5934 floatx80_infinity_low);
5935 }
5936 if ( aExp == 0 ) ++expDiff;
5937 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5938 zExp = bExp;
5939 }
5940 else {
5941 if ( aExp == 0x7FFF ) {
5942 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5943 return propagateFloatx80NaN(a, b, status);
5944 }
5945 return a;
5946 }
5947 zSig1 = 0;
5948 zSig0 = aSig + bSig;
5949 if ( aExp == 0 ) {
5950 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5951 /* At least one of the values is a pseudo-denormal,
5952 * and there is a carry out of the result. */
5953 zExp = 1;
5954 goto shiftRight1;
5955 }
5956 if (zSig0 == 0) {
5957 return packFloatx80(zSign, 0, 0);
5958 }
5959 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5960 goto roundAndPack;
5961 }
5962 zExp = aExp;
5963 goto shiftRight1;
5964 }
5965 zSig0 = aSig + bSig;
5966 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5967 shiftRight1:
5968 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5969 zSig0 |= UINT64_C(0x8000000000000000);
5970 ++zExp;
5971 roundAndPack:
5972 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5973 zSign, zExp, zSig0, zSig1, status);
5974 }
5975
5976 /*----------------------------------------------------------------------------
5977 | Returns the result of subtracting the absolute values of the extended
5978 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5979 | difference is negated before being returned. `zSign' is ignored if the
5980 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5981 | Standard for Binary Floating-Point Arithmetic.
5982 *----------------------------------------------------------------------------*/
5983
5984 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5985 float_status *status)
5986 {
5987 int32_t aExp, bExp, zExp;
5988 uint64_t aSig, bSig, zSig0, zSig1;
5989 int32_t expDiff;
5990
5991 aSig = extractFloatx80Frac( a );
5992 aExp = extractFloatx80Exp( a );
5993 bSig = extractFloatx80Frac( b );
5994 bExp = extractFloatx80Exp( b );
5995 expDiff = aExp - bExp;
5996 if ( 0 < expDiff ) goto aExpBigger;
5997 if ( expDiff < 0 ) goto bExpBigger;
5998 if ( aExp == 0x7FFF ) {
5999 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6000 return propagateFloatx80NaN(a, b, status);
6001 }
6002 float_raise(float_flag_invalid, status);
6003 return floatx80_default_nan(status);
6004 }
6005 if ( aExp == 0 ) {
6006 aExp = 1;
6007 bExp = 1;
6008 }
6009 zSig1 = 0;
6010 if ( bSig < aSig ) goto aBigger;
6011 if ( aSig < bSig ) goto bBigger;
6012 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6013 bExpBigger:
6014 if ( bExp == 0x7FFF ) {
6015 if ((uint64_t)(bSig << 1)) {
6016 return propagateFloatx80NaN(a, b, status);
6017 }
6018 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6019 floatx80_infinity_low);
6020 }
6021 if ( aExp == 0 ) ++expDiff;
6022 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6023 bBigger:
6024 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6025 zExp = bExp;
6026 zSign ^= 1;
6027 goto normalizeRoundAndPack;
6028 aExpBigger:
6029 if ( aExp == 0x7FFF ) {
6030 if ((uint64_t)(aSig << 1)) {
6031 return propagateFloatx80NaN(a, b, status);
6032 }
6033 return a;
6034 }
6035 if ( bExp == 0 ) --expDiff;
6036 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6037 aBigger:
6038 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6039 zExp = aExp;
6040 normalizeRoundAndPack:
6041 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6042 zSign, zExp, zSig0, zSig1, status);
6043 }
6044
6045 /*----------------------------------------------------------------------------
6046 | Returns the result of adding the extended double-precision floating-point
6047 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6048 | Standard for Binary Floating-Point Arithmetic.
6049 *----------------------------------------------------------------------------*/
6050
6051 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6052 {
6053 bool aSign, bSign;
6054
6055 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6056 float_raise(float_flag_invalid, status);
6057 return floatx80_default_nan(status);
6058 }
6059 aSign = extractFloatx80Sign( a );
6060 bSign = extractFloatx80Sign( b );
6061 if ( aSign == bSign ) {
6062 return addFloatx80Sigs(a, b, aSign, status);
6063 }
6064 else {
6065 return subFloatx80Sigs(a, b, aSign, status);
6066 }
6067
6068 }
6069
6070 /*----------------------------------------------------------------------------
6071 | Returns the result of subtracting the extended double-precision floating-
6072 | point values `a' and `b'. The operation is performed according to the
6073 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6074 *----------------------------------------------------------------------------*/
6075
6076 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6077 {
6078 bool aSign, bSign;
6079
6080 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6081 float_raise(float_flag_invalid, status);
6082 return floatx80_default_nan(status);
6083 }
6084 aSign = extractFloatx80Sign( a );
6085 bSign = extractFloatx80Sign( b );
6086 if ( aSign == bSign ) {
6087 return subFloatx80Sigs(a, b, aSign, status);
6088 }
6089 else {
6090 return addFloatx80Sigs(a, b, aSign, status);
6091 }
6092
6093 }
6094
6095 /*----------------------------------------------------------------------------
6096 | Returns the result of multiplying the extended double-precision floating-
6097 | point values `a' and `b'. The operation is performed according to the
6098 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6099 *----------------------------------------------------------------------------*/
6100
6101 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6102 {
6103 bool aSign, bSign, zSign;
6104 int32_t aExp, bExp, zExp;
6105 uint64_t aSig, bSig, zSig0, zSig1;
6106
6107 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6108 float_raise(float_flag_invalid, status);
6109 return floatx80_default_nan(status);
6110 }
6111 aSig = extractFloatx80Frac( a );
6112 aExp = extractFloatx80Exp( a );
6113 aSign = extractFloatx80Sign( a );
6114 bSig = extractFloatx80Frac( b );
6115 bExp = extractFloatx80Exp( b );
6116 bSign = extractFloatx80Sign( b );
6117 zSign = aSign ^ bSign;
6118 if ( aExp == 0x7FFF ) {
6119 if ( (uint64_t) ( aSig<<1 )
6120 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6121 return propagateFloatx80NaN(a, b, status);
6122 }
6123 if ( ( bExp | bSig ) == 0 ) goto invalid;
6124 return packFloatx80(zSign, floatx80_infinity_high,
6125 floatx80_infinity_low);
6126 }
6127 if ( bExp == 0x7FFF ) {
6128 if ((uint64_t)(bSig << 1)) {
6129 return propagateFloatx80NaN(a, b, status);
6130 }
6131 if ( ( aExp | aSig ) == 0 ) {
6132 invalid:
6133 float_raise(float_flag_invalid, status);
6134 return floatx80_default_nan(status);
6135 }
6136 return packFloatx80(zSign, floatx80_infinity_high,
6137 floatx80_infinity_low);
6138 }
6139 if ( aExp == 0 ) {
6140 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6141 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6142 }
6143 if ( bExp == 0 ) {
6144 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6145 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6146 }
6147 zExp = aExp + bExp - 0x3FFE;
6148 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6149 if ( 0 < (int64_t) zSig0 ) {
6150 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6151 --zExp;
6152 }
6153 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6154 zSign, zExp, zSig0, zSig1, status);
6155 }
6156
6157 /*----------------------------------------------------------------------------
6158 | Returns the result of dividing the extended double-precision floating-point
6159 | value `a' by the corresponding value `b'. The operation is performed
6160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6161 *----------------------------------------------------------------------------*/
6162
6163 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6164 {
6165 bool aSign, bSign, zSign;
6166 int32_t aExp, bExp, zExp;
6167 uint64_t aSig, bSig, zSig0, zSig1;
6168 uint64_t rem0, rem1, rem2, term0, term1, term2;
6169
6170 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6171 float_raise(float_flag_invalid, status);
6172 return floatx80_default_nan(status);
6173 }
6174 aSig = extractFloatx80Frac( a );
6175 aExp = extractFloatx80Exp( a );
6176 aSign = extractFloatx80Sign( a );
6177 bSig = extractFloatx80Frac( b );
6178 bExp = extractFloatx80Exp( b );
6179 bSign = extractFloatx80Sign( b );
6180 zSign = aSign ^ bSign;
6181 if ( aExp == 0x7FFF ) {
6182 if ((uint64_t)(aSig << 1)) {
6183 return propagateFloatx80NaN(a, b, status);
6184 }
6185 if ( bExp == 0x7FFF ) {
6186 if ((uint64_t)(bSig << 1)) {
6187 return propagateFloatx80NaN(a, b, status);
6188 }
6189 goto invalid;
6190 }
6191 return packFloatx80(zSign, floatx80_infinity_high,
6192 floatx80_infinity_low);
6193 }
6194 if ( bExp == 0x7FFF ) {
6195 if ((uint64_t)(bSig << 1)) {
6196 return propagateFloatx80NaN(a, b, status);
6197 }
6198 return packFloatx80( zSign, 0, 0 );
6199 }
6200 if ( bExp == 0 ) {
6201 if ( bSig == 0 ) {
6202 if ( ( aExp | aSig ) == 0 ) {
6203 invalid:
6204 float_raise(float_flag_invalid, status);
6205 return floatx80_default_nan(status);
6206 }
6207 float_raise(float_flag_divbyzero, status);
6208 return packFloatx80(zSign, floatx80_infinity_high,
6209 floatx80_infinity_low);
6210 }
6211 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6212 }
6213 if ( aExp == 0 ) {
6214 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6215 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6216 }
6217 zExp = aExp - bExp + 0x3FFE;
6218 rem1 = 0;
6219 if ( bSig <= aSig ) {
6220 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6221 ++zExp;
6222 }
6223 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6224 mul64To128( bSig, zSig0, &term0, &term1 );
6225 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6226 while ( (int64_t) rem0 < 0 ) {
6227 --zSig0;
6228 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6229 }
6230 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6231 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6232 mul64To128( bSig, zSig1, &term1, &term2 );
6233 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6234 while ( (int64_t) rem1 < 0 ) {
6235 --zSig1;
6236 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6237 }
6238 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6239 }
6240 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6241 zSign, zExp, zSig0, zSig1, status);
6242 }
6243
6244 /*----------------------------------------------------------------------------
6245 | Returns the remainder of the extended double-precision floating-point value
6246 | `a' with respect to the corresponding value `b'. The operation is performed
6247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6248 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6249 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6250 | the absolute value of the integer quotient.
6251 *----------------------------------------------------------------------------*/
6252
6253 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6254 float_status *status)
6255 {
6256 bool aSign, zSign;
6257 int32_t aExp, bExp, expDiff, aExpOrig;
6258 uint64_t aSig0, aSig1, bSig;
6259 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6260
6261 *quotient = 0;
6262 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6263 float_raise(float_flag_invalid, status);
6264 return floatx80_default_nan(status);
6265 }
6266 aSig0 = extractFloatx80Frac( a );
6267 aExpOrig = aExp = extractFloatx80Exp( a );
6268 aSign = extractFloatx80Sign( a );
6269 bSig = extractFloatx80Frac( b );
6270 bExp = extractFloatx80Exp( b );
6271 if ( aExp == 0x7FFF ) {
6272 if ( (uint64_t) ( aSig0<<1 )
6273 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6274 return propagateFloatx80NaN(a, b, status);
6275 }
6276 goto invalid;
6277 }
6278 if ( bExp == 0x7FFF ) {
6279 if ((uint64_t)(bSig << 1)) {
6280 return propagateFloatx80NaN(a, b, status);
6281 }
6282 if (aExp == 0 && aSig0 >> 63) {
6283 /*
6284 * Pseudo-denormal argument must be returned in normalized
6285 * form.
6286 */
6287 return packFloatx80(aSign, 1, aSig0);
6288 }
6289 return a;
6290 }
6291 if ( bExp == 0 ) {
6292 if ( bSig == 0 ) {
6293 invalid:
6294 float_raise(float_flag_invalid, status);
6295 return floatx80_default_nan(status);
6296 }
6297 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6298 }
6299 if ( aExp == 0 ) {
6300 if ( aSig0 == 0 ) return a;
6301 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6302 }
6303 zSign = aSign;
6304 expDiff = aExp - bExp;
6305 aSig1 = 0;
6306 if ( expDiff < 0 ) {
6307 if ( mod || expDiff < -1 ) {
6308 if (aExp == 1 && aExpOrig == 0) {
6309 /*
6310 * Pseudo-denormal argument must be returned in
6311 * normalized form.
6312 */
6313 return packFloatx80(aSign, aExp, aSig0);
6314 }
6315 return a;
6316 }
6317 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6318 expDiff = 0;
6319 }
6320 *quotient = q = ( bSig <= aSig0 );
6321 if ( q ) aSig0 -= bSig;
6322 expDiff -= 64;
6323 while ( 0 < expDiff ) {
6324 q = estimateDiv128To64( aSig0, aSig1, bSig );
6325 q = ( 2 < q ) ? q - 2 : 0;
6326 mul64To128( bSig, q, &term0, &term1 );
6327 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6328 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6329 expDiff -= 62;
6330 *quotient <<= 62;
6331 *quotient += q;
6332 }
6333 expDiff += 64;
6334 if ( 0 < expDiff ) {
6335 q = estimateDiv128To64( aSig0, aSig1, bSig );
6336 q = ( 2 < q ) ? q - 2 : 0;
6337 q >>= 64 - expDiff;
6338 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6339 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6340 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6341 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6342 ++q;
6343 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6344 }
6345 if (expDiff < 64) {
6346 *quotient <<= expDiff;
6347 } else {
6348 *quotient = 0;
6349 }
6350 *quotient += q;
6351 }
6352 else {
6353 term1 = 0;
6354 term0 = bSig;
6355 }
6356 if (!mod) {
6357 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6358 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6359 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6360 && ( q & 1 ) )
6361 ) {
6362 aSig0 = alternateASig0;
6363 aSig1 = alternateASig1;
6364 zSign = ! zSign;
6365 ++*quotient;
6366 }
6367 }
6368 return
6369 normalizeRoundAndPackFloatx80(
6370 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6371
6372 }
6373
6374 /*----------------------------------------------------------------------------
6375 | Returns the remainder of the extended double-precision floating-point value
6376 | `a' with respect to the corresponding value `b'. The operation is performed
6377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6378 *----------------------------------------------------------------------------*/
6379
6380 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6381 {
6382 uint64_t quotient;
6383 return floatx80_modrem(a, b, false, &quotient, status);
6384 }
6385
6386 /*----------------------------------------------------------------------------
6387 | Returns the remainder of the extended double-precision floating-point value
6388 | `a' with respect to the corresponding value `b', with the quotient truncated
6389 | toward zero.
6390 *----------------------------------------------------------------------------*/
6391
6392 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6393 {
6394 uint64_t quotient;
6395 return floatx80_modrem(a, b, true, &quotient, status);
6396 }
6397
6398 /*----------------------------------------------------------------------------
6399 | Returns the square root of the extended double-precision floating-point
6400 | value `a'. The operation is performed according to the IEC/IEEE Standard
6401 | for Binary Floating-Point Arithmetic.
6402 *----------------------------------------------------------------------------*/
6403
6404 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6405 {
6406 bool aSign;
6407 int32_t aExp, zExp;
6408 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6409 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6410
6411 if (floatx80_invalid_encoding(a)) {
6412 float_raise(float_flag_invalid, status);
6413 return floatx80_default_nan(status);
6414 }
6415 aSig0 = extractFloatx80Frac( a );
6416 aExp = extractFloatx80Exp( a );
6417 aSign = extractFloatx80Sign( a );
6418 if ( aExp == 0x7FFF ) {
6419 if ((uint64_t)(aSig0 << 1)) {
6420 return propagateFloatx80NaN(a, a, status);
6421 }
6422 if ( ! aSign ) return a;
6423 goto invalid;
6424 }
6425 if ( aSign ) {
6426 if ( ( aExp | aSig0 ) == 0 ) return a;
6427 invalid:
6428 float_raise(float_flag_invalid, status);
6429 return floatx80_default_nan(status);
6430 }
6431 if ( aExp == 0 ) {
6432 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6433 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6434 }
6435 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6436 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6437 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6438 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6439 doubleZSig0 = zSig0<<1;
6440 mul64To128( zSig0, zSig0, &term0, &term1 );
6441 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6442 while ( (int64_t) rem0 < 0 ) {
6443 --zSig0;
6444 doubleZSig0 -= 2;
6445 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6446 }
6447 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6448 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6449 if ( zSig1 == 0 ) zSig1 = 1;
6450 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6451 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6452 mul64To128( zSig1, zSig1, &term2, &term3 );
6453 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6454 while ( (int64_t) rem1 < 0 ) {
6455 --zSig1;
6456 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6457 term3 |= 1;
6458 term2 |= doubleZSig0;
6459 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6460 }
6461 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6462 }
6463 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6464 zSig0 |= doubleZSig0;
6465 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6466 0, zExp, zSig0, zSig1, status);
6467 }
6468
6469 /*----------------------------------------------------------------------------
6470 | Returns the result of converting the quadruple-precision floating-point
6471 | value `a' to the 32-bit two's complement integer format. The conversion
6472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6473 | Arithmetic---which means in particular that the conversion is rounded
6474 | according to the current rounding mode. If `a' is a NaN, the largest
6475 | positive integer is returned. Otherwise, if the conversion overflows, the
6476 | largest integer with the same sign as `a' is returned.
6477 *----------------------------------------------------------------------------*/
6478
6479 int32_t float128_to_int32(float128 a, float_status *status)
6480 {
6481 bool aSign;
6482 int32_t aExp, shiftCount;
6483 uint64_t aSig0, aSig1;
6484
6485 aSig1 = extractFloat128Frac1( a );
6486 aSig0 = extractFloat128Frac0( a );
6487 aExp = extractFloat128Exp( a );
6488 aSign = extractFloat128Sign( a );
6489 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6490 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6491 aSig0 |= ( aSig1 != 0 );
6492 shiftCount = 0x4028 - aExp;
6493 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6494 return roundAndPackInt32(aSign, aSig0, status);
6495
6496 }
6497
6498 /*----------------------------------------------------------------------------
6499 | Returns the result of converting the quadruple-precision floating-point
6500 | value `a' to the 32-bit two's complement integer format. The conversion
6501 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6502 | Arithmetic, except that the conversion is always rounded toward zero. If
6503 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6504 | conversion overflows, the largest integer with the same sign as `a' is
6505 | returned.
6506 *----------------------------------------------------------------------------*/
6507
6508 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6509 {
6510 bool aSign;
6511 int32_t aExp, shiftCount;
6512 uint64_t aSig0, aSig1, savedASig;
6513 int32_t z;
6514
6515 aSig1 = extractFloat128Frac1( a );
6516 aSig0 = extractFloat128Frac0( a );
6517 aExp = extractFloat128Exp( a );
6518 aSign = extractFloat128Sign( a );
6519 aSig0 |= ( aSig1 != 0 );
6520 if ( 0x401E < aExp ) {
6521 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6522 goto invalid;
6523 }
6524 else if ( aExp < 0x3FFF ) {
6525 if (aExp || aSig0) {
6526 float_raise(float_flag_inexact, status);
6527 }
6528 return 0;
6529 }
6530 aSig0 |= UINT64_C(0x0001000000000000);
6531 shiftCount = 0x402F - aExp;
6532 savedASig = aSig0;
6533 aSig0 >>= shiftCount;
6534 z = aSig0;
6535 if ( aSign ) z = - z;
6536 if ( ( z < 0 ) ^ aSign ) {
6537 invalid:
6538 float_raise(float_flag_invalid, status);
6539 return aSign ? INT32_MIN : INT32_MAX;
6540 }
6541 if ( ( aSig0<<shiftCount ) != savedASig ) {
6542 float_raise(float_flag_inexact, status);
6543 }
6544 return z;
6545
6546 }
6547
6548 /*----------------------------------------------------------------------------
6549 | Returns the result of converting the quadruple-precision floating-point
6550 | value `a' to the 64-bit two's complement integer format. The conversion
6551 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6552 | Arithmetic---which means in particular that the conversion is rounded
6553 | according to the current rounding mode. If `a' is a NaN, the largest
6554 | positive integer is returned. Otherwise, if the conversion overflows, the
6555 | largest integer with the same sign as `a' is returned.
6556 *----------------------------------------------------------------------------*/
6557
6558 int64_t float128_to_int64(float128 a, float_status *status)
6559 {
6560 bool aSign;
6561 int32_t aExp, shiftCount;
6562 uint64_t aSig0, aSig1;
6563
6564 aSig1 = extractFloat128Frac1( a );
6565 aSig0 = extractFloat128Frac0( a );
6566 aExp = extractFloat128Exp( a );
6567 aSign = extractFloat128Sign( a );
6568 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6569 shiftCount = 0x402F - aExp;
6570 if ( shiftCount <= 0 ) {
6571 if ( 0x403E < aExp ) {
6572 float_raise(float_flag_invalid, status);
6573 if ( ! aSign
6574 || ( ( aExp == 0x7FFF )
6575 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6576 )
6577 ) {
6578 return INT64_MAX;
6579 }
6580 return INT64_MIN;
6581 }
6582 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6583 }
6584 else {
6585 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6586 }
6587 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6588
6589 }
6590
6591 /*----------------------------------------------------------------------------
6592 | Returns the result of converting the quadruple-precision floating-point
6593 | value `a' to the 64-bit two's complement integer format. The conversion
6594 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6595 | Arithmetic, except that the conversion is always rounded toward zero.
6596 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6597 | the conversion overflows, the largest integer with the same sign as `a' is
6598 | returned.
6599 *----------------------------------------------------------------------------*/
6600
6601 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6602 {
6603 bool aSign;
6604 int32_t aExp, shiftCount;
6605 uint64_t aSig0, aSig1;
6606 int64_t z;
6607
6608 aSig1 = extractFloat128Frac1( a );
6609 aSig0 = extractFloat128Frac0( a );
6610 aExp = extractFloat128Exp( a );
6611 aSign = extractFloat128Sign( a );
6612 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6613 shiftCount = aExp - 0x402F;
6614 if ( 0 < shiftCount ) {
6615 if ( 0x403E <= aExp ) {
6616 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6617 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6618 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6619 if (aSig1) {
6620 float_raise(float_flag_inexact, status);
6621 }
6622 }
6623 else {
6624 float_raise(float_flag_invalid, status);
6625 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6626 return INT64_MAX;
6627 }
6628 }
6629 return INT64_MIN;
6630 }
6631 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6632 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6633 float_raise(float_flag_inexact, status);
6634 }
6635 }
6636 else {
6637 if ( aExp < 0x3FFF ) {
6638 if ( aExp | aSig0 | aSig1 ) {
6639 float_raise(float_flag_inexact, status);
6640 }
6641 return 0;
6642 }
6643 z = aSig0>>( - shiftCount );
6644 if ( aSig1
6645 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6646 float_raise(float_flag_inexact, status);
6647 }
6648 }
6649 if ( aSign ) z = - z;
6650 return z;
6651
6652 }
6653
6654 /*----------------------------------------------------------------------------
6655 | Returns the result of converting the quadruple-precision floating-point value
6656 | `a' to the 64-bit unsigned integer format. The conversion is
6657 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6658 | Arithmetic---which means in particular that the conversion is rounded
6659 | according to the current rounding mode. If `a' is a NaN, the largest
6660 | positive integer is returned. If the conversion overflows, the
6661 | largest unsigned integer is returned. If 'a' is negative, the value is
6662 | rounded and zero is returned; negative values that do not round to zero
6663 | will raise the inexact exception.
6664 *----------------------------------------------------------------------------*/
6665
6666 uint64_t float128_to_uint64(float128 a, float_status *status)
6667 {
6668 bool aSign;
6669 int aExp;
6670 int shiftCount;
6671 uint64_t aSig0, aSig1;
6672
6673 aSig0 = extractFloat128Frac0(a);
6674 aSig1 = extractFloat128Frac1(a);
6675 aExp = extractFloat128Exp(a);
6676 aSign = extractFloat128Sign(a);
6677 if (aSign && (aExp > 0x3FFE)) {
6678 float_raise(float_flag_invalid, status);
6679 if (float128_is_any_nan(a)) {
6680 return UINT64_MAX;
6681 } else {
6682 return 0;
6683 }
6684 }
6685 if (aExp) {
6686 aSig0 |= UINT64_C(0x0001000000000000);
6687 }
6688 shiftCount = 0x402F - aExp;
6689 if (shiftCount <= 0) {
6690 if (0x403E < aExp) {
6691 float_raise(float_flag_invalid, status);
6692 return UINT64_MAX;
6693 }
6694 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6695 } else {
6696 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6697 }
6698 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6699 }
6700
6701 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6702 {
6703 uint64_t v;
6704 signed char current_rounding_mode = status->float_rounding_mode;
6705
6706 set_float_rounding_mode(float_round_to_zero, status);
6707 v = float128_to_uint64(a, status);
6708 set_float_rounding_mode(current_rounding_mode, status);
6709
6710 return v;
6711 }
6712
6713 /*----------------------------------------------------------------------------
6714 | Returns the result of converting the quadruple-precision floating-point
6715 | value `a' to the 32-bit unsigned integer format. The conversion
6716 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6717 | Arithmetic except that the conversion is always rounded toward zero.
6718 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6719 | if the conversion overflows, the largest unsigned integer is returned.
6720 | If 'a' is negative, the value is rounded and zero is returned; negative
6721 | values that do not round to zero will raise the inexact exception.
6722 *----------------------------------------------------------------------------*/
6723
6724 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6725 {
6726 uint64_t v;
6727 uint32_t res;
6728 int old_exc_flags = get_float_exception_flags(status);
6729
6730 v = float128_to_uint64_round_to_zero(a, status);
6731 if (v > 0xffffffff) {
6732 res = 0xffffffff;
6733 } else {
6734 return v;
6735 }
6736 set_float_exception_flags(old_exc_flags, status);
6737 float_raise(float_flag_invalid, status);
6738 return res;
6739 }
6740
6741 /*----------------------------------------------------------------------------
6742 | Returns the result of converting the quadruple-precision floating-point value
6743 | `a' to the 32-bit unsigned integer format. The conversion is
6744 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6745 | Arithmetic---which means in particular that the conversion is rounded
6746 | according to the current rounding mode. If `a' is a NaN, the largest
6747 | positive integer is returned. If the conversion overflows, the
6748 | largest unsigned integer is returned. If 'a' is negative, the value is
6749 | rounded and zero is returned; negative values that do not round to zero
6750 | will raise the inexact exception.
6751 *----------------------------------------------------------------------------*/
6752
6753 uint32_t float128_to_uint32(float128 a, float_status *status)
6754 {
6755 uint64_t v;
6756 uint32_t res;
6757 int old_exc_flags = get_float_exception_flags(status);
6758
6759 v = float128_to_uint64(a, status);
6760 if (v > 0xffffffff) {
6761 res = 0xffffffff;
6762 } else {
6763 return v;
6764 }
6765 set_float_exception_flags(old_exc_flags, status);
6766 float_raise(float_flag_invalid, status);
6767 return res;
6768 }
6769
6770 /*----------------------------------------------------------------------------
6771 | Returns the result of converting the quadruple-precision floating-point
6772 | value `a' to the single-precision floating-point format. The conversion
6773 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6774 | Arithmetic.
6775 *----------------------------------------------------------------------------*/
6776
6777 float32 float128_to_float32(float128 a, float_status *status)
6778 {
6779 bool aSign;
6780 int32_t aExp;
6781 uint64_t aSig0, aSig1;
6782 uint32_t zSig;
6783
6784 aSig1 = extractFloat128Frac1( a );
6785 aSig0 = extractFloat128Frac0( a );
6786 aExp = extractFloat128Exp( a );
6787 aSign = extractFloat128Sign( a );
6788 if ( aExp == 0x7FFF ) {
6789 if ( aSig0 | aSig1 ) {
6790 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6791 }
6792 return packFloat32( aSign, 0xFF, 0 );
6793 }
6794 aSig0 |= ( aSig1 != 0 );
6795 shift64RightJamming( aSig0, 18, &aSig0 );
6796 zSig = aSig0;
6797 if ( aExp || zSig ) {
6798 zSig |= 0x40000000;
6799 aExp -= 0x3F81;
6800 }
6801 return roundAndPackFloat32(aSign, aExp, zSig, status);
6802
6803 }
6804
6805 /*----------------------------------------------------------------------------
6806 | Returns the result of converting the quadruple-precision floating-point
6807 | value `a' to the double-precision floating-point format. The conversion
6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6809 | Arithmetic.
6810 *----------------------------------------------------------------------------*/
6811
6812 float64 float128_to_float64(float128 a, float_status *status)
6813 {
6814 bool aSign;
6815 int32_t aExp;
6816 uint64_t aSig0, aSig1;
6817
6818 aSig1 = extractFloat128Frac1( a );
6819 aSig0 = extractFloat128Frac0( a );
6820 aExp = extractFloat128Exp( a );
6821 aSign = extractFloat128Sign( a );
6822 if ( aExp == 0x7FFF ) {
6823 if ( aSig0 | aSig1 ) {
6824 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6825 }
6826 return packFloat64( aSign, 0x7FF, 0 );
6827 }
6828 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6829 aSig0 |= ( aSig1 != 0 );
6830 if ( aExp || aSig0 ) {
6831 aSig0 |= UINT64_C(0x4000000000000000);
6832 aExp -= 0x3C01;
6833 }
6834 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6835
6836 }
6837
6838 /*----------------------------------------------------------------------------
6839 | Returns the result of converting the quadruple-precision floating-point
6840 | value `a' to the extended double-precision floating-point format. The
6841 | conversion is performed according to the IEC/IEEE Standard for Binary
6842 | Floating-Point Arithmetic.
6843 *----------------------------------------------------------------------------*/
6844
6845 floatx80 float128_to_floatx80(float128 a, float_status *status)
6846 {
6847 bool aSign;
6848 int32_t aExp;
6849 uint64_t aSig0, aSig1;
6850
6851 aSig1 = extractFloat128Frac1( a );
6852 aSig0 = extractFloat128Frac0( a );
6853 aExp = extractFloat128Exp( a );
6854 aSign = extractFloat128Sign( a );
6855 if ( aExp == 0x7FFF ) {
6856 if ( aSig0 | aSig1 ) {
6857 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6858 status);
6859 return floatx80_silence_nan(res, status);
6860 }
6861 return packFloatx80(aSign, floatx80_infinity_high,
6862 floatx80_infinity_low);
6863 }
6864 if ( aExp == 0 ) {
6865 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6866 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6867 }
6868 else {
6869 aSig0 |= UINT64_C(0x0001000000000000);
6870 }
6871 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6872 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6873
6874 }
6875
6876 /*----------------------------------------------------------------------------
6877 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6878 | returns the result as a quadruple-precision floating-point value. The
6879 | operation is performed according to the IEC/IEEE Standard for Binary
6880 | Floating-Point Arithmetic.
6881 *----------------------------------------------------------------------------*/
6882
6883 float128 float128_round_to_int(float128 a, float_status *status)
6884 {
6885 bool aSign;
6886 int32_t aExp;
6887 uint64_t lastBitMask, roundBitsMask;
6888 float128 z;
6889
6890 aExp = extractFloat128Exp( a );
6891 if ( 0x402F <= aExp ) {
6892 if ( 0x406F <= aExp ) {
6893 if ( ( aExp == 0x7FFF )
6894 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6895 ) {
6896 return propagateFloat128NaN(a, a, status);
6897 }
6898 return a;
6899 }
6900 lastBitMask = 1;
6901 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6902 roundBitsMask = lastBitMask - 1;
6903 z = a;
6904 switch (status->float_rounding_mode) {
6905 case float_round_nearest_even:
6906 if ( lastBitMask ) {
6907 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6908 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6909 }
6910 else {
6911 if ( (int64_t) z.low < 0 ) {
6912 ++z.high;
6913 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6914 }
6915 }
6916 break;
6917 case float_round_ties_away:
6918 if (lastBitMask) {
6919 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6920 } else {
6921 if ((int64_t) z.low < 0) {
6922 ++z.high;
6923 }
6924 }
6925 break;
6926 case float_round_to_zero:
6927 break;
6928 case float_round_up:
6929 if (!extractFloat128Sign(z)) {
6930 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6931 }
6932 break;
6933 case float_round_down:
6934 if (extractFloat128Sign(z)) {
6935 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6936 }
6937 break;
6938 case float_round_to_odd:
6939 /*
6940 * Note that if lastBitMask == 0, the last bit is the lsb
6941 * of high, and roundBitsMask == -1.
6942 */
6943 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6944 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6945 }
6946 break;
6947 default:
6948 abort();
6949 }
6950 z.low &= ~ roundBitsMask;
6951 }
6952 else {
6953 if ( aExp < 0x3FFF ) {
6954 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6955 float_raise(float_flag_inexact, status);
6956 aSign = extractFloat128Sign( a );
6957 switch (status->float_rounding_mode) {
6958 case float_round_nearest_even:
6959 if ( ( aExp == 0x3FFE )
6960 && ( extractFloat128Frac0( a )
6961 | extractFloat128Frac1( a ) )
6962 ) {
6963 return packFloat128( aSign, 0x3FFF, 0, 0 );
6964 }
6965 break;
6966 case float_round_ties_away:
6967 if (aExp == 0x3FFE) {
6968 return packFloat128(aSign, 0x3FFF, 0, 0);
6969 }
6970 break;
6971 case float_round_down:
6972 return
6973 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6974 : packFloat128( 0, 0, 0, 0 );
6975 case float_round_up:
6976 return
6977 aSign ? packFloat128( 1, 0, 0, 0 )
6978 : packFloat128( 0, 0x3FFF, 0, 0 );
6979
6980 case float_round_to_odd:
6981 return packFloat128(aSign, 0x3FFF, 0, 0);
6982
6983 case float_round_to_zero:
6984 break;
6985 }
6986 return packFloat128( aSign, 0, 0, 0 );
6987 }
6988 lastBitMask = 1;
6989 lastBitMask <<= 0x402F - aExp;
6990 roundBitsMask = lastBitMask - 1;
6991 z.low = 0;
6992 z.high = a.high;
6993 switch (status->float_rounding_mode) {
6994 case float_round_nearest_even:
6995 z.high += lastBitMask>>1;
6996 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6997 z.high &= ~ lastBitMask;
6998 }
6999 break;
7000 case float_round_ties_away:
7001 z.high += lastBitMask>>1;
7002 break;
7003 case float_round_to_zero:
7004 break;
7005 case float_round_up:
7006 if (!extractFloat128Sign(z)) {
7007 z.high |= ( a.low != 0 );
7008 z.high += roundBitsMask;
7009 }
7010 break;
7011 case float_round_down:
7012 if (extractFloat128Sign(z)) {
7013 z.high |= (a.low != 0);
7014 z.high += roundBitsMask;
7015 }
7016 break;
7017 case float_round_to_odd:
7018 if ((z.high & lastBitMask) == 0) {
7019 z.high |= (a.low != 0);
7020 z.high += roundBitsMask;
7021 }
7022 break;
7023 default:
7024 abort();
7025 }
7026 z.high &= ~ roundBitsMask;
7027 }
7028 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7029 float_raise(float_flag_inexact, status);
7030 }
7031 return z;
7032
7033 }
7034
7035 /*----------------------------------------------------------------------------
7036 | Returns the result of adding the absolute values of the quadruple-precision
7037 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7038 | before being returned. `zSign' is ignored if the result is a NaN.
7039 | The addition is performed according to the IEC/IEEE Standard for Binary
7040 | Floating-Point Arithmetic.
7041 *----------------------------------------------------------------------------*/
7042
7043 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7044 float_status *status)
7045 {
7046 int32_t aExp, bExp, zExp;
7047 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7048 int32_t expDiff;
7049
7050 aSig1 = extractFloat128Frac1( a );
7051 aSig0 = extractFloat128Frac0( a );
7052 aExp = extractFloat128Exp( a );
7053 bSig1 = extractFloat128Frac1( b );
7054 bSig0 = extractFloat128Frac0( b );
7055 bExp = extractFloat128Exp( b );
7056 expDiff = aExp - bExp;
7057 if ( 0 < expDiff ) {
7058 if ( aExp == 0x7FFF ) {
7059 if (aSig0 | aSig1) {
7060 return propagateFloat128NaN(a, b, status);
7061 }
7062 return a;
7063 }
7064 if ( bExp == 0 ) {
7065 --expDiff;
7066 }
7067 else {
7068 bSig0 |= UINT64_C(0x0001000000000000);
7069 }
7070 shift128ExtraRightJamming(
7071 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7072 zExp = aExp;
7073 }
7074 else if ( expDiff < 0 ) {
7075 if ( bExp == 0x7FFF ) {
7076 if (bSig0 | bSig1) {
7077 return propagateFloat128NaN(a, b, status);
7078 }
7079 return packFloat128( zSign, 0x7FFF, 0, 0 );
7080 }
7081 if ( aExp == 0 ) {
7082 ++expDiff;
7083 }
7084 else {
7085 aSig0 |= UINT64_C(0x0001000000000000);
7086 }
7087 shift128ExtraRightJamming(
7088 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7089 zExp = bExp;
7090 }
7091 else {
7092 if ( aExp == 0x7FFF ) {
7093 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7094 return propagateFloat128NaN(a, b, status);
7095 }
7096 return a;
7097 }
7098 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7099 if ( aExp == 0 ) {
7100 if (status->flush_to_zero) {
7101 if (zSig0 | zSig1) {
7102 float_raise(float_flag_output_denormal, status);
7103 }
7104 return packFloat128(zSign, 0, 0, 0);
7105 }
7106 return packFloat128( zSign, 0, zSig0, zSig1 );
7107 }
7108 zSig2 = 0;
7109 zSig0 |= UINT64_C(0x0002000000000000);
7110 zExp = aExp;
7111 goto shiftRight1;
7112 }
7113 aSig0 |= UINT64_C(0x0001000000000000);
7114 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7115 --zExp;
7116 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7117 ++zExp;
7118 shiftRight1:
7119 shift128ExtraRightJamming(
7120 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7121 roundAndPack:
7122 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7123
7124 }
7125
7126 /*----------------------------------------------------------------------------
7127 | Returns the result of subtracting the absolute values of the quadruple-
7128 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7129 | difference is negated before being returned. `zSign' is ignored if the
7130 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7131 | Standard for Binary Floating-Point Arithmetic.
7132 *----------------------------------------------------------------------------*/
7133
7134 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7135 float_status *status)
7136 {
7137 int32_t aExp, bExp, zExp;
7138 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7139 int32_t expDiff;
7140
7141 aSig1 = extractFloat128Frac1( a );
7142 aSig0 = extractFloat128Frac0( a );
7143 aExp = extractFloat128Exp( a );
7144 bSig1 = extractFloat128Frac1( b );
7145 bSig0 = extractFloat128Frac0( b );
7146 bExp = extractFloat128Exp( b );
7147 expDiff = aExp - bExp;
7148 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7149 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7150 if ( 0 < expDiff ) goto aExpBigger;
7151 if ( expDiff < 0 ) goto bExpBigger;
7152 if ( aExp == 0x7FFF ) {
7153 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7154 return propagateFloat128NaN(a, b, status);
7155 }
7156 float_raise(float_flag_invalid, status);
7157 return float128_default_nan(status);
7158 }
7159 if ( aExp == 0 ) {
7160 aExp = 1;
7161 bExp = 1;
7162 }
7163 if ( bSig0 < aSig0 ) goto aBigger;
7164 if ( aSig0 < bSig0 ) goto bBigger;
7165 if ( bSig1 < aSig1 ) goto aBigger;
7166 if ( aSig1 < bSig1 ) goto bBigger;
7167 return packFloat128(status->float_rounding_mode == float_round_down,
7168 0, 0, 0);
7169 bExpBigger:
7170 if ( bExp == 0x7FFF ) {
7171 if (bSig0 | bSig1) {
7172 return propagateFloat128NaN(a, b, status);
7173 }
7174 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7175 }
7176 if ( aExp == 0 ) {
7177 ++expDiff;
7178 }
7179 else {
7180 aSig0 |= UINT64_C(0x4000000000000000);
7181 }
7182 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7183 bSig0 |= UINT64_C(0x4000000000000000);
7184 bBigger:
7185 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7186 zExp = bExp;
7187 zSign ^= 1;
7188 goto normalizeRoundAndPack;
7189 aExpBigger:
7190 if ( aExp == 0x7FFF ) {
7191 if (aSig0 | aSig1) {
7192 return propagateFloat128NaN(a, b, status);
7193 }
7194 return a;
7195 }
7196 if ( bExp == 0 ) {
7197 --expDiff;
7198 }
7199 else {
7200 bSig0 |= UINT64_C(0x4000000000000000);
7201 }
7202 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7203 aSig0 |= UINT64_C(0x4000000000000000);
7204 aBigger:
7205 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7206 zExp = aExp;
7207 normalizeRoundAndPack:
7208 --zExp;
7209 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7210 status);
7211
7212 }
7213
7214 /*----------------------------------------------------------------------------
7215 | Returns the result of adding the quadruple-precision floating-point values
7216 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7217 | for Binary Floating-Point Arithmetic.
7218 *----------------------------------------------------------------------------*/
7219
7220 float128 float128_add(float128 a, float128 b, float_status *status)
7221 {
7222 bool aSign, bSign;
7223
7224 aSign = extractFloat128Sign( a );
7225 bSign = extractFloat128Sign( b );
7226 if ( aSign == bSign ) {
7227 return addFloat128Sigs(a, b, aSign, status);
7228 }
7229 else {
7230 return subFloat128Sigs(a, b, aSign, status);
7231 }
7232
7233 }
7234
7235 /*----------------------------------------------------------------------------
7236 | Returns the result of subtracting the quadruple-precision floating-point
7237 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7238 | Standard for Binary Floating-Point Arithmetic.
7239 *----------------------------------------------------------------------------*/
7240
7241 float128 float128_sub(float128 a, float128 b, float_status *status)
7242 {
7243 bool aSign, bSign;
7244
7245 aSign = extractFloat128Sign( a );
7246 bSign = extractFloat128Sign( b );
7247 if ( aSign == bSign ) {
7248 return subFloat128Sigs(a, b, aSign, status);
7249 }
7250 else {
7251 return addFloat128Sigs(a, b, aSign, status);
7252 }
7253
7254 }
7255
7256 /*----------------------------------------------------------------------------
7257 | Returns the result of multiplying the quadruple-precision floating-point
7258 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7259 | Standard for Binary Floating-Point Arithmetic.
7260 *----------------------------------------------------------------------------*/
7261
7262 float128 float128_mul(float128 a, float128 b, float_status *status)
7263 {
7264 bool aSign, bSign, zSign;
7265 int32_t aExp, bExp, zExp;
7266 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7267
7268 aSig1 = extractFloat128Frac1( a );
7269 aSig0 = extractFloat128Frac0( a );
7270 aExp = extractFloat128Exp( a );
7271 aSign = extractFloat128Sign( a );
7272 bSig1 = extractFloat128Frac1( b );
7273 bSig0 = extractFloat128Frac0( b );
7274 bExp = extractFloat128Exp( b );
7275 bSign = extractFloat128Sign( b );
7276 zSign = aSign ^ bSign;
7277 if ( aExp == 0x7FFF ) {
7278 if ( ( aSig0 | aSig1 )
7279 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7280 return propagateFloat128NaN(a, b, status);
7281 }
7282 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7283 return packFloat128( zSign, 0x7FFF, 0, 0 );
7284 }
7285 if ( bExp == 0x7FFF ) {
7286 if (bSig0 | bSig1) {
7287 return propagateFloat128NaN(a, b, status);
7288 }
7289 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7290 invalid:
7291 float_raise(float_flag_invalid, status);
7292 return float128_default_nan(status);
7293 }
7294 return packFloat128( zSign, 0x7FFF, 0, 0 );
7295 }
7296 if ( aExp == 0 ) {
7297 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7298 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7299 }
7300 if ( bExp == 0 ) {
7301 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7302 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7303 }
7304 zExp = aExp + bExp - 0x4000;
7305 aSig0 |= UINT64_C(0x0001000000000000);
7306 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7307 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7308 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7309 zSig2 |= ( zSig3 != 0 );
7310 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7311 shift128ExtraRightJamming(
7312 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7313 ++zExp;
7314 }
7315 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7316
7317 }
7318
7319 /*----------------------------------------------------------------------------
7320 | Returns the result of dividing the quadruple-precision floating-point value
7321 | `a' by the corresponding value `b'. The operation is performed according to
7322 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7323 *----------------------------------------------------------------------------*/
7324
7325 float128 float128_div(float128 a, float128 b, float_status *status)
7326 {
7327 bool aSign, bSign, zSign;
7328 int32_t aExp, bExp, zExp;
7329 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7330 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7331
7332 aSig1 = extractFloat128Frac1( a );
7333 aSig0 = extractFloat128Frac0( a );
7334 aExp = extractFloat128Exp( a );
7335 aSign = extractFloat128Sign( a );
7336 bSig1 = extractFloat128Frac1( b );
7337 bSig0 = extractFloat128Frac0( b );
7338 bExp = extractFloat128Exp( b );
7339 bSign = extractFloat128Sign( b );
7340 zSign = aSign ^ bSign;
7341 if ( aExp == 0x7FFF ) {
7342 if (aSig0 | aSig1) {
7343 return propagateFloat128NaN(a, b, status);
7344 }
7345 if ( bExp == 0x7FFF ) {
7346 if (bSig0 | bSig1) {
7347 return propagateFloat128NaN(a, b, status);
7348 }
7349 goto invalid;
7350 }
7351 return packFloat128( zSign, 0x7FFF, 0, 0 );
7352 }
7353 if ( bExp == 0x7FFF ) {
7354 if (bSig0 | bSig1) {
7355 return propagateFloat128NaN(a, b, status);
7356 }
7357 return packFloat128( zSign, 0, 0, 0 );
7358 }
7359 if ( bExp == 0 ) {
7360 if ( ( bSig0 | bSig1 ) == 0 ) {
7361 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7362 invalid:
7363 float_raise(float_flag_invalid, status);
7364 return float128_default_nan(status);
7365 }
7366 float_raise(float_flag_divbyzero, status);
7367 return packFloat128( zSign, 0x7FFF, 0, 0 );
7368 }
7369 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7370 }
7371 if ( aExp == 0 ) {
7372 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7373 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7374 }
7375 zExp = aExp - bExp + 0x3FFD;
7376 shortShift128Left(
7377 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7378 shortShift128Left(
7379 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7380 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7381 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7382 ++zExp;
7383 }
7384 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7385 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7386 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7387 while ( (int64_t) rem0 < 0 ) {
7388 --zSig0;
7389 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7390 }
7391 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7392 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7393 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7394 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7395 while ( (int64_t) rem1 < 0 ) {
7396 --zSig1;
7397 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7398 }
7399 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7400 }
7401 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7402 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7403
7404 }
7405
7406 /*----------------------------------------------------------------------------
7407 | Returns the remainder of the quadruple-precision floating-point value `a'
7408 | with respect to the corresponding value `b'. The operation is performed
7409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7410 *----------------------------------------------------------------------------*/
7411
7412 float128 float128_rem(float128 a, float128 b, float_status *status)
7413 {
7414 bool aSign, zSign;
7415 int32_t aExp, bExp, expDiff;
7416 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7417 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7418 int64_t sigMean0;
7419
7420 aSig1 = extractFloat128Frac1( a );
7421 aSig0 = extractFloat128Frac0( a );
7422 aExp = extractFloat128Exp( a );
7423 aSign = extractFloat128Sign( a );
7424 bSig1 = extractFloat128Frac1( b );
7425 bSig0 = extractFloat128Frac0( b );
7426 bExp = extractFloat128Exp( b );
7427 if ( aExp == 0x7FFF ) {
7428 if ( ( aSig0 | aSig1 )
7429 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7430 return propagateFloat128NaN(a, b, status);
7431 }
7432 goto invalid;
7433 }
7434 if ( bExp == 0x7FFF ) {
7435 if (bSig0 | bSig1) {
7436 return propagateFloat128NaN(a, b, status);
7437 }
7438 return a;
7439 }
7440 if ( bExp == 0 ) {
7441 if ( ( bSig0 | bSig1 ) == 0 ) {
7442 invalid:
7443 float_raise(float_flag_invalid, status);
7444 return float128_default_nan(status);
7445 }
7446 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7447 }
7448 if ( aExp == 0 ) {
7449 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7450 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7451 }
7452 expDiff = aExp - bExp;
7453 if ( expDiff < -1 ) return a;
7454 shortShift128Left(
7455 aSig0 | UINT64_C(0x0001000000000000),
7456 aSig1,
7457 15 - ( expDiff < 0 ),
7458 &aSig0,
7459 &aSig1
7460 );
7461 shortShift128Left(
7462 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7463 q = le128( bSig0, bSig1, aSig0, aSig1 );
7464 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7465 expDiff -= 64;
7466 while ( 0 < expDiff ) {
7467 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7468 q = ( 4 < q ) ? q - 4 : 0;
7469 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7470 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7471 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7472 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7473 expDiff -= 61;
7474 }
7475 if ( -64 < expDiff ) {
7476 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7477 q = ( 4 < q ) ? q - 4 : 0;
7478 q >>= - expDiff;
7479 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7480 expDiff += 52;
7481 if ( expDiff < 0 ) {
7482 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7483 }
7484 else {
7485 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7486 }
7487 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7488 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7489 }
7490 else {
7491 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7492 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7493 }
7494 do {
7495 alternateASig0 = aSig0;
7496 alternateASig1 = aSig1;
7497 ++q;
7498 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7499 } while ( 0 <= (int64_t) aSig0 );
7500 add128(
7501 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7502 if ( ( sigMean0 < 0 )
7503 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7504 aSig0 = alternateASig0;
7505 aSig1 = alternateASig1;
7506 }
7507 zSign = ( (int64_t) aSig0 < 0 );
7508 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7509 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7510 status);
7511 }
7512
7513 /*----------------------------------------------------------------------------
7514 | Returns the square root of the quadruple-precision floating-point value `a'.
7515 | The operation is performed according to the IEC/IEEE Standard for Binary
7516 | Floating-Point Arithmetic.
7517 *----------------------------------------------------------------------------*/
7518
7519 float128 float128_sqrt(float128 a, float_status *status)
7520 {
7521 bool aSign;
7522 int32_t aExp, zExp;
7523 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7524 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7525
7526 aSig1 = extractFloat128Frac1( a );
7527 aSig0 = extractFloat128Frac0( a );
7528 aExp = extractFloat128Exp( a );
7529 aSign = extractFloat128Sign( a );
7530 if ( aExp == 0x7FFF ) {
7531 if (aSig0 | aSig1) {
7532 return propagateFloat128NaN(a, a, status);
7533 }
7534 if ( ! aSign ) return a;
7535 goto invalid;
7536 }
7537 if ( aSign ) {
7538 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7539 invalid:
7540 float_raise(float_flag_invalid, status);
7541 return float128_default_nan(status);
7542 }
7543 if ( aExp == 0 ) {
7544 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7545 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7546 }
7547 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7548 aSig0 |= UINT64_C(0x0001000000000000);
7549 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7550 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7551 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7552 doubleZSig0 = zSig0<<1;
7553 mul64To128( zSig0, zSig0, &term0, &term1 );
7554 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7555 while ( (int64_t) rem0 < 0 ) {
7556 --zSig0;
7557 doubleZSig0 -= 2;
7558 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7559 }
7560 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7561 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7562 if ( zSig1 == 0 ) zSig1 = 1;
7563 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7564 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7565 mul64To128( zSig1, zSig1, &term2, &term3 );
7566 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7567 while ( (int64_t) rem1 < 0 ) {
7568 --zSig1;
7569 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7570 term3 |= 1;
7571 term2 |= doubleZSig0;
7572 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7573 }
7574 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7575 }
7576 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7577 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7578
7579 }
7580
7581 static inline FloatRelation
7582 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7583 float_status *status)
7584 {
7585 bool aSign, bSign;
7586
7587 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7588 float_raise(float_flag_invalid, status);
7589 return float_relation_unordered;
7590 }
7591 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7592 ( extractFloatx80Frac( a )<<1 ) ) ||
7593 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7594 ( extractFloatx80Frac( b )<<1 ) )) {
7595 if (!is_quiet ||
7596 floatx80_is_signaling_nan(a, status) ||
7597 floatx80_is_signaling_nan(b, status)) {
7598 float_raise(float_flag_invalid, status);
7599 }
7600 return float_relation_unordered;
7601 }
7602 aSign = extractFloatx80Sign( a );
7603 bSign = extractFloatx80Sign( b );
7604 if ( aSign != bSign ) {
7605
7606 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7607 ( ( a.low | b.low ) == 0 ) ) {
7608 /* zero case */
7609 return float_relation_equal;
7610 } else {
7611 return 1 - (2 * aSign);
7612 }
7613 } else {
7614 /* Normalize pseudo-denormals before comparison. */
7615 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7616 ++a.high;
7617 }
7618 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7619 ++b.high;
7620 }
7621 if (a.low == b.low && a.high == b.high) {
7622 return float_relation_equal;
7623 } else {
7624 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7625 }
7626 }
7627 }
7628
7629 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7630 {
7631 return floatx80_compare_internal(a, b, 0, status);
7632 }
7633
7634 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7635 float_status *status)
7636 {
7637 return floatx80_compare_internal(a, b, 1, status);
7638 }
7639
7640 static inline FloatRelation
7641 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7642 float_status *status)
7643 {
7644 bool aSign, bSign;
7645
7646 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7647 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7648 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7649 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7650 if (!is_quiet ||
7651 float128_is_signaling_nan(a, status) ||
7652 float128_is_signaling_nan(b, status)) {
7653 float_raise(float_flag_invalid, status);
7654 }
7655 return float_relation_unordered;
7656 }
7657 aSign = extractFloat128Sign( a );
7658 bSign = extractFloat128Sign( b );
7659 if ( aSign != bSign ) {
7660 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7661 /* zero case */
7662 return float_relation_equal;
7663 } else {
7664 return 1 - (2 * aSign);
7665 }
7666 } else {
7667 if (a.low == b.low && a.high == b.high) {
7668 return float_relation_equal;
7669 } else {
7670 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7671 }
7672 }
7673 }
7674
7675 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7676 {
7677 return float128_compare_internal(a, b, 0, status);
7678 }
7679
7680 FloatRelation float128_compare_quiet(float128 a, float128 b,
7681 float_status *status)
7682 {
7683 return float128_compare_internal(a, b, 1, status);
7684 }
7685
7686 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7687 {
7688 bool aSign;
7689 int32_t aExp;
7690 uint64_t aSig;
7691
7692 if (floatx80_invalid_encoding(a)) {
7693 float_raise(float_flag_invalid, status);
7694 return floatx80_default_nan(status);
7695 }
7696 aSig = extractFloatx80Frac( a );
7697 aExp = extractFloatx80Exp( a );
7698 aSign = extractFloatx80Sign( a );
7699
7700 if ( aExp == 0x7FFF ) {
7701 if ( aSig<<1 ) {
7702 return propagateFloatx80NaN(a, a, status);
7703 }
7704 return a;
7705 }
7706
7707 if (aExp == 0) {
7708 if (aSig == 0) {
7709 return a;
7710 }
7711 aExp++;
7712 }
7713
7714 if (n > 0x10000) {
7715 n = 0x10000;
7716 } else if (n < -0x10000) {
7717 n = -0x10000;
7718 }
7719
7720 aExp += n;
7721 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7722 aSign, aExp, aSig, 0, status);
7723 }
7724
7725 float128 float128_scalbn(float128 a, int n, float_status *status)
7726 {
7727 bool aSign;
7728 int32_t aExp;
7729 uint64_t aSig0, aSig1;
7730
7731 aSig1 = extractFloat128Frac1( a );
7732 aSig0 = extractFloat128Frac0( a );
7733 aExp = extractFloat128Exp( a );
7734 aSign = extractFloat128Sign( a );
7735 if ( aExp == 0x7FFF ) {
7736 if ( aSig0 | aSig1 ) {
7737 return propagateFloat128NaN(a, a, status);
7738 }
7739 return a;
7740 }
7741 if (aExp != 0) {
7742 aSig0 |= UINT64_C(0x0001000000000000);
7743 } else if (aSig0 == 0 && aSig1 == 0) {
7744 return a;
7745 } else {
7746 aExp++;
7747 }
7748
7749 if (n > 0x10000) {
7750 n = 0x10000;
7751 } else if (n < -0x10000) {
7752 n = -0x10000;
7753 }
7754
7755 aExp += n - 1;
7756 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7757 , status);
7758
7759 }
7760
7761 static void __attribute__((constructor)) softfloat_init(void)
7762 {
7763 union_float64 ua, ub, uc, ur;
7764
7765 if (QEMU_NO_HARDFLOAT) {
7766 return;
7767 }
7768 /*
7769 * Test that the host's FMA is not obviously broken. For example,
7770 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7771 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7772 */
7773 ua.s = 0x0020000000000001ULL;
7774 ub.s = 0x3ca0000000000000ULL;
7775 uc.s = 0x0020000000000000ULL;
7776 ur.h = fma(ua.h, ub.h, uc.h);
7777 if (ur.s != 0x0020000000000001ULL) {
7778 force_soft_fma = true;
7779 }
7780 }