]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Move sf_canonicalize to softfloat-parts.c.inc
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float.
504 * The exponent is unbiased and the fraction is normalized.
505 *
506 * The fraction words are stored in big-endian word ordering,
507 * so that truncation from a larger format to a smaller format
508 * can be done simply by ignoring subsequent elements.
509 */
510
511 typedef struct {
512 FloatClass cls;
513 bool sign;
514 int32_t exp;
515 union {
516 /* Routines that know the structure may reference the singular name. */
517 uint64_t frac;
518 /*
519 * Routines expanded with multiple structures reference "hi" and "lo"
520 * depending on the operation. In FloatParts64, "hi" and "lo" are
521 * both the same word and aliased here.
522 */
523 uint64_t frac_hi;
524 uint64_t frac_lo;
525 };
526 } FloatParts64;
527
528 typedef struct {
529 FloatClass cls;
530 bool sign;
531 int32_t exp;
532 uint64_t frac_hi;
533 uint64_t frac_lo;
534 } FloatParts128;
535
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT 63
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539
540 /* Structure holding all of the relevant parameters for a format.
541 * exp_size: the size of the exponent field
542 * exp_bias: the offset applied to the exponent field
543 * exp_max: the maximum normalised exponent
544 * frac_size: the size of the fraction field
545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546 * The following are computed based the size of fraction
547 * frac_lsb: least significant bit of fraction
548 * frac_lsbm1: the bit below the least significant bit (for rounding)
549 * round_mask/roundeven_mask: masks used for rounding
550 * The following optional modifiers are available:
551 * arm_althp: handle ARM Alternative Half Precision
552 */
553 typedef struct {
554 int exp_size;
555 int exp_bias;
556 int exp_max;
557 int frac_size;
558 int frac_shift;
559 uint64_t frac_lsb;
560 uint64_t frac_lsbm1;
561 uint64_t round_mask;
562 uint64_t roundeven_mask;
563 bool arm_althp;
564 } FloatFmt;
565
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F) \
568 .exp_size = E, \
569 .exp_bias = ((1 << E) - 1) >> 1, \
570 .exp_max = (1 << E) - 1, \
571 .frac_size = F, \
572 .frac_shift = (-F - 1) & 63, \
573 .frac_lsb = 1ull << ((-F - 1) & 63), \
574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \
575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \
576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577
578 static const FloatFmt float16_params = {
579 FLOAT_PARAMS(5, 10)
580 };
581
582 static const FloatFmt float16_params_ahp = {
583 FLOAT_PARAMS(5, 10),
584 .arm_althp = true
585 };
586
587 static const FloatFmt bfloat16_params = {
588 FLOAT_PARAMS(8, 7)
589 };
590
591 static const FloatFmt float32_params = {
592 FLOAT_PARAMS(8, 23)
593 };
594
595 static const FloatFmt float64_params = {
596 FLOAT_PARAMS(11, 52)
597 };
598
599 static const FloatFmt float128_params = {
600 FLOAT_PARAMS(15, 112)
601 };
602
603 /* Unpack a float to parts, but do not canonicalize. */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606 const int f_size = fmt->frac_size;
607 const int e_size = fmt->exp_size;
608
609 *r = (FloatParts64) {
610 .cls = float_class_unclassified,
611 .sign = extract64(raw, f_size + e_size, 1),
612 .exp = extract64(raw, f_size, e_size),
613 .frac = extract64(raw, 0, f_size)
614 };
615 }
616
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619 unpack_raw64(p, &float16_params, f);
620 }
621
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624 unpack_raw64(p, &bfloat16_params, f);
625 }
626
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629 unpack_raw64(p, &float32_params, f);
630 }
631
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634 unpack_raw64(p, &float64_params, f);
635 }
636
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639 const int f_size = float128_params.frac_size - 64;
640 const int e_size = float128_params.exp_size;
641
642 *p = (FloatParts128) {
643 .cls = float_class_unclassified,
644 .sign = extract64(f.high, f_size + e_size, 1),
645 .exp = extract64(f.high, f_size, e_size),
646 .frac_hi = extract64(f.high, 0, f_size),
647 .frac_lo = f.low,
648 };
649 }
650
651 /* Pack a float from parts, but do not canonicalize. */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654 const int f_size = fmt->frac_size;
655 const int e_size = fmt->exp_size;
656 uint64_t ret;
657
658 ret = (uint64_t)p->sign << (f_size + e_size);
659 ret = deposit64(ret, f_size, e_size, p->exp);
660 ret = deposit64(ret, 0, f_size, p->frac);
661 return ret;
662 }
663
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666 return make_float16(pack_raw64(p, &float16_params));
667 }
668
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671 return pack_raw64(p, &bfloat16_params);
672 }
673
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676 return make_float32(pack_raw64(p, &float32_params));
677 }
678
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681 return make_float64(pack_raw64(p, &float64_params));
682 }
683
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686 const int f_size = float128_params.frac_size - 64;
687 const int e_size = float128_params.exp_size;
688 uint64_t hi;
689
690 hi = (uint64_t)p->sign << (f_size + e_size);
691 hi = deposit64(hi, f_size, e_size, p->exp);
692 hi = deposit64(hi, 0, f_size, p->frac_hi);
693 return make_float128(hi, p->frac_lo);
694 }
695
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine: (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output. These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705
706 #define PARTS_GENERIC_64_128(NAME, P) \
707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708
709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711
712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
714
715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S)
716
717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
718 float_status *s);
719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
720 float_status *s);
721
722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
723
724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
725 FloatParts64 *c, float_status *s,
726 int ab_mask, int abc_mask);
727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
728 FloatParts128 *b,
729 FloatParts128 *c,
730 float_status *s,
731 int ab_mask, int abc_mask);
732
733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
735
736 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
737 const FloatFmt *fmt);
738 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
739 const FloatFmt *fmt);
740
741 #define parts_canonicalize(A, S, F) \
742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
743
744 /*
745 * Helper functions for softfloat-parts.c.inc, per-size operations.
746 */
747
748 #define FRAC_GENERIC_64_128(NAME, P) \
749 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
750
751 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
752 {
753 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
754 }
755
756 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
757 {
758 uint64_t ta = a->frac_hi, tb = b->frac_hi;
759 if (ta == tb) {
760 ta = a->frac_lo, tb = b->frac_lo;
761 if (ta == tb) {
762 return 0;
763 }
764 }
765 return ta < tb ? -1 : 1;
766 }
767
768 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B)
769
770 static void frac64_clear(FloatParts64 *a)
771 {
772 a->frac = 0;
773 }
774
775 static void frac128_clear(FloatParts128 *a)
776 {
777 a->frac_hi = a->frac_lo = 0;
778 }
779
780 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A)
781
782 static bool frac64_eqz(FloatParts64 *a)
783 {
784 return a->frac == 0;
785 }
786
787 static bool frac128_eqz(FloatParts128 *a)
788 {
789 return (a->frac_hi | a->frac_lo) == 0;
790 }
791
792 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A)
793
794 static int frac64_normalize(FloatParts64 *a)
795 {
796 if (a->frac) {
797 int shift = clz64(a->frac);
798 a->frac <<= shift;
799 return shift;
800 }
801 return 64;
802 }
803
804 static int frac128_normalize(FloatParts128 *a)
805 {
806 if (a->frac_hi) {
807 int shl = clz64(a->frac_hi);
808 if (shl) {
809 int shr = 64 - shl;
810 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
811 a->frac_lo = (a->frac_lo << shl);
812 }
813 return shl;
814 } else if (a->frac_lo) {
815 int shl = clz64(a->frac_lo);
816 a->frac_hi = (a->frac_lo << shl);
817 a->frac_lo = 0;
818 return shl + 64;
819 }
820 return 128;
821 }
822
823 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A)
824
825 static void frac64_shl(FloatParts64 *a, int c)
826 {
827 a->frac <<= c;
828 }
829
830 static void frac128_shl(FloatParts128 *a, int c)
831 {
832 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
833 }
834
835 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C)
836
837 static void frac64_shr(FloatParts64 *a, int c)
838 {
839 a->frac >>= c;
840 }
841
842 static void frac128_shr(FloatParts128 *a, int c)
843 {
844 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
845 }
846
847 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C)
848
849
850 /* Round and uncanonicalize a floating-point number by parts. There
851 * are FRAC_SHIFT bits that may require rounding at the bottom of the
852 * fraction; these bits will be removed. The exponent will be biased
853 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
854 */
855
856 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
857 const FloatFmt *parm)
858 {
859 const uint64_t frac_lsb = parm->frac_lsb;
860 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
861 const uint64_t round_mask = parm->round_mask;
862 const uint64_t roundeven_mask = parm->roundeven_mask;
863 const int exp_max = parm->exp_max;
864 const int frac_shift = parm->frac_shift;
865 uint64_t frac, inc;
866 int exp, flags = 0;
867 bool overflow_norm;
868
869 frac = p.frac;
870 exp = p.exp;
871
872 switch (p.cls) {
873 case float_class_normal:
874 switch (s->float_rounding_mode) {
875 case float_round_nearest_even:
876 overflow_norm = false;
877 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
878 break;
879 case float_round_ties_away:
880 overflow_norm = false;
881 inc = frac_lsbm1;
882 break;
883 case float_round_to_zero:
884 overflow_norm = true;
885 inc = 0;
886 break;
887 case float_round_up:
888 inc = p.sign ? 0 : round_mask;
889 overflow_norm = p.sign;
890 break;
891 case float_round_down:
892 inc = p.sign ? round_mask : 0;
893 overflow_norm = !p.sign;
894 break;
895 case float_round_to_odd:
896 overflow_norm = true;
897 inc = frac & frac_lsb ? 0 : round_mask;
898 break;
899 default:
900 g_assert_not_reached();
901 }
902
903 exp += parm->exp_bias;
904 if (likely(exp > 0)) {
905 if (frac & round_mask) {
906 flags |= float_flag_inexact;
907 if (uadd64_overflow(frac, inc, &frac)) {
908 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
909 exp++;
910 }
911 }
912 frac >>= frac_shift;
913
914 if (parm->arm_althp) {
915 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
916 if (unlikely(exp > exp_max)) {
917 /* Overflow. Return the maximum normal. */
918 flags = float_flag_invalid;
919 exp = exp_max;
920 frac = -1;
921 }
922 } else if (unlikely(exp >= exp_max)) {
923 flags |= float_flag_overflow | float_flag_inexact;
924 if (overflow_norm) {
925 exp = exp_max - 1;
926 frac = -1;
927 } else {
928 p.cls = float_class_inf;
929 goto do_inf;
930 }
931 }
932 } else if (s->flush_to_zero) {
933 flags |= float_flag_output_denormal;
934 p.cls = float_class_zero;
935 goto do_zero;
936 } else {
937 bool is_tiny = s->tininess_before_rounding || (exp < 0);
938
939 if (!is_tiny) {
940 uint64_t discard;
941 is_tiny = !uadd64_overflow(frac, inc, &discard);
942 }
943
944 shift64RightJamming(frac, 1 - exp, &frac);
945 if (frac & round_mask) {
946 /* Need to recompute round-to-even. */
947 switch (s->float_rounding_mode) {
948 case float_round_nearest_even:
949 inc = ((frac & roundeven_mask) != frac_lsbm1
950 ? frac_lsbm1 : 0);
951 break;
952 case float_round_to_odd:
953 inc = frac & frac_lsb ? 0 : round_mask;
954 break;
955 default:
956 break;
957 }
958 flags |= float_flag_inexact;
959 frac += inc;
960 }
961
962 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
963 frac >>= frac_shift;
964
965 if (is_tiny && (flags & float_flag_inexact)) {
966 flags |= float_flag_underflow;
967 }
968 if (exp == 0 && frac == 0) {
969 p.cls = float_class_zero;
970 }
971 }
972 break;
973
974 case float_class_zero:
975 do_zero:
976 exp = 0;
977 frac = 0;
978 break;
979
980 case float_class_inf:
981 do_inf:
982 assert(!parm->arm_althp);
983 exp = exp_max;
984 frac = 0;
985 break;
986
987 case float_class_qnan:
988 case float_class_snan:
989 assert(!parm->arm_althp);
990 exp = exp_max;
991 frac >>= parm->frac_shift;
992 break;
993
994 default:
995 g_assert_not_reached();
996 }
997
998 float_raise(flags, s);
999 p.exp = exp;
1000 p.frac = frac;
1001 return p;
1002 }
1003
1004
1005 #define partsN(NAME) parts64_##NAME
1006 #define FloatPartsN FloatParts64
1007
1008 #include "softfloat-parts.c.inc"
1009
1010 #undef partsN
1011 #undef FloatPartsN
1012 #define partsN(NAME) parts128_##NAME
1013 #define FloatPartsN FloatParts128
1014
1015 #include "softfloat-parts.c.inc"
1016
1017 #undef partsN
1018 #undef FloatPartsN
1019
1020 /*
1021 * Pack/unpack routines with a specific FloatFmt.
1022 */
1023
1024 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1025 float_status *s, const FloatFmt *params)
1026 {
1027 float16_unpack_raw(p, f);
1028 parts_canonicalize(p, s, params);
1029 }
1030
1031 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1032 float_status *s)
1033 {
1034 float16a_unpack_canonical(p, f, s, &float16_params);
1035 }
1036
1037 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1038 float_status *s)
1039 {
1040 bfloat16_unpack_raw(p, f);
1041 parts_canonicalize(p, s, &bfloat16_params);
1042 }
1043
1044 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1045 float_status *s,
1046 const FloatFmt *params)
1047 {
1048 *p = round_canonical(*p, s, params);
1049 return float16_pack_raw(p);
1050 }
1051
1052 static float16 float16_round_pack_canonical(FloatParts64 *p,
1053 float_status *s)
1054 {
1055 return float16a_round_pack_canonical(p, s, &float16_params);
1056 }
1057
1058 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1059 float_status *s)
1060 {
1061 *p = round_canonical(*p, s, &bfloat16_params);
1062 return bfloat16_pack_raw(p);
1063 }
1064
1065 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1066 float_status *s)
1067 {
1068 float32_unpack_raw(p, f);
1069 parts_canonicalize(p, s, &float32_params);
1070 }
1071
1072 static float32 float32_round_pack_canonical(FloatParts64 *p,
1073 float_status *s)
1074 {
1075 *p = round_canonical(*p, s, &float32_params);
1076 return float32_pack_raw(p);
1077 }
1078
1079 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1080 float_status *s)
1081 {
1082 float64_unpack_raw(p, f);
1083 parts_canonicalize(p, s, &float64_params);
1084 }
1085
1086 static float64 float64_round_pack_canonical(FloatParts64 *p,
1087 float_status *s)
1088 {
1089 *p = round_canonical(*p, s, &float64_params);
1090 return float64_pack_raw(p);
1091 }
1092
1093 /*
1094 * Returns the result of adding or subtracting the values of the
1095 * floating-point values `a' and `b'. The operation is performed
1096 * according to the IEC/IEEE Standard for Binary Floating-Point
1097 * Arithmetic.
1098 */
1099
1100 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1101 float_status *s)
1102 {
1103 bool a_sign = a.sign;
1104 bool b_sign = b.sign ^ subtract;
1105
1106 if (a_sign != b_sign) {
1107 /* Subtraction */
1108
1109 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1110 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1111 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1112 a.frac = a.frac - b.frac;
1113 } else {
1114 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1115 a.frac = b.frac - a.frac;
1116 a.exp = b.exp;
1117 a_sign ^= 1;
1118 }
1119
1120 if (a.frac == 0) {
1121 a.cls = float_class_zero;
1122 a.sign = s->float_rounding_mode == float_round_down;
1123 } else {
1124 int shift = clz64(a.frac);
1125 a.frac = a.frac << shift;
1126 a.exp = a.exp - shift;
1127 a.sign = a_sign;
1128 }
1129 return a;
1130 }
1131 if (is_nan(a.cls) || is_nan(b.cls)) {
1132 return *parts_pick_nan(&a, &b, s);
1133 }
1134 if (a.cls == float_class_inf) {
1135 if (b.cls == float_class_inf) {
1136 float_raise(float_flag_invalid, s);
1137 parts_default_nan(&a, s);
1138 }
1139 return a;
1140 }
1141 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1142 a.sign = s->float_rounding_mode == float_round_down;
1143 return a;
1144 }
1145 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1146 b.sign = a_sign ^ 1;
1147 return b;
1148 }
1149 if (b.cls == float_class_zero) {
1150 return a;
1151 }
1152 } else {
1153 /* Addition */
1154 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1155 if (a.exp > b.exp) {
1156 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1157 } else if (a.exp < b.exp) {
1158 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1159 a.exp = b.exp;
1160 }
1161
1162 if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1163 shift64RightJamming(a.frac, 1, &a.frac);
1164 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1165 a.exp += 1;
1166 }
1167 return a;
1168 }
1169 if (is_nan(a.cls) || is_nan(b.cls)) {
1170 return *parts_pick_nan(&a, &b, s);
1171 }
1172 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1173 return a;
1174 }
1175 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1176 b.sign = b_sign;
1177 return b;
1178 }
1179 }
1180 g_assert_not_reached();
1181 }
1182
1183 /*
1184 * Returns the result of adding or subtracting the floating-point
1185 * values `a' and `b'. The operation is performed according to the
1186 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 */
1188
1189 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1190 {
1191 FloatParts64 pa, pb, pr;
1192
1193 float16_unpack_canonical(&pa, a, status);
1194 float16_unpack_canonical(&pb, b, status);
1195 pr = addsub_floats(pa, pb, false, status);
1196
1197 return float16_round_pack_canonical(&pr, status);
1198 }
1199
1200 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1201 {
1202 FloatParts64 pa, pb, pr;
1203
1204 float16_unpack_canonical(&pa, a, status);
1205 float16_unpack_canonical(&pb, b, status);
1206 pr = addsub_floats(pa, pb, true, status);
1207
1208 return float16_round_pack_canonical(&pr, status);
1209 }
1210
1211 static float32 QEMU_SOFTFLOAT_ATTR
1212 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1213 {
1214 FloatParts64 pa, pb, pr;
1215
1216 float32_unpack_canonical(&pa, a, status);
1217 float32_unpack_canonical(&pb, b, status);
1218 pr = addsub_floats(pa, pb, subtract, status);
1219
1220 return float32_round_pack_canonical(&pr, status);
1221 }
1222
1223 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1224 {
1225 return soft_f32_addsub(a, b, false, status);
1226 }
1227
1228 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1229 {
1230 return soft_f32_addsub(a, b, true, status);
1231 }
1232
1233 static float64 QEMU_SOFTFLOAT_ATTR
1234 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1235 {
1236 FloatParts64 pa, pb, pr;
1237
1238 float64_unpack_canonical(&pa, a, status);
1239 float64_unpack_canonical(&pb, b, status);
1240 pr = addsub_floats(pa, pb, subtract, status);
1241
1242 return float64_round_pack_canonical(&pr, status);
1243 }
1244
1245 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1246 {
1247 return soft_f64_addsub(a, b, false, status);
1248 }
1249
1250 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1251 {
1252 return soft_f64_addsub(a, b, true, status);
1253 }
1254
1255 static float hard_f32_add(float a, float b)
1256 {
1257 return a + b;
1258 }
1259
1260 static float hard_f32_sub(float a, float b)
1261 {
1262 return a - b;
1263 }
1264
1265 static double hard_f64_add(double a, double b)
1266 {
1267 return a + b;
1268 }
1269
1270 static double hard_f64_sub(double a, double b)
1271 {
1272 return a - b;
1273 }
1274
1275 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1276 {
1277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1278 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1279 }
1280 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1281 }
1282
1283 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1284 {
1285 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1286 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1287 } else {
1288 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1289 }
1290 }
1291
1292 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1293 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1294 {
1295 return float32_gen2(a, b, s, hard, soft,
1296 f32_is_zon2, f32_addsubmul_post);
1297 }
1298
1299 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1300 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1301 {
1302 return float64_gen2(a, b, s, hard, soft,
1303 f64_is_zon2, f64_addsubmul_post);
1304 }
1305
1306 float32 QEMU_FLATTEN
1307 float32_add(float32 a, float32 b, float_status *s)
1308 {
1309 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1310 }
1311
1312 float32 QEMU_FLATTEN
1313 float32_sub(float32 a, float32 b, float_status *s)
1314 {
1315 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1316 }
1317
1318 float64 QEMU_FLATTEN
1319 float64_add(float64 a, float64 b, float_status *s)
1320 {
1321 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1322 }
1323
1324 float64 QEMU_FLATTEN
1325 float64_sub(float64 a, float64 b, float_status *s)
1326 {
1327 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1328 }
1329
1330 /*
1331 * Returns the result of adding or subtracting the bfloat16
1332 * values `a' and `b'.
1333 */
1334 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1335 {
1336 FloatParts64 pa, pb, pr;
1337
1338 bfloat16_unpack_canonical(&pa, a, status);
1339 bfloat16_unpack_canonical(&pb, b, status);
1340 pr = addsub_floats(pa, pb, false, status);
1341
1342 return bfloat16_round_pack_canonical(&pr, status);
1343 }
1344
1345 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1346 {
1347 FloatParts64 pa, pb, pr;
1348
1349 bfloat16_unpack_canonical(&pa, a, status);
1350 bfloat16_unpack_canonical(&pb, b, status);
1351 pr = addsub_floats(pa, pb, true, status);
1352
1353 return bfloat16_round_pack_canonical(&pr, status);
1354 }
1355
1356 /*
1357 * Returns the result of multiplying the floating-point values `a' and
1358 * `b'. The operation is performed according to the IEC/IEEE Standard
1359 * for Binary Floating-Point Arithmetic.
1360 */
1361
1362 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1363 {
1364 bool sign = a.sign ^ b.sign;
1365
1366 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1367 uint64_t hi, lo;
1368 int exp = a.exp + b.exp;
1369
1370 mul64To128(a.frac, b.frac, &hi, &lo);
1371 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1372 exp += 1;
1373 } else {
1374 hi <<= 1;
1375 }
1376 hi |= (lo != 0);
1377
1378 /* Re-use a */
1379 a.exp = exp;
1380 a.sign = sign;
1381 a.frac = hi;
1382 return a;
1383 }
1384 /* handle all the NaN cases */
1385 if (is_nan(a.cls) || is_nan(b.cls)) {
1386 return *parts_pick_nan(&a, &b, s);
1387 }
1388 /* Inf * Zero == NaN */
1389 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1390 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1391 float_raise(float_flag_invalid, s);
1392 parts_default_nan(&a, s);
1393 return a;
1394 }
1395 /* Multiply by 0 or Inf */
1396 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1397 a.sign = sign;
1398 return a;
1399 }
1400 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1401 b.sign = sign;
1402 return b;
1403 }
1404 g_assert_not_reached();
1405 }
1406
1407 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1408 {
1409 FloatParts64 pa, pb, pr;
1410
1411 float16_unpack_canonical(&pa, a, status);
1412 float16_unpack_canonical(&pb, b, status);
1413 pr = mul_floats(pa, pb, status);
1414
1415 return float16_round_pack_canonical(&pr, status);
1416 }
1417
1418 static float32 QEMU_SOFTFLOAT_ATTR
1419 soft_f32_mul(float32 a, float32 b, float_status *status)
1420 {
1421 FloatParts64 pa, pb, pr;
1422
1423 float32_unpack_canonical(&pa, a, status);
1424 float32_unpack_canonical(&pb, b, status);
1425 pr = mul_floats(pa, pb, status);
1426
1427 return float32_round_pack_canonical(&pr, status);
1428 }
1429
1430 static float64 QEMU_SOFTFLOAT_ATTR
1431 soft_f64_mul(float64 a, float64 b, float_status *status)
1432 {
1433 FloatParts64 pa, pb, pr;
1434
1435 float64_unpack_canonical(&pa, a, status);
1436 float64_unpack_canonical(&pb, b, status);
1437 pr = mul_floats(pa, pb, status);
1438
1439 return float64_round_pack_canonical(&pr, status);
1440 }
1441
1442 static float hard_f32_mul(float a, float b)
1443 {
1444 return a * b;
1445 }
1446
1447 static double hard_f64_mul(double a, double b)
1448 {
1449 return a * b;
1450 }
1451
1452 float32 QEMU_FLATTEN
1453 float32_mul(float32 a, float32 b, float_status *s)
1454 {
1455 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1456 f32_is_zon2, f32_addsubmul_post);
1457 }
1458
1459 float64 QEMU_FLATTEN
1460 float64_mul(float64 a, float64 b, float_status *s)
1461 {
1462 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1463 f64_is_zon2, f64_addsubmul_post);
1464 }
1465
1466 /*
1467 * Returns the result of multiplying the bfloat16
1468 * values `a' and `b'.
1469 */
1470
1471 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1472 {
1473 FloatParts64 pa, pb, pr;
1474
1475 bfloat16_unpack_canonical(&pa, a, status);
1476 bfloat16_unpack_canonical(&pb, b, status);
1477 pr = mul_floats(pa, pb, status);
1478
1479 return bfloat16_round_pack_canonical(&pr, status);
1480 }
1481
1482 /*
1483 * Returns the result of multiplying the floating-point values `a' and
1484 * `b' then adding 'c', with no intermediate rounding step after the
1485 * multiplication. The operation is performed according to the
1486 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1487 * The flags argument allows the caller to select negation of the
1488 * addend, the intermediate product, or the final result. (The
1489 * difference between this and having the caller do a separate
1490 * negation is that negating externally will flip the sign bit on
1491 * NaNs.)
1492 */
1493
1494 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1495 int flags, float_status *s)
1496 {
1497 bool inf_zero, p_sign;
1498 bool sign_flip = flags & float_muladd_negate_result;
1499 FloatClass p_class;
1500 uint64_t hi, lo;
1501 int p_exp;
1502 int ab_mask, abc_mask;
1503
1504 ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1505 abc_mask = float_cmask(c.cls) | ab_mask;
1506 inf_zero = ab_mask == float_cmask_infzero;
1507
1508 /* It is implementation-defined whether the cases of (0,inf,qnan)
1509 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1510 * they return if they do), so we have to hand this information
1511 * off to the target-specific pick-a-NaN routine.
1512 */
1513 if (unlikely(abc_mask & float_cmask_anynan)) {
1514 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1515 }
1516
1517 if (inf_zero) {
1518 float_raise(float_flag_invalid, s);
1519 parts_default_nan(&a, s);
1520 return a;
1521 }
1522
1523 if (flags & float_muladd_negate_c) {
1524 c.sign ^= 1;
1525 }
1526
1527 p_sign = a.sign ^ b.sign;
1528
1529 if (flags & float_muladd_negate_product) {
1530 p_sign ^= 1;
1531 }
1532
1533 if (ab_mask & float_cmask_inf) {
1534 p_class = float_class_inf;
1535 } else if (ab_mask & float_cmask_zero) {
1536 p_class = float_class_zero;
1537 } else {
1538 p_class = float_class_normal;
1539 }
1540
1541 if (c.cls == float_class_inf) {
1542 if (p_class == float_class_inf && p_sign != c.sign) {
1543 float_raise(float_flag_invalid, s);
1544 parts_default_nan(&c, s);
1545 } else {
1546 c.sign ^= sign_flip;
1547 }
1548 return c;
1549 }
1550
1551 if (p_class == float_class_inf) {
1552 a.cls = float_class_inf;
1553 a.sign = p_sign ^ sign_flip;
1554 return a;
1555 }
1556
1557 if (p_class == float_class_zero) {
1558 if (c.cls == float_class_zero) {
1559 if (p_sign != c.sign) {
1560 p_sign = s->float_rounding_mode == float_round_down;
1561 }
1562 c.sign = p_sign;
1563 } else if (flags & float_muladd_halve_result) {
1564 c.exp -= 1;
1565 }
1566 c.sign ^= sign_flip;
1567 return c;
1568 }
1569
1570 /* a & b should be normals now... */
1571 assert(a.cls == float_class_normal &&
1572 b.cls == float_class_normal);
1573
1574 p_exp = a.exp + b.exp;
1575
1576 mul64To128(a.frac, b.frac, &hi, &lo);
1577
1578 /* Renormalize to the msb. */
1579 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1580 p_exp += 1;
1581 } else {
1582 shortShift128Left(hi, lo, 1, &hi, &lo);
1583 }
1584
1585 /* + add/sub */
1586 if (c.cls != float_class_zero) {
1587 int exp_diff = p_exp - c.exp;
1588 if (p_sign == c.sign) {
1589 /* Addition */
1590 if (exp_diff <= 0) {
1591 shift64RightJamming(hi, -exp_diff, &hi);
1592 p_exp = c.exp;
1593 if (uadd64_overflow(hi, c.frac, &hi)) {
1594 shift64RightJamming(hi, 1, &hi);
1595 hi |= DECOMPOSED_IMPLICIT_BIT;
1596 p_exp += 1;
1597 }
1598 } else {
1599 uint64_t c_hi, c_lo, over;
1600 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1601 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1602 if (over) {
1603 shift64RightJamming(hi, 1, &hi);
1604 hi |= DECOMPOSED_IMPLICIT_BIT;
1605 p_exp += 1;
1606 }
1607 }
1608 } else {
1609 /* Subtraction */
1610 uint64_t c_hi = c.frac, c_lo = 0;
1611
1612 if (exp_diff <= 0) {
1613 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1614 if (exp_diff == 0
1615 &&
1616 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1617 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1618 } else {
1619 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1620 p_sign ^= 1;
1621 p_exp = c.exp;
1622 }
1623 } else {
1624 shift128RightJamming(c_hi, c_lo,
1625 exp_diff,
1626 &c_hi, &c_lo);
1627 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1628 }
1629
1630 if (hi == 0 && lo == 0) {
1631 a.cls = float_class_zero;
1632 a.sign = s->float_rounding_mode == float_round_down;
1633 a.sign ^= sign_flip;
1634 return a;
1635 } else {
1636 int shift;
1637 if (hi != 0) {
1638 shift = clz64(hi);
1639 } else {
1640 shift = clz64(lo) + 64;
1641 }
1642 /* Normalizing to a binary point of 124 is the
1643 correct adjust for the exponent. However since we're
1644 shifting, we might as well put the binary point back
1645 at 63 where we really want it. Therefore shift as
1646 if we're leaving 1 bit at the top of the word, but
1647 adjust the exponent as if we're leaving 3 bits. */
1648 shift128Left(hi, lo, shift, &hi, &lo);
1649 p_exp -= shift;
1650 }
1651 }
1652 }
1653 hi |= (lo != 0);
1654
1655 if (flags & float_muladd_halve_result) {
1656 p_exp -= 1;
1657 }
1658
1659 /* finally prepare our result */
1660 a.cls = float_class_normal;
1661 a.sign = p_sign ^ sign_flip;
1662 a.exp = p_exp;
1663 a.frac = hi;
1664
1665 return a;
1666 }
1667
1668 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1669 int flags, float_status *status)
1670 {
1671 FloatParts64 pa, pb, pc, pr;
1672
1673 float16_unpack_canonical(&pa, a, status);
1674 float16_unpack_canonical(&pb, b, status);
1675 float16_unpack_canonical(&pc, c, status);
1676 pr = muladd_floats(pa, pb, pc, flags, status);
1677
1678 return float16_round_pack_canonical(&pr, status);
1679 }
1680
1681 static float32 QEMU_SOFTFLOAT_ATTR
1682 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1683 float_status *status)
1684 {
1685 FloatParts64 pa, pb, pc, pr;
1686
1687 float32_unpack_canonical(&pa, a, status);
1688 float32_unpack_canonical(&pb, b, status);
1689 float32_unpack_canonical(&pc, c, status);
1690 pr = muladd_floats(pa, pb, pc, flags, status);
1691
1692 return float32_round_pack_canonical(&pr, status);
1693 }
1694
1695 static float64 QEMU_SOFTFLOAT_ATTR
1696 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1697 float_status *status)
1698 {
1699 FloatParts64 pa, pb, pc, pr;
1700
1701 float64_unpack_canonical(&pa, a, status);
1702 float64_unpack_canonical(&pb, b, status);
1703 float64_unpack_canonical(&pc, c, status);
1704 pr = muladd_floats(pa, pb, pc, flags, status);
1705
1706 return float64_round_pack_canonical(&pr, status);
1707 }
1708
1709 static bool force_soft_fma;
1710
1711 float32 QEMU_FLATTEN
1712 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1713 {
1714 union_float32 ua, ub, uc, ur;
1715
1716 ua.s = xa;
1717 ub.s = xb;
1718 uc.s = xc;
1719
1720 if (unlikely(!can_use_fpu(s))) {
1721 goto soft;
1722 }
1723 if (unlikely(flags & float_muladd_halve_result)) {
1724 goto soft;
1725 }
1726
1727 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1728 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1729 goto soft;
1730 }
1731
1732 if (unlikely(force_soft_fma)) {
1733 goto soft;
1734 }
1735
1736 /*
1737 * When (a || b) == 0, there's no need to check for under/over flow,
1738 * since we know the addend is (normal || 0) and the product is 0.
1739 */
1740 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1741 union_float32 up;
1742 bool prod_sign;
1743
1744 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1745 prod_sign ^= !!(flags & float_muladd_negate_product);
1746 up.s = float32_set_sign(float32_zero, prod_sign);
1747
1748 if (flags & float_muladd_negate_c) {
1749 uc.h = -uc.h;
1750 }
1751 ur.h = up.h + uc.h;
1752 } else {
1753 union_float32 ua_orig = ua;
1754 union_float32 uc_orig = uc;
1755
1756 if (flags & float_muladd_negate_product) {
1757 ua.h = -ua.h;
1758 }
1759 if (flags & float_muladd_negate_c) {
1760 uc.h = -uc.h;
1761 }
1762
1763 ur.h = fmaf(ua.h, ub.h, uc.h);
1764
1765 if (unlikely(f32_is_inf(ur))) {
1766 float_raise(float_flag_overflow, s);
1767 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1768 ua = ua_orig;
1769 uc = uc_orig;
1770 goto soft;
1771 }
1772 }
1773 if (flags & float_muladd_negate_result) {
1774 return float32_chs(ur.s);
1775 }
1776 return ur.s;
1777
1778 soft:
1779 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1780 }
1781
1782 float64 QEMU_FLATTEN
1783 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1784 {
1785 union_float64 ua, ub, uc, ur;
1786
1787 ua.s = xa;
1788 ub.s = xb;
1789 uc.s = xc;
1790
1791 if (unlikely(!can_use_fpu(s))) {
1792 goto soft;
1793 }
1794 if (unlikely(flags & float_muladd_halve_result)) {
1795 goto soft;
1796 }
1797
1798 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1799 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1800 goto soft;
1801 }
1802
1803 if (unlikely(force_soft_fma)) {
1804 goto soft;
1805 }
1806
1807 /*
1808 * When (a || b) == 0, there's no need to check for under/over flow,
1809 * since we know the addend is (normal || 0) and the product is 0.
1810 */
1811 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1812 union_float64 up;
1813 bool prod_sign;
1814
1815 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1816 prod_sign ^= !!(flags & float_muladd_negate_product);
1817 up.s = float64_set_sign(float64_zero, prod_sign);
1818
1819 if (flags & float_muladd_negate_c) {
1820 uc.h = -uc.h;
1821 }
1822 ur.h = up.h + uc.h;
1823 } else {
1824 union_float64 ua_orig = ua;
1825 union_float64 uc_orig = uc;
1826
1827 if (flags & float_muladd_negate_product) {
1828 ua.h = -ua.h;
1829 }
1830 if (flags & float_muladd_negate_c) {
1831 uc.h = -uc.h;
1832 }
1833
1834 ur.h = fma(ua.h, ub.h, uc.h);
1835
1836 if (unlikely(f64_is_inf(ur))) {
1837 float_raise(float_flag_overflow, s);
1838 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1839 ua = ua_orig;
1840 uc = uc_orig;
1841 goto soft;
1842 }
1843 }
1844 if (flags & float_muladd_negate_result) {
1845 return float64_chs(ur.s);
1846 }
1847 return ur.s;
1848
1849 soft:
1850 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1851 }
1852
1853 /*
1854 * Returns the result of multiplying the bfloat16 values `a'
1855 * and `b' then adding 'c', with no intermediate rounding step after the
1856 * multiplication.
1857 */
1858
1859 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1860 int flags, float_status *status)
1861 {
1862 FloatParts64 pa, pb, pc, pr;
1863
1864 bfloat16_unpack_canonical(&pa, a, status);
1865 bfloat16_unpack_canonical(&pb, b, status);
1866 bfloat16_unpack_canonical(&pc, c, status);
1867 pr = muladd_floats(pa, pb, pc, flags, status);
1868
1869 return bfloat16_round_pack_canonical(&pr, status);
1870 }
1871
1872 /*
1873 * Returns the result of dividing the floating-point value `a' by the
1874 * corresponding value `b'. The operation is performed according to
1875 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1876 */
1877
1878 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1879 {
1880 bool sign = a.sign ^ b.sign;
1881
1882 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1883 uint64_t n0, n1, q, r;
1884 int exp = a.exp - b.exp;
1885
1886 /*
1887 * We want a 2*N / N-bit division to produce exactly an N-bit
1888 * result, so that we do not lose any precision and so that we
1889 * do not have to renormalize afterward. If A.frac < B.frac,
1890 * then division would produce an (N-1)-bit result; shift A left
1891 * by one to produce the an N-bit result, and decrement the
1892 * exponent to match.
1893 *
1894 * The udiv_qrnnd algorithm that we're using requires normalization,
1895 * i.e. the msb of the denominator must be set, which is already true.
1896 */
1897 if (a.frac < b.frac) {
1898 exp -= 1;
1899 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1900 } else {
1901 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1902 }
1903 q = udiv_qrnnd(&r, n1, n0, b.frac);
1904
1905 /* Set lsb if there is a remainder, to set inexact. */
1906 a.frac = q | (r != 0);
1907 a.sign = sign;
1908 a.exp = exp;
1909 return a;
1910 }
1911 /* handle all the NaN cases */
1912 if (is_nan(a.cls) || is_nan(b.cls)) {
1913 return *parts_pick_nan(&a, &b, s);
1914 }
1915 /* 0/0 or Inf/Inf */
1916 if (a.cls == b.cls
1917 &&
1918 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1919 float_raise(float_flag_invalid, s);
1920 parts_default_nan(&a, s);
1921 return a;
1922 }
1923 /* Inf / x or 0 / x */
1924 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1925 a.sign = sign;
1926 return a;
1927 }
1928 /* Div 0 => Inf */
1929 if (b.cls == float_class_zero) {
1930 float_raise(float_flag_divbyzero, s);
1931 a.cls = float_class_inf;
1932 a.sign = sign;
1933 return a;
1934 }
1935 /* Div by Inf */
1936 if (b.cls == float_class_inf) {
1937 a.cls = float_class_zero;
1938 a.sign = sign;
1939 return a;
1940 }
1941 g_assert_not_reached();
1942 }
1943
1944 float16 float16_div(float16 a, float16 b, float_status *status)
1945 {
1946 FloatParts64 pa, pb, pr;
1947
1948 float16_unpack_canonical(&pa, a, status);
1949 float16_unpack_canonical(&pb, b, status);
1950 pr = div_floats(pa, pb, status);
1951
1952 return float16_round_pack_canonical(&pr, status);
1953 }
1954
1955 static float32 QEMU_SOFTFLOAT_ATTR
1956 soft_f32_div(float32 a, float32 b, float_status *status)
1957 {
1958 FloatParts64 pa, pb, pr;
1959
1960 float32_unpack_canonical(&pa, a, status);
1961 float32_unpack_canonical(&pb, b, status);
1962 pr = div_floats(pa, pb, status);
1963
1964 return float32_round_pack_canonical(&pr, status);
1965 }
1966
1967 static float64 QEMU_SOFTFLOAT_ATTR
1968 soft_f64_div(float64 a, float64 b, float_status *status)
1969 {
1970 FloatParts64 pa, pb, pr;
1971
1972 float64_unpack_canonical(&pa, a, status);
1973 float64_unpack_canonical(&pb, b, status);
1974 pr = div_floats(pa, pb, status);
1975
1976 return float64_round_pack_canonical(&pr, status);
1977 }
1978
1979 static float hard_f32_div(float a, float b)
1980 {
1981 return a / b;
1982 }
1983
1984 static double hard_f64_div(double a, double b)
1985 {
1986 return a / b;
1987 }
1988
1989 static bool f32_div_pre(union_float32 a, union_float32 b)
1990 {
1991 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1992 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1993 fpclassify(b.h) == FP_NORMAL;
1994 }
1995 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1996 }
1997
1998 static bool f64_div_pre(union_float64 a, union_float64 b)
1999 {
2000 if (QEMU_HARDFLOAT_2F64_USE_FP) {
2001 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
2002 fpclassify(b.h) == FP_NORMAL;
2003 }
2004 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
2005 }
2006
2007 static bool f32_div_post(union_float32 a, union_float32 b)
2008 {
2009 if (QEMU_HARDFLOAT_2F32_USE_FP) {
2010 return fpclassify(a.h) != FP_ZERO;
2011 }
2012 return !float32_is_zero(a.s);
2013 }
2014
2015 static bool f64_div_post(union_float64 a, union_float64 b)
2016 {
2017 if (QEMU_HARDFLOAT_2F64_USE_FP) {
2018 return fpclassify(a.h) != FP_ZERO;
2019 }
2020 return !float64_is_zero(a.s);
2021 }
2022
2023 float32 QEMU_FLATTEN
2024 float32_div(float32 a, float32 b, float_status *s)
2025 {
2026 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2027 f32_div_pre, f32_div_post);
2028 }
2029
2030 float64 QEMU_FLATTEN
2031 float64_div(float64 a, float64 b, float_status *s)
2032 {
2033 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2034 f64_div_pre, f64_div_post);
2035 }
2036
2037 /*
2038 * Returns the result of dividing the bfloat16
2039 * value `a' by the corresponding value `b'.
2040 */
2041
2042 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2043 {
2044 FloatParts64 pa, pb, pr;
2045
2046 bfloat16_unpack_canonical(&pa, a, status);
2047 bfloat16_unpack_canonical(&pb, b, status);
2048 pr = div_floats(pa, pb, status);
2049
2050 return bfloat16_round_pack_canonical(&pr, status);
2051 }
2052
2053 /*
2054 * Float to Float conversions
2055 *
2056 * Returns the result of converting one float format to another. The
2057 * conversion is performed according to the IEC/IEEE Standard for
2058 * Binary Floating-Point Arithmetic.
2059 *
2060 * The float_to_float helper only needs to take care of raising
2061 * invalid exceptions and handling the conversion on NaNs.
2062 */
2063
2064 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
2065 float_status *s)
2066 {
2067 if (dstf->arm_althp) {
2068 switch (a.cls) {
2069 case float_class_qnan:
2070 case float_class_snan:
2071 /* There is no NaN in the destination format. Raise Invalid
2072 * and return a zero with the sign of the input NaN.
2073 */
2074 float_raise(float_flag_invalid, s);
2075 a.cls = float_class_zero;
2076 a.frac = 0;
2077 a.exp = 0;
2078 break;
2079
2080 case float_class_inf:
2081 /* There is no Inf in the destination format. Raise Invalid
2082 * and return the maximum normal with the correct sign.
2083 */
2084 float_raise(float_flag_invalid, s);
2085 a.cls = float_class_normal;
2086 a.exp = dstf->exp_max;
2087 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
2088 break;
2089
2090 default:
2091 break;
2092 }
2093 } else if (is_nan(a.cls)) {
2094 parts_return_nan(&a, s);
2095 }
2096 return a;
2097 }
2098
2099 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2100 {
2101 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2102 FloatParts64 pa, pr;
2103
2104 float16a_unpack_canonical(&pa, a, s, fmt16);
2105 pr = float_to_float(pa, &float32_params, s);
2106 return float32_round_pack_canonical(&pr, s);
2107 }
2108
2109 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2110 {
2111 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2112 FloatParts64 pa, pr;
2113
2114 float16a_unpack_canonical(&pa, a, s, fmt16);
2115 pr = float_to_float(pa, &float64_params, s);
2116 return float64_round_pack_canonical(&pr, s);
2117 }
2118
2119 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2120 {
2121 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2122 FloatParts64 pa, pr;
2123
2124 float32_unpack_canonical(&pa, a, s);
2125 pr = float_to_float(pa, fmt16, s);
2126 return float16a_round_pack_canonical(&pr, s, fmt16);
2127 }
2128
2129 static float64 QEMU_SOFTFLOAT_ATTR
2130 soft_float32_to_float64(float32 a, float_status *s)
2131 {
2132 FloatParts64 pa, pr;
2133
2134 float32_unpack_canonical(&pa, a, s);
2135 pr = float_to_float(pa, &float64_params, s);
2136 return float64_round_pack_canonical(&pr, s);
2137 }
2138
2139 float64 float32_to_float64(float32 a, float_status *s)
2140 {
2141 if (likely(float32_is_normal(a))) {
2142 /* Widening conversion can never produce inexact results. */
2143 union_float32 uf;
2144 union_float64 ud;
2145 uf.s = a;
2146 ud.h = uf.h;
2147 return ud.s;
2148 } else if (float32_is_zero(a)) {
2149 return float64_set_sign(float64_zero, float32_is_neg(a));
2150 } else {
2151 return soft_float32_to_float64(a, s);
2152 }
2153 }
2154
2155 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2156 {
2157 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2158 FloatParts64 pa, pr;
2159
2160 float64_unpack_canonical(&pa, a, s);
2161 pr = float_to_float(pa, fmt16, s);
2162 return float16a_round_pack_canonical(&pr, s, fmt16);
2163 }
2164
2165 float32 float64_to_float32(float64 a, float_status *s)
2166 {
2167 FloatParts64 pa, pr;
2168
2169 float64_unpack_canonical(&pa, a, s);
2170 pr = float_to_float(pa, &float32_params, s);
2171 return float32_round_pack_canonical(&pr, s);
2172 }
2173
2174 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2175 {
2176 FloatParts64 pa, pr;
2177
2178 bfloat16_unpack_canonical(&pa, a, s);
2179 pr = float_to_float(pa, &float32_params, s);
2180 return float32_round_pack_canonical(&pr, s);
2181 }
2182
2183 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2184 {
2185 FloatParts64 pa, pr;
2186
2187 bfloat16_unpack_canonical(&pa, a, s);
2188 pr = float_to_float(pa, &float64_params, s);
2189 return float64_round_pack_canonical(&pr, s);
2190 }
2191
2192 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2193 {
2194 FloatParts64 pa, pr;
2195
2196 float32_unpack_canonical(&pa, a, s);
2197 pr = float_to_float(pa, &bfloat16_params, s);
2198 return bfloat16_round_pack_canonical(&pr, s);
2199 }
2200
2201 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2202 {
2203 FloatParts64 pa, pr;
2204
2205 float64_unpack_canonical(&pa, a, s);
2206 pr = float_to_float(pa, &bfloat16_params, s);
2207 return bfloat16_round_pack_canonical(&pr, s);
2208 }
2209
2210 /*
2211 * Rounds the floating-point value `a' to an integer, and returns the
2212 * result as a floating-point value. The operation is performed
2213 * according to the IEC/IEEE Standard for Binary Floating-Point
2214 * Arithmetic.
2215 */
2216
2217 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2218 int scale, float_status *s)
2219 {
2220 switch (a.cls) {
2221 case float_class_qnan:
2222 case float_class_snan:
2223 parts_return_nan(&a, s);
2224 break;
2225
2226 case float_class_zero:
2227 case float_class_inf:
2228 /* already "integral" */
2229 break;
2230
2231 case float_class_normal:
2232 scale = MIN(MAX(scale, -0x10000), 0x10000);
2233 a.exp += scale;
2234
2235 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2236 /* already integral */
2237 break;
2238 }
2239 if (a.exp < 0) {
2240 bool one;
2241 /* all fractional */
2242 float_raise(float_flag_inexact, s);
2243 switch (rmode) {
2244 case float_round_nearest_even:
2245 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2246 break;
2247 case float_round_ties_away:
2248 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2249 break;
2250 case float_round_to_zero:
2251 one = false;
2252 break;
2253 case float_round_up:
2254 one = !a.sign;
2255 break;
2256 case float_round_down:
2257 one = a.sign;
2258 break;
2259 case float_round_to_odd:
2260 one = true;
2261 break;
2262 default:
2263 g_assert_not_reached();
2264 }
2265
2266 if (one) {
2267 a.frac = DECOMPOSED_IMPLICIT_BIT;
2268 a.exp = 0;
2269 } else {
2270 a.cls = float_class_zero;
2271 }
2272 } else {
2273 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2274 uint64_t frac_lsbm1 = frac_lsb >> 1;
2275 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2276 uint64_t rnd_mask = rnd_even_mask >> 1;
2277 uint64_t inc;
2278
2279 switch (rmode) {
2280 case float_round_nearest_even:
2281 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2282 break;
2283 case float_round_ties_away:
2284 inc = frac_lsbm1;
2285 break;
2286 case float_round_to_zero:
2287 inc = 0;
2288 break;
2289 case float_round_up:
2290 inc = a.sign ? 0 : rnd_mask;
2291 break;
2292 case float_round_down:
2293 inc = a.sign ? rnd_mask : 0;
2294 break;
2295 case float_round_to_odd:
2296 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2297 break;
2298 default:
2299 g_assert_not_reached();
2300 }
2301
2302 if (a.frac & rnd_mask) {
2303 float_raise(float_flag_inexact, s);
2304 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2305 a.frac >>= 1;
2306 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2307 a.exp++;
2308 }
2309 a.frac &= ~rnd_mask;
2310 }
2311 }
2312 break;
2313 default:
2314 g_assert_not_reached();
2315 }
2316 return a;
2317 }
2318
2319 float16 float16_round_to_int(float16 a, float_status *s)
2320 {
2321 FloatParts64 pa, pr;
2322
2323 float16_unpack_canonical(&pa, a, s);
2324 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2325 return float16_round_pack_canonical(&pr, s);
2326 }
2327
2328 float32 float32_round_to_int(float32 a, float_status *s)
2329 {
2330 FloatParts64 pa, pr;
2331
2332 float32_unpack_canonical(&pa, a, s);
2333 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2334 return float32_round_pack_canonical(&pr, s);
2335 }
2336
2337 float64 float64_round_to_int(float64 a, float_status *s)
2338 {
2339 FloatParts64 pa, pr;
2340
2341 float64_unpack_canonical(&pa, a, s);
2342 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2343 return float64_round_pack_canonical(&pr, s);
2344 }
2345
2346 /*
2347 * Rounds the bfloat16 value `a' to an integer, and returns the
2348 * result as a bfloat16 value.
2349 */
2350
2351 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2352 {
2353 FloatParts64 pa, pr;
2354
2355 bfloat16_unpack_canonical(&pa, a, s);
2356 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2357 return bfloat16_round_pack_canonical(&pr, s);
2358 }
2359
2360 /*
2361 * Returns the result of converting the floating-point value `a' to
2362 * the two's complement integer format. The conversion is performed
2363 * according to the IEC/IEEE Standard for Binary Floating-Point
2364 * Arithmetic---which means in particular that the conversion is
2365 * rounded according to the current rounding mode. If `a' is a NaN,
2366 * the largest positive integer is returned. Otherwise, if the
2367 * conversion overflows, the largest integer with the same sign as `a'
2368 * is returned.
2369 */
2370
2371 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2372 int scale, int64_t min, int64_t max,
2373 float_status *s)
2374 {
2375 uint64_t r;
2376 int orig_flags = get_float_exception_flags(s);
2377 FloatParts64 p = round_to_int(in, rmode, scale, s);
2378
2379 switch (p.cls) {
2380 case float_class_snan:
2381 case float_class_qnan:
2382 s->float_exception_flags = orig_flags | float_flag_invalid;
2383 return max;
2384 case float_class_inf:
2385 s->float_exception_flags = orig_flags | float_flag_invalid;
2386 return p.sign ? min : max;
2387 case float_class_zero:
2388 return 0;
2389 case float_class_normal:
2390 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2391 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2392 } else {
2393 r = UINT64_MAX;
2394 }
2395 if (p.sign) {
2396 if (r <= -(uint64_t) min) {
2397 return -r;
2398 } else {
2399 s->float_exception_flags = orig_flags | float_flag_invalid;
2400 return min;
2401 }
2402 } else {
2403 if (r <= max) {
2404 return r;
2405 } else {
2406 s->float_exception_flags = orig_flags | float_flag_invalid;
2407 return max;
2408 }
2409 }
2410 default:
2411 g_assert_not_reached();
2412 }
2413 }
2414
2415 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2416 float_status *s)
2417 {
2418 FloatParts64 p;
2419
2420 float16_unpack_canonical(&p, a, s);
2421 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2422 }
2423
2424 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2425 float_status *s)
2426 {
2427 FloatParts64 p;
2428
2429 float16_unpack_canonical(&p, a, s);
2430 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2431 }
2432
2433 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2434 float_status *s)
2435 {
2436 FloatParts64 p;
2437
2438 float16_unpack_canonical(&p, a, s);
2439 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2440 }
2441
2442 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2443 float_status *s)
2444 {
2445 FloatParts64 p;
2446
2447 float16_unpack_canonical(&p, a, s);
2448 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2449 }
2450
2451 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2452 float_status *s)
2453 {
2454 FloatParts64 p;
2455
2456 float32_unpack_canonical(&p, a, s);
2457 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2458 }
2459
2460 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2461 float_status *s)
2462 {
2463 FloatParts64 p;
2464
2465 float32_unpack_canonical(&p, a, s);
2466 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2467 }
2468
2469 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2470 float_status *s)
2471 {
2472 FloatParts64 p;
2473
2474 float32_unpack_canonical(&p, a, s);
2475 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2476 }
2477
2478 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2479 float_status *s)
2480 {
2481 FloatParts64 p;
2482
2483 float64_unpack_canonical(&p, a, s);
2484 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2485 }
2486
2487 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2488 float_status *s)
2489 {
2490 FloatParts64 p;
2491
2492 float64_unpack_canonical(&p, a, s);
2493 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2494 }
2495
2496 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2497 float_status *s)
2498 {
2499 FloatParts64 p;
2500
2501 float64_unpack_canonical(&p, a, s);
2502 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2503 }
2504
2505 int8_t float16_to_int8(float16 a, float_status *s)
2506 {
2507 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2508 }
2509
2510 int16_t float16_to_int16(float16 a, float_status *s)
2511 {
2512 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2513 }
2514
2515 int32_t float16_to_int32(float16 a, float_status *s)
2516 {
2517 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2518 }
2519
2520 int64_t float16_to_int64(float16 a, float_status *s)
2521 {
2522 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2523 }
2524
2525 int16_t float32_to_int16(float32 a, float_status *s)
2526 {
2527 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2528 }
2529
2530 int32_t float32_to_int32(float32 a, float_status *s)
2531 {
2532 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2533 }
2534
2535 int64_t float32_to_int64(float32 a, float_status *s)
2536 {
2537 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2538 }
2539
2540 int16_t float64_to_int16(float64 a, float_status *s)
2541 {
2542 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2543 }
2544
2545 int32_t float64_to_int32(float64 a, float_status *s)
2546 {
2547 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2548 }
2549
2550 int64_t float64_to_int64(float64 a, float_status *s)
2551 {
2552 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2553 }
2554
2555 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2556 {
2557 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2558 }
2559
2560 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2561 {
2562 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2563 }
2564
2565 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2566 {
2567 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2568 }
2569
2570 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2571 {
2572 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2573 }
2574
2575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2576 {
2577 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2578 }
2579
2580 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2581 {
2582 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2583 }
2584
2585 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2586 {
2587 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2588 }
2589
2590 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2591 {
2592 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2593 }
2594
2595 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2596 {
2597 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2598 }
2599
2600 /*
2601 * Returns the result of converting the floating-point value `a' to
2602 * the two's complement integer format.
2603 */
2604
2605 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2606 float_status *s)
2607 {
2608 FloatParts64 p;
2609
2610 bfloat16_unpack_canonical(&p, a, s);
2611 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2612 }
2613
2614 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2615 float_status *s)
2616 {
2617 FloatParts64 p;
2618
2619 bfloat16_unpack_canonical(&p, a, s);
2620 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2621 }
2622
2623 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2624 float_status *s)
2625 {
2626 FloatParts64 p;
2627
2628 bfloat16_unpack_canonical(&p, a, s);
2629 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2630 }
2631
2632 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2633 {
2634 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2635 }
2636
2637 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2638 {
2639 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2640 }
2641
2642 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2643 {
2644 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2645 }
2646
2647 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2648 {
2649 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2650 }
2651
2652 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2653 {
2654 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2655 }
2656
2657 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2658 {
2659 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2660 }
2661
2662 /*
2663 * Returns the result of converting the floating-point value `a' to
2664 * the unsigned integer format. The conversion is performed according
2665 * to the IEC/IEEE Standard for Binary Floating-Point
2666 * Arithmetic---which means in particular that the conversion is
2667 * rounded according to the current rounding mode. If `a' is a NaN,
2668 * the largest unsigned integer is returned. Otherwise, if the
2669 * conversion overflows, the largest unsigned integer is returned. If
2670 * the 'a' is negative, the result is rounded and zero is returned;
2671 * values that do not round to zero will raise the inexact exception
2672 * flag.
2673 */
2674
2675 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2676 int scale, uint64_t max,
2677 float_status *s)
2678 {
2679 int orig_flags = get_float_exception_flags(s);
2680 FloatParts64 p = round_to_int(in, rmode, scale, s);
2681 uint64_t r;
2682
2683 switch (p.cls) {
2684 case float_class_snan:
2685 case float_class_qnan:
2686 s->float_exception_flags = orig_flags | float_flag_invalid;
2687 return max;
2688 case float_class_inf:
2689 s->float_exception_flags = orig_flags | float_flag_invalid;
2690 return p.sign ? 0 : max;
2691 case float_class_zero:
2692 return 0;
2693 case float_class_normal:
2694 if (p.sign) {
2695 s->float_exception_flags = orig_flags | float_flag_invalid;
2696 return 0;
2697 }
2698
2699 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2700 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2701 } else {
2702 s->float_exception_flags = orig_flags | float_flag_invalid;
2703 return max;
2704 }
2705
2706 /* For uint64 this will never trip, but if p.exp is too large
2707 * to shift a decomposed fraction we shall have exited via the
2708 * 3rd leg above.
2709 */
2710 if (r > max) {
2711 s->float_exception_flags = orig_flags | float_flag_invalid;
2712 return max;
2713 }
2714 return r;
2715 default:
2716 g_assert_not_reached();
2717 }
2718 }
2719
2720 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2721 float_status *s)
2722 {
2723 FloatParts64 p;
2724
2725 float16_unpack_canonical(&p, a, s);
2726 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2727 }
2728
2729 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2730 float_status *s)
2731 {
2732 FloatParts64 p;
2733
2734 float16_unpack_canonical(&p, a, s);
2735 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2736 }
2737
2738 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2739 float_status *s)
2740 {
2741 FloatParts64 p;
2742
2743 float16_unpack_canonical(&p, a, s);
2744 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2745 }
2746
2747 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2748 float_status *s)
2749 {
2750 FloatParts64 p;
2751
2752 float16_unpack_canonical(&p, a, s);
2753 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2754 }
2755
2756 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2757 float_status *s)
2758 {
2759 FloatParts64 p;
2760
2761 float32_unpack_canonical(&p, a, s);
2762 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2763 }
2764
2765 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2766 float_status *s)
2767 {
2768 FloatParts64 p;
2769
2770 float32_unpack_canonical(&p, a, s);
2771 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2772 }
2773
2774 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2775 float_status *s)
2776 {
2777 FloatParts64 p;
2778
2779 float32_unpack_canonical(&p, a, s);
2780 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2781 }
2782
2783 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2784 float_status *s)
2785 {
2786 FloatParts64 p;
2787
2788 float64_unpack_canonical(&p, a, s);
2789 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2790 }
2791
2792 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2793 float_status *s)
2794 {
2795 FloatParts64 p;
2796
2797 float64_unpack_canonical(&p, a, s);
2798 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2799 }
2800
2801 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2802 float_status *s)
2803 {
2804 FloatParts64 p;
2805
2806 float64_unpack_canonical(&p, a, s);
2807 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2808 }
2809
2810 uint8_t float16_to_uint8(float16 a, float_status *s)
2811 {
2812 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2813 }
2814
2815 uint16_t float16_to_uint16(float16 a, float_status *s)
2816 {
2817 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2818 }
2819
2820 uint32_t float16_to_uint32(float16 a, float_status *s)
2821 {
2822 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2823 }
2824
2825 uint64_t float16_to_uint64(float16 a, float_status *s)
2826 {
2827 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2828 }
2829
2830 uint16_t float32_to_uint16(float32 a, float_status *s)
2831 {
2832 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2833 }
2834
2835 uint32_t float32_to_uint32(float32 a, float_status *s)
2836 {
2837 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2838 }
2839
2840 uint64_t float32_to_uint64(float32 a, float_status *s)
2841 {
2842 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2843 }
2844
2845 uint16_t float64_to_uint16(float64 a, float_status *s)
2846 {
2847 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2848 }
2849
2850 uint32_t float64_to_uint32(float64 a, float_status *s)
2851 {
2852 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2853 }
2854
2855 uint64_t float64_to_uint64(float64 a, float_status *s)
2856 {
2857 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2858 }
2859
2860 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2861 {
2862 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2863 }
2864
2865 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2866 {
2867 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2868 }
2869
2870 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2871 {
2872 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2873 }
2874
2875 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2876 {
2877 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2878 }
2879
2880 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2881 {
2882 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2883 }
2884
2885 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2886 {
2887 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2888 }
2889
2890 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2891 {
2892 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2893 }
2894
2895 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2896 {
2897 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2898 }
2899
2900 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2901 {
2902 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2903 }
2904
2905 /*
2906 * Returns the result of converting the bfloat16 value `a' to
2907 * the unsigned integer format.
2908 */
2909
2910 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2911 int scale, float_status *s)
2912 {
2913 FloatParts64 p;
2914
2915 bfloat16_unpack_canonical(&p, a, s);
2916 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2917 }
2918
2919 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2920 int scale, float_status *s)
2921 {
2922 FloatParts64 p;
2923
2924 bfloat16_unpack_canonical(&p, a, s);
2925 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2926 }
2927
2928 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2929 int scale, float_status *s)
2930 {
2931 FloatParts64 p;
2932
2933 bfloat16_unpack_canonical(&p, a, s);
2934 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2935 }
2936
2937 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2938 {
2939 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2940 }
2941
2942 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2943 {
2944 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2945 }
2946
2947 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2948 {
2949 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2950 }
2951
2952 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2953 {
2954 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2955 }
2956
2957 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2958 {
2959 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2960 }
2961
2962 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2963 {
2964 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2965 }
2966
2967 /*
2968 * Integer to float conversions
2969 *
2970 * Returns the result of converting the two's complement integer `a'
2971 * to the floating-point format. The conversion is performed according
2972 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2973 */
2974
2975 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2976 {
2977 FloatParts64 r = { .sign = false };
2978
2979 if (a == 0) {
2980 r.cls = float_class_zero;
2981 } else {
2982 uint64_t f = a;
2983 int shift;
2984
2985 r.cls = float_class_normal;
2986 if (a < 0) {
2987 f = -f;
2988 r.sign = true;
2989 }
2990 shift = clz64(f);
2991 scale = MIN(MAX(scale, -0x10000), 0x10000);
2992
2993 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2994 r.frac = f << shift;
2995 }
2996
2997 return r;
2998 }
2999
3000 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
3001 {
3002 FloatParts64 pa = int_to_float(a, scale, status);
3003 return float16_round_pack_canonical(&pa, status);
3004 }
3005
3006 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
3007 {
3008 return int64_to_float16_scalbn(a, scale, status);
3009 }
3010
3011 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
3012 {
3013 return int64_to_float16_scalbn(a, scale, status);
3014 }
3015
3016 float16 int64_to_float16(int64_t a, float_status *status)
3017 {
3018 return int64_to_float16_scalbn(a, 0, status);
3019 }
3020
3021 float16 int32_to_float16(int32_t a, float_status *status)
3022 {
3023 return int64_to_float16_scalbn(a, 0, status);
3024 }
3025
3026 float16 int16_to_float16(int16_t a, float_status *status)
3027 {
3028 return int64_to_float16_scalbn(a, 0, status);
3029 }
3030
3031 float16 int8_to_float16(int8_t a, float_status *status)
3032 {
3033 return int64_to_float16_scalbn(a, 0, status);
3034 }
3035
3036 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3037 {
3038 FloatParts64 pa = int_to_float(a, scale, status);
3039 return float32_round_pack_canonical(&pa, status);
3040 }
3041
3042 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3043 {
3044 return int64_to_float32_scalbn(a, scale, status);
3045 }
3046
3047 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3048 {
3049 return int64_to_float32_scalbn(a, scale, status);
3050 }
3051
3052 float32 int64_to_float32(int64_t a, float_status *status)
3053 {
3054 return int64_to_float32_scalbn(a, 0, status);
3055 }
3056
3057 float32 int32_to_float32(int32_t a, float_status *status)
3058 {
3059 return int64_to_float32_scalbn(a, 0, status);
3060 }
3061
3062 float32 int16_to_float32(int16_t a, float_status *status)
3063 {
3064 return int64_to_float32_scalbn(a, 0, status);
3065 }
3066
3067 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3068 {
3069 FloatParts64 pa = int_to_float(a, scale, status);
3070 return float64_round_pack_canonical(&pa, status);
3071 }
3072
3073 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3074 {
3075 return int64_to_float64_scalbn(a, scale, status);
3076 }
3077
3078 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3079 {
3080 return int64_to_float64_scalbn(a, scale, status);
3081 }
3082
3083 float64 int64_to_float64(int64_t a, float_status *status)
3084 {
3085 return int64_to_float64_scalbn(a, 0, status);
3086 }
3087
3088 float64 int32_to_float64(int32_t a, float_status *status)
3089 {
3090 return int64_to_float64_scalbn(a, 0, status);
3091 }
3092
3093 float64 int16_to_float64(int16_t a, float_status *status)
3094 {
3095 return int64_to_float64_scalbn(a, 0, status);
3096 }
3097
3098 /*
3099 * Returns the result of converting the two's complement integer `a'
3100 * to the bfloat16 format.
3101 */
3102
3103 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3104 {
3105 FloatParts64 pa = int_to_float(a, scale, status);
3106 return bfloat16_round_pack_canonical(&pa, status);
3107 }
3108
3109 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3110 {
3111 return int64_to_bfloat16_scalbn(a, scale, status);
3112 }
3113
3114 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3115 {
3116 return int64_to_bfloat16_scalbn(a, scale, status);
3117 }
3118
3119 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3120 {
3121 return int64_to_bfloat16_scalbn(a, 0, status);
3122 }
3123
3124 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3125 {
3126 return int64_to_bfloat16_scalbn(a, 0, status);
3127 }
3128
3129 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3130 {
3131 return int64_to_bfloat16_scalbn(a, 0, status);
3132 }
3133
3134 /*
3135 * Unsigned Integer to float conversions
3136 *
3137 * Returns the result of converting the unsigned integer `a' to the
3138 * floating-point format. The conversion is performed according to the
3139 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 */
3141
3142 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3143 {
3144 FloatParts64 r = { .sign = false };
3145 int shift;
3146
3147 if (a == 0) {
3148 r.cls = float_class_zero;
3149 } else {
3150 scale = MIN(MAX(scale, -0x10000), 0x10000);
3151 shift = clz64(a);
3152 r.cls = float_class_normal;
3153 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3154 r.frac = a << shift;
3155 }
3156
3157 return r;
3158 }
3159
3160 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3161 {
3162 FloatParts64 pa = uint_to_float(a, scale, status);
3163 return float16_round_pack_canonical(&pa, status);
3164 }
3165
3166 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3167 {
3168 return uint64_to_float16_scalbn(a, scale, status);
3169 }
3170
3171 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3172 {
3173 return uint64_to_float16_scalbn(a, scale, status);
3174 }
3175
3176 float16 uint64_to_float16(uint64_t a, float_status *status)
3177 {
3178 return uint64_to_float16_scalbn(a, 0, status);
3179 }
3180
3181 float16 uint32_to_float16(uint32_t a, float_status *status)
3182 {
3183 return uint64_to_float16_scalbn(a, 0, status);
3184 }
3185
3186 float16 uint16_to_float16(uint16_t a, float_status *status)
3187 {
3188 return uint64_to_float16_scalbn(a, 0, status);
3189 }
3190
3191 float16 uint8_to_float16(uint8_t a, float_status *status)
3192 {
3193 return uint64_to_float16_scalbn(a, 0, status);
3194 }
3195
3196 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3197 {
3198 FloatParts64 pa = uint_to_float(a, scale, status);
3199 return float32_round_pack_canonical(&pa, status);
3200 }
3201
3202 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3203 {
3204 return uint64_to_float32_scalbn(a, scale, status);
3205 }
3206
3207 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3208 {
3209 return uint64_to_float32_scalbn(a, scale, status);
3210 }
3211
3212 float32 uint64_to_float32(uint64_t a, float_status *status)
3213 {
3214 return uint64_to_float32_scalbn(a, 0, status);
3215 }
3216
3217 float32 uint32_to_float32(uint32_t a, float_status *status)
3218 {
3219 return uint64_to_float32_scalbn(a, 0, status);
3220 }
3221
3222 float32 uint16_to_float32(uint16_t a, float_status *status)
3223 {
3224 return uint64_to_float32_scalbn(a, 0, status);
3225 }
3226
3227 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3228 {
3229 FloatParts64 pa = uint_to_float(a, scale, status);
3230 return float64_round_pack_canonical(&pa, status);
3231 }
3232
3233 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3234 {
3235 return uint64_to_float64_scalbn(a, scale, status);
3236 }
3237
3238 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3239 {
3240 return uint64_to_float64_scalbn(a, scale, status);
3241 }
3242
3243 float64 uint64_to_float64(uint64_t a, float_status *status)
3244 {
3245 return uint64_to_float64_scalbn(a, 0, status);
3246 }
3247
3248 float64 uint32_to_float64(uint32_t a, float_status *status)
3249 {
3250 return uint64_to_float64_scalbn(a, 0, status);
3251 }
3252
3253 float64 uint16_to_float64(uint16_t a, float_status *status)
3254 {
3255 return uint64_to_float64_scalbn(a, 0, status);
3256 }
3257
3258 /*
3259 * Returns the result of converting the unsigned integer `a' to the
3260 * bfloat16 format.
3261 */
3262
3263 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3264 {
3265 FloatParts64 pa = uint_to_float(a, scale, status);
3266 return bfloat16_round_pack_canonical(&pa, status);
3267 }
3268
3269 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3270 {
3271 return uint64_to_bfloat16_scalbn(a, scale, status);
3272 }
3273
3274 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3275 {
3276 return uint64_to_bfloat16_scalbn(a, scale, status);
3277 }
3278
3279 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3280 {
3281 return uint64_to_bfloat16_scalbn(a, 0, status);
3282 }
3283
3284 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3285 {
3286 return uint64_to_bfloat16_scalbn(a, 0, status);
3287 }
3288
3289 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3290 {
3291 return uint64_to_bfloat16_scalbn(a, 0, status);
3292 }
3293
3294 /* Float Min/Max */
3295 /* min() and max() functions. These can't be implemented as
3296 * 'compare and pick one input' because that would mishandle
3297 * NaNs and +0 vs -0.
3298 *
3299 * minnum() and maxnum() functions. These are similar to the min()
3300 * and max() functions but if one of the arguments is a QNaN and
3301 * the other is numerical then the numerical argument is returned.
3302 * SNaNs will get quietened before being returned.
3303 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3304 * and maxNum() operations. min() and max() are the typical min/max
3305 * semantics provided by many CPUs which predate that specification.
3306 *
3307 * minnummag() and maxnummag() functions correspond to minNumMag()
3308 * and minNumMag() from the IEEE-754 2008.
3309 */
3310 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3311 bool ieee, bool ismag, float_status *s)
3312 {
3313 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3314 if (ieee) {
3315 /* Takes two floating-point values `a' and `b', one of
3316 * which is a NaN, and returns the appropriate NaN
3317 * result. If either `a' or `b' is a signaling NaN,
3318 * the invalid exception is raised.
3319 */
3320 if (is_snan(a.cls) || is_snan(b.cls)) {
3321 return *parts_pick_nan(&a, &b, s);
3322 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3323 return b;
3324 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3325 return a;
3326 }
3327 }
3328 return *parts_pick_nan(&a, &b, s);
3329 } else {
3330 int a_exp, b_exp;
3331
3332 switch (a.cls) {
3333 case float_class_normal:
3334 a_exp = a.exp;
3335 break;
3336 case float_class_inf:
3337 a_exp = INT_MAX;
3338 break;
3339 case float_class_zero:
3340 a_exp = INT_MIN;
3341 break;
3342 default:
3343 g_assert_not_reached();
3344 break;
3345 }
3346 switch (b.cls) {
3347 case float_class_normal:
3348 b_exp = b.exp;
3349 break;
3350 case float_class_inf:
3351 b_exp = INT_MAX;
3352 break;
3353 case float_class_zero:
3354 b_exp = INT_MIN;
3355 break;
3356 default:
3357 g_assert_not_reached();
3358 break;
3359 }
3360
3361 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3362 bool a_less = a_exp < b_exp;
3363 if (a_exp == b_exp) {
3364 a_less = a.frac < b.frac;
3365 }
3366 return a_less ^ ismin ? b : a;
3367 }
3368
3369 if (a.sign == b.sign) {
3370 bool a_less = a_exp < b_exp;
3371 if (a_exp == b_exp) {
3372 a_less = a.frac < b.frac;
3373 }
3374 return a.sign ^ a_less ^ ismin ? b : a;
3375 } else {
3376 return a.sign ^ ismin ? b : a;
3377 }
3378 }
3379 }
3380
3381 #define MINMAX(sz, name, ismin, isiee, ismag) \
3382 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3383 float_status *s) \
3384 { \
3385 FloatParts64 pa, pb, pr; \
3386 float ## sz ## _unpack_canonical(&pa, a, s); \
3387 float ## sz ## _unpack_canonical(&pb, b, s); \
3388 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3389 return float ## sz ## _round_pack_canonical(&pr, s); \
3390 }
3391
3392 MINMAX(16, min, true, false, false)
3393 MINMAX(16, minnum, true, true, false)
3394 MINMAX(16, minnummag, true, true, true)
3395 MINMAX(16, max, false, false, false)
3396 MINMAX(16, maxnum, false, true, false)
3397 MINMAX(16, maxnummag, false, true, true)
3398
3399 MINMAX(32, min, true, false, false)
3400 MINMAX(32, minnum, true, true, false)
3401 MINMAX(32, minnummag, true, true, true)
3402 MINMAX(32, max, false, false, false)
3403 MINMAX(32, maxnum, false, true, false)
3404 MINMAX(32, maxnummag, false, true, true)
3405
3406 MINMAX(64, min, true, false, false)
3407 MINMAX(64, minnum, true, true, false)
3408 MINMAX(64, minnummag, true, true, true)
3409 MINMAX(64, max, false, false, false)
3410 MINMAX(64, maxnum, false, true, false)
3411 MINMAX(64, maxnummag, false, true, true)
3412
3413 #undef MINMAX
3414
3415 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3416 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3417 { \
3418 FloatParts64 pa, pb, pr; \
3419 bfloat16_unpack_canonical(&pa, a, s); \
3420 bfloat16_unpack_canonical(&pb, b, s); \
3421 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3422 return bfloat16_round_pack_canonical(&pr, s); \
3423 }
3424
3425 BF16_MINMAX(min, true, false, false)
3426 BF16_MINMAX(minnum, true, true, false)
3427 BF16_MINMAX(minnummag, true, true, true)
3428 BF16_MINMAX(max, false, false, false)
3429 BF16_MINMAX(maxnum, false, true, false)
3430 BF16_MINMAX(maxnummag, false, true, true)
3431
3432 #undef BF16_MINMAX
3433
3434 /* Floating point compare */
3435 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3436 float_status *s)
3437 {
3438 if (is_nan(a.cls) || is_nan(b.cls)) {
3439 if (!is_quiet ||
3440 a.cls == float_class_snan ||
3441 b.cls == float_class_snan) {
3442 float_raise(float_flag_invalid, s);
3443 }
3444 return float_relation_unordered;
3445 }
3446
3447 if (a.cls == float_class_zero) {
3448 if (b.cls == float_class_zero) {
3449 return float_relation_equal;
3450 }
3451 return b.sign ? float_relation_greater : float_relation_less;
3452 } else if (b.cls == float_class_zero) {
3453 return a.sign ? float_relation_less : float_relation_greater;
3454 }
3455
3456 /* The only really important thing about infinity is its sign. If
3457 * both are infinities the sign marks the smallest of the two.
3458 */
3459 if (a.cls == float_class_inf) {
3460 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3461 return float_relation_equal;
3462 }
3463 return a.sign ? float_relation_less : float_relation_greater;
3464 } else if (b.cls == float_class_inf) {
3465 return b.sign ? float_relation_greater : float_relation_less;
3466 }
3467
3468 if (a.sign != b.sign) {
3469 return a.sign ? float_relation_less : float_relation_greater;
3470 }
3471
3472 if (a.exp == b.exp) {
3473 if (a.frac == b.frac) {
3474 return float_relation_equal;
3475 }
3476 if (a.sign) {
3477 return a.frac > b.frac ?
3478 float_relation_less : float_relation_greater;
3479 } else {
3480 return a.frac > b.frac ?
3481 float_relation_greater : float_relation_less;
3482 }
3483 } else {
3484 if (a.sign) {
3485 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3486 } else {
3487 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3488 }
3489 }
3490 }
3491
3492 #define COMPARE(name, attr, sz) \
3493 static int attr \
3494 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3495 { \
3496 FloatParts64 pa, pb; \
3497 float ## sz ## _unpack_canonical(&pa, a, s); \
3498 float ## sz ## _unpack_canonical(&pb, b, s); \
3499 return compare_floats(pa, pb, is_quiet, s); \
3500 }
3501
3502 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3503 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3504 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3505
3506 #undef COMPARE
3507
3508 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3509 {
3510 return soft_f16_compare(a, b, false, s);
3511 }
3512
3513 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3514 {
3515 return soft_f16_compare(a, b, true, s);
3516 }
3517
3518 static FloatRelation QEMU_FLATTEN
3519 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3520 {
3521 union_float32 ua, ub;
3522
3523 ua.s = xa;
3524 ub.s = xb;
3525
3526 if (QEMU_NO_HARDFLOAT) {
3527 goto soft;
3528 }
3529
3530 float32_input_flush2(&ua.s, &ub.s, s);
3531 if (isgreaterequal(ua.h, ub.h)) {
3532 if (isgreater(ua.h, ub.h)) {
3533 return float_relation_greater;
3534 }
3535 return float_relation_equal;
3536 }
3537 if (likely(isless(ua.h, ub.h))) {
3538 return float_relation_less;
3539 }
3540 /* The only condition remaining is unordered.
3541 * Fall through to set flags.
3542 */
3543 soft:
3544 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3545 }
3546
3547 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3548 {
3549 return f32_compare(a, b, false, s);
3550 }
3551
3552 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3553 {
3554 return f32_compare(a, b, true, s);
3555 }
3556
3557 static FloatRelation QEMU_FLATTEN
3558 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3559 {
3560 union_float64 ua, ub;
3561
3562 ua.s = xa;
3563 ub.s = xb;
3564
3565 if (QEMU_NO_HARDFLOAT) {
3566 goto soft;
3567 }
3568
3569 float64_input_flush2(&ua.s, &ub.s, s);
3570 if (isgreaterequal(ua.h, ub.h)) {
3571 if (isgreater(ua.h, ub.h)) {
3572 return float_relation_greater;
3573 }
3574 return float_relation_equal;
3575 }
3576 if (likely(isless(ua.h, ub.h))) {
3577 return float_relation_less;
3578 }
3579 /* The only condition remaining is unordered.
3580 * Fall through to set flags.
3581 */
3582 soft:
3583 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3584 }
3585
3586 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3587 {
3588 return f64_compare(a, b, false, s);
3589 }
3590
3591 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3592 {
3593 return f64_compare(a, b, true, s);
3594 }
3595
3596 static FloatRelation QEMU_FLATTEN
3597 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3598 {
3599 FloatParts64 pa, pb;
3600
3601 bfloat16_unpack_canonical(&pa, a, s);
3602 bfloat16_unpack_canonical(&pb, b, s);
3603 return compare_floats(pa, pb, is_quiet, s);
3604 }
3605
3606 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3607 {
3608 return soft_bf16_compare(a, b, false, s);
3609 }
3610
3611 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3612 {
3613 return soft_bf16_compare(a, b, true, s);
3614 }
3615
3616 /* Multiply A by 2 raised to the power N. */
3617 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3618 {
3619 if (unlikely(is_nan(a.cls))) {
3620 parts_return_nan(&a, s);
3621 }
3622 if (a.cls == float_class_normal) {
3623 /* The largest float type (even though not supported by FloatParts64)
3624 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3625 * still allows rounding to infinity, without allowing overflow
3626 * within the int32_t that backs FloatParts64.exp.
3627 */
3628 n = MIN(MAX(n, -0x10000), 0x10000);
3629 a.exp += n;
3630 }
3631 return a;
3632 }
3633
3634 float16 float16_scalbn(float16 a, int n, float_status *status)
3635 {
3636 FloatParts64 pa, pr;
3637
3638 float16_unpack_canonical(&pa, a, status);
3639 pr = scalbn_decomposed(pa, n, status);
3640 return float16_round_pack_canonical(&pr, status);
3641 }
3642
3643 float32 float32_scalbn(float32 a, int n, float_status *status)
3644 {
3645 FloatParts64 pa, pr;
3646
3647 float32_unpack_canonical(&pa, a, status);
3648 pr = scalbn_decomposed(pa, n, status);
3649 return float32_round_pack_canonical(&pr, status);
3650 }
3651
3652 float64 float64_scalbn(float64 a, int n, float_status *status)
3653 {
3654 FloatParts64 pa, pr;
3655
3656 float64_unpack_canonical(&pa, a, status);
3657 pr = scalbn_decomposed(pa, n, status);
3658 return float64_round_pack_canonical(&pr, status);
3659 }
3660
3661 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3662 {
3663 FloatParts64 pa, pr;
3664
3665 bfloat16_unpack_canonical(&pa, a, status);
3666 pr = scalbn_decomposed(pa, n, status);
3667 return bfloat16_round_pack_canonical(&pr, status);
3668 }
3669
3670 /*
3671 * Square Root
3672 *
3673 * The old softfloat code did an approximation step before zeroing in
3674 * on the final result. However for simpleness we just compute the
3675 * square root by iterating down from the implicit bit to enough extra
3676 * bits to ensure we get a correctly rounded result.
3677 *
3678 * This does mean however the calculation is slower than before,
3679 * especially for 64 bit floats.
3680 */
3681
3682 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3683 {
3684 uint64_t a_frac, r_frac, s_frac;
3685 int bit, last_bit;
3686
3687 if (is_nan(a.cls)) {
3688 parts_return_nan(&a, s);
3689 return a;
3690 }
3691 if (a.cls == float_class_zero) {
3692 return a; /* sqrt(+-0) = +-0 */
3693 }
3694 if (a.sign) {
3695 float_raise(float_flag_invalid, s);
3696 parts_default_nan(&a, s);
3697 return a;
3698 }
3699 if (a.cls == float_class_inf) {
3700 return a; /* sqrt(+inf) = +inf */
3701 }
3702
3703 assert(a.cls == float_class_normal);
3704
3705 /* We need two overflow bits at the top. Adding room for that is a
3706 * right shift. If the exponent is odd, we can discard the low bit
3707 * by multiplying the fraction by 2; that's a left shift. Combine
3708 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3709 */
3710 a_frac = a.frac >> (2 - (a.exp & 1));
3711 a.exp >>= 1;
3712
3713 /* Bit-by-bit computation of sqrt. */
3714 r_frac = 0;
3715 s_frac = 0;
3716
3717 /* Iterate from implicit bit down to the 3 extra bits to compute a
3718 * properly rounded result. Remember we've inserted two more bits
3719 * at the top, so these positions are two less.
3720 */
3721 bit = DECOMPOSED_BINARY_POINT - 2;
3722 last_bit = MAX(p->frac_shift - 4, 0);
3723 do {
3724 uint64_t q = 1ULL << bit;
3725 uint64_t t_frac = s_frac + q;
3726 if (t_frac <= a_frac) {
3727 s_frac = t_frac + q;
3728 a_frac -= t_frac;
3729 r_frac += q;
3730 }
3731 a_frac <<= 1;
3732 } while (--bit >= last_bit);
3733
3734 /* Undo the right shift done above. If there is any remaining
3735 * fraction, the result is inexact. Set the sticky bit.
3736 */
3737 a.frac = (r_frac << 2) + (a_frac != 0);
3738
3739 return a;
3740 }
3741
3742 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3743 {
3744 FloatParts64 pa, pr;
3745
3746 float16_unpack_canonical(&pa, a, status);
3747 pr = sqrt_float(pa, status, &float16_params);
3748 return float16_round_pack_canonical(&pr, status);
3749 }
3750
3751 static float32 QEMU_SOFTFLOAT_ATTR
3752 soft_f32_sqrt(float32 a, float_status *status)
3753 {
3754 FloatParts64 pa, pr;
3755
3756 float32_unpack_canonical(&pa, a, status);
3757 pr = sqrt_float(pa, status, &float32_params);
3758 return float32_round_pack_canonical(&pr, status);
3759 }
3760
3761 static float64 QEMU_SOFTFLOAT_ATTR
3762 soft_f64_sqrt(float64 a, float_status *status)
3763 {
3764 FloatParts64 pa, pr;
3765
3766 float64_unpack_canonical(&pa, a, status);
3767 pr = sqrt_float(pa, status, &float64_params);
3768 return float64_round_pack_canonical(&pr, status);
3769 }
3770
3771 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3772 {
3773 union_float32 ua, ur;
3774
3775 ua.s = xa;
3776 if (unlikely(!can_use_fpu(s))) {
3777 goto soft;
3778 }
3779
3780 float32_input_flush1(&ua.s, s);
3781 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3782 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3783 fpclassify(ua.h) == FP_ZERO) ||
3784 signbit(ua.h))) {
3785 goto soft;
3786 }
3787 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3788 float32_is_neg(ua.s))) {
3789 goto soft;
3790 }
3791 ur.h = sqrtf(ua.h);
3792 return ur.s;
3793
3794 soft:
3795 return soft_f32_sqrt(ua.s, s);
3796 }
3797
3798 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3799 {
3800 union_float64 ua, ur;
3801
3802 ua.s = xa;
3803 if (unlikely(!can_use_fpu(s))) {
3804 goto soft;
3805 }
3806
3807 float64_input_flush1(&ua.s, s);
3808 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3809 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3810 fpclassify(ua.h) == FP_ZERO) ||
3811 signbit(ua.h))) {
3812 goto soft;
3813 }
3814 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3815 float64_is_neg(ua.s))) {
3816 goto soft;
3817 }
3818 ur.h = sqrt(ua.h);
3819 return ur.s;
3820
3821 soft:
3822 return soft_f64_sqrt(ua.s, s);
3823 }
3824
3825 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3826 {
3827 FloatParts64 pa, pr;
3828
3829 bfloat16_unpack_canonical(&pa, a, status);
3830 pr = sqrt_float(pa, status, &bfloat16_params);
3831 return bfloat16_round_pack_canonical(&pr, status);
3832 }
3833
3834 /*----------------------------------------------------------------------------
3835 | The pattern for a default generated NaN.
3836 *----------------------------------------------------------------------------*/
3837
3838 float16 float16_default_nan(float_status *status)
3839 {
3840 FloatParts64 p;
3841
3842 parts_default_nan(&p, status);
3843 p.frac >>= float16_params.frac_shift;
3844 return float16_pack_raw(&p);
3845 }
3846
3847 float32 float32_default_nan(float_status *status)
3848 {
3849 FloatParts64 p;
3850
3851 parts_default_nan(&p, status);
3852 p.frac >>= float32_params.frac_shift;
3853 return float32_pack_raw(&p);
3854 }
3855
3856 float64 float64_default_nan(float_status *status)
3857 {
3858 FloatParts64 p;
3859
3860 parts_default_nan(&p, status);
3861 p.frac >>= float64_params.frac_shift;
3862 return float64_pack_raw(&p);
3863 }
3864
3865 float128 float128_default_nan(float_status *status)
3866 {
3867 FloatParts128 p;
3868
3869 parts_default_nan(&p, status);
3870 frac_shr(&p, float128_params.frac_shift);
3871 return float128_pack_raw(&p);
3872 }
3873
3874 bfloat16 bfloat16_default_nan(float_status *status)
3875 {
3876 FloatParts64 p;
3877
3878 parts_default_nan(&p, status);
3879 p.frac >>= bfloat16_params.frac_shift;
3880 return bfloat16_pack_raw(&p);
3881 }
3882
3883 /*----------------------------------------------------------------------------
3884 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3885 *----------------------------------------------------------------------------*/
3886
3887 float16 float16_silence_nan(float16 a, float_status *status)
3888 {
3889 FloatParts64 p;
3890
3891 float16_unpack_raw(&p, a);
3892 p.frac <<= float16_params.frac_shift;
3893 parts_silence_nan(&p, status);
3894 p.frac >>= float16_params.frac_shift;
3895 return float16_pack_raw(&p);
3896 }
3897
3898 float32 float32_silence_nan(float32 a, float_status *status)
3899 {
3900 FloatParts64 p;
3901
3902 float32_unpack_raw(&p, a);
3903 p.frac <<= float32_params.frac_shift;
3904 parts_silence_nan(&p, status);
3905 p.frac >>= float32_params.frac_shift;
3906 return float32_pack_raw(&p);
3907 }
3908
3909 float64 float64_silence_nan(float64 a, float_status *status)
3910 {
3911 FloatParts64 p;
3912
3913 float64_unpack_raw(&p, a);
3914 p.frac <<= float64_params.frac_shift;
3915 parts_silence_nan(&p, status);
3916 p.frac >>= float64_params.frac_shift;
3917 return float64_pack_raw(&p);
3918 }
3919
3920 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3921 {
3922 FloatParts64 p;
3923
3924 bfloat16_unpack_raw(&p, a);
3925 p.frac <<= bfloat16_params.frac_shift;
3926 parts_silence_nan(&p, status);
3927 p.frac >>= bfloat16_params.frac_shift;
3928 return bfloat16_pack_raw(&p);
3929 }
3930
3931 float128 float128_silence_nan(float128 a, float_status *status)
3932 {
3933 FloatParts128 p;
3934
3935 float128_unpack_raw(&p, a);
3936 frac_shl(&p, float128_params.frac_shift);
3937 parts_silence_nan(&p, status);
3938 frac_shr(&p, float128_params.frac_shift);
3939 return float128_pack_raw(&p);
3940 }
3941
3942 /*----------------------------------------------------------------------------
3943 | If `a' is denormal and we are in flush-to-zero mode then set the
3944 | input-denormal exception and return zero. Otherwise just return the value.
3945 *----------------------------------------------------------------------------*/
3946
3947 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3948 {
3949 if (p.exp == 0 && p.frac != 0) {
3950 float_raise(float_flag_input_denormal, status);
3951 return true;
3952 }
3953
3954 return false;
3955 }
3956
3957 float16 float16_squash_input_denormal(float16 a, float_status *status)
3958 {
3959 if (status->flush_inputs_to_zero) {
3960 FloatParts64 p;
3961
3962 float16_unpack_raw(&p, a);
3963 if (parts_squash_denormal(p, status)) {
3964 return float16_set_sign(float16_zero, p.sign);
3965 }
3966 }
3967 return a;
3968 }
3969
3970 float32 float32_squash_input_denormal(float32 a, float_status *status)
3971 {
3972 if (status->flush_inputs_to_zero) {
3973 FloatParts64 p;
3974
3975 float32_unpack_raw(&p, a);
3976 if (parts_squash_denormal(p, status)) {
3977 return float32_set_sign(float32_zero, p.sign);
3978 }
3979 }
3980 return a;
3981 }
3982
3983 float64 float64_squash_input_denormal(float64 a, float_status *status)
3984 {
3985 if (status->flush_inputs_to_zero) {
3986 FloatParts64 p;
3987
3988 float64_unpack_raw(&p, a);
3989 if (parts_squash_denormal(p, status)) {
3990 return float64_set_sign(float64_zero, p.sign);
3991 }
3992 }
3993 return a;
3994 }
3995
3996 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3997 {
3998 if (status->flush_inputs_to_zero) {
3999 FloatParts64 p;
4000
4001 bfloat16_unpack_raw(&p, a);
4002 if (parts_squash_denormal(p, status)) {
4003 return bfloat16_set_sign(bfloat16_zero, p.sign);
4004 }
4005 }
4006 return a;
4007 }
4008
4009 /*----------------------------------------------------------------------------
4010 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
4011 | and 7, and returns the properly rounded 32-bit integer corresponding to the
4012 | input. If `zSign' is 1, the input is negated before being converted to an
4013 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
4014 | is simply rounded to an integer, with the inexact exception raised if the
4015 | input cannot be represented exactly as an integer. However, if the fixed-
4016 | point input is too large, the invalid exception is raised and the largest
4017 | positive or negative integer is returned.
4018 *----------------------------------------------------------------------------*/
4019
4020 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
4021 float_status *status)
4022 {
4023 int8_t roundingMode;
4024 bool roundNearestEven;
4025 int8_t roundIncrement, roundBits;
4026 int32_t z;
4027
4028 roundingMode = status->float_rounding_mode;
4029 roundNearestEven = ( roundingMode == float_round_nearest_even );
4030 switch (roundingMode) {
4031 case float_round_nearest_even:
4032 case float_round_ties_away:
4033 roundIncrement = 0x40;
4034 break;
4035 case float_round_to_zero:
4036 roundIncrement = 0;
4037 break;
4038 case float_round_up:
4039 roundIncrement = zSign ? 0 : 0x7f;
4040 break;
4041 case float_round_down:
4042 roundIncrement = zSign ? 0x7f : 0;
4043 break;
4044 case float_round_to_odd:
4045 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4046 break;
4047 default:
4048 abort();
4049 }
4050 roundBits = absZ & 0x7F;
4051 absZ = ( absZ + roundIncrement )>>7;
4052 if (!(roundBits ^ 0x40) && roundNearestEven) {
4053 absZ &= ~1;
4054 }
4055 z = absZ;
4056 if ( zSign ) z = - z;
4057 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4058 float_raise(float_flag_invalid, status);
4059 return zSign ? INT32_MIN : INT32_MAX;
4060 }
4061 if (roundBits) {
4062 float_raise(float_flag_inexact, status);
4063 }
4064 return z;
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4070 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4071 | and returns the properly rounded 64-bit integer corresponding to the input.
4072 | If `zSign' is 1, the input is negated before being converted to an integer.
4073 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4074 | the inexact exception raised if the input cannot be represented exactly as
4075 | an integer. However, if the fixed-point input is too large, the invalid
4076 | exception is raised and the largest positive or negative integer is
4077 | returned.
4078 *----------------------------------------------------------------------------*/
4079
4080 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4081 float_status *status)
4082 {
4083 int8_t roundingMode;
4084 bool roundNearestEven, increment;
4085 int64_t z;
4086
4087 roundingMode = status->float_rounding_mode;
4088 roundNearestEven = ( roundingMode == float_round_nearest_even );
4089 switch (roundingMode) {
4090 case float_round_nearest_even:
4091 case float_round_ties_away:
4092 increment = ((int64_t) absZ1 < 0);
4093 break;
4094 case float_round_to_zero:
4095 increment = 0;
4096 break;
4097 case float_round_up:
4098 increment = !zSign && absZ1;
4099 break;
4100 case float_round_down:
4101 increment = zSign && absZ1;
4102 break;
4103 case float_round_to_odd:
4104 increment = !(absZ0 & 1) && absZ1;
4105 break;
4106 default:
4107 abort();
4108 }
4109 if ( increment ) {
4110 ++absZ0;
4111 if ( absZ0 == 0 ) goto overflow;
4112 if (!(absZ1 << 1) && roundNearestEven) {
4113 absZ0 &= ~1;
4114 }
4115 }
4116 z = absZ0;
4117 if ( zSign ) z = - z;
4118 if ( z && ( ( z < 0 ) ^ zSign ) ) {
4119 overflow:
4120 float_raise(float_flag_invalid, status);
4121 return zSign ? INT64_MIN : INT64_MAX;
4122 }
4123 if (absZ1) {
4124 float_raise(float_flag_inexact, status);
4125 }
4126 return z;
4127
4128 }
4129
4130 /*----------------------------------------------------------------------------
4131 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4132 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4133 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4134 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
4135 | with the inexact exception raised if the input cannot be represented exactly
4136 | as an integer. However, if the fixed-point input is too large, the invalid
4137 | exception is raised and the largest unsigned integer is returned.
4138 *----------------------------------------------------------------------------*/
4139
4140 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4141 uint64_t absZ1, float_status *status)
4142 {
4143 int8_t roundingMode;
4144 bool roundNearestEven, increment;
4145
4146 roundingMode = status->float_rounding_mode;
4147 roundNearestEven = (roundingMode == float_round_nearest_even);
4148 switch (roundingMode) {
4149 case float_round_nearest_even:
4150 case float_round_ties_away:
4151 increment = ((int64_t)absZ1 < 0);
4152 break;
4153 case float_round_to_zero:
4154 increment = 0;
4155 break;
4156 case float_round_up:
4157 increment = !zSign && absZ1;
4158 break;
4159 case float_round_down:
4160 increment = zSign && absZ1;
4161 break;
4162 case float_round_to_odd:
4163 increment = !(absZ0 & 1) && absZ1;
4164 break;
4165 default:
4166 abort();
4167 }
4168 if (increment) {
4169 ++absZ0;
4170 if (absZ0 == 0) {
4171 float_raise(float_flag_invalid, status);
4172 return UINT64_MAX;
4173 }
4174 if (!(absZ1 << 1) && roundNearestEven) {
4175 absZ0 &= ~1;
4176 }
4177 }
4178
4179 if (zSign && absZ0) {
4180 float_raise(float_flag_invalid, status);
4181 return 0;
4182 }
4183
4184 if (absZ1) {
4185 float_raise(float_flag_inexact, status);
4186 }
4187 return absZ0;
4188 }
4189
4190 /*----------------------------------------------------------------------------
4191 | Normalizes the subnormal single-precision floating-point value represented
4192 | by the denormalized significand `aSig'. The normalized exponent and
4193 | significand are stored at the locations pointed to by `zExpPtr' and
4194 | `zSigPtr', respectively.
4195 *----------------------------------------------------------------------------*/
4196
4197 static void
4198 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4199 {
4200 int8_t shiftCount;
4201
4202 shiftCount = clz32(aSig) - 8;
4203 *zSigPtr = aSig<<shiftCount;
4204 *zExpPtr = 1 - shiftCount;
4205
4206 }
4207
4208 /*----------------------------------------------------------------------------
4209 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4210 | and significand `zSig', and returns the proper single-precision floating-
4211 | point value corresponding to the abstract input. Ordinarily, the abstract
4212 | value is simply rounded and packed into the single-precision format, with
4213 | the inexact exception raised if the abstract input cannot be represented
4214 | exactly. However, if the abstract value is too large, the overflow and
4215 | inexact exceptions are raised and an infinity or maximal finite value is
4216 | returned. If the abstract value is too small, the input value is rounded to
4217 | a subnormal number, and the underflow and inexact exceptions are raised if
4218 | the abstract input cannot be represented exactly as a subnormal single-
4219 | precision floating-point number.
4220 | The input significand `zSig' has its binary point between bits 30
4221 | and 29, which is 7 bits to the left of the usual location. This shifted
4222 | significand must be normalized or smaller. If `zSig' is not normalized,
4223 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4224 | and it must not require rounding. In the usual case that `zSig' is
4225 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4226 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4227 | Binary Floating-Point Arithmetic.
4228 *----------------------------------------------------------------------------*/
4229
4230 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4231 float_status *status)
4232 {
4233 int8_t roundingMode;
4234 bool roundNearestEven;
4235 int8_t roundIncrement, roundBits;
4236 bool isTiny;
4237
4238 roundingMode = status->float_rounding_mode;
4239 roundNearestEven = ( roundingMode == float_round_nearest_even );
4240 switch (roundingMode) {
4241 case float_round_nearest_even:
4242 case float_round_ties_away:
4243 roundIncrement = 0x40;
4244 break;
4245 case float_round_to_zero:
4246 roundIncrement = 0;
4247 break;
4248 case float_round_up:
4249 roundIncrement = zSign ? 0 : 0x7f;
4250 break;
4251 case float_round_down:
4252 roundIncrement = zSign ? 0x7f : 0;
4253 break;
4254 case float_round_to_odd:
4255 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4256 break;
4257 default:
4258 abort();
4259 break;
4260 }
4261 roundBits = zSig & 0x7F;
4262 if ( 0xFD <= (uint16_t) zExp ) {
4263 if ( ( 0xFD < zExp )
4264 || ( ( zExp == 0xFD )
4265 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4266 ) {
4267 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4268 roundIncrement != 0;
4269 float_raise(float_flag_overflow | float_flag_inexact, status);
4270 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4271 }
4272 if ( zExp < 0 ) {
4273 if (status->flush_to_zero) {
4274 float_raise(float_flag_output_denormal, status);
4275 return packFloat32(zSign, 0, 0);
4276 }
4277 isTiny = status->tininess_before_rounding
4278 || (zExp < -1)
4279 || (zSig + roundIncrement < 0x80000000);
4280 shift32RightJamming( zSig, - zExp, &zSig );
4281 zExp = 0;
4282 roundBits = zSig & 0x7F;
4283 if (isTiny && roundBits) {
4284 float_raise(float_flag_underflow, status);
4285 }
4286 if (roundingMode == float_round_to_odd) {
4287 /*
4288 * For round-to-odd case, the roundIncrement depends on
4289 * zSig which just changed.
4290 */
4291 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4292 }
4293 }
4294 }
4295 if (roundBits) {
4296 float_raise(float_flag_inexact, status);
4297 }
4298 zSig = ( zSig + roundIncrement )>>7;
4299 if (!(roundBits ^ 0x40) && roundNearestEven) {
4300 zSig &= ~1;
4301 }
4302 if ( zSig == 0 ) zExp = 0;
4303 return packFloat32( zSign, zExp, zSig );
4304
4305 }
4306
4307 /*----------------------------------------------------------------------------
4308 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4309 | and significand `zSig', and returns the proper single-precision floating-
4310 | point value corresponding to the abstract input. This routine is just like
4311 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4312 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4313 | floating-point exponent.
4314 *----------------------------------------------------------------------------*/
4315
4316 static float32
4317 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4318 float_status *status)
4319 {
4320 int8_t shiftCount;
4321
4322 shiftCount = clz32(zSig) - 1;
4323 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4324 status);
4325
4326 }
4327
4328 /*----------------------------------------------------------------------------
4329 | Normalizes the subnormal double-precision floating-point value represented
4330 | by the denormalized significand `aSig'. The normalized exponent and
4331 | significand are stored at the locations pointed to by `zExpPtr' and
4332 | `zSigPtr', respectively.
4333 *----------------------------------------------------------------------------*/
4334
4335 static void
4336 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4337 {
4338 int8_t shiftCount;
4339
4340 shiftCount = clz64(aSig) - 11;
4341 *zSigPtr = aSig<<shiftCount;
4342 *zExpPtr = 1 - shiftCount;
4343
4344 }
4345
4346 /*----------------------------------------------------------------------------
4347 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4348 | double-precision floating-point value, returning the result. After being
4349 | shifted into the proper positions, the three fields are simply added
4350 | together to form the result. This means that any integer portion of `zSig'
4351 | will be added into the exponent. Since a properly normalized significand
4352 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4353 | than the desired result exponent whenever `zSig' is a complete, normalized
4354 | significand.
4355 *----------------------------------------------------------------------------*/
4356
4357 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4358 {
4359
4360 return make_float64(
4361 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4362
4363 }
4364
4365 /*----------------------------------------------------------------------------
4366 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4367 | and significand `zSig', and returns the proper double-precision floating-
4368 | point value corresponding to the abstract input. Ordinarily, the abstract
4369 | value is simply rounded and packed into the double-precision format, with
4370 | the inexact exception raised if the abstract input cannot be represented
4371 | exactly. However, if the abstract value is too large, the overflow and
4372 | inexact exceptions are raised and an infinity or maximal finite value is
4373 | returned. If the abstract value is too small, the input value is rounded to
4374 | a subnormal number, and the underflow and inexact exceptions are raised if
4375 | the abstract input cannot be represented exactly as a subnormal double-
4376 | precision floating-point number.
4377 | The input significand `zSig' has its binary point between bits 62
4378 | and 61, which is 10 bits to the left of the usual location. This shifted
4379 | significand must be normalized or smaller. If `zSig' is not normalized,
4380 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4381 | and it must not require rounding. In the usual case that `zSig' is
4382 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4383 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4384 | Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386
4387 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4388 float_status *status)
4389 {
4390 int8_t roundingMode;
4391 bool roundNearestEven;
4392 int roundIncrement, roundBits;
4393 bool isTiny;
4394
4395 roundingMode = status->float_rounding_mode;
4396 roundNearestEven = ( roundingMode == float_round_nearest_even );
4397 switch (roundingMode) {
4398 case float_round_nearest_even:
4399 case float_round_ties_away:
4400 roundIncrement = 0x200;
4401 break;
4402 case float_round_to_zero:
4403 roundIncrement = 0;
4404 break;
4405 case float_round_up:
4406 roundIncrement = zSign ? 0 : 0x3ff;
4407 break;
4408 case float_round_down:
4409 roundIncrement = zSign ? 0x3ff : 0;
4410 break;
4411 case float_round_to_odd:
4412 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4413 break;
4414 default:
4415 abort();
4416 }
4417 roundBits = zSig & 0x3FF;
4418 if ( 0x7FD <= (uint16_t) zExp ) {
4419 if ( ( 0x7FD < zExp )
4420 || ( ( zExp == 0x7FD )
4421 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4422 ) {
4423 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4424 roundIncrement != 0;
4425 float_raise(float_flag_overflow | float_flag_inexact, status);
4426 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4427 }
4428 if ( zExp < 0 ) {
4429 if (status->flush_to_zero) {
4430 float_raise(float_flag_output_denormal, status);
4431 return packFloat64(zSign, 0, 0);
4432 }
4433 isTiny = status->tininess_before_rounding
4434 || (zExp < -1)
4435 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4436 shift64RightJamming( zSig, - zExp, &zSig );
4437 zExp = 0;
4438 roundBits = zSig & 0x3FF;
4439 if (isTiny && roundBits) {
4440 float_raise(float_flag_underflow, status);
4441 }
4442 if (roundingMode == float_round_to_odd) {
4443 /*
4444 * For round-to-odd case, the roundIncrement depends on
4445 * zSig which just changed.
4446 */
4447 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4448 }
4449 }
4450 }
4451 if (roundBits) {
4452 float_raise(float_flag_inexact, status);
4453 }
4454 zSig = ( zSig + roundIncrement )>>10;
4455 if (!(roundBits ^ 0x200) && roundNearestEven) {
4456 zSig &= ~1;
4457 }
4458 if ( zSig == 0 ) zExp = 0;
4459 return packFloat64( zSign, zExp, zSig );
4460
4461 }
4462
4463 /*----------------------------------------------------------------------------
4464 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4465 | and significand `zSig', and returns the proper double-precision floating-
4466 | point value corresponding to the abstract input. This routine is just like
4467 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4468 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4469 | floating-point exponent.
4470 *----------------------------------------------------------------------------*/
4471
4472 static float64
4473 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4474 float_status *status)
4475 {
4476 int8_t shiftCount;
4477
4478 shiftCount = clz64(zSig) - 1;
4479 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4480 status);
4481
4482 }
4483
4484 /*----------------------------------------------------------------------------
4485 | Normalizes the subnormal extended double-precision floating-point value
4486 | represented by the denormalized significand `aSig'. The normalized exponent
4487 | and significand are stored at the locations pointed to by `zExpPtr' and
4488 | `zSigPtr', respectively.
4489 *----------------------------------------------------------------------------*/
4490
4491 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4492 uint64_t *zSigPtr)
4493 {
4494 int8_t shiftCount;
4495
4496 shiftCount = clz64(aSig);
4497 *zSigPtr = aSig<<shiftCount;
4498 *zExpPtr = 1 - shiftCount;
4499 }
4500
4501 /*----------------------------------------------------------------------------
4502 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4503 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4504 | and returns the proper extended double-precision floating-point value
4505 | corresponding to the abstract input. Ordinarily, the abstract value is
4506 | rounded and packed into the extended double-precision format, with the
4507 | inexact exception raised if the abstract input cannot be represented
4508 | exactly. However, if the abstract value is too large, the overflow and
4509 | inexact exceptions are raised and an infinity or maximal finite value is
4510 | returned. If the abstract value is too small, the input value is rounded to
4511 | a subnormal number, and the underflow and inexact exceptions are raised if
4512 | the abstract input cannot be represented exactly as a subnormal extended
4513 | double-precision floating-point number.
4514 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4515 | number of bits as single or double precision, respectively. Otherwise, the
4516 | result is rounded to the full precision of the extended double-precision
4517 | format.
4518 | The input significand must be normalized or smaller. If the input
4519 | significand is not normalized, `zExp' must be 0; in that case, the result
4520 | returned is a subnormal number, and it must not require rounding. The
4521 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4522 | Floating-Point Arithmetic.
4523 *----------------------------------------------------------------------------*/
4524
4525 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4526 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4527 float_status *status)
4528 {
4529 int8_t roundingMode;
4530 bool roundNearestEven, increment, isTiny;
4531 int64_t roundIncrement, roundMask, roundBits;
4532
4533 roundingMode = status->float_rounding_mode;
4534 roundNearestEven = ( roundingMode == float_round_nearest_even );
4535 if ( roundingPrecision == 80 ) goto precision80;
4536 if ( roundingPrecision == 64 ) {
4537 roundIncrement = UINT64_C(0x0000000000000400);
4538 roundMask = UINT64_C(0x00000000000007FF);
4539 }
4540 else if ( roundingPrecision == 32 ) {
4541 roundIncrement = UINT64_C(0x0000008000000000);
4542 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4543 }
4544 else {
4545 goto precision80;
4546 }
4547 zSig0 |= ( zSig1 != 0 );
4548 switch (roundingMode) {
4549 case float_round_nearest_even:
4550 case float_round_ties_away:
4551 break;
4552 case float_round_to_zero:
4553 roundIncrement = 0;
4554 break;
4555 case float_round_up:
4556 roundIncrement = zSign ? 0 : roundMask;
4557 break;
4558 case float_round_down:
4559 roundIncrement = zSign ? roundMask : 0;
4560 break;
4561 default:
4562 abort();
4563 }
4564 roundBits = zSig0 & roundMask;
4565 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4566 if ( ( 0x7FFE < zExp )
4567 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4568 ) {
4569 goto overflow;
4570 }
4571 if ( zExp <= 0 ) {
4572 if (status->flush_to_zero) {
4573 float_raise(float_flag_output_denormal, status);
4574 return packFloatx80(zSign, 0, 0);
4575 }
4576 isTiny = status->tininess_before_rounding
4577 || (zExp < 0 )
4578 || (zSig0 <= zSig0 + roundIncrement);
4579 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4580 zExp = 0;
4581 roundBits = zSig0 & roundMask;
4582 if (isTiny && roundBits) {
4583 float_raise(float_flag_underflow, status);
4584 }
4585 if (roundBits) {
4586 float_raise(float_flag_inexact, status);
4587 }
4588 zSig0 += roundIncrement;
4589 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4590 roundIncrement = roundMask + 1;
4591 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4592 roundMask |= roundIncrement;
4593 }
4594 zSig0 &= ~ roundMask;
4595 return packFloatx80( zSign, zExp, zSig0 );
4596 }
4597 }
4598 if (roundBits) {
4599 float_raise(float_flag_inexact, status);
4600 }
4601 zSig0 += roundIncrement;
4602 if ( zSig0 < roundIncrement ) {
4603 ++zExp;
4604 zSig0 = UINT64_C(0x8000000000000000);
4605 }
4606 roundIncrement = roundMask + 1;
4607 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4608 roundMask |= roundIncrement;
4609 }
4610 zSig0 &= ~ roundMask;
4611 if ( zSig0 == 0 ) zExp = 0;
4612 return packFloatx80( zSign, zExp, zSig0 );
4613 precision80:
4614 switch (roundingMode) {
4615 case float_round_nearest_even:
4616 case float_round_ties_away:
4617 increment = ((int64_t)zSig1 < 0);
4618 break;
4619 case float_round_to_zero:
4620 increment = 0;
4621 break;
4622 case float_round_up:
4623 increment = !zSign && zSig1;
4624 break;
4625 case float_round_down:
4626 increment = zSign && zSig1;
4627 break;
4628 default:
4629 abort();
4630 }
4631 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4632 if ( ( 0x7FFE < zExp )
4633 || ( ( zExp == 0x7FFE )
4634 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4635 && increment
4636 )
4637 ) {
4638 roundMask = 0;
4639 overflow:
4640 float_raise(float_flag_overflow | float_flag_inexact, status);
4641 if ( ( roundingMode == float_round_to_zero )
4642 || ( zSign && ( roundingMode == float_round_up ) )
4643 || ( ! zSign && ( roundingMode == float_round_down ) )
4644 ) {
4645 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4646 }
4647 return packFloatx80(zSign,
4648 floatx80_infinity_high,
4649 floatx80_infinity_low);
4650 }
4651 if ( zExp <= 0 ) {
4652 isTiny = status->tininess_before_rounding
4653 || (zExp < 0)
4654 || !increment
4655 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4656 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4657 zExp = 0;
4658 if (isTiny && zSig1) {
4659 float_raise(float_flag_underflow, status);
4660 }
4661 if (zSig1) {
4662 float_raise(float_flag_inexact, status);
4663 }
4664 switch (roundingMode) {
4665 case float_round_nearest_even:
4666 case float_round_ties_away:
4667 increment = ((int64_t)zSig1 < 0);
4668 break;
4669 case float_round_to_zero:
4670 increment = 0;
4671 break;
4672 case float_round_up:
4673 increment = !zSign && zSig1;
4674 break;
4675 case float_round_down:
4676 increment = zSign && zSig1;
4677 break;
4678 default:
4679 abort();
4680 }
4681 if ( increment ) {
4682 ++zSig0;
4683 if (!(zSig1 << 1) && roundNearestEven) {
4684 zSig0 &= ~1;
4685 }
4686 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4687 }
4688 return packFloatx80( zSign, zExp, zSig0 );
4689 }
4690 }
4691 if (zSig1) {
4692 float_raise(float_flag_inexact, status);
4693 }
4694 if ( increment ) {
4695 ++zSig0;
4696 if ( zSig0 == 0 ) {
4697 ++zExp;
4698 zSig0 = UINT64_C(0x8000000000000000);
4699 }
4700 else {
4701 if (!(zSig1 << 1) && roundNearestEven) {
4702 zSig0 &= ~1;
4703 }
4704 }
4705 }
4706 else {
4707 if ( zSig0 == 0 ) zExp = 0;
4708 }
4709 return packFloatx80( zSign, zExp, zSig0 );
4710
4711 }
4712
4713 /*----------------------------------------------------------------------------
4714 | Takes an abstract floating-point value having sign `zSign', exponent
4715 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4716 | and returns the proper extended double-precision floating-point value
4717 | corresponding to the abstract input. This routine is just like
4718 | `roundAndPackFloatx80' except that the input significand does not have to be
4719 | normalized.
4720 *----------------------------------------------------------------------------*/
4721
4722 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4723 bool zSign, int32_t zExp,
4724 uint64_t zSig0, uint64_t zSig1,
4725 float_status *status)
4726 {
4727 int8_t shiftCount;
4728
4729 if ( zSig0 == 0 ) {
4730 zSig0 = zSig1;
4731 zSig1 = 0;
4732 zExp -= 64;
4733 }
4734 shiftCount = clz64(zSig0);
4735 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4736 zExp -= shiftCount;
4737 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4738 zSig0, zSig1, status);
4739
4740 }
4741
4742 /*----------------------------------------------------------------------------
4743 | Returns the least-significant 64 fraction bits of the quadruple-precision
4744 | floating-point value `a'.
4745 *----------------------------------------------------------------------------*/
4746
4747 static inline uint64_t extractFloat128Frac1( float128 a )
4748 {
4749
4750 return a.low;
4751
4752 }
4753
4754 /*----------------------------------------------------------------------------
4755 | Returns the most-significant 48 fraction bits of the quadruple-precision
4756 | floating-point value `a'.
4757 *----------------------------------------------------------------------------*/
4758
4759 static inline uint64_t extractFloat128Frac0( float128 a )
4760 {
4761
4762 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4763
4764 }
4765
4766 /*----------------------------------------------------------------------------
4767 | Returns the exponent bits of the quadruple-precision floating-point value
4768 | `a'.
4769 *----------------------------------------------------------------------------*/
4770
4771 static inline int32_t extractFloat128Exp( float128 a )
4772 {
4773
4774 return ( a.high>>48 ) & 0x7FFF;
4775
4776 }
4777
4778 /*----------------------------------------------------------------------------
4779 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4780 *----------------------------------------------------------------------------*/
4781
4782 static inline bool extractFloat128Sign(float128 a)
4783 {
4784 return a.high >> 63;
4785 }
4786
4787 /*----------------------------------------------------------------------------
4788 | Normalizes the subnormal quadruple-precision floating-point value
4789 | represented by the denormalized significand formed by the concatenation of
4790 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4791 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4792 | significand are stored at the location pointed to by `zSig0Ptr', and the
4793 | least significant 64 bits of the normalized significand are stored at the
4794 | location pointed to by `zSig1Ptr'.
4795 *----------------------------------------------------------------------------*/
4796
4797 static void
4798 normalizeFloat128Subnormal(
4799 uint64_t aSig0,
4800 uint64_t aSig1,
4801 int32_t *zExpPtr,
4802 uint64_t *zSig0Ptr,
4803 uint64_t *zSig1Ptr
4804 )
4805 {
4806 int8_t shiftCount;
4807
4808 if ( aSig0 == 0 ) {
4809 shiftCount = clz64(aSig1) - 15;
4810 if ( shiftCount < 0 ) {
4811 *zSig0Ptr = aSig1>>( - shiftCount );
4812 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4813 }
4814 else {
4815 *zSig0Ptr = aSig1<<shiftCount;
4816 *zSig1Ptr = 0;
4817 }
4818 *zExpPtr = - shiftCount - 63;
4819 }
4820 else {
4821 shiftCount = clz64(aSig0) - 15;
4822 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4823 *zExpPtr = 1 - shiftCount;
4824 }
4825
4826 }
4827
4828 /*----------------------------------------------------------------------------
4829 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4830 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4831 | floating-point value, returning the result. After being shifted into the
4832 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4833 | added together to form the most significant 32 bits of the result. This
4834 | means that any integer portion of `zSig0' will be added into the exponent.
4835 | Since a properly normalized significand will have an integer portion equal
4836 | to 1, the `zExp' input should be 1 less than the desired result exponent
4837 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4838 | significand.
4839 *----------------------------------------------------------------------------*/
4840
4841 static inline float128
4842 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4843 {
4844 float128 z;
4845
4846 z.low = zSig1;
4847 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4848 return z;
4849 }
4850
4851 /*----------------------------------------------------------------------------
4852 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4853 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4854 | and `zSig2', and returns the proper quadruple-precision floating-point value
4855 | corresponding to the abstract input. Ordinarily, the abstract value is
4856 | simply rounded and packed into the quadruple-precision format, with the
4857 | inexact exception raised if the abstract input cannot be represented
4858 | exactly. However, if the abstract value is too large, the overflow and
4859 | inexact exceptions are raised and an infinity or maximal finite value is
4860 | returned. If the abstract value is too small, the input value is rounded to
4861 | a subnormal number, and the underflow and inexact exceptions are raised if
4862 | the abstract input cannot be represented exactly as a subnormal quadruple-
4863 | precision floating-point number.
4864 | The input significand must be normalized or smaller. If the input
4865 | significand is not normalized, `zExp' must be 0; in that case, the result
4866 | returned is a subnormal number, and it must not require rounding. In the
4867 | usual case that the input significand is normalized, `zExp' must be 1 less
4868 | than the ``true'' floating-point exponent. The handling of underflow and
4869 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4870 *----------------------------------------------------------------------------*/
4871
4872 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4873 uint64_t zSig0, uint64_t zSig1,
4874 uint64_t zSig2, float_status *status)
4875 {
4876 int8_t roundingMode;
4877 bool roundNearestEven, increment, isTiny;
4878
4879 roundingMode = status->float_rounding_mode;
4880 roundNearestEven = ( roundingMode == float_round_nearest_even );
4881 switch (roundingMode) {
4882 case float_round_nearest_even:
4883 case float_round_ties_away:
4884 increment = ((int64_t)zSig2 < 0);
4885 break;
4886 case float_round_to_zero:
4887 increment = 0;
4888 break;
4889 case float_round_up:
4890 increment = !zSign && zSig2;
4891 break;
4892 case float_round_down:
4893 increment = zSign && zSig2;
4894 break;
4895 case float_round_to_odd:
4896 increment = !(zSig1 & 0x1) && zSig2;
4897 break;
4898 default:
4899 abort();
4900 }
4901 if ( 0x7FFD <= (uint32_t) zExp ) {
4902 if ( ( 0x7FFD < zExp )
4903 || ( ( zExp == 0x7FFD )
4904 && eq128(
4905 UINT64_C(0x0001FFFFFFFFFFFF),
4906 UINT64_C(0xFFFFFFFFFFFFFFFF),
4907 zSig0,
4908 zSig1
4909 )
4910 && increment
4911 )
4912 ) {
4913 float_raise(float_flag_overflow | float_flag_inexact, status);
4914 if ( ( roundingMode == float_round_to_zero )
4915 || ( zSign && ( roundingMode == float_round_up ) )
4916 || ( ! zSign && ( roundingMode == float_round_down ) )
4917 || (roundingMode == float_round_to_odd)
4918 ) {
4919 return
4920 packFloat128(
4921 zSign,
4922 0x7FFE,
4923 UINT64_C(0x0000FFFFFFFFFFFF),
4924 UINT64_C(0xFFFFFFFFFFFFFFFF)
4925 );
4926 }
4927 return packFloat128( zSign, 0x7FFF, 0, 0 );
4928 }
4929 if ( zExp < 0 ) {
4930 if (status->flush_to_zero) {
4931 float_raise(float_flag_output_denormal, status);
4932 return packFloat128(zSign, 0, 0, 0);
4933 }
4934 isTiny = status->tininess_before_rounding
4935 || (zExp < -1)
4936 || !increment
4937 || lt128(zSig0, zSig1,
4938 UINT64_C(0x0001FFFFFFFFFFFF),
4939 UINT64_C(0xFFFFFFFFFFFFFFFF));
4940 shift128ExtraRightJamming(
4941 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4942 zExp = 0;
4943 if (isTiny && zSig2) {
4944 float_raise(float_flag_underflow, status);
4945 }
4946 switch (roundingMode) {
4947 case float_round_nearest_even:
4948 case float_round_ties_away:
4949 increment = ((int64_t)zSig2 < 0);
4950 break;
4951 case float_round_to_zero:
4952 increment = 0;
4953 break;
4954 case float_round_up:
4955 increment = !zSign && zSig2;
4956 break;
4957 case float_round_down:
4958 increment = zSign && zSig2;
4959 break;
4960 case float_round_to_odd:
4961 increment = !(zSig1 & 0x1) && zSig2;
4962 break;
4963 default:
4964 abort();
4965 }
4966 }
4967 }
4968 if (zSig2) {
4969 float_raise(float_flag_inexact, status);
4970 }
4971 if ( increment ) {
4972 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4973 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4974 zSig1 &= ~1;
4975 }
4976 }
4977 else {
4978 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4979 }
4980 return packFloat128( zSign, zExp, zSig0, zSig1 );
4981
4982 }
4983
4984 /*----------------------------------------------------------------------------
4985 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4986 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4987 | returns the proper quadruple-precision floating-point value corresponding
4988 | to the abstract input. This routine is just like `roundAndPackFloat128'
4989 | except that the input significand has fewer bits and does not have to be
4990 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4991 | point exponent.
4992 *----------------------------------------------------------------------------*/
4993
4994 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4995 uint64_t zSig0, uint64_t zSig1,
4996 float_status *status)
4997 {
4998 int8_t shiftCount;
4999 uint64_t zSig2;
5000
5001 if ( zSig0 == 0 ) {
5002 zSig0 = zSig1;
5003 zSig1 = 0;
5004 zExp -= 64;
5005 }
5006 shiftCount = clz64(zSig0) - 15;
5007 if ( 0 <= shiftCount ) {
5008 zSig2 = 0;
5009 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5010 }
5011 else {
5012 shift128ExtraRightJamming(
5013 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
5014 }
5015 zExp -= shiftCount;
5016 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
5017
5018 }
5019
5020
5021 /*----------------------------------------------------------------------------
5022 | Returns the result of converting the 32-bit two's complement integer `a'
5023 | to the extended double-precision floating-point format. The conversion
5024 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5025 | Arithmetic.
5026 *----------------------------------------------------------------------------*/
5027
5028 floatx80 int32_to_floatx80(int32_t a, float_status *status)
5029 {
5030 bool zSign;
5031 uint32_t absA;
5032 int8_t shiftCount;
5033 uint64_t zSig;
5034
5035 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5036 zSign = ( a < 0 );
5037 absA = zSign ? - a : a;
5038 shiftCount = clz32(absA) + 32;
5039 zSig = absA;
5040 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5041
5042 }
5043
5044 /*----------------------------------------------------------------------------
5045 | Returns the result of converting the 32-bit two's complement integer `a' to
5046 | the quadruple-precision floating-point format. The conversion is performed
5047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5048 *----------------------------------------------------------------------------*/
5049
5050 float128 int32_to_float128(int32_t a, float_status *status)
5051 {
5052 bool zSign;
5053 uint32_t absA;
5054 int8_t shiftCount;
5055 uint64_t zSig0;
5056
5057 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5058 zSign = ( a < 0 );
5059 absA = zSign ? - a : a;
5060 shiftCount = clz32(absA) + 17;
5061 zSig0 = absA;
5062 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5063
5064 }
5065
5066 /*----------------------------------------------------------------------------
5067 | Returns the result of converting the 64-bit two's complement integer `a'
5068 | to the extended double-precision floating-point format. The conversion
5069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5070 | Arithmetic.
5071 *----------------------------------------------------------------------------*/
5072
5073 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5074 {
5075 bool zSign;
5076 uint64_t absA;
5077 int8_t shiftCount;
5078
5079 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5080 zSign = ( a < 0 );
5081 absA = zSign ? - a : a;
5082 shiftCount = clz64(absA);
5083 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5084
5085 }
5086
5087 /*----------------------------------------------------------------------------
5088 | Returns the result of converting the 64-bit two's complement integer `a' to
5089 | the quadruple-precision floating-point format. The conversion is performed
5090 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5091 *----------------------------------------------------------------------------*/
5092
5093 float128 int64_to_float128(int64_t a, float_status *status)
5094 {
5095 bool zSign;
5096 uint64_t absA;
5097 int8_t shiftCount;
5098 int32_t zExp;
5099 uint64_t zSig0, zSig1;
5100
5101 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5102 zSign = ( a < 0 );
5103 absA = zSign ? - a : a;
5104 shiftCount = clz64(absA) + 49;
5105 zExp = 0x406E - shiftCount;
5106 if ( 64 <= shiftCount ) {
5107 zSig1 = 0;
5108 zSig0 = absA;
5109 shiftCount -= 64;
5110 }
5111 else {
5112 zSig1 = absA;
5113 zSig0 = 0;
5114 }
5115 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5116 return packFloat128( zSign, zExp, zSig0, zSig1 );
5117
5118 }
5119
5120 /*----------------------------------------------------------------------------
5121 | Returns the result of converting the 64-bit unsigned integer `a'
5122 | to the quadruple-precision floating-point format. The conversion is performed
5123 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5124 *----------------------------------------------------------------------------*/
5125
5126 float128 uint64_to_float128(uint64_t a, float_status *status)
5127 {
5128 if (a == 0) {
5129 return float128_zero;
5130 }
5131 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5132 }
5133
5134 /*----------------------------------------------------------------------------
5135 | Returns the result of converting the single-precision floating-point value
5136 | `a' to the extended double-precision floating-point format. The conversion
5137 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5138 | Arithmetic.
5139 *----------------------------------------------------------------------------*/
5140
5141 floatx80 float32_to_floatx80(float32 a, float_status *status)
5142 {
5143 bool aSign;
5144 int aExp;
5145 uint32_t aSig;
5146
5147 a = float32_squash_input_denormal(a, status);
5148 aSig = extractFloat32Frac( a );
5149 aExp = extractFloat32Exp( a );
5150 aSign = extractFloat32Sign( a );
5151 if ( aExp == 0xFF ) {
5152 if (aSig) {
5153 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5154 status);
5155 return floatx80_silence_nan(res, status);
5156 }
5157 return packFloatx80(aSign,
5158 floatx80_infinity_high,
5159 floatx80_infinity_low);
5160 }
5161 if ( aExp == 0 ) {
5162 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5163 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5164 }
5165 aSig |= 0x00800000;
5166 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5167
5168 }
5169
5170 /*----------------------------------------------------------------------------
5171 | Returns the result of converting the single-precision floating-point value
5172 | `a' to the double-precision floating-point format. The conversion is
5173 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5174 | Arithmetic.
5175 *----------------------------------------------------------------------------*/
5176
5177 float128 float32_to_float128(float32 a, float_status *status)
5178 {
5179 bool aSign;
5180 int aExp;
5181 uint32_t aSig;
5182
5183 a = float32_squash_input_denormal(a, status);
5184 aSig = extractFloat32Frac( a );
5185 aExp = extractFloat32Exp( a );
5186 aSign = extractFloat32Sign( a );
5187 if ( aExp == 0xFF ) {
5188 if (aSig) {
5189 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5190 }
5191 return packFloat128( aSign, 0x7FFF, 0, 0 );
5192 }
5193 if ( aExp == 0 ) {
5194 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5195 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5196 --aExp;
5197 }
5198 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5199
5200 }
5201
5202 /*----------------------------------------------------------------------------
5203 | Returns the remainder of the single-precision floating-point value `a'
5204 | with respect to the corresponding value `b'. The operation is performed
5205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5206 *----------------------------------------------------------------------------*/
5207
5208 float32 float32_rem(float32 a, float32 b, float_status *status)
5209 {
5210 bool aSign, zSign;
5211 int aExp, bExp, expDiff;
5212 uint32_t aSig, bSig;
5213 uint32_t q;
5214 uint64_t aSig64, bSig64, q64;
5215 uint32_t alternateASig;
5216 int32_t sigMean;
5217 a = float32_squash_input_denormal(a, status);
5218 b = float32_squash_input_denormal(b, status);
5219
5220 aSig = extractFloat32Frac( a );
5221 aExp = extractFloat32Exp( a );
5222 aSign = extractFloat32Sign( a );
5223 bSig = extractFloat32Frac( b );
5224 bExp = extractFloat32Exp( b );
5225 if ( aExp == 0xFF ) {
5226 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5227 return propagateFloat32NaN(a, b, status);
5228 }
5229 float_raise(float_flag_invalid, status);
5230 return float32_default_nan(status);
5231 }
5232 if ( bExp == 0xFF ) {
5233 if (bSig) {
5234 return propagateFloat32NaN(a, b, status);
5235 }
5236 return a;
5237 }
5238 if ( bExp == 0 ) {
5239 if ( bSig == 0 ) {
5240 float_raise(float_flag_invalid, status);
5241 return float32_default_nan(status);
5242 }
5243 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5244 }
5245 if ( aExp == 0 ) {
5246 if ( aSig == 0 ) return a;
5247 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5248 }
5249 expDiff = aExp - bExp;
5250 aSig |= 0x00800000;
5251 bSig |= 0x00800000;
5252 if ( expDiff < 32 ) {
5253 aSig <<= 8;
5254 bSig <<= 8;
5255 if ( expDiff < 0 ) {
5256 if ( expDiff < -1 ) return a;
5257 aSig >>= 1;
5258 }
5259 q = ( bSig <= aSig );
5260 if ( q ) aSig -= bSig;
5261 if ( 0 < expDiff ) {
5262 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5263 q >>= 32 - expDiff;
5264 bSig >>= 2;
5265 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5266 }
5267 else {
5268 aSig >>= 2;
5269 bSig >>= 2;
5270 }
5271 }
5272 else {
5273 if ( bSig <= aSig ) aSig -= bSig;
5274 aSig64 = ( (uint64_t) aSig )<<40;
5275 bSig64 = ( (uint64_t) bSig )<<40;
5276 expDiff -= 64;
5277 while ( 0 < expDiff ) {
5278 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5279 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5280 aSig64 = - ( ( bSig * q64 )<<38 );
5281 expDiff -= 62;
5282 }
5283 expDiff += 64;
5284 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5285 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5286 q = q64>>( 64 - expDiff );
5287 bSig <<= 6;
5288 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5289 }
5290 do {
5291 alternateASig = aSig;
5292 ++q;
5293 aSig -= bSig;
5294 } while ( 0 <= (int32_t) aSig );
5295 sigMean = aSig + alternateASig;
5296 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5297 aSig = alternateASig;
5298 }
5299 zSign = ( (int32_t) aSig < 0 );
5300 if ( zSign ) aSig = - aSig;
5301 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5302 }
5303
5304
5305
5306 /*----------------------------------------------------------------------------
5307 | Returns the binary exponential of the single-precision floating-point value
5308 | `a'. The operation is performed according to the IEC/IEEE Standard for
5309 | Binary Floating-Point Arithmetic.
5310 |
5311 | Uses the following identities:
5312 |
5313 | 1. -------------------------------------------------------------------------
5314 | x x*ln(2)
5315 | 2 = e
5316 |
5317 | 2. -------------------------------------------------------------------------
5318 | 2 3 4 5 n
5319 | x x x x x x x
5320 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5321 | 1! 2! 3! 4! 5! n!
5322 *----------------------------------------------------------------------------*/
5323
5324 static const float64 float32_exp2_coefficients[15] =
5325 {
5326 const_float64( 0x3ff0000000000000ll ), /* 1 */
5327 const_float64( 0x3fe0000000000000ll ), /* 2 */
5328 const_float64( 0x3fc5555555555555ll ), /* 3 */
5329 const_float64( 0x3fa5555555555555ll ), /* 4 */
5330 const_float64( 0x3f81111111111111ll ), /* 5 */
5331 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5332 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5333 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5334 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5335 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5336 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5337 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5338 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5339 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5340 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5341 };
5342
5343 float32 float32_exp2(float32 a, float_status *status)
5344 {
5345 bool aSign;
5346 int aExp;
5347 uint32_t aSig;
5348 float64 r, x, xn;
5349 int i;
5350 a = float32_squash_input_denormal(a, status);
5351
5352 aSig = extractFloat32Frac( a );
5353 aExp = extractFloat32Exp( a );
5354 aSign = extractFloat32Sign( a );
5355
5356 if ( aExp == 0xFF) {
5357 if (aSig) {
5358 return propagateFloat32NaN(a, float32_zero, status);
5359 }
5360 return (aSign) ? float32_zero : a;
5361 }
5362 if (aExp == 0) {
5363 if (aSig == 0) return float32_one;
5364 }
5365
5366 float_raise(float_flag_inexact, status);
5367
5368 /* ******************************* */
5369 /* using float64 for approximation */
5370 /* ******************************* */
5371 x = float32_to_float64(a, status);
5372 x = float64_mul(x, float64_ln2, status);
5373
5374 xn = x;
5375 r = float64_one;
5376 for (i = 0 ; i < 15 ; i++) {
5377 float64 f;
5378
5379 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5380 r = float64_add(r, f, status);
5381
5382 xn = float64_mul(xn, x, status);
5383 }
5384
5385 return float64_to_float32(r, status);
5386 }
5387
5388 /*----------------------------------------------------------------------------
5389 | Returns the binary log of the single-precision floating-point value `a'.
5390 | The operation is performed according to the IEC/IEEE Standard for Binary
5391 | Floating-Point Arithmetic.
5392 *----------------------------------------------------------------------------*/
5393 float32 float32_log2(float32 a, float_status *status)
5394 {
5395 bool aSign, zSign;
5396 int aExp;
5397 uint32_t aSig, zSig, i;
5398
5399 a = float32_squash_input_denormal(a, status);
5400 aSig = extractFloat32Frac( a );
5401 aExp = extractFloat32Exp( a );
5402 aSign = extractFloat32Sign( a );
5403
5404 if ( aExp == 0 ) {
5405 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5406 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5407 }
5408 if ( aSign ) {
5409 float_raise(float_flag_invalid, status);
5410 return float32_default_nan(status);
5411 }
5412 if ( aExp == 0xFF ) {
5413 if (aSig) {
5414 return propagateFloat32NaN(a, float32_zero, status);
5415 }
5416 return a;
5417 }
5418
5419 aExp -= 0x7F;
5420 aSig |= 0x00800000;
5421 zSign = aExp < 0;
5422 zSig = aExp << 23;
5423
5424 for (i = 1 << 22; i > 0; i >>= 1) {
5425 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5426 if ( aSig & 0x01000000 ) {
5427 aSig >>= 1;
5428 zSig |= i;
5429 }
5430 }
5431
5432 if ( zSign )
5433 zSig = -zSig;
5434
5435 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5436 }
5437
5438 /*----------------------------------------------------------------------------
5439 | Returns the result of converting the double-precision floating-point value
5440 | `a' to the extended double-precision floating-point format. The conversion
5441 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5442 | Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444
5445 floatx80 float64_to_floatx80(float64 a, float_status *status)
5446 {
5447 bool aSign;
5448 int aExp;
5449 uint64_t aSig;
5450
5451 a = float64_squash_input_denormal(a, status);
5452 aSig = extractFloat64Frac( a );
5453 aExp = extractFloat64Exp( a );
5454 aSign = extractFloat64Sign( a );
5455 if ( aExp == 0x7FF ) {
5456 if (aSig) {
5457 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5458 status);
5459 return floatx80_silence_nan(res, status);
5460 }
5461 return packFloatx80(aSign,
5462 floatx80_infinity_high,
5463 floatx80_infinity_low);
5464 }
5465 if ( aExp == 0 ) {
5466 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5467 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5468 }
5469 return
5470 packFloatx80(
5471 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5472
5473 }
5474
5475 /*----------------------------------------------------------------------------
5476 | Returns the result of converting the double-precision floating-point value
5477 | `a' to the quadruple-precision floating-point format. The conversion is
5478 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5479 | Arithmetic.
5480 *----------------------------------------------------------------------------*/
5481
5482 float128 float64_to_float128(float64 a, float_status *status)
5483 {
5484 bool aSign;
5485 int aExp;
5486 uint64_t aSig, zSig0, zSig1;
5487
5488 a = float64_squash_input_denormal(a, status);
5489 aSig = extractFloat64Frac( a );
5490 aExp = extractFloat64Exp( a );
5491 aSign = extractFloat64Sign( a );
5492 if ( aExp == 0x7FF ) {
5493 if (aSig) {
5494 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5495 }
5496 return packFloat128( aSign, 0x7FFF, 0, 0 );
5497 }
5498 if ( aExp == 0 ) {
5499 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5500 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5501 --aExp;
5502 }
5503 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5504 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5505
5506 }
5507
5508
5509 /*----------------------------------------------------------------------------
5510 | Returns the remainder of the double-precision floating-point value `a'
5511 | with respect to the corresponding value `b'. The operation is performed
5512 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5513 *----------------------------------------------------------------------------*/
5514
5515 float64 float64_rem(float64 a, float64 b, float_status *status)
5516 {
5517 bool aSign, zSign;
5518 int aExp, bExp, expDiff;
5519 uint64_t aSig, bSig;
5520 uint64_t q, alternateASig;
5521 int64_t sigMean;
5522
5523 a = float64_squash_input_denormal(a, status);
5524 b = float64_squash_input_denormal(b, status);
5525 aSig = extractFloat64Frac( a );
5526 aExp = extractFloat64Exp( a );
5527 aSign = extractFloat64Sign( a );
5528 bSig = extractFloat64Frac( b );
5529 bExp = extractFloat64Exp( b );
5530 if ( aExp == 0x7FF ) {
5531 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5532 return propagateFloat64NaN(a, b, status);
5533 }
5534 float_raise(float_flag_invalid, status);
5535 return float64_default_nan(status);
5536 }
5537 if ( bExp == 0x7FF ) {
5538 if (bSig) {
5539 return propagateFloat64NaN(a, b, status);
5540 }
5541 return a;
5542 }
5543 if ( bExp == 0 ) {
5544 if ( bSig == 0 ) {
5545 float_raise(float_flag_invalid, status);
5546 return float64_default_nan(status);
5547 }
5548 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5549 }
5550 if ( aExp == 0 ) {
5551 if ( aSig == 0 ) return a;
5552 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5553 }
5554 expDiff = aExp - bExp;
5555 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5556 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5557 if ( expDiff < 0 ) {
5558 if ( expDiff < -1 ) return a;
5559 aSig >>= 1;
5560 }
5561 q = ( bSig <= aSig );
5562 if ( q ) aSig -= bSig;
5563 expDiff -= 64;
5564 while ( 0 < expDiff ) {
5565 q = estimateDiv128To64( aSig, 0, bSig );
5566 q = ( 2 < q ) ? q - 2 : 0;
5567 aSig = - ( ( bSig>>2 ) * q );
5568 expDiff -= 62;
5569 }
5570 expDiff += 64;
5571 if ( 0 < expDiff ) {
5572 q = estimateDiv128To64( aSig, 0, bSig );
5573 q = ( 2 < q ) ? q - 2 : 0;
5574 q >>= 64 - expDiff;
5575 bSig >>= 2;
5576 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5577 }
5578 else {
5579 aSig >>= 2;
5580 bSig >>= 2;
5581 }
5582 do {
5583 alternateASig = aSig;
5584 ++q;
5585 aSig -= bSig;
5586 } while ( 0 <= (int64_t) aSig );
5587 sigMean = aSig + alternateASig;
5588 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5589 aSig = alternateASig;
5590 }
5591 zSign = ( (int64_t) aSig < 0 );
5592 if ( zSign ) aSig = - aSig;
5593 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5594
5595 }
5596
5597 /*----------------------------------------------------------------------------
5598 | Returns the binary log of the double-precision floating-point value `a'.
5599 | The operation is performed according to the IEC/IEEE Standard for Binary
5600 | Floating-Point Arithmetic.
5601 *----------------------------------------------------------------------------*/
5602 float64 float64_log2(float64 a, float_status *status)
5603 {
5604 bool aSign, zSign;
5605 int aExp;
5606 uint64_t aSig, aSig0, aSig1, zSig, i;
5607 a = float64_squash_input_denormal(a, status);
5608
5609 aSig = extractFloat64Frac( a );
5610 aExp = extractFloat64Exp( a );
5611 aSign = extractFloat64Sign( a );
5612
5613 if ( aExp == 0 ) {
5614 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5615 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5616 }
5617 if ( aSign ) {
5618 float_raise(float_flag_invalid, status);
5619 return float64_default_nan(status);
5620 }
5621 if ( aExp == 0x7FF ) {
5622 if (aSig) {
5623 return propagateFloat64NaN(a, float64_zero, status);
5624 }
5625 return a;
5626 }
5627
5628 aExp -= 0x3FF;
5629 aSig |= UINT64_C(0x0010000000000000);
5630 zSign = aExp < 0;
5631 zSig = (uint64_t)aExp << 52;
5632 for (i = 1LL << 51; i > 0; i >>= 1) {
5633 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5634 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5635 if ( aSig & UINT64_C(0x0020000000000000) ) {
5636 aSig >>= 1;
5637 zSig |= i;
5638 }
5639 }
5640
5641 if ( zSign )
5642 zSig = -zSig;
5643 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5644 }
5645
5646 /*----------------------------------------------------------------------------
5647 | Returns the result of converting the extended double-precision floating-
5648 | point value `a' to the 32-bit two's complement integer format. The
5649 | conversion is performed according to the IEC/IEEE Standard for Binary
5650 | Floating-Point Arithmetic---which means in particular that the conversion
5651 | is rounded according to the current rounding mode. If `a' is a NaN, the
5652 | largest positive integer is returned. Otherwise, if the conversion
5653 | overflows, the largest integer with the same sign as `a' is returned.
5654 *----------------------------------------------------------------------------*/
5655
5656 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5657 {
5658 bool aSign;
5659 int32_t aExp, shiftCount;
5660 uint64_t aSig;
5661
5662 if (floatx80_invalid_encoding(a)) {
5663 float_raise(float_flag_invalid, status);
5664 return 1 << 31;
5665 }
5666 aSig = extractFloatx80Frac( a );
5667 aExp = extractFloatx80Exp( a );
5668 aSign = extractFloatx80Sign( a );
5669 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5670 shiftCount = 0x4037 - aExp;
5671 if ( shiftCount <= 0 ) shiftCount = 1;
5672 shift64RightJamming( aSig, shiftCount, &aSig );
5673 return roundAndPackInt32(aSign, aSig, status);
5674
5675 }
5676
5677 /*----------------------------------------------------------------------------
5678 | Returns the result of converting the extended double-precision floating-
5679 | point value `a' to the 32-bit two's complement integer format. The
5680 | conversion is performed according to the IEC/IEEE Standard for Binary
5681 | Floating-Point Arithmetic, except that the conversion is always rounded
5682 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5683 | Otherwise, if the conversion overflows, the largest integer with the same
5684 | sign as `a' is returned.
5685 *----------------------------------------------------------------------------*/
5686
5687 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5688 {
5689 bool aSign;
5690 int32_t aExp, shiftCount;
5691 uint64_t aSig, savedASig;
5692 int32_t z;
5693
5694 if (floatx80_invalid_encoding(a)) {
5695 float_raise(float_flag_invalid, status);
5696 return 1 << 31;
5697 }
5698 aSig = extractFloatx80Frac( a );
5699 aExp = extractFloatx80Exp( a );
5700 aSign = extractFloatx80Sign( a );
5701 if ( 0x401E < aExp ) {
5702 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5703 goto invalid;
5704 }
5705 else if ( aExp < 0x3FFF ) {
5706 if (aExp || aSig) {
5707 float_raise(float_flag_inexact, status);
5708 }
5709 return 0;
5710 }
5711 shiftCount = 0x403E - aExp;
5712 savedASig = aSig;
5713 aSig >>= shiftCount;
5714 z = aSig;
5715 if ( aSign ) z = - z;
5716 if ( ( z < 0 ) ^ aSign ) {
5717 invalid:
5718 float_raise(float_flag_invalid, status);
5719 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5720 }
5721 if ( ( aSig<<shiftCount ) != savedASig ) {
5722 float_raise(float_flag_inexact, status);
5723 }
5724 return z;
5725
5726 }
5727
5728 /*----------------------------------------------------------------------------
5729 | Returns the result of converting the extended double-precision floating-
5730 | point value `a' to the 64-bit two's complement integer format. The
5731 | conversion is performed according to the IEC/IEEE Standard for Binary
5732 | Floating-Point Arithmetic---which means in particular that the conversion
5733 | is rounded according to the current rounding mode. If `a' is a NaN,
5734 | the largest positive integer is returned. Otherwise, if the conversion
5735 | overflows, the largest integer with the same sign as `a' is returned.
5736 *----------------------------------------------------------------------------*/
5737
5738 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5739 {
5740 bool aSign;
5741 int32_t aExp, shiftCount;
5742 uint64_t aSig, aSigExtra;
5743
5744 if (floatx80_invalid_encoding(a)) {
5745 float_raise(float_flag_invalid, status);
5746 return 1ULL << 63;
5747 }
5748 aSig = extractFloatx80Frac( a );
5749 aExp = extractFloatx80Exp( a );
5750 aSign = extractFloatx80Sign( a );
5751 shiftCount = 0x403E - aExp;
5752 if ( shiftCount <= 0 ) {
5753 if ( shiftCount ) {
5754 float_raise(float_flag_invalid, status);
5755 if (!aSign || floatx80_is_any_nan(a)) {
5756 return INT64_MAX;
5757 }
5758 return INT64_MIN;
5759 }
5760 aSigExtra = 0;
5761 }
5762 else {
5763 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5764 }
5765 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5766
5767 }
5768
5769 /*----------------------------------------------------------------------------
5770 | Returns the result of converting the extended double-precision floating-
5771 | point value `a' to the 64-bit two's complement integer format. The
5772 | conversion is performed according to the IEC/IEEE Standard for Binary
5773 | Floating-Point Arithmetic, except that the conversion is always rounded
5774 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5775 | Otherwise, if the conversion overflows, the largest integer with the same
5776 | sign as `a' is returned.
5777 *----------------------------------------------------------------------------*/
5778
5779 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5780 {
5781 bool aSign;
5782 int32_t aExp, shiftCount;
5783 uint64_t aSig;
5784 int64_t z;
5785
5786 if (floatx80_invalid_encoding(a)) {
5787 float_raise(float_flag_invalid, status);
5788 return 1ULL << 63;
5789 }
5790 aSig = extractFloatx80Frac( a );
5791 aExp = extractFloatx80Exp( a );
5792 aSign = extractFloatx80Sign( a );
5793 shiftCount = aExp - 0x403E;
5794 if ( 0 <= shiftCount ) {
5795 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5796 if ( ( a.high != 0xC03E ) || aSig ) {
5797 float_raise(float_flag_invalid, status);
5798 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5799 return INT64_MAX;
5800 }
5801 }
5802 return INT64_MIN;
5803 }
5804 else if ( aExp < 0x3FFF ) {
5805 if (aExp | aSig) {
5806 float_raise(float_flag_inexact, status);
5807 }
5808 return 0;
5809 }
5810 z = aSig>>( - shiftCount );
5811 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5812 float_raise(float_flag_inexact, status);
5813 }
5814 if ( aSign ) z = - z;
5815 return z;
5816
5817 }
5818
5819 /*----------------------------------------------------------------------------
5820 | Returns the result of converting the extended double-precision floating-
5821 | point value `a' to the single-precision floating-point format. The
5822 | conversion is performed according to the IEC/IEEE Standard for Binary
5823 | Floating-Point Arithmetic.
5824 *----------------------------------------------------------------------------*/
5825
5826 float32 floatx80_to_float32(floatx80 a, float_status *status)
5827 {
5828 bool aSign;
5829 int32_t aExp;
5830 uint64_t aSig;
5831
5832 if (floatx80_invalid_encoding(a)) {
5833 float_raise(float_flag_invalid, status);
5834 return float32_default_nan(status);
5835 }
5836 aSig = extractFloatx80Frac( a );
5837 aExp = extractFloatx80Exp( a );
5838 aSign = extractFloatx80Sign( a );
5839 if ( aExp == 0x7FFF ) {
5840 if ( (uint64_t) ( aSig<<1 ) ) {
5841 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5842 status);
5843 return float32_silence_nan(res, status);
5844 }
5845 return packFloat32( aSign, 0xFF, 0 );
5846 }
5847 shift64RightJamming( aSig, 33, &aSig );
5848 if ( aExp || aSig ) aExp -= 0x3F81;
5849 return roundAndPackFloat32(aSign, aExp, aSig, status);
5850
5851 }
5852
5853 /*----------------------------------------------------------------------------
5854 | Returns the result of converting the extended double-precision floating-
5855 | point value `a' to the double-precision floating-point format. The
5856 | conversion is performed according to the IEC/IEEE Standard for Binary
5857 | Floating-Point Arithmetic.
5858 *----------------------------------------------------------------------------*/
5859
5860 float64 floatx80_to_float64(floatx80 a, float_status *status)
5861 {
5862 bool aSign;
5863 int32_t aExp;
5864 uint64_t aSig, zSig;
5865
5866 if (floatx80_invalid_encoding(a)) {
5867 float_raise(float_flag_invalid, status);
5868 return float64_default_nan(status);
5869 }
5870 aSig = extractFloatx80Frac( a );
5871 aExp = extractFloatx80Exp( a );
5872 aSign = extractFloatx80Sign( a );
5873 if ( aExp == 0x7FFF ) {
5874 if ( (uint64_t) ( aSig<<1 ) ) {
5875 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5876 status);
5877 return float64_silence_nan(res, status);
5878 }
5879 return packFloat64( aSign, 0x7FF, 0 );
5880 }
5881 shift64RightJamming( aSig, 1, &zSig );
5882 if ( aExp || aSig ) aExp -= 0x3C01;
5883 return roundAndPackFloat64(aSign, aExp, zSig, status);
5884
5885 }
5886
5887 /*----------------------------------------------------------------------------
5888 | Returns the result of converting the extended double-precision floating-
5889 | point value `a' to the quadruple-precision floating-point format. The
5890 | conversion is performed according to the IEC/IEEE Standard for Binary
5891 | Floating-Point Arithmetic.
5892 *----------------------------------------------------------------------------*/
5893
5894 float128 floatx80_to_float128(floatx80 a, float_status *status)
5895 {
5896 bool aSign;
5897 int aExp;
5898 uint64_t aSig, zSig0, zSig1;
5899
5900 if (floatx80_invalid_encoding(a)) {
5901 float_raise(float_flag_invalid, status);
5902 return float128_default_nan(status);
5903 }
5904 aSig = extractFloatx80Frac( a );
5905 aExp = extractFloatx80Exp( a );
5906 aSign = extractFloatx80Sign( a );
5907 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5908 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5909 status);
5910 return float128_silence_nan(res, status);
5911 }
5912 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5913 return packFloat128( aSign, aExp, zSig0, zSig1 );
5914
5915 }
5916
5917 /*----------------------------------------------------------------------------
5918 | Rounds the extended double-precision floating-point value `a'
5919 | to the precision provided by floatx80_rounding_precision and returns the
5920 | result as an extended double-precision floating-point value.
5921 | The operation is performed according to the IEC/IEEE Standard for Binary
5922 | Floating-Point Arithmetic.
5923 *----------------------------------------------------------------------------*/
5924
5925 floatx80 floatx80_round(floatx80 a, float_status *status)
5926 {
5927 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5928 extractFloatx80Sign(a),
5929 extractFloatx80Exp(a),
5930 extractFloatx80Frac(a), 0, status);
5931 }
5932
5933 /*----------------------------------------------------------------------------
5934 | Rounds the extended double-precision floating-point value `a' to an integer,
5935 | and returns the result as an extended quadruple-precision floating-point
5936 | value. The operation is performed according to the IEC/IEEE Standard for
5937 | Binary Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939
5940 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5941 {
5942 bool aSign;
5943 int32_t aExp;
5944 uint64_t lastBitMask, roundBitsMask;
5945 floatx80 z;
5946
5947 if (floatx80_invalid_encoding(a)) {
5948 float_raise(float_flag_invalid, status);
5949 return floatx80_default_nan(status);
5950 }
5951 aExp = extractFloatx80Exp( a );
5952 if ( 0x403E <= aExp ) {
5953 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5954 return propagateFloatx80NaN(a, a, status);
5955 }
5956 return a;
5957 }
5958 if ( aExp < 0x3FFF ) {
5959 if ( ( aExp == 0 )
5960 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5961 return a;
5962 }
5963 float_raise(float_flag_inexact, status);
5964 aSign = extractFloatx80Sign( a );
5965 switch (status->float_rounding_mode) {
5966 case float_round_nearest_even:
5967 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5968 ) {
5969 return
5970 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5971 }
5972 break;
5973 case float_round_ties_away:
5974 if (aExp == 0x3FFE) {
5975 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5976 }
5977 break;
5978 case float_round_down:
5979 return
5980 aSign ?
5981 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5982 : packFloatx80( 0, 0, 0 );
5983 case float_round_up:
5984 return
5985 aSign ? packFloatx80( 1, 0, 0 )
5986 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5987
5988 case float_round_to_zero:
5989 break;
5990 default:
5991 g_assert_not_reached();
5992 }
5993 return packFloatx80( aSign, 0, 0 );
5994 }
5995 lastBitMask = 1;
5996 lastBitMask <<= 0x403E - aExp;
5997 roundBitsMask = lastBitMask - 1;
5998 z = a;
5999 switch (status->float_rounding_mode) {
6000 case float_round_nearest_even:
6001 z.low += lastBitMask>>1;
6002 if ((z.low & roundBitsMask) == 0) {
6003 z.low &= ~lastBitMask;
6004 }
6005 break;
6006 case float_round_ties_away:
6007 z.low += lastBitMask >> 1;
6008 break;
6009 case float_round_to_zero:
6010 break;
6011 case float_round_up:
6012 if (!extractFloatx80Sign(z)) {
6013 z.low += roundBitsMask;
6014 }
6015 break;
6016 case float_round_down:
6017 if (extractFloatx80Sign(z)) {
6018 z.low += roundBitsMask;
6019 }
6020 break;
6021 default:
6022 abort();
6023 }
6024 z.low &= ~ roundBitsMask;
6025 if ( z.low == 0 ) {
6026 ++z.high;
6027 z.low = UINT64_C(0x8000000000000000);
6028 }
6029 if (z.low != a.low) {
6030 float_raise(float_flag_inexact, status);
6031 }
6032 return z;
6033
6034 }
6035
6036 /*----------------------------------------------------------------------------
6037 | Returns the result of adding the absolute values of the extended double-
6038 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
6039 | negated before being returned. `zSign' is ignored if the result is a NaN.
6040 | The addition is performed according to the IEC/IEEE Standard for Binary
6041 | Floating-Point Arithmetic.
6042 *----------------------------------------------------------------------------*/
6043
6044 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6045 float_status *status)
6046 {
6047 int32_t aExp, bExp, zExp;
6048 uint64_t aSig, bSig, zSig0, zSig1;
6049 int32_t expDiff;
6050
6051 aSig = extractFloatx80Frac( a );
6052 aExp = extractFloatx80Exp( a );
6053 bSig = extractFloatx80Frac( b );
6054 bExp = extractFloatx80Exp( b );
6055 expDiff = aExp - bExp;
6056 if ( 0 < expDiff ) {
6057 if ( aExp == 0x7FFF ) {
6058 if ((uint64_t)(aSig << 1)) {
6059 return propagateFloatx80NaN(a, b, status);
6060 }
6061 return a;
6062 }
6063 if ( bExp == 0 ) --expDiff;
6064 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6065 zExp = aExp;
6066 }
6067 else if ( expDiff < 0 ) {
6068 if ( bExp == 0x7FFF ) {
6069 if ((uint64_t)(bSig << 1)) {
6070 return propagateFloatx80NaN(a, b, status);
6071 }
6072 return packFloatx80(zSign,
6073 floatx80_infinity_high,
6074 floatx80_infinity_low);
6075 }
6076 if ( aExp == 0 ) ++expDiff;
6077 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6078 zExp = bExp;
6079 }
6080 else {
6081 if ( aExp == 0x7FFF ) {
6082 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6083 return propagateFloatx80NaN(a, b, status);
6084 }
6085 return a;
6086 }
6087 zSig1 = 0;
6088 zSig0 = aSig + bSig;
6089 if ( aExp == 0 ) {
6090 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6091 /* At least one of the values is a pseudo-denormal,
6092 * and there is a carry out of the result. */
6093 zExp = 1;
6094 goto shiftRight1;
6095 }
6096 if (zSig0 == 0) {
6097 return packFloatx80(zSign, 0, 0);
6098 }
6099 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6100 goto roundAndPack;
6101 }
6102 zExp = aExp;
6103 goto shiftRight1;
6104 }
6105 zSig0 = aSig + bSig;
6106 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6107 shiftRight1:
6108 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6109 zSig0 |= UINT64_C(0x8000000000000000);
6110 ++zExp;
6111 roundAndPack:
6112 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6113 zSign, zExp, zSig0, zSig1, status);
6114 }
6115
6116 /*----------------------------------------------------------------------------
6117 | Returns the result of subtracting the absolute values of the extended
6118 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
6119 | difference is negated before being returned. `zSign' is ignored if the
6120 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6121 | Standard for Binary Floating-Point Arithmetic.
6122 *----------------------------------------------------------------------------*/
6123
6124 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6125 float_status *status)
6126 {
6127 int32_t aExp, bExp, zExp;
6128 uint64_t aSig, bSig, zSig0, zSig1;
6129 int32_t expDiff;
6130
6131 aSig = extractFloatx80Frac( a );
6132 aExp = extractFloatx80Exp( a );
6133 bSig = extractFloatx80Frac( b );
6134 bExp = extractFloatx80Exp( b );
6135 expDiff = aExp - bExp;
6136 if ( 0 < expDiff ) goto aExpBigger;
6137 if ( expDiff < 0 ) goto bExpBigger;
6138 if ( aExp == 0x7FFF ) {
6139 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6140 return propagateFloatx80NaN(a, b, status);
6141 }
6142 float_raise(float_flag_invalid, status);
6143 return floatx80_default_nan(status);
6144 }
6145 if ( aExp == 0 ) {
6146 aExp = 1;
6147 bExp = 1;
6148 }
6149 zSig1 = 0;
6150 if ( bSig < aSig ) goto aBigger;
6151 if ( aSig < bSig ) goto bBigger;
6152 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6153 bExpBigger:
6154 if ( bExp == 0x7FFF ) {
6155 if ((uint64_t)(bSig << 1)) {
6156 return propagateFloatx80NaN(a, b, status);
6157 }
6158 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6159 floatx80_infinity_low);
6160 }
6161 if ( aExp == 0 ) ++expDiff;
6162 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6163 bBigger:
6164 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6165 zExp = bExp;
6166 zSign ^= 1;
6167 goto normalizeRoundAndPack;
6168 aExpBigger:
6169 if ( aExp == 0x7FFF ) {
6170 if ((uint64_t)(aSig << 1)) {
6171 return propagateFloatx80NaN(a, b, status);
6172 }
6173 return a;
6174 }
6175 if ( bExp == 0 ) --expDiff;
6176 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6177 aBigger:
6178 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6179 zExp = aExp;
6180 normalizeRoundAndPack:
6181 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6182 zSign, zExp, zSig0, zSig1, status);
6183 }
6184
6185 /*----------------------------------------------------------------------------
6186 | Returns the result of adding the extended double-precision floating-point
6187 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6188 | Standard for Binary Floating-Point Arithmetic.
6189 *----------------------------------------------------------------------------*/
6190
6191 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6192 {
6193 bool aSign, bSign;
6194
6195 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6196 float_raise(float_flag_invalid, status);
6197 return floatx80_default_nan(status);
6198 }
6199 aSign = extractFloatx80Sign( a );
6200 bSign = extractFloatx80Sign( b );
6201 if ( aSign == bSign ) {
6202 return addFloatx80Sigs(a, b, aSign, status);
6203 }
6204 else {
6205 return subFloatx80Sigs(a, b, aSign, status);
6206 }
6207
6208 }
6209
6210 /*----------------------------------------------------------------------------
6211 | Returns the result of subtracting the extended double-precision floating-
6212 | point values `a' and `b'. The operation is performed according to the
6213 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6214 *----------------------------------------------------------------------------*/
6215
6216 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6217 {
6218 bool aSign, bSign;
6219
6220 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6221 float_raise(float_flag_invalid, status);
6222 return floatx80_default_nan(status);
6223 }
6224 aSign = extractFloatx80Sign( a );
6225 bSign = extractFloatx80Sign( b );
6226 if ( aSign == bSign ) {
6227 return subFloatx80Sigs(a, b, aSign, status);
6228 }
6229 else {
6230 return addFloatx80Sigs(a, b, aSign, status);
6231 }
6232
6233 }
6234
6235 /*----------------------------------------------------------------------------
6236 | Returns the result of multiplying the extended double-precision floating-
6237 | point values `a' and `b'. The operation is performed according to the
6238 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6239 *----------------------------------------------------------------------------*/
6240
6241 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6242 {
6243 bool aSign, bSign, zSign;
6244 int32_t aExp, bExp, zExp;
6245 uint64_t aSig, bSig, zSig0, zSig1;
6246
6247 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6248 float_raise(float_flag_invalid, status);
6249 return floatx80_default_nan(status);
6250 }
6251 aSig = extractFloatx80Frac( a );
6252 aExp = extractFloatx80Exp( a );
6253 aSign = extractFloatx80Sign( a );
6254 bSig = extractFloatx80Frac( b );
6255 bExp = extractFloatx80Exp( b );
6256 bSign = extractFloatx80Sign( b );
6257 zSign = aSign ^ bSign;
6258 if ( aExp == 0x7FFF ) {
6259 if ( (uint64_t) ( aSig<<1 )
6260 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6261 return propagateFloatx80NaN(a, b, status);
6262 }
6263 if ( ( bExp | bSig ) == 0 ) goto invalid;
6264 return packFloatx80(zSign, floatx80_infinity_high,
6265 floatx80_infinity_low);
6266 }
6267 if ( bExp == 0x7FFF ) {
6268 if ((uint64_t)(bSig << 1)) {
6269 return propagateFloatx80NaN(a, b, status);
6270 }
6271 if ( ( aExp | aSig ) == 0 ) {
6272 invalid:
6273 float_raise(float_flag_invalid, status);
6274 return floatx80_default_nan(status);
6275 }
6276 return packFloatx80(zSign, floatx80_infinity_high,
6277 floatx80_infinity_low);
6278 }
6279 if ( aExp == 0 ) {
6280 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6281 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6282 }
6283 if ( bExp == 0 ) {
6284 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6285 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6286 }
6287 zExp = aExp + bExp - 0x3FFE;
6288 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6289 if ( 0 < (int64_t) zSig0 ) {
6290 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6291 --zExp;
6292 }
6293 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6294 zSign, zExp, zSig0, zSig1, status);
6295 }
6296
6297 /*----------------------------------------------------------------------------
6298 | Returns the result of dividing the extended double-precision floating-point
6299 | value `a' by the corresponding value `b'. The operation is performed
6300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6301 *----------------------------------------------------------------------------*/
6302
6303 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6304 {
6305 bool aSign, bSign, zSign;
6306 int32_t aExp, bExp, zExp;
6307 uint64_t aSig, bSig, zSig0, zSig1;
6308 uint64_t rem0, rem1, rem2, term0, term1, term2;
6309
6310 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6311 float_raise(float_flag_invalid, status);
6312 return floatx80_default_nan(status);
6313 }
6314 aSig = extractFloatx80Frac( a );
6315 aExp = extractFloatx80Exp( a );
6316 aSign = extractFloatx80Sign( a );
6317 bSig = extractFloatx80Frac( b );
6318 bExp = extractFloatx80Exp( b );
6319 bSign = extractFloatx80Sign( b );
6320 zSign = aSign ^ bSign;
6321 if ( aExp == 0x7FFF ) {
6322 if ((uint64_t)(aSig << 1)) {
6323 return propagateFloatx80NaN(a, b, status);
6324 }
6325 if ( bExp == 0x7FFF ) {
6326 if ((uint64_t)(bSig << 1)) {
6327 return propagateFloatx80NaN(a, b, status);
6328 }
6329 goto invalid;
6330 }
6331 return packFloatx80(zSign, floatx80_infinity_high,
6332 floatx80_infinity_low);
6333 }
6334 if ( bExp == 0x7FFF ) {
6335 if ((uint64_t)(bSig << 1)) {
6336 return propagateFloatx80NaN(a, b, status);
6337 }
6338 return packFloatx80( zSign, 0, 0 );
6339 }
6340 if ( bExp == 0 ) {
6341 if ( bSig == 0 ) {
6342 if ( ( aExp | aSig ) == 0 ) {
6343 invalid:
6344 float_raise(float_flag_invalid, status);
6345 return floatx80_default_nan(status);
6346 }
6347 float_raise(float_flag_divbyzero, status);
6348 return packFloatx80(zSign, floatx80_infinity_high,
6349 floatx80_infinity_low);
6350 }
6351 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6352 }
6353 if ( aExp == 0 ) {
6354 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6355 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6356 }
6357 zExp = aExp - bExp + 0x3FFE;
6358 rem1 = 0;
6359 if ( bSig <= aSig ) {
6360 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6361 ++zExp;
6362 }
6363 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6364 mul64To128( bSig, zSig0, &term0, &term1 );
6365 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6366 while ( (int64_t) rem0 < 0 ) {
6367 --zSig0;
6368 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6369 }
6370 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6371 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6372 mul64To128( bSig, zSig1, &term1, &term2 );
6373 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6374 while ( (int64_t) rem1 < 0 ) {
6375 --zSig1;
6376 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6377 }
6378 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6379 }
6380 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6381 zSign, zExp, zSig0, zSig1, status);
6382 }
6383
6384 /*----------------------------------------------------------------------------
6385 | Returns the remainder of the extended double-precision floating-point value
6386 | `a' with respect to the corresponding value `b'. The operation is performed
6387 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6388 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6389 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6390 | the absolute value of the integer quotient.
6391 *----------------------------------------------------------------------------*/
6392
6393 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6394 float_status *status)
6395 {
6396 bool aSign, zSign;
6397 int32_t aExp, bExp, expDiff, aExpOrig;
6398 uint64_t aSig0, aSig1, bSig;
6399 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6400
6401 *quotient = 0;
6402 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6403 float_raise(float_flag_invalid, status);
6404 return floatx80_default_nan(status);
6405 }
6406 aSig0 = extractFloatx80Frac( a );
6407 aExpOrig = aExp = extractFloatx80Exp( a );
6408 aSign = extractFloatx80Sign( a );
6409 bSig = extractFloatx80Frac( b );
6410 bExp = extractFloatx80Exp( b );
6411 if ( aExp == 0x7FFF ) {
6412 if ( (uint64_t) ( aSig0<<1 )
6413 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6414 return propagateFloatx80NaN(a, b, status);
6415 }
6416 goto invalid;
6417 }
6418 if ( bExp == 0x7FFF ) {
6419 if ((uint64_t)(bSig << 1)) {
6420 return propagateFloatx80NaN(a, b, status);
6421 }
6422 if (aExp == 0 && aSig0 >> 63) {
6423 /*
6424 * Pseudo-denormal argument must be returned in normalized
6425 * form.
6426 */
6427 return packFloatx80(aSign, 1, aSig0);
6428 }
6429 return a;
6430 }
6431 if ( bExp == 0 ) {
6432 if ( bSig == 0 ) {
6433 invalid:
6434 float_raise(float_flag_invalid, status);
6435 return floatx80_default_nan(status);
6436 }
6437 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6438 }
6439 if ( aExp == 0 ) {
6440 if ( aSig0 == 0 ) return a;
6441 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6442 }
6443 zSign = aSign;
6444 expDiff = aExp - bExp;
6445 aSig1 = 0;
6446 if ( expDiff < 0 ) {
6447 if ( mod || expDiff < -1 ) {
6448 if (aExp == 1 && aExpOrig == 0) {
6449 /*
6450 * Pseudo-denormal argument must be returned in
6451 * normalized form.
6452 */
6453 return packFloatx80(aSign, aExp, aSig0);
6454 }
6455 return a;
6456 }
6457 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6458 expDiff = 0;
6459 }
6460 *quotient = q = ( bSig <= aSig0 );
6461 if ( q ) aSig0 -= bSig;
6462 expDiff -= 64;
6463 while ( 0 < expDiff ) {
6464 q = estimateDiv128To64( aSig0, aSig1, bSig );
6465 q = ( 2 < q ) ? q - 2 : 0;
6466 mul64To128( bSig, q, &term0, &term1 );
6467 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6468 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6469 expDiff -= 62;
6470 *quotient <<= 62;
6471 *quotient += q;
6472 }
6473 expDiff += 64;
6474 if ( 0 < expDiff ) {
6475 q = estimateDiv128To64( aSig0, aSig1, bSig );
6476 q = ( 2 < q ) ? q - 2 : 0;
6477 q >>= 64 - expDiff;
6478 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6479 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6480 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6481 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6482 ++q;
6483 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6484 }
6485 if (expDiff < 64) {
6486 *quotient <<= expDiff;
6487 } else {
6488 *quotient = 0;
6489 }
6490 *quotient += q;
6491 }
6492 else {
6493 term1 = 0;
6494 term0 = bSig;
6495 }
6496 if (!mod) {
6497 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6498 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6499 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6500 && ( q & 1 ) )
6501 ) {
6502 aSig0 = alternateASig0;
6503 aSig1 = alternateASig1;
6504 zSign = ! zSign;
6505 ++*quotient;
6506 }
6507 }
6508 return
6509 normalizeRoundAndPackFloatx80(
6510 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6511
6512 }
6513
6514 /*----------------------------------------------------------------------------
6515 | Returns the remainder of the extended double-precision floating-point value
6516 | `a' with respect to the corresponding value `b'. The operation is performed
6517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6518 *----------------------------------------------------------------------------*/
6519
6520 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6521 {
6522 uint64_t quotient;
6523 return floatx80_modrem(a, b, false, &quotient, status);
6524 }
6525
6526 /*----------------------------------------------------------------------------
6527 | Returns the remainder of the extended double-precision floating-point value
6528 | `a' with respect to the corresponding value `b', with the quotient truncated
6529 | toward zero.
6530 *----------------------------------------------------------------------------*/
6531
6532 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6533 {
6534 uint64_t quotient;
6535 return floatx80_modrem(a, b, true, &quotient, status);
6536 }
6537
6538 /*----------------------------------------------------------------------------
6539 | Returns the square root of the extended double-precision floating-point
6540 | value `a'. The operation is performed according to the IEC/IEEE Standard
6541 | for Binary Floating-Point Arithmetic.
6542 *----------------------------------------------------------------------------*/
6543
6544 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6545 {
6546 bool aSign;
6547 int32_t aExp, zExp;
6548 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6549 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6550
6551 if (floatx80_invalid_encoding(a)) {
6552 float_raise(float_flag_invalid, status);
6553 return floatx80_default_nan(status);
6554 }
6555 aSig0 = extractFloatx80Frac( a );
6556 aExp = extractFloatx80Exp( a );
6557 aSign = extractFloatx80Sign( a );
6558 if ( aExp == 0x7FFF ) {
6559 if ((uint64_t)(aSig0 << 1)) {
6560 return propagateFloatx80NaN(a, a, status);
6561 }
6562 if ( ! aSign ) return a;
6563 goto invalid;
6564 }
6565 if ( aSign ) {
6566 if ( ( aExp | aSig0 ) == 0 ) return a;
6567 invalid:
6568 float_raise(float_flag_invalid, status);
6569 return floatx80_default_nan(status);
6570 }
6571 if ( aExp == 0 ) {
6572 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6573 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6574 }
6575 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6576 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6577 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6578 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6579 doubleZSig0 = zSig0<<1;
6580 mul64To128( zSig0, zSig0, &term0, &term1 );
6581 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6582 while ( (int64_t) rem0 < 0 ) {
6583 --zSig0;
6584 doubleZSig0 -= 2;
6585 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6586 }
6587 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6588 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6589 if ( zSig1 == 0 ) zSig1 = 1;
6590 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6591 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6592 mul64To128( zSig1, zSig1, &term2, &term3 );
6593 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6594 while ( (int64_t) rem1 < 0 ) {
6595 --zSig1;
6596 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6597 term3 |= 1;
6598 term2 |= doubleZSig0;
6599 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6600 }
6601 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6602 }
6603 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6604 zSig0 |= doubleZSig0;
6605 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6606 0, zExp, zSig0, zSig1, status);
6607 }
6608
6609 /*----------------------------------------------------------------------------
6610 | Returns the result of converting the quadruple-precision floating-point
6611 | value `a' to the 32-bit two's complement integer format. The conversion
6612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6613 | Arithmetic---which means in particular that the conversion is rounded
6614 | according to the current rounding mode. If `a' is a NaN, the largest
6615 | positive integer is returned. Otherwise, if the conversion overflows, the
6616 | largest integer with the same sign as `a' is returned.
6617 *----------------------------------------------------------------------------*/
6618
6619 int32_t float128_to_int32(float128 a, float_status *status)
6620 {
6621 bool aSign;
6622 int32_t aExp, shiftCount;
6623 uint64_t aSig0, aSig1;
6624
6625 aSig1 = extractFloat128Frac1( a );
6626 aSig0 = extractFloat128Frac0( a );
6627 aExp = extractFloat128Exp( a );
6628 aSign = extractFloat128Sign( a );
6629 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6630 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6631 aSig0 |= ( aSig1 != 0 );
6632 shiftCount = 0x4028 - aExp;
6633 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6634 return roundAndPackInt32(aSign, aSig0, status);
6635
6636 }
6637
6638 /*----------------------------------------------------------------------------
6639 | Returns the result of converting the quadruple-precision floating-point
6640 | value `a' to the 32-bit two's complement integer format. The conversion
6641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6642 | Arithmetic, except that the conversion is always rounded toward zero. If
6643 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6644 | conversion overflows, the largest integer with the same sign as `a' is
6645 | returned.
6646 *----------------------------------------------------------------------------*/
6647
6648 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6649 {
6650 bool aSign;
6651 int32_t aExp, shiftCount;
6652 uint64_t aSig0, aSig1, savedASig;
6653 int32_t z;
6654
6655 aSig1 = extractFloat128Frac1( a );
6656 aSig0 = extractFloat128Frac0( a );
6657 aExp = extractFloat128Exp( a );
6658 aSign = extractFloat128Sign( a );
6659 aSig0 |= ( aSig1 != 0 );
6660 if ( 0x401E < aExp ) {
6661 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6662 goto invalid;
6663 }
6664 else if ( aExp < 0x3FFF ) {
6665 if (aExp || aSig0) {
6666 float_raise(float_flag_inexact, status);
6667 }
6668 return 0;
6669 }
6670 aSig0 |= UINT64_C(0x0001000000000000);
6671 shiftCount = 0x402F - aExp;
6672 savedASig = aSig0;
6673 aSig0 >>= shiftCount;
6674 z = aSig0;
6675 if ( aSign ) z = - z;
6676 if ( ( z < 0 ) ^ aSign ) {
6677 invalid:
6678 float_raise(float_flag_invalid, status);
6679 return aSign ? INT32_MIN : INT32_MAX;
6680 }
6681 if ( ( aSig0<<shiftCount ) != savedASig ) {
6682 float_raise(float_flag_inexact, status);
6683 }
6684 return z;
6685
6686 }
6687
6688 /*----------------------------------------------------------------------------
6689 | Returns the result of converting the quadruple-precision floating-point
6690 | value `a' to the 64-bit two's complement integer format. The conversion
6691 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6692 | Arithmetic---which means in particular that the conversion is rounded
6693 | according to the current rounding mode. If `a' is a NaN, the largest
6694 | positive integer is returned. Otherwise, if the conversion overflows, the
6695 | largest integer with the same sign as `a' is returned.
6696 *----------------------------------------------------------------------------*/
6697
6698 int64_t float128_to_int64(float128 a, float_status *status)
6699 {
6700 bool aSign;
6701 int32_t aExp, shiftCount;
6702 uint64_t aSig0, aSig1;
6703
6704 aSig1 = extractFloat128Frac1( a );
6705 aSig0 = extractFloat128Frac0( a );
6706 aExp = extractFloat128Exp( a );
6707 aSign = extractFloat128Sign( a );
6708 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6709 shiftCount = 0x402F - aExp;
6710 if ( shiftCount <= 0 ) {
6711 if ( 0x403E < aExp ) {
6712 float_raise(float_flag_invalid, status);
6713 if ( ! aSign
6714 || ( ( aExp == 0x7FFF )
6715 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6716 )
6717 ) {
6718 return INT64_MAX;
6719 }
6720 return INT64_MIN;
6721 }
6722 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6723 }
6724 else {
6725 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6726 }
6727 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6728
6729 }
6730
6731 /*----------------------------------------------------------------------------
6732 | Returns the result of converting the quadruple-precision floating-point
6733 | value `a' to the 64-bit two's complement integer format. The conversion
6734 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6735 | Arithmetic, except that the conversion is always rounded toward zero.
6736 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6737 | the conversion overflows, the largest integer with the same sign as `a' is
6738 | returned.
6739 *----------------------------------------------------------------------------*/
6740
6741 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6742 {
6743 bool aSign;
6744 int32_t aExp, shiftCount;
6745 uint64_t aSig0, aSig1;
6746 int64_t z;
6747
6748 aSig1 = extractFloat128Frac1( a );
6749 aSig0 = extractFloat128Frac0( a );
6750 aExp = extractFloat128Exp( a );
6751 aSign = extractFloat128Sign( a );
6752 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6753 shiftCount = aExp - 0x402F;
6754 if ( 0 < shiftCount ) {
6755 if ( 0x403E <= aExp ) {
6756 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6757 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6758 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6759 if (aSig1) {
6760 float_raise(float_flag_inexact, status);
6761 }
6762 }
6763 else {
6764 float_raise(float_flag_invalid, status);
6765 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6766 return INT64_MAX;
6767 }
6768 }
6769 return INT64_MIN;
6770 }
6771 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6772 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6773 float_raise(float_flag_inexact, status);
6774 }
6775 }
6776 else {
6777 if ( aExp < 0x3FFF ) {
6778 if ( aExp | aSig0 | aSig1 ) {
6779 float_raise(float_flag_inexact, status);
6780 }
6781 return 0;
6782 }
6783 z = aSig0>>( - shiftCount );
6784 if ( aSig1
6785 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6786 float_raise(float_flag_inexact, status);
6787 }
6788 }
6789 if ( aSign ) z = - z;
6790 return z;
6791
6792 }
6793
6794 /*----------------------------------------------------------------------------
6795 | Returns the result of converting the quadruple-precision floating-point value
6796 | `a' to the 64-bit unsigned integer format. The conversion is
6797 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6798 | Arithmetic---which means in particular that the conversion is rounded
6799 | according to the current rounding mode. If `a' is a NaN, the largest
6800 | positive integer is returned. If the conversion overflows, the
6801 | largest unsigned integer is returned. If 'a' is negative, the value is
6802 | rounded and zero is returned; negative values that do not round to zero
6803 | will raise the inexact exception.
6804 *----------------------------------------------------------------------------*/
6805
6806 uint64_t float128_to_uint64(float128 a, float_status *status)
6807 {
6808 bool aSign;
6809 int aExp;
6810 int shiftCount;
6811 uint64_t aSig0, aSig1;
6812
6813 aSig0 = extractFloat128Frac0(a);
6814 aSig1 = extractFloat128Frac1(a);
6815 aExp = extractFloat128Exp(a);
6816 aSign = extractFloat128Sign(a);
6817 if (aSign && (aExp > 0x3FFE)) {
6818 float_raise(float_flag_invalid, status);
6819 if (float128_is_any_nan(a)) {
6820 return UINT64_MAX;
6821 } else {
6822 return 0;
6823 }
6824 }
6825 if (aExp) {
6826 aSig0 |= UINT64_C(0x0001000000000000);
6827 }
6828 shiftCount = 0x402F - aExp;
6829 if (shiftCount <= 0) {
6830 if (0x403E < aExp) {
6831 float_raise(float_flag_invalid, status);
6832 return UINT64_MAX;
6833 }
6834 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6835 } else {
6836 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6837 }
6838 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6839 }
6840
6841 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6842 {
6843 uint64_t v;
6844 signed char current_rounding_mode = status->float_rounding_mode;
6845
6846 set_float_rounding_mode(float_round_to_zero, status);
6847 v = float128_to_uint64(a, status);
6848 set_float_rounding_mode(current_rounding_mode, status);
6849
6850 return v;
6851 }
6852
6853 /*----------------------------------------------------------------------------
6854 | Returns the result of converting the quadruple-precision floating-point
6855 | value `a' to the 32-bit unsigned integer format. The conversion
6856 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6857 | Arithmetic except that the conversion is always rounded toward zero.
6858 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6859 | if the conversion overflows, the largest unsigned integer is returned.
6860 | If 'a' is negative, the value is rounded and zero is returned; negative
6861 | values that do not round to zero will raise the inexact exception.
6862 *----------------------------------------------------------------------------*/
6863
6864 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6865 {
6866 uint64_t v;
6867 uint32_t res;
6868 int old_exc_flags = get_float_exception_flags(status);
6869
6870 v = float128_to_uint64_round_to_zero(a, status);
6871 if (v > 0xffffffff) {
6872 res = 0xffffffff;
6873 } else {
6874 return v;
6875 }
6876 set_float_exception_flags(old_exc_flags, status);
6877 float_raise(float_flag_invalid, status);
6878 return res;
6879 }
6880
6881 /*----------------------------------------------------------------------------
6882 | Returns the result of converting the quadruple-precision floating-point value
6883 | `a' to the 32-bit unsigned integer format. The conversion is
6884 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6885 | Arithmetic---which means in particular that the conversion is rounded
6886 | according to the current rounding mode. If `a' is a NaN, the largest
6887 | positive integer is returned. If the conversion overflows, the
6888 | largest unsigned integer is returned. If 'a' is negative, the value is
6889 | rounded and zero is returned; negative values that do not round to zero
6890 | will raise the inexact exception.
6891 *----------------------------------------------------------------------------*/
6892
6893 uint32_t float128_to_uint32(float128 a, float_status *status)
6894 {
6895 uint64_t v;
6896 uint32_t res;
6897 int old_exc_flags = get_float_exception_flags(status);
6898
6899 v = float128_to_uint64(a, status);
6900 if (v > 0xffffffff) {
6901 res = 0xffffffff;
6902 } else {
6903 return v;
6904 }
6905 set_float_exception_flags(old_exc_flags, status);
6906 float_raise(float_flag_invalid, status);
6907 return res;
6908 }
6909
6910 /*----------------------------------------------------------------------------
6911 | Returns the result of converting the quadruple-precision floating-point
6912 | value `a' to the single-precision floating-point format. The conversion
6913 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6914 | Arithmetic.
6915 *----------------------------------------------------------------------------*/
6916
6917 float32 float128_to_float32(float128 a, float_status *status)
6918 {
6919 bool aSign;
6920 int32_t aExp;
6921 uint64_t aSig0, aSig1;
6922 uint32_t zSig;
6923
6924 aSig1 = extractFloat128Frac1( a );
6925 aSig0 = extractFloat128Frac0( a );
6926 aExp = extractFloat128Exp( a );
6927 aSign = extractFloat128Sign( a );
6928 if ( aExp == 0x7FFF ) {
6929 if ( aSig0 | aSig1 ) {
6930 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6931 }
6932 return packFloat32( aSign, 0xFF, 0 );
6933 }
6934 aSig0 |= ( aSig1 != 0 );
6935 shift64RightJamming( aSig0, 18, &aSig0 );
6936 zSig = aSig0;
6937 if ( aExp || zSig ) {
6938 zSig |= 0x40000000;
6939 aExp -= 0x3F81;
6940 }
6941 return roundAndPackFloat32(aSign, aExp, zSig, status);
6942
6943 }
6944
6945 /*----------------------------------------------------------------------------
6946 | Returns the result of converting the quadruple-precision floating-point
6947 | value `a' to the double-precision floating-point format. The conversion
6948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6949 | Arithmetic.
6950 *----------------------------------------------------------------------------*/
6951
6952 float64 float128_to_float64(float128 a, float_status *status)
6953 {
6954 bool aSign;
6955 int32_t aExp;
6956 uint64_t aSig0, aSig1;
6957
6958 aSig1 = extractFloat128Frac1( a );
6959 aSig0 = extractFloat128Frac0( a );
6960 aExp = extractFloat128Exp( a );
6961 aSign = extractFloat128Sign( a );
6962 if ( aExp == 0x7FFF ) {
6963 if ( aSig0 | aSig1 ) {
6964 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6965 }
6966 return packFloat64( aSign, 0x7FF, 0 );
6967 }
6968 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6969 aSig0 |= ( aSig1 != 0 );
6970 if ( aExp || aSig0 ) {
6971 aSig0 |= UINT64_C(0x4000000000000000);
6972 aExp -= 0x3C01;
6973 }
6974 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6975
6976 }
6977
6978 /*----------------------------------------------------------------------------
6979 | Returns the result of converting the quadruple-precision floating-point
6980 | value `a' to the extended double-precision floating-point format. The
6981 | conversion is performed according to the IEC/IEEE Standard for Binary
6982 | Floating-Point Arithmetic.
6983 *----------------------------------------------------------------------------*/
6984
6985 floatx80 float128_to_floatx80(float128 a, float_status *status)
6986 {
6987 bool aSign;
6988 int32_t aExp;
6989 uint64_t aSig0, aSig1;
6990
6991 aSig1 = extractFloat128Frac1( a );
6992 aSig0 = extractFloat128Frac0( a );
6993 aExp = extractFloat128Exp( a );
6994 aSign = extractFloat128Sign( a );
6995 if ( aExp == 0x7FFF ) {
6996 if ( aSig0 | aSig1 ) {
6997 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6998 status);
6999 return floatx80_silence_nan(res, status);
7000 }
7001 return packFloatx80(aSign, floatx80_infinity_high,
7002 floatx80_infinity_low);
7003 }
7004 if ( aExp == 0 ) {
7005 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
7006 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7007 }
7008 else {
7009 aSig0 |= UINT64_C(0x0001000000000000);
7010 }
7011 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
7012 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
7013
7014 }
7015
7016 /*----------------------------------------------------------------------------
7017 | Rounds the quadruple-precision floating-point value `a' to an integer, and
7018 | returns the result as a quadruple-precision floating-point value. The
7019 | operation is performed according to the IEC/IEEE Standard for Binary
7020 | Floating-Point Arithmetic.
7021 *----------------------------------------------------------------------------*/
7022
7023 float128 float128_round_to_int(float128 a, float_status *status)
7024 {
7025 bool aSign;
7026 int32_t aExp;
7027 uint64_t lastBitMask, roundBitsMask;
7028 float128 z;
7029
7030 aExp = extractFloat128Exp( a );
7031 if ( 0x402F <= aExp ) {
7032 if ( 0x406F <= aExp ) {
7033 if ( ( aExp == 0x7FFF )
7034 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
7035 ) {
7036 return propagateFloat128NaN(a, a, status);
7037 }
7038 return a;
7039 }
7040 lastBitMask = 1;
7041 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7042 roundBitsMask = lastBitMask - 1;
7043 z = a;
7044 switch (status->float_rounding_mode) {
7045 case float_round_nearest_even:
7046 if ( lastBitMask ) {
7047 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7048 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7049 }
7050 else {
7051 if ( (int64_t) z.low < 0 ) {
7052 ++z.high;
7053 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7054 }
7055 }
7056 break;
7057 case float_round_ties_away:
7058 if (lastBitMask) {
7059 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7060 } else {
7061 if ((int64_t) z.low < 0) {
7062 ++z.high;
7063 }
7064 }
7065 break;
7066 case float_round_to_zero:
7067 break;
7068 case float_round_up:
7069 if (!extractFloat128Sign(z)) {
7070 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7071 }
7072 break;
7073 case float_round_down:
7074 if (extractFloat128Sign(z)) {
7075 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7076 }
7077 break;
7078 case float_round_to_odd:
7079 /*
7080 * Note that if lastBitMask == 0, the last bit is the lsb
7081 * of high, and roundBitsMask == -1.
7082 */
7083 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7084 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7085 }
7086 break;
7087 default:
7088 abort();
7089 }
7090 z.low &= ~ roundBitsMask;
7091 }
7092 else {
7093 if ( aExp < 0x3FFF ) {
7094 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7095 float_raise(float_flag_inexact, status);
7096 aSign = extractFloat128Sign( a );
7097 switch (status->float_rounding_mode) {
7098 case float_round_nearest_even:
7099 if ( ( aExp == 0x3FFE )
7100 && ( extractFloat128Frac0( a )
7101 | extractFloat128Frac1( a ) )
7102 ) {
7103 return packFloat128( aSign, 0x3FFF, 0, 0 );
7104 }
7105 break;
7106 case float_round_ties_away:
7107 if (aExp == 0x3FFE) {
7108 return packFloat128(aSign, 0x3FFF, 0, 0);
7109 }
7110 break;
7111 case float_round_down:
7112 return
7113 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7114 : packFloat128( 0, 0, 0, 0 );
7115 case float_round_up:
7116 return
7117 aSign ? packFloat128( 1, 0, 0, 0 )
7118 : packFloat128( 0, 0x3FFF, 0, 0 );
7119
7120 case float_round_to_odd:
7121 return packFloat128(aSign, 0x3FFF, 0, 0);
7122
7123 case float_round_to_zero:
7124 break;
7125 }
7126 return packFloat128( aSign, 0, 0, 0 );
7127 }
7128 lastBitMask = 1;
7129 lastBitMask <<= 0x402F - aExp;
7130 roundBitsMask = lastBitMask - 1;
7131 z.low = 0;
7132 z.high = a.high;
7133 switch (status->float_rounding_mode) {
7134 case float_round_nearest_even:
7135 z.high += lastBitMask>>1;
7136 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7137 z.high &= ~ lastBitMask;
7138 }
7139 break;
7140 case float_round_ties_away:
7141 z.high += lastBitMask>>1;
7142 break;
7143 case float_round_to_zero:
7144 break;
7145 case float_round_up:
7146 if (!extractFloat128Sign(z)) {
7147 z.high |= ( a.low != 0 );
7148 z.high += roundBitsMask;
7149 }
7150 break;
7151 case float_round_down:
7152 if (extractFloat128Sign(z)) {
7153 z.high |= (a.low != 0);
7154 z.high += roundBitsMask;
7155 }
7156 break;
7157 case float_round_to_odd:
7158 if ((z.high & lastBitMask) == 0) {
7159 z.high |= (a.low != 0);
7160 z.high += roundBitsMask;
7161 }
7162 break;
7163 default:
7164 abort();
7165 }
7166 z.high &= ~ roundBitsMask;
7167 }
7168 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7169 float_raise(float_flag_inexact, status);
7170 }
7171 return z;
7172
7173 }
7174
7175 /*----------------------------------------------------------------------------
7176 | Returns the result of adding the absolute values of the quadruple-precision
7177 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7178 | before being returned. `zSign' is ignored if the result is a NaN.
7179 | The addition is performed according to the IEC/IEEE Standard for Binary
7180 | Floating-Point Arithmetic.
7181 *----------------------------------------------------------------------------*/
7182
7183 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7184 float_status *status)
7185 {
7186 int32_t aExp, bExp, zExp;
7187 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7188 int32_t expDiff;
7189
7190 aSig1 = extractFloat128Frac1( a );
7191 aSig0 = extractFloat128Frac0( a );
7192 aExp = extractFloat128Exp( a );
7193 bSig1 = extractFloat128Frac1( b );
7194 bSig0 = extractFloat128Frac0( b );
7195 bExp = extractFloat128Exp( b );
7196 expDiff = aExp - bExp;
7197 if ( 0 < expDiff ) {
7198 if ( aExp == 0x7FFF ) {
7199 if (aSig0 | aSig1) {
7200 return propagateFloat128NaN(a, b, status);
7201 }
7202 return a;
7203 }
7204 if ( bExp == 0 ) {
7205 --expDiff;
7206 }
7207 else {
7208 bSig0 |= UINT64_C(0x0001000000000000);
7209 }
7210 shift128ExtraRightJamming(
7211 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7212 zExp = aExp;
7213 }
7214 else if ( expDiff < 0 ) {
7215 if ( bExp == 0x7FFF ) {
7216 if (bSig0 | bSig1) {
7217 return propagateFloat128NaN(a, b, status);
7218 }
7219 return packFloat128( zSign, 0x7FFF, 0, 0 );
7220 }
7221 if ( aExp == 0 ) {
7222 ++expDiff;
7223 }
7224 else {
7225 aSig0 |= UINT64_C(0x0001000000000000);
7226 }
7227 shift128ExtraRightJamming(
7228 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7229 zExp = bExp;
7230 }
7231 else {
7232 if ( aExp == 0x7FFF ) {
7233 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7234 return propagateFloat128NaN(a, b, status);
7235 }
7236 return a;
7237 }
7238 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7239 if ( aExp == 0 ) {
7240 if (status->flush_to_zero) {
7241 if (zSig0 | zSig1) {
7242 float_raise(float_flag_output_denormal, status);
7243 }
7244 return packFloat128(zSign, 0, 0, 0);
7245 }
7246 return packFloat128( zSign, 0, zSig0, zSig1 );
7247 }
7248 zSig2 = 0;
7249 zSig0 |= UINT64_C(0x0002000000000000);
7250 zExp = aExp;
7251 goto shiftRight1;
7252 }
7253 aSig0 |= UINT64_C(0x0001000000000000);
7254 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7255 --zExp;
7256 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7257 ++zExp;
7258 shiftRight1:
7259 shift128ExtraRightJamming(
7260 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7261 roundAndPack:
7262 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7263
7264 }
7265
7266 /*----------------------------------------------------------------------------
7267 | Returns the result of subtracting the absolute values of the quadruple-
7268 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7269 | difference is negated before being returned. `zSign' is ignored if the
7270 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7271 | Standard for Binary Floating-Point Arithmetic.
7272 *----------------------------------------------------------------------------*/
7273
7274 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7275 float_status *status)
7276 {
7277 int32_t aExp, bExp, zExp;
7278 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7279 int32_t expDiff;
7280
7281 aSig1 = extractFloat128Frac1( a );
7282 aSig0 = extractFloat128Frac0( a );
7283 aExp = extractFloat128Exp( a );
7284 bSig1 = extractFloat128Frac1( b );
7285 bSig0 = extractFloat128Frac0( b );
7286 bExp = extractFloat128Exp( b );
7287 expDiff = aExp - bExp;
7288 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7289 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7290 if ( 0 < expDiff ) goto aExpBigger;
7291 if ( expDiff < 0 ) goto bExpBigger;
7292 if ( aExp == 0x7FFF ) {
7293 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7294 return propagateFloat128NaN(a, b, status);
7295 }
7296 float_raise(float_flag_invalid, status);
7297 return float128_default_nan(status);
7298 }
7299 if ( aExp == 0 ) {
7300 aExp = 1;
7301 bExp = 1;
7302 }
7303 if ( bSig0 < aSig0 ) goto aBigger;
7304 if ( aSig0 < bSig0 ) goto bBigger;
7305 if ( bSig1 < aSig1 ) goto aBigger;
7306 if ( aSig1 < bSig1 ) goto bBigger;
7307 return packFloat128(status->float_rounding_mode == float_round_down,
7308 0, 0, 0);
7309 bExpBigger:
7310 if ( bExp == 0x7FFF ) {
7311 if (bSig0 | bSig1) {
7312 return propagateFloat128NaN(a, b, status);
7313 }
7314 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7315 }
7316 if ( aExp == 0 ) {
7317 ++expDiff;
7318 }
7319 else {
7320 aSig0 |= UINT64_C(0x4000000000000000);
7321 }
7322 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7323 bSig0 |= UINT64_C(0x4000000000000000);
7324 bBigger:
7325 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7326 zExp = bExp;
7327 zSign ^= 1;
7328 goto normalizeRoundAndPack;
7329 aExpBigger:
7330 if ( aExp == 0x7FFF ) {
7331 if (aSig0 | aSig1) {
7332 return propagateFloat128NaN(a, b, status);
7333 }
7334 return a;
7335 }
7336 if ( bExp == 0 ) {
7337 --expDiff;
7338 }
7339 else {
7340 bSig0 |= UINT64_C(0x4000000000000000);
7341 }
7342 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7343 aSig0 |= UINT64_C(0x4000000000000000);
7344 aBigger:
7345 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7346 zExp = aExp;
7347 normalizeRoundAndPack:
7348 --zExp;
7349 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7350 status);
7351
7352 }
7353
7354 /*----------------------------------------------------------------------------
7355 | Returns the result of adding the quadruple-precision floating-point values
7356 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7357 | for Binary Floating-Point Arithmetic.
7358 *----------------------------------------------------------------------------*/
7359
7360 float128 float128_add(float128 a, float128 b, float_status *status)
7361 {
7362 bool aSign, bSign;
7363
7364 aSign = extractFloat128Sign( a );
7365 bSign = extractFloat128Sign( b );
7366 if ( aSign == bSign ) {
7367 return addFloat128Sigs(a, b, aSign, status);
7368 }
7369 else {
7370 return subFloat128Sigs(a, b, aSign, status);
7371 }
7372
7373 }
7374
7375 /*----------------------------------------------------------------------------
7376 | Returns the result of subtracting the quadruple-precision floating-point
7377 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7378 | Standard for Binary Floating-Point Arithmetic.
7379 *----------------------------------------------------------------------------*/
7380
7381 float128 float128_sub(float128 a, float128 b, float_status *status)
7382 {
7383 bool aSign, bSign;
7384
7385 aSign = extractFloat128Sign( a );
7386 bSign = extractFloat128Sign( b );
7387 if ( aSign == bSign ) {
7388 return subFloat128Sigs(a, b, aSign, status);
7389 }
7390 else {
7391 return addFloat128Sigs(a, b, aSign, status);
7392 }
7393
7394 }
7395
7396 /*----------------------------------------------------------------------------
7397 | Returns the result of multiplying the quadruple-precision floating-point
7398 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7399 | Standard for Binary Floating-Point Arithmetic.
7400 *----------------------------------------------------------------------------*/
7401
7402 float128 float128_mul(float128 a, float128 b, float_status *status)
7403 {
7404 bool aSign, bSign, zSign;
7405 int32_t aExp, bExp, zExp;
7406 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7407
7408 aSig1 = extractFloat128Frac1( a );
7409 aSig0 = extractFloat128Frac0( a );
7410 aExp = extractFloat128Exp( a );
7411 aSign = extractFloat128Sign( a );
7412 bSig1 = extractFloat128Frac1( b );
7413 bSig0 = extractFloat128Frac0( b );
7414 bExp = extractFloat128Exp( b );
7415 bSign = extractFloat128Sign( b );
7416 zSign = aSign ^ bSign;
7417 if ( aExp == 0x7FFF ) {
7418 if ( ( aSig0 | aSig1 )
7419 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7420 return propagateFloat128NaN(a, b, status);
7421 }
7422 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7423 return packFloat128( zSign, 0x7FFF, 0, 0 );
7424 }
7425 if ( bExp == 0x7FFF ) {
7426 if (bSig0 | bSig1) {
7427 return propagateFloat128NaN(a, b, status);
7428 }
7429 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7430 invalid:
7431 float_raise(float_flag_invalid, status);
7432 return float128_default_nan(status);
7433 }
7434 return packFloat128( zSign, 0x7FFF, 0, 0 );
7435 }
7436 if ( aExp == 0 ) {
7437 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7438 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7439 }
7440 if ( bExp == 0 ) {
7441 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7442 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7443 }
7444 zExp = aExp + bExp - 0x4000;
7445 aSig0 |= UINT64_C(0x0001000000000000);
7446 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7447 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7448 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7449 zSig2 |= ( zSig3 != 0 );
7450 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7451 shift128ExtraRightJamming(
7452 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7453 ++zExp;
7454 }
7455 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7456
7457 }
7458
7459 /*----------------------------------------------------------------------------
7460 | Returns the result of dividing the quadruple-precision floating-point value
7461 | `a' by the corresponding value `b'. The operation is performed according to
7462 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7463 *----------------------------------------------------------------------------*/
7464
7465 float128 float128_div(float128 a, float128 b, float_status *status)
7466 {
7467 bool aSign, bSign, zSign;
7468 int32_t aExp, bExp, zExp;
7469 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7470 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7471
7472 aSig1 = extractFloat128Frac1( a );
7473 aSig0 = extractFloat128Frac0( a );
7474 aExp = extractFloat128Exp( a );
7475 aSign = extractFloat128Sign( a );
7476 bSig1 = extractFloat128Frac1( b );
7477 bSig0 = extractFloat128Frac0( b );
7478 bExp = extractFloat128Exp( b );
7479 bSign = extractFloat128Sign( b );
7480 zSign = aSign ^ bSign;
7481 if ( aExp == 0x7FFF ) {
7482 if (aSig0 | aSig1) {
7483 return propagateFloat128NaN(a, b, status);
7484 }
7485 if ( bExp == 0x7FFF ) {
7486 if (bSig0 | bSig1) {
7487 return propagateFloat128NaN(a, b, status);
7488 }
7489 goto invalid;
7490 }
7491 return packFloat128( zSign, 0x7FFF, 0, 0 );
7492 }
7493 if ( bExp == 0x7FFF ) {
7494 if (bSig0 | bSig1) {
7495 return propagateFloat128NaN(a, b, status);
7496 }
7497 return packFloat128( zSign, 0, 0, 0 );
7498 }
7499 if ( bExp == 0 ) {
7500 if ( ( bSig0 | bSig1 ) == 0 ) {
7501 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7502 invalid:
7503 float_raise(float_flag_invalid, status);
7504 return float128_default_nan(status);
7505 }
7506 float_raise(float_flag_divbyzero, status);
7507 return packFloat128( zSign, 0x7FFF, 0, 0 );
7508 }
7509 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7510 }
7511 if ( aExp == 0 ) {
7512 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7513 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7514 }
7515 zExp = aExp - bExp + 0x3FFD;
7516 shortShift128Left(
7517 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7518 shortShift128Left(
7519 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7520 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7521 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7522 ++zExp;
7523 }
7524 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7525 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7526 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7527 while ( (int64_t) rem0 < 0 ) {
7528 --zSig0;
7529 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7530 }
7531 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7532 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7533 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7534 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7535 while ( (int64_t) rem1 < 0 ) {
7536 --zSig1;
7537 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7538 }
7539 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7540 }
7541 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7542 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7543
7544 }
7545
7546 /*----------------------------------------------------------------------------
7547 | Returns the remainder of the quadruple-precision floating-point value `a'
7548 | with respect to the corresponding value `b'. The operation is performed
7549 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7550 *----------------------------------------------------------------------------*/
7551
7552 float128 float128_rem(float128 a, float128 b, float_status *status)
7553 {
7554 bool aSign, zSign;
7555 int32_t aExp, bExp, expDiff;
7556 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7557 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7558 int64_t sigMean0;
7559
7560 aSig1 = extractFloat128Frac1( a );
7561 aSig0 = extractFloat128Frac0( a );
7562 aExp = extractFloat128Exp( a );
7563 aSign = extractFloat128Sign( a );
7564 bSig1 = extractFloat128Frac1( b );
7565 bSig0 = extractFloat128Frac0( b );
7566 bExp = extractFloat128Exp( b );
7567 if ( aExp == 0x7FFF ) {
7568 if ( ( aSig0 | aSig1 )
7569 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7570 return propagateFloat128NaN(a, b, status);
7571 }
7572 goto invalid;
7573 }
7574 if ( bExp == 0x7FFF ) {
7575 if (bSig0 | bSig1) {
7576 return propagateFloat128NaN(a, b, status);
7577 }
7578 return a;
7579 }
7580 if ( bExp == 0 ) {
7581 if ( ( bSig0 | bSig1 ) == 0 ) {
7582 invalid:
7583 float_raise(float_flag_invalid, status);
7584 return float128_default_nan(status);
7585 }
7586 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7587 }
7588 if ( aExp == 0 ) {
7589 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7590 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7591 }
7592 expDiff = aExp - bExp;
7593 if ( expDiff < -1 ) return a;
7594 shortShift128Left(
7595 aSig0 | UINT64_C(0x0001000000000000),
7596 aSig1,
7597 15 - ( expDiff < 0 ),
7598 &aSig0,
7599 &aSig1
7600 );
7601 shortShift128Left(
7602 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7603 q = le128( bSig0, bSig1, aSig0, aSig1 );
7604 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7605 expDiff -= 64;
7606 while ( 0 < expDiff ) {
7607 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7608 q = ( 4 < q ) ? q - 4 : 0;
7609 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7610 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7611 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7612 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7613 expDiff -= 61;
7614 }
7615 if ( -64 < expDiff ) {
7616 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7617 q = ( 4 < q ) ? q - 4 : 0;
7618 q >>= - expDiff;
7619 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7620 expDiff += 52;
7621 if ( expDiff < 0 ) {
7622 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7623 }
7624 else {
7625 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7626 }
7627 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7628 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7629 }
7630 else {
7631 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7632 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7633 }
7634 do {
7635 alternateASig0 = aSig0;
7636 alternateASig1 = aSig1;
7637 ++q;
7638 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7639 } while ( 0 <= (int64_t) aSig0 );
7640 add128(
7641 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7642 if ( ( sigMean0 < 0 )
7643 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7644 aSig0 = alternateASig0;
7645 aSig1 = alternateASig1;
7646 }
7647 zSign = ( (int64_t) aSig0 < 0 );
7648 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7649 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7650 status);
7651 }
7652
7653 /*----------------------------------------------------------------------------
7654 | Returns the square root of the quadruple-precision floating-point value `a'.
7655 | The operation is performed according to the IEC/IEEE Standard for Binary
7656 | Floating-Point Arithmetic.
7657 *----------------------------------------------------------------------------*/
7658
7659 float128 float128_sqrt(float128 a, float_status *status)
7660 {
7661 bool aSign;
7662 int32_t aExp, zExp;
7663 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7664 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7665
7666 aSig1 = extractFloat128Frac1( a );
7667 aSig0 = extractFloat128Frac0( a );
7668 aExp = extractFloat128Exp( a );
7669 aSign = extractFloat128Sign( a );
7670 if ( aExp == 0x7FFF ) {
7671 if (aSig0 | aSig1) {
7672 return propagateFloat128NaN(a, a, status);
7673 }
7674 if ( ! aSign ) return a;
7675 goto invalid;
7676 }
7677 if ( aSign ) {
7678 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7679 invalid:
7680 float_raise(float_flag_invalid, status);
7681 return float128_default_nan(status);
7682 }
7683 if ( aExp == 0 ) {
7684 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7685 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7686 }
7687 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7688 aSig0 |= UINT64_C(0x0001000000000000);
7689 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7690 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7691 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7692 doubleZSig0 = zSig0<<1;
7693 mul64To128( zSig0, zSig0, &term0, &term1 );
7694 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7695 while ( (int64_t) rem0 < 0 ) {
7696 --zSig0;
7697 doubleZSig0 -= 2;
7698 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7699 }
7700 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7701 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7702 if ( zSig1 == 0 ) zSig1 = 1;
7703 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7704 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7705 mul64To128( zSig1, zSig1, &term2, &term3 );
7706 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7707 while ( (int64_t) rem1 < 0 ) {
7708 --zSig1;
7709 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7710 term3 |= 1;
7711 term2 |= doubleZSig0;
7712 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7713 }
7714 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7715 }
7716 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7717 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7718
7719 }
7720
7721 static inline FloatRelation
7722 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7723 float_status *status)
7724 {
7725 bool aSign, bSign;
7726
7727 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7728 float_raise(float_flag_invalid, status);
7729 return float_relation_unordered;
7730 }
7731 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7732 ( extractFloatx80Frac( a )<<1 ) ) ||
7733 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7734 ( extractFloatx80Frac( b )<<1 ) )) {
7735 if (!is_quiet ||
7736 floatx80_is_signaling_nan(a, status) ||
7737 floatx80_is_signaling_nan(b, status)) {
7738 float_raise(float_flag_invalid, status);
7739 }
7740 return float_relation_unordered;
7741 }
7742 aSign = extractFloatx80Sign( a );
7743 bSign = extractFloatx80Sign( b );
7744 if ( aSign != bSign ) {
7745
7746 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7747 ( ( a.low | b.low ) == 0 ) ) {
7748 /* zero case */
7749 return float_relation_equal;
7750 } else {
7751 return 1 - (2 * aSign);
7752 }
7753 } else {
7754 /* Normalize pseudo-denormals before comparison. */
7755 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7756 ++a.high;
7757 }
7758 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7759 ++b.high;
7760 }
7761 if (a.low == b.low && a.high == b.high) {
7762 return float_relation_equal;
7763 } else {
7764 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7765 }
7766 }
7767 }
7768
7769 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7770 {
7771 return floatx80_compare_internal(a, b, 0, status);
7772 }
7773
7774 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7775 float_status *status)
7776 {
7777 return floatx80_compare_internal(a, b, 1, status);
7778 }
7779
7780 static inline FloatRelation
7781 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7782 float_status *status)
7783 {
7784 bool aSign, bSign;
7785
7786 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7787 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7788 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7789 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7790 if (!is_quiet ||
7791 float128_is_signaling_nan(a, status) ||
7792 float128_is_signaling_nan(b, status)) {
7793 float_raise(float_flag_invalid, status);
7794 }
7795 return float_relation_unordered;
7796 }
7797 aSign = extractFloat128Sign( a );
7798 bSign = extractFloat128Sign( b );
7799 if ( aSign != bSign ) {
7800 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7801 /* zero case */
7802 return float_relation_equal;
7803 } else {
7804 return 1 - (2 * aSign);
7805 }
7806 } else {
7807 if (a.low == b.low && a.high == b.high) {
7808 return float_relation_equal;
7809 } else {
7810 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7811 }
7812 }
7813 }
7814
7815 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7816 {
7817 return float128_compare_internal(a, b, 0, status);
7818 }
7819
7820 FloatRelation float128_compare_quiet(float128 a, float128 b,
7821 float_status *status)
7822 {
7823 return float128_compare_internal(a, b, 1, status);
7824 }
7825
7826 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7827 {
7828 bool aSign;
7829 int32_t aExp;
7830 uint64_t aSig;
7831
7832 if (floatx80_invalid_encoding(a)) {
7833 float_raise(float_flag_invalid, status);
7834 return floatx80_default_nan(status);
7835 }
7836 aSig = extractFloatx80Frac( a );
7837 aExp = extractFloatx80Exp( a );
7838 aSign = extractFloatx80Sign( a );
7839
7840 if ( aExp == 0x7FFF ) {
7841 if ( aSig<<1 ) {
7842 return propagateFloatx80NaN(a, a, status);
7843 }
7844 return a;
7845 }
7846
7847 if (aExp == 0) {
7848 if (aSig == 0) {
7849 return a;
7850 }
7851 aExp++;
7852 }
7853
7854 if (n > 0x10000) {
7855 n = 0x10000;
7856 } else if (n < -0x10000) {
7857 n = -0x10000;
7858 }
7859
7860 aExp += n;
7861 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7862 aSign, aExp, aSig, 0, status);
7863 }
7864
7865 float128 float128_scalbn(float128 a, int n, float_status *status)
7866 {
7867 bool aSign;
7868 int32_t aExp;
7869 uint64_t aSig0, aSig1;
7870
7871 aSig1 = extractFloat128Frac1( a );
7872 aSig0 = extractFloat128Frac0( a );
7873 aExp = extractFloat128Exp( a );
7874 aSign = extractFloat128Sign( a );
7875 if ( aExp == 0x7FFF ) {
7876 if ( aSig0 | aSig1 ) {
7877 return propagateFloat128NaN(a, a, status);
7878 }
7879 return a;
7880 }
7881 if (aExp != 0) {
7882 aSig0 |= UINT64_C(0x0001000000000000);
7883 } else if (aSig0 == 0 && aSig1 == 0) {
7884 return a;
7885 } else {
7886 aExp++;
7887 }
7888
7889 if (n > 0x10000) {
7890 n = 0x10000;
7891 } else if (n < -0x10000) {
7892 n = -0x10000;
7893 }
7894
7895 aExp += n - 1;
7896 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7897 , status);
7898
7899 }
7900
7901 static void __attribute__((constructor)) softfloat_init(void)
7902 {
7903 union_float64 ua, ub, uc, ur;
7904
7905 if (QEMU_NO_HARDFLOAT) {
7906 return;
7907 }
7908 /*
7909 * Test that the host's FMA is not obviously broken. For example,
7910 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7911 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7912 */
7913 ua.s = 0x0020000000000001ULL;
7914 ub.s = 0x3ca0000000000000ULL;
7915 uc.s = 0x0020000000000000ULL;
7916 ur.h = fma(ua.h, ub.h, uc.h);
7917 if (ur.s != 0x0020000000000001ULL) {
7918 force_soft_fma = true;
7919 }
7920 }