]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Move the binary point to the msb
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 s->float_exception_flags |= float_flag_overflow;
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 s->float_exception_flags |= float_flag_overflow;
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused)) bool is_nan(FloatClass c)
474 {
475 return unlikely(c >= float_class_qnan);
476 }
477
478 static inline __attribute__((unused)) bool is_snan(FloatClass c)
479 {
480 return c == float_class_snan;
481 }
482
483 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
484 {
485 return c == float_class_qnan;
486 }
487
488 /*
489 * Structure holding all of the decomposed parts of a float. The
490 * exponent is unbiased and the fraction is normalized. All
491 * calculations are done with a 64 bit fraction and then rounded as
492 * appropriate for the final format.
493 *
494 * Thanks to the packed FloatClass a decent compiler should be able to
495 * fit the whole structure into registers and avoid using the stack
496 * for parameter passing.
497 */
498
499 typedef struct {
500 uint64_t frac;
501 int32_t exp;
502 FloatClass cls;
503 bool sign;
504 } FloatParts;
505
506 #define DECOMPOSED_BINARY_POINT 63
507 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
508
509 /* Structure holding all of the relevant parameters for a format.
510 * exp_size: the size of the exponent field
511 * exp_bias: the offset applied to the exponent field
512 * exp_max: the maximum normalised exponent
513 * frac_size: the size of the fraction field
514 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
515 * The following are computed based the size of fraction
516 * frac_lsb: least significant bit of fraction
517 * frac_lsbm1: the bit below the least significant bit (for rounding)
518 * round_mask/roundeven_mask: masks used for rounding
519 * The following optional modifiers are available:
520 * arm_althp: handle ARM Alternative Half Precision
521 */
522 typedef struct {
523 int exp_size;
524 int exp_bias;
525 int exp_max;
526 int frac_size;
527 int frac_shift;
528 uint64_t frac_lsb;
529 uint64_t frac_lsbm1;
530 uint64_t round_mask;
531 uint64_t roundeven_mask;
532 bool arm_althp;
533 } FloatFmt;
534
535 /* Expand fields based on the size of exponent and fraction */
536 #define FLOAT_PARAMS(E, F) \
537 .exp_size = E, \
538 .exp_bias = ((1 << E) - 1) >> 1, \
539 .exp_max = (1 << E) - 1, \
540 .frac_size = F, \
541 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
542 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
543 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
544 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
545 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
546
547 static const FloatFmt float16_params = {
548 FLOAT_PARAMS(5, 10)
549 };
550
551 static const FloatFmt float16_params_ahp = {
552 FLOAT_PARAMS(5, 10),
553 .arm_althp = true
554 };
555
556 static const FloatFmt bfloat16_params = {
557 FLOAT_PARAMS(8, 7)
558 };
559
560 static const FloatFmt float32_params = {
561 FLOAT_PARAMS(8, 23)
562 };
563
564 static const FloatFmt float64_params = {
565 FLOAT_PARAMS(11, 52)
566 };
567
568 /* Unpack a float to parts, but do not canonicalize. */
569 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
570 {
571 const int sign_pos = fmt.frac_size + fmt.exp_size;
572
573 return (FloatParts) {
574 .cls = float_class_unclassified,
575 .sign = extract64(raw, sign_pos, 1),
576 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
577 .frac = extract64(raw, 0, fmt.frac_size),
578 };
579 }
580
581 static inline FloatParts float16_unpack_raw(float16 f)
582 {
583 return unpack_raw(float16_params, f);
584 }
585
586 static inline FloatParts bfloat16_unpack_raw(bfloat16 f)
587 {
588 return unpack_raw(bfloat16_params, f);
589 }
590
591 static inline FloatParts float32_unpack_raw(float32 f)
592 {
593 return unpack_raw(float32_params, f);
594 }
595
596 static inline FloatParts float64_unpack_raw(float64 f)
597 {
598 return unpack_raw(float64_params, f);
599 }
600
601 /* Pack a float from parts, but do not canonicalize. */
602 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
603 {
604 const int sign_pos = fmt.frac_size + fmt.exp_size;
605 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
606 return deposit64(ret, sign_pos, 1, p.sign);
607 }
608
609 static inline float16 float16_pack_raw(FloatParts p)
610 {
611 return make_float16(pack_raw(float16_params, p));
612 }
613
614 static inline bfloat16 bfloat16_pack_raw(FloatParts p)
615 {
616 return pack_raw(bfloat16_params, p);
617 }
618
619 static inline float32 float32_pack_raw(FloatParts p)
620 {
621 return make_float32(pack_raw(float32_params, p));
622 }
623
624 static inline float64 float64_pack_raw(FloatParts p)
625 {
626 return make_float64(pack_raw(float64_params, p));
627 }
628
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine: (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output. These details are target-
635 | specific.
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.c.inc"
638
639 /* Canonicalize EXP and FRAC, setting CLS. */
640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
641 float_status *status)
642 {
643 if (part.exp == parm->exp_max && !parm->arm_althp) {
644 if (part.frac == 0) {
645 part.cls = float_class_inf;
646 } else {
647 part.frac <<= parm->frac_shift;
648 part.cls = (parts_is_snan_frac(part.frac, status)
649 ? float_class_snan : float_class_qnan);
650 }
651 } else if (part.exp == 0) {
652 if (likely(part.frac == 0)) {
653 part.cls = float_class_zero;
654 } else if (status->flush_inputs_to_zero) {
655 float_raise(float_flag_input_denormal, status);
656 part.cls = float_class_zero;
657 part.frac = 0;
658 } else {
659 int shift = clz64(part.frac);
660 part.cls = float_class_normal;
661 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
662 part.frac <<= shift;
663 }
664 } else {
665 part.cls = float_class_normal;
666 part.exp -= parm->exp_bias;
667 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
668 }
669 return part;
670 }
671
672 /* Round and uncanonicalize a floating-point number by parts. There
673 * are FRAC_SHIFT bits that may require rounding at the bottom of the
674 * fraction; these bits will be removed. The exponent will be biased
675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
676 */
677
678 static FloatParts round_canonical(FloatParts p, float_status *s,
679 const FloatFmt *parm)
680 {
681 const uint64_t frac_lsb = parm->frac_lsb;
682 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
683 const uint64_t round_mask = parm->round_mask;
684 const uint64_t roundeven_mask = parm->roundeven_mask;
685 const int exp_max = parm->exp_max;
686 const int frac_shift = parm->frac_shift;
687 uint64_t frac, inc;
688 int exp, flags = 0;
689 bool overflow_norm;
690
691 frac = p.frac;
692 exp = p.exp;
693
694 switch (p.cls) {
695 case float_class_normal:
696 switch (s->float_rounding_mode) {
697 case float_round_nearest_even:
698 overflow_norm = false;
699 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
700 break;
701 case float_round_ties_away:
702 overflow_norm = false;
703 inc = frac_lsbm1;
704 break;
705 case float_round_to_zero:
706 overflow_norm = true;
707 inc = 0;
708 break;
709 case float_round_up:
710 inc = p.sign ? 0 : round_mask;
711 overflow_norm = p.sign;
712 break;
713 case float_round_down:
714 inc = p.sign ? round_mask : 0;
715 overflow_norm = !p.sign;
716 break;
717 case float_round_to_odd:
718 overflow_norm = true;
719 inc = frac & frac_lsb ? 0 : round_mask;
720 break;
721 default:
722 g_assert_not_reached();
723 }
724
725 exp += parm->exp_bias;
726 if (likely(exp > 0)) {
727 if (frac & round_mask) {
728 flags |= float_flag_inexact;
729 if (uadd64_overflow(frac, inc, &frac)) {
730 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
731 exp++;
732 }
733 }
734 frac >>= frac_shift;
735
736 if (parm->arm_althp) {
737 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
738 if (unlikely(exp > exp_max)) {
739 /* Overflow. Return the maximum normal. */
740 flags = float_flag_invalid;
741 exp = exp_max;
742 frac = -1;
743 }
744 } else if (unlikely(exp >= exp_max)) {
745 flags |= float_flag_overflow | float_flag_inexact;
746 if (overflow_norm) {
747 exp = exp_max - 1;
748 frac = -1;
749 } else {
750 p.cls = float_class_inf;
751 goto do_inf;
752 }
753 }
754 } else if (s->flush_to_zero) {
755 flags |= float_flag_output_denormal;
756 p.cls = float_class_zero;
757 goto do_zero;
758 } else {
759 bool is_tiny = s->tininess_before_rounding || (exp < 0);
760
761 if (!is_tiny) {
762 uint64_t discard;
763 is_tiny = !uadd64_overflow(frac, inc, &discard);
764 }
765
766 shift64RightJamming(frac, 1 - exp, &frac);
767 if (frac & round_mask) {
768 /* Need to recompute round-to-even. */
769 switch (s->float_rounding_mode) {
770 case float_round_nearest_even:
771 inc = ((frac & roundeven_mask) != frac_lsbm1
772 ? frac_lsbm1 : 0);
773 break;
774 case float_round_to_odd:
775 inc = frac & frac_lsb ? 0 : round_mask;
776 break;
777 default:
778 break;
779 }
780 flags |= float_flag_inexact;
781 frac += inc;
782 }
783
784 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
785 frac >>= frac_shift;
786
787 if (is_tiny && (flags & float_flag_inexact)) {
788 flags |= float_flag_underflow;
789 }
790 if (exp == 0 && frac == 0) {
791 p.cls = float_class_zero;
792 }
793 }
794 break;
795
796 case float_class_zero:
797 do_zero:
798 exp = 0;
799 frac = 0;
800 break;
801
802 case float_class_inf:
803 do_inf:
804 assert(!parm->arm_althp);
805 exp = exp_max;
806 frac = 0;
807 break;
808
809 case float_class_qnan:
810 case float_class_snan:
811 assert(!parm->arm_althp);
812 exp = exp_max;
813 frac >>= parm->frac_shift;
814 break;
815
816 default:
817 g_assert_not_reached();
818 }
819
820 float_raise(flags, s);
821 p.exp = exp;
822 p.frac = frac;
823 return p;
824 }
825
826 /* Explicit FloatFmt version */
827 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
828 const FloatFmt *params)
829 {
830 return sf_canonicalize(float16_unpack_raw(f), params, s);
831 }
832
833 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
834 {
835 return float16a_unpack_canonical(f, s, &float16_params);
836 }
837
838 static FloatParts bfloat16_unpack_canonical(bfloat16 f, float_status *s)
839 {
840 return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
841 }
842
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844 const FloatFmt *params)
845 {
846 return float16_pack_raw(round_canonical(p, s, params));
847 }
848
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851 return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853
854 static bfloat16 bfloat16_round_pack_canonical(FloatParts p, float_status *s)
855 {
856 return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
857 }
858
859 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
860 {
861 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
862 }
863
864 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
865 {
866 return float32_pack_raw(round_canonical(p, s, &float32_params));
867 }
868
869 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
870 {
871 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
872 }
873
874 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
875 {
876 return float64_pack_raw(round_canonical(p, s, &float64_params));
877 }
878
879 static FloatParts return_nan(FloatParts a, float_status *s)
880 {
881 switch (a.cls) {
882 case float_class_snan:
883 s->float_exception_flags |= float_flag_invalid;
884 a = parts_silence_nan(a, s);
885 /* fall through */
886 case float_class_qnan:
887 if (s->default_nan_mode) {
888 return parts_default_nan(s);
889 }
890 break;
891
892 default:
893 g_assert_not_reached();
894 }
895 return a;
896 }
897
898 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
899 {
900 if (is_snan(a.cls) || is_snan(b.cls)) {
901 s->float_exception_flags |= float_flag_invalid;
902 }
903
904 if (s->default_nan_mode) {
905 return parts_default_nan(s);
906 } else {
907 if (pickNaN(a.cls, b.cls,
908 a.frac > b.frac ||
909 (a.frac == b.frac && a.sign < b.sign), s)) {
910 a = b;
911 }
912 if (is_snan(a.cls)) {
913 return parts_silence_nan(a, s);
914 }
915 }
916 return a;
917 }
918
919 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
920 bool inf_zero, float_status *s)
921 {
922 int which;
923
924 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
925 s->float_exception_flags |= float_flag_invalid;
926 }
927
928 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
929
930 if (s->default_nan_mode) {
931 /* Note that this check is after pickNaNMulAdd so that function
932 * has an opportunity to set the Invalid flag.
933 */
934 which = 3;
935 }
936
937 switch (which) {
938 case 0:
939 break;
940 case 1:
941 a = b;
942 break;
943 case 2:
944 a = c;
945 break;
946 case 3:
947 return parts_default_nan(s);
948 default:
949 g_assert_not_reached();
950 }
951
952 if (is_snan(a.cls)) {
953 return parts_silence_nan(a, s);
954 }
955 return a;
956 }
957
958 /*
959 * Returns the result of adding or subtracting the values of the
960 * floating-point values `a' and `b'. The operation is performed
961 * according to the IEC/IEEE Standard for Binary Floating-Point
962 * Arithmetic.
963 */
964
965 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
966 float_status *s)
967 {
968 bool a_sign = a.sign;
969 bool b_sign = b.sign ^ subtract;
970
971 if (a_sign != b_sign) {
972 /* Subtraction */
973
974 if (a.cls == float_class_normal && b.cls == float_class_normal) {
975 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
976 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
977 a.frac = a.frac - b.frac;
978 } else {
979 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
980 a.frac = b.frac - a.frac;
981 a.exp = b.exp;
982 a_sign ^= 1;
983 }
984
985 if (a.frac == 0) {
986 a.cls = float_class_zero;
987 a.sign = s->float_rounding_mode == float_round_down;
988 } else {
989 int shift = clz64(a.frac);
990 a.frac = a.frac << shift;
991 a.exp = a.exp - shift;
992 a.sign = a_sign;
993 }
994 return a;
995 }
996 if (is_nan(a.cls) || is_nan(b.cls)) {
997 return pick_nan(a, b, s);
998 }
999 if (a.cls == float_class_inf) {
1000 if (b.cls == float_class_inf) {
1001 float_raise(float_flag_invalid, s);
1002 return parts_default_nan(s);
1003 }
1004 return a;
1005 }
1006 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1007 a.sign = s->float_rounding_mode == float_round_down;
1008 return a;
1009 }
1010 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1011 b.sign = a_sign ^ 1;
1012 return b;
1013 }
1014 if (b.cls == float_class_zero) {
1015 return a;
1016 }
1017 } else {
1018 /* Addition */
1019 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1020 if (a.exp > b.exp) {
1021 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1022 } else if (a.exp < b.exp) {
1023 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1024 a.exp = b.exp;
1025 }
1026
1027 if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1028 shift64RightJamming(a.frac, 1, &a.frac);
1029 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1030 a.exp += 1;
1031 }
1032 return a;
1033 }
1034 if (is_nan(a.cls) || is_nan(b.cls)) {
1035 return pick_nan(a, b, s);
1036 }
1037 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1038 return a;
1039 }
1040 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1041 b.sign = b_sign;
1042 return b;
1043 }
1044 }
1045 g_assert_not_reached();
1046 }
1047
1048 /*
1049 * Returns the result of adding or subtracting the floating-point
1050 * values `a' and `b'. The operation is performed according to the
1051 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1052 */
1053
1054 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1055 {
1056 FloatParts pa = float16_unpack_canonical(a, status);
1057 FloatParts pb = float16_unpack_canonical(b, status);
1058 FloatParts pr = addsub_floats(pa, pb, false, status);
1059
1060 return float16_round_pack_canonical(pr, status);
1061 }
1062
1063 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1064 {
1065 FloatParts pa = float16_unpack_canonical(a, status);
1066 FloatParts pb = float16_unpack_canonical(b, status);
1067 FloatParts pr = addsub_floats(pa, pb, true, status);
1068
1069 return float16_round_pack_canonical(pr, status);
1070 }
1071
1072 static float32 QEMU_SOFTFLOAT_ATTR
1073 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1074 {
1075 FloatParts pa = float32_unpack_canonical(a, status);
1076 FloatParts pb = float32_unpack_canonical(b, status);
1077 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1078
1079 return float32_round_pack_canonical(pr, status);
1080 }
1081
1082 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1083 {
1084 return soft_f32_addsub(a, b, false, status);
1085 }
1086
1087 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1088 {
1089 return soft_f32_addsub(a, b, true, status);
1090 }
1091
1092 static float64 QEMU_SOFTFLOAT_ATTR
1093 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1094 {
1095 FloatParts pa = float64_unpack_canonical(a, status);
1096 FloatParts pb = float64_unpack_canonical(b, status);
1097 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1098
1099 return float64_round_pack_canonical(pr, status);
1100 }
1101
1102 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1103 {
1104 return soft_f64_addsub(a, b, false, status);
1105 }
1106
1107 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1108 {
1109 return soft_f64_addsub(a, b, true, status);
1110 }
1111
1112 static float hard_f32_add(float a, float b)
1113 {
1114 return a + b;
1115 }
1116
1117 static float hard_f32_sub(float a, float b)
1118 {
1119 return a - b;
1120 }
1121
1122 static double hard_f64_add(double a, double b)
1123 {
1124 return a + b;
1125 }
1126
1127 static double hard_f64_sub(double a, double b)
1128 {
1129 return a - b;
1130 }
1131
1132 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1133 {
1134 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1135 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1136 }
1137 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1138 }
1139
1140 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1141 {
1142 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1143 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1144 } else {
1145 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1146 }
1147 }
1148
1149 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1150 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1151 {
1152 return float32_gen2(a, b, s, hard, soft,
1153 f32_is_zon2, f32_addsubmul_post);
1154 }
1155
1156 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1157 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1158 {
1159 return float64_gen2(a, b, s, hard, soft,
1160 f64_is_zon2, f64_addsubmul_post);
1161 }
1162
1163 float32 QEMU_FLATTEN
1164 float32_add(float32 a, float32 b, float_status *s)
1165 {
1166 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1167 }
1168
1169 float32 QEMU_FLATTEN
1170 float32_sub(float32 a, float32 b, float_status *s)
1171 {
1172 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1173 }
1174
1175 float64 QEMU_FLATTEN
1176 float64_add(float64 a, float64 b, float_status *s)
1177 {
1178 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1179 }
1180
1181 float64 QEMU_FLATTEN
1182 float64_sub(float64 a, float64 b, float_status *s)
1183 {
1184 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1185 }
1186
1187 /*
1188 * Returns the result of adding or subtracting the bfloat16
1189 * values `a' and `b'.
1190 */
1191 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1192 {
1193 FloatParts pa = bfloat16_unpack_canonical(a, status);
1194 FloatParts pb = bfloat16_unpack_canonical(b, status);
1195 FloatParts pr = addsub_floats(pa, pb, false, status);
1196
1197 return bfloat16_round_pack_canonical(pr, status);
1198 }
1199
1200 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1201 {
1202 FloatParts pa = bfloat16_unpack_canonical(a, status);
1203 FloatParts pb = bfloat16_unpack_canonical(b, status);
1204 FloatParts pr = addsub_floats(pa, pb, true, status);
1205
1206 return bfloat16_round_pack_canonical(pr, status);
1207 }
1208
1209 /*
1210 * Returns the result of multiplying the floating-point values `a' and
1211 * `b'. The operation is performed according to the IEC/IEEE Standard
1212 * for Binary Floating-Point Arithmetic.
1213 */
1214
1215 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1216 {
1217 bool sign = a.sign ^ b.sign;
1218
1219 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1220 uint64_t hi, lo;
1221 int exp = a.exp + b.exp;
1222
1223 mul64To128(a.frac, b.frac, &hi, &lo);
1224 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1225 exp += 1;
1226 } else {
1227 hi <<= 1;
1228 }
1229 hi |= (lo != 0);
1230
1231 /* Re-use a */
1232 a.exp = exp;
1233 a.sign = sign;
1234 a.frac = hi;
1235 return a;
1236 }
1237 /* handle all the NaN cases */
1238 if (is_nan(a.cls) || is_nan(b.cls)) {
1239 return pick_nan(a, b, s);
1240 }
1241 /* Inf * Zero == NaN */
1242 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1243 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1244 s->float_exception_flags |= float_flag_invalid;
1245 return parts_default_nan(s);
1246 }
1247 /* Multiply by 0 or Inf */
1248 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1249 a.sign = sign;
1250 return a;
1251 }
1252 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1253 b.sign = sign;
1254 return b;
1255 }
1256 g_assert_not_reached();
1257 }
1258
1259 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1260 {
1261 FloatParts pa = float16_unpack_canonical(a, status);
1262 FloatParts pb = float16_unpack_canonical(b, status);
1263 FloatParts pr = mul_floats(pa, pb, status);
1264
1265 return float16_round_pack_canonical(pr, status);
1266 }
1267
1268 static float32 QEMU_SOFTFLOAT_ATTR
1269 soft_f32_mul(float32 a, float32 b, float_status *status)
1270 {
1271 FloatParts pa = float32_unpack_canonical(a, status);
1272 FloatParts pb = float32_unpack_canonical(b, status);
1273 FloatParts pr = mul_floats(pa, pb, status);
1274
1275 return float32_round_pack_canonical(pr, status);
1276 }
1277
1278 static float64 QEMU_SOFTFLOAT_ATTR
1279 soft_f64_mul(float64 a, float64 b, float_status *status)
1280 {
1281 FloatParts pa = float64_unpack_canonical(a, status);
1282 FloatParts pb = float64_unpack_canonical(b, status);
1283 FloatParts pr = mul_floats(pa, pb, status);
1284
1285 return float64_round_pack_canonical(pr, status);
1286 }
1287
1288 static float hard_f32_mul(float a, float b)
1289 {
1290 return a * b;
1291 }
1292
1293 static double hard_f64_mul(double a, double b)
1294 {
1295 return a * b;
1296 }
1297
1298 float32 QEMU_FLATTEN
1299 float32_mul(float32 a, float32 b, float_status *s)
1300 {
1301 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1302 f32_is_zon2, f32_addsubmul_post);
1303 }
1304
1305 float64 QEMU_FLATTEN
1306 float64_mul(float64 a, float64 b, float_status *s)
1307 {
1308 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1309 f64_is_zon2, f64_addsubmul_post);
1310 }
1311
1312 /*
1313 * Returns the result of multiplying the bfloat16
1314 * values `a' and `b'.
1315 */
1316
1317 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1318 {
1319 FloatParts pa = bfloat16_unpack_canonical(a, status);
1320 FloatParts pb = bfloat16_unpack_canonical(b, status);
1321 FloatParts pr = mul_floats(pa, pb, status);
1322
1323 return bfloat16_round_pack_canonical(pr, status);
1324 }
1325
1326 /*
1327 * Returns the result of multiplying the floating-point values `a' and
1328 * `b' then adding 'c', with no intermediate rounding step after the
1329 * multiplication. The operation is performed according to the
1330 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1331 * The flags argument allows the caller to select negation of the
1332 * addend, the intermediate product, or the final result. (The
1333 * difference between this and having the caller do a separate
1334 * negation is that negating externally will flip the sign bit on
1335 * NaNs.)
1336 */
1337
1338 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1339 int flags, float_status *s)
1340 {
1341 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1342 ((1 << float_class_inf) | (1 << float_class_zero));
1343 bool p_sign;
1344 bool sign_flip = flags & float_muladd_negate_result;
1345 FloatClass p_class;
1346 uint64_t hi, lo;
1347 int p_exp;
1348
1349 /* It is implementation-defined whether the cases of (0,inf,qnan)
1350 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1351 * they return if they do), so we have to hand this information
1352 * off to the target-specific pick-a-NaN routine.
1353 */
1354 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1355 return pick_nan_muladd(a, b, c, inf_zero, s);
1356 }
1357
1358 if (inf_zero) {
1359 s->float_exception_flags |= float_flag_invalid;
1360 return parts_default_nan(s);
1361 }
1362
1363 if (flags & float_muladd_negate_c) {
1364 c.sign ^= 1;
1365 }
1366
1367 p_sign = a.sign ^ b.sign;
1368
1369 if (flags & float_muladd_negate_product) {
1370 p_sign ^= 1;
1371 }
1372
1373 if (a.cls == float_class_inf || b.cls == float_class_inf) {
1374 p_class = float_class_inf;
1375 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1376 p_class = float_class_zero;
1377 } else {
1378 p_class = float_class_normal;
1379 }
1380
1381 if (c.cls == float_class_inf) {
1382 if (p_class == float_class_inf && p_sign != c.sign) {
1383 s->float_exception_flags |= float_flag_invalid;
1384 return parts_default_nan(s);
1385 } else {
1386 a.cls = float_class_inf;
1387 a.sign = c.sign ^ sign_flip;
1388 return a;
1389 }
1390 }
1391
1392 if (p_class == float_class_inf) {
1393 a.cls = float_class_inf;
1394 a.sign = p_sign ^ sign_flip;
1395 return a;
1396 }
1397
1398 if (p_class == float_class_zero) {
1399 if (c.cls == float_class_zero) {
1400 if (p_sign != c.sign) {
1401 p_sign = s->float_rounding_mode == float_round_down;
1402 }
1403 c.sign = p_sign;
1404 } else if (flags & float_muladd_halve_result) {
1405 c.exp -= 1;
1406 }
1407 c.sign ^= sign_flip;
1408 return c;
1409 }
1410
1411 /* a & b should be normals now... */
1412 assert(a.cls == float_class_normal &&
1413 b.cls == float_class_normal);
1414
1415 p_exp = a.exp + b.exp;
1416
1417 mul64To128(a.frac, b.frac, &hi, &lo);
1418
1419 /* Renormalize to the msb. */
1420 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1421 p_exp += 1;
1422 } else {
1423 shortShift128Left(hi, lo, 1, &hi, &lo);
1424 }
1425
1426 /* + add/sub */
1427 if (c.cls != float_class_zero) {
1428 int exp_diff = p_exp - c.exp;
1429 if (p_sign == c.sign) {
1430 /* Addition */
1431 if (exp_diff <= 0) {
1432 shift64RightJamming(hi, -exp_diff, &hi);
1433 p_exp = c.exp;
1434 if (uadd64_overflow(hi, c.frac, &hi)) {
1435 shift64RightJamming(hi, 1, &hi);
1436 hi |= DECOMPOSED_IMPLICIT_BIT;
1437 p_exp += 1;
1438 }
1439 } else {
1440 uint64_t c_hi, c_lo, over;
1441 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1442 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1443 if (over) {
1444 shift64RightJamming(hi, 1, &hi);
1445 hi |= DECOMPOSED_IMPLICIT_BIT;
1446 p_exp += 1;
1447 }
1448 }
1449 } else {
1450 /* Subtraction */
1451 uint64_t c_hi = c.frac, c_lo = 0;
1452
1453 if (exp_diff <= 0) {
1454 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1455 if (exp_diff == 0
1456 &&
1457 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1458 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1459 } else {
1460 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1461 p_sign ^= 1;
1462 p_exp = c.exp;
1463 }
1464 } else {
1465 shift128RightJamming(c_hi, c_lo,
1466 exp_diff,
1467 &c_hi, &c_lo);
1468 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1469 }
1470
1471 if (hi == 0 && lo == 0) {
1472 a.cls = float_class_zero;
1473 a.sign = s->float_rounding_mode == float_round_down;
1474 a.sign ^= sign_flip;
1475 return a;
1476 } else {
1477 int shift;
1478 if (hi != 0) {
1479 shift = clz64(hi);
1480 } else {
1481 shift = clz64(lo) + 64;
1482 }
1483 /* Normalizing to a binary point of 124 is the
1484 correct adjust for the exponent. However since we're
1485 shifting, we might as well put the binary point back
1486 at 63 where we really want it. Therefore shift as
1487 if we're leaving 1 bit at the top of the word, but
1488 adjust the exponent as if we're leaving 3 bits. */
1489 shift128Left(hi, lo, shift, &hi, &lo);
1490 p_exp -= shift;
1491 }
1492 }
1493 }
1494 hi |= (lo != 0);
1495
1496 if (flags & float_muladd_halve_result) {
1497 p_exp -= 1;
1498 }
1499
1500 /* finally prepare our result */
1501 a.cls = float_class_normal;
1502 a.sign = p_sign ^ sign_flip;
1503 a.exp = p_exp;
1504 a.frac = hi;
1505
1506 return a;
1507 }
1508
1509 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1510 int flags, float_status *status)
1511 {
1512 FloatParts pa = float16_unpack_canonical(a, status);
1513 FloatParts pb = float16_unpack_canonical(b, status);
1514 FloatParts pc = float16_unpack_canonical(c, status);
1515 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1516
1517 return float16_round_pack_canonical(pr, status);
1518 }
1519
1520 static float32 QEMU_SOFTFLOAT_ATTR
1521 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1522 float_status *status)
1523 {
1524 FloatParts pa = float32_unpack_canonical(a, status);
1525 FloatParts pb = float32_unpack_canonical(b, status);
1526 FloatParts pc = float32_unpack_canonical(c, status);
1527 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1528
1529 return float32_round_pack_canonical(pr, status);
1530 }
1531
1532 static float64 QEMU_SOFTFLOAT_ATTR
1533 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1534 float_status *status)
1535 {
1536 FloatParts pa = float64_unpack_canonical(a, status);
1537 FloatParts pb = float64_unpack_canonical(b, status);
1538 FloatParts pc = float64_unpack_canonical(c, status);
1539 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1540
1541 return float64_round_pack_canonical(pr, status);
1542 }
1543
1544 static bool force_soft_fma;
1545
1546 float32 QEMU_FLATTEN
1547 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1548 {
1549 union_float32 ua, ub, uc, ur;
1550
1551 ua.s = xa;
1552 ub.s = xb;
1553 uc.s = xc;
1554
1555 if (unlikely(!can_use_fpu(s))) {
1556 goto soft;
1557 }
1558 if (unlikely(flags & float_muladd_halve_result)) {
1559 goto soft;
1560 }
1561
1562 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1563 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1564 goto soft;
1565 }
1566
1567 if (unlikely(force_soft_fma)) {
1568 goto soft;
1569 }
1570
1571 /*
1572 * When (a || b) == 0, there's no need to check for under/over flow,
1573 * since we know the addend is (normal || 0) and the product is 0.
1574 */
1575 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1576 union_float32 up;
1577 bool prod_sign;
1578
1579 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1580 prod_sign ^= !!(flags & float_muladd_negate_product);
1581 up.s = float32_set_sign(float32_zero, prod_sign);
1582
1583 if (flags & float_muladd_negate_c) {
1584 uc.h = -uc.h;
1585 }
1586 ur.h = up.h + uc.h;
1587 } else {
1588 union_float32 ua_orig = ua;
1589 union_float32 uc_orig = uc;
1590
1591 if (flags & float_muladd_negate_product) {
1592 ua.h = -ua.h;
1593 }
1594 if (flags & float_muladd_negate_c) {
1595 uc.h = -uc.h;
1596 }
1597
1598 ur.h = fmaf(ua.h, ub.h, uc.h);
1599
1600 if (unlikely(f32_is_inf(ur))) {
1601 s->float_exception_flags |= float_flag_overflow;
1602 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1603 ua = ua_orig;
1604 uc = uc_orig;
1605 goto soft;
1606 }
1607 }
1608 if (flags & float_muladd_negate_result) {
1609 return float32_chs(ur.s);
1610 }
1611 return ur.s;
1612
1613 soft:
1614 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1615 }
1616
1617 float64 QEMU_FLATTEN
1618 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1619 {
1620 union_float64 ua, ub, uc, ur;
1621
1622 ua.s = xa;
1623 ub.s = xb;
1624 uc.s = xc;
1625
1626 if (unlikely(!can_use_fpu(s))) {
1627 goto soft;
1628 }
1629 if (unlikely(flags & float_muladd_halve_result)) {
1630 goto soft;
1631 }
1632
1633 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1634 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1635 goto soft;
1636 }
1637
1638 if (unlikely(force_soft_fma)) {
1639 goto soft;
1640 }
1641
1642 /*
1643 * When (a || b) == 0, there's no need to check for under/over flow,
1644 * since we know the addend is (normal || 0) and the product is 0.
1645 */
1646 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1647 union_float64 up;
1648 bool prod_sign;
1649
1650 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1651 prod_sign ^= !!(flags & float_muladd_negate_product);
1652 up.s = float64_set_sign(float64_zero, prod_sign);
1653
1654 if (flags & float_muladd_negate_c) {
1655 uc.h = -uc.h;
1656 }
1657 ur.h = up.h + uc.h;
1658 } else {
1659 union_float64 ua_orig = ua;
1660 union_float64 uc_orig = uc;
1661
1662 if (flags & float_muladd_negate_product) {
1663 ua.h = -ua.h;
1664 }
1665 if (flags & float_muladd_negate_c) {
1666 uc.h = -uc.h;
1667 }
1668
1669 ur.h = fma(ua.h, ub.h, uc.h);
1670
1671 if (unlikely(f64_is_inf(ur))) {
1672 s->float_exception_flags |= float_flag_overflow;
1673 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1674 ua = ua_orig;
1675 uc = uc_orig;
1676 goto soft;
1677 }
1678 }
1679 if (flags & float_muladd_negate_result) {
1680 return float64_chs(ur.s);
1681 }
1682 return ur.s;
1683
1684 soft:
1685 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1686 }
1687
1688 /*
1689 * Returns the result of multiplying the bfloat16 values `a'
1690 * and `b' then adding 'c', with no intermediate rounding step after the
1691 * multiplication.
1692 */
1693
1694 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1695 int flags, float_status *status)
1696 {
1697 FloatParts pa = bfloat16_unpack_canonical(a, status);
1698 FloatParts pb = bfloat16_unpack_canonical(b, status);
1699 FloatParts pc = bfloat16_unpack_canonical(c, status);
1700 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1701
1702 return bfloat16_round_pack_canonical(pr, status);
1703 }
1704
1705 /*
1706 * Returns the result of dividing the floating-point value `a' by the
1707 * corresponding value `b'. The operation is performed according to
1708 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1709 */
1710
1711 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1712 {
1713 bool sign = a.sign ^ b.sign;
1714
1715 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1716 uint64_t n0, n1, q, r;
1717 int exp = a.exp - b.exp;
1718
1719 /*
1720 * We want a 2*N / N-bit division to produce exactly an N-bit
1721 * result, so that we do not lose any precision and so that we
1722 * do not have to renormalize afterward. If A.frac < B.frac,
1723 * then division would produce an (N-1)-bit result; shift A left
1724 * by one to produce the an N-bit result, and decrement the
1725 * exponent to match.
1726 *
1727 * The udiv_qrnnd algorithm that we're using requires normalization,
1728 * i.e. the msb of the denominator must be set, which is already true.
1729 */
1730 if (a.frac < b.frac) {
1731 exp -= 1;
1732 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1733 } else {
1734 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1735 }
1736 q = udiv_qrnnd(&r, n1, n0, b.frac);
1737
1738 /* Set lsb if there is a remainder, to set inexact. */
1739 a.frac = q | (r != 0);
1740 a.sign = sign;
1741 a.exp = exp;
1742 return a;
1743 }
1744 /* handle all the NaN cases */
1745 if (is_nan(a.cls) || is_nan(b.cls)) {
1746 return pick_nan(a, b, s);
1747 }
1748 /* 0/0 or Inf/Inf */
1749 if (a.cls == b.cls
1750 &&
1751 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1752 s->float_exception_flags |= float_flag_invalid;
1753 return parts_default_nan(s);
1754 }
1755 /* Inf / x or 0 / x */
1756 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1757 a.sign = sign;
1758 return a;
1759 }
1760 /* Div 0 => Inf */
1761 if (b.cls == float_class_zero) {
1762 s->float_exception_flags |= float_flag_divbyzero;
1763 a.cls = float_class_inf;
1764 a.sign = sign;
1765 return a;
1766 }
1767 /* Div by Inf */
1768 if (b.cls == float_class_inf) {
1769 a.cls = float_class_zero;
1770 a.sign = sign;
1771 return a;
1772 }
1773 g_assert_not_reached();
1774 }
1775
1776 float16 float16_div(float16 a, float16 b, float_status *status)
1777 {
1778 FloatParts pa = float16_unpack_canonical(a, status);
1779 FloatParts pb = float16_unpack_canonical(b, status);
1780 FloatParts pr = div_floats(pa, pb, status);
1781
1782 return float16_round_pack_canonical(pr, status);
1783 }
1784
1785 static float32 QEMU_SOFTFLOAT_ATTR
1786 soft_f32_div(float32 a, float32 b, float_status *status)
1787 {
1788 FloatParts pa = float32_unpack_canonical(a, status);
1789 FloatParts pb = float32_unpack_canonical(b, status);
1790 FloatParts pr = div_floats(pa, pb, status);
1791
1792 return float32_round_pack_canonical(pr, status);
1793 }
1794
1795 static float64 QEMU_SOFTFLOAT_ATTR
1796 soft_f64_div(float64 a, float64 b, float_status *status)
1797 {
1798 FloatParts pa = float64_unpack_canonical(a, status);
1799 FloatParts pb = float64_unpack_canonical(b, status);
1800 FloatParts pr = div_floats(pa, pb, status);
1801
1802 return float64_round_pack_canonical(pr, status);
1803 }
1804
1805 static float hard_f32_div(float a, float b)
1806 {
1807 return a / b;
1808 }
1809
1810 static double hard_f64_div(double a, double b)
1811 {
1812 return a / b;
1813 }
1814
1815 static bool f32_div_pre(union_float32 a, union_float32 b)
1816 {
1817 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1818 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1819 fpclassify(b.h) == FP_NORMAL;
1820 }
1821 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1822 }
1823
1824 static bool f64_div_pre(union_float64 a, union_float64 b)
1825 {
1826 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1827 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1828 fpclassify(b.h) == FP_NORMAL;
1829 }
1830 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1831 }
1832
1833 static bool f32_div_post(union_float32 a, union_float32 b)
1834 {
1835 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1836 return fpclassify(a.h) != FP_ZERO;
1837 }
1838 return !float32_is_zero(a.s);
1839 }
1840
1841 static bool f64_div_post(union_float64 a, union_float64 b)
1842 {
1843 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1844 return fpclassify(a.h) != FP_ZERO;
1845 }
1846 return !float64_is_zero(a.s);
1847 }
1848
1849 float32 QEMU_FLATTEN
1850 float32_div(float32 a, float32 b, float_status *s)
1851 {
1852 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1853 f32_div_pre, f32_div_post);
1854 }
1855
1856 float64 QEMU_FLATTEN
1857 float64_div(float64 a, float64 b, float_status *s)
1858 {
1859 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1860 f64_div_pre, f64_div_post);
1861 }
1862
1863 /*
1864 * Returns the result of dividing the bfloat16
1865 * value `a' by the corresponding value `b'.
1866 */
1867
1868 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1869 {
1870 FloatParts pa = bfloat16_unpack_canonical(a, status);
1871 FloatParts pb = bfloat16_unpack_canonical(b, status);
1872 FloatParts pr = div_floats(pa, pb, status);
1873
1874 return bfloat16_round_pack_canonical(pr, status);
1875 }
1876
1877 /*
1878 * Float to Float conversions
1879 *
1880 * Returns the result of converting one float format to another. The
1881 * conversion is performed according to the IEC/IEEE Standard for
1882 * Binary Floating-Point Arithmetic.
1883 *
1884 * The float_to_float helper only needs to take care of raising
1885 * invalid exceptions and handling the conversion on NaNs.
1886 */
1887
1888 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1889 float_status *s)
1890 {
1891 if (dstf->arm_althp) {
1892 switch (a.cls) {
1893 case float_class_qnan:
1894 case float_class_snan:
1895 /* There is no NaN in the destination format. Raise Invalid
1896 * and return a zero with the sign of the input NaN.
1897 */
1898 s->float_exception_flags |= float_flag_invalid;
1899 a.cls = float_class_zero;
1900 a.frac = 0;
1901 a.exp = 0;
1902 break;
1903
1904 case float_class_inf:
1905 /* There is no Inf in the destination format. Raise Invalid
1906 * and return the maximum normal with the correct sign.
1907 */
1908 s->float_exception_flags |= float_flag_invalid;
1909 a.cls = float_class_normal;
1910 a.exp = dstf->exp_max;
1911 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1912 break;
1913
1914 default:
1915 break;
1916 }
1917 } else if (is_nan(a.cls)) {
1918 if (is_snan(a.cls)) {
1919 s->float_exception_flags |= float_flag_invalid;
1920 a = parts_silence_nan(a, s);
1921 }
1922 if (s->default_nan_mode) {
1923 return parts_default_nan(s);
1924 }
1925 }
1926 return a;
1927 }
1928
1929 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1930 {
1931 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1932 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1933 FloatParts pr = float_to_float(p, &float32_params, s);
1934 return float32_round_pack_canonical(pr, s);
1935 }
1936
1937 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1938 {
1939 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1940 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1941 FloatParts pr = float_to_float(p, &float64_params, s);
1942 return float64_round_pack_canonical(pr, s);
1943 }
1944
1945 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1946 {
1947 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1948 FloatParts p = float32_unpack_canonical(a, s);
1949 FloatParts pr = float_to_float(p, fmt16, s);
1950 return float16a_round_pack_canonical(pr, s, fmt16);
1951 }
1952
1953 static float64 QEMU_SOFTFLOAT_ATTR
1954 soft_float32_to_float64(float32 a, float_status *s)
1955 {
1956 FloatParts p = float32_unpack_canonical(a, s);
1957 FloatParts pr = float_to_float(p, &float64_params, s);
1958 return float64_round_pack_canonical(pr, s);
1959 }
1960
1961 float64 float32_to_float64(float32 a, float_status *s)
1962 {
1963 if (likely(float32_is_normal(a))) {
1964 /* Widening conversion can never produce inexact results. */
1965 union_float32 uf;
1966 union_float64 ud;
1967 uf.s = a;
1968 ud.h = uf.h;
1969 return ud.s;
1970 } else if (float32_is_zero(a)) {
1971 return float64_set_sign(float64_zero, float32_is_neg(a));
1972 } else {
1973 return soft_float32_to_float64(a, s);
1974 }
1975 }
1976
1977 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1978 {
1979 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1980 FloatParts p = float64_unpack_canonical(a, s);
1981 FloatParts pr = float_to_float(p, fmt16, s);
1982 return float16a_round_pack_canonical(pr, s, fmt16);
1983 }
1984
1985 float32 float64_to_float32(float64 a, float_status *s)
1986 {
1987 FloatParts p = float64_unpack_canonical(a, s);
1988 FloatParts pr = float_to_float(p, &float32_params, s);
1989 return float32_round_pack_canonical(pr, s);
1990 }
1991
1992 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
1993 {
1994 FloatParts p = bfloat16_unpack_canonical(a, s);
1995 FloatParts pr = float_to_float(p, &float32_params, s);
1996 return float32_round_pack_canonical(pr, s);
1997 }
1998
1999 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2000 {
2001 FloatParts p = bfloat16_unpack_canonical(a, s);
2002 FloatParts pr = float_to_float(p, &float64_params, s);
2003 return float64_round_pack_canonical(pr, s);
2004 }
2005
2006 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2007 {
2008 FloatParts p = float32_unpack_canonical(a, s);
2009 FloatParts pr = float_to_float(p, &bfloat16_params, s);
2010 return bfloat16_round_pack_canonical(pr, s);
2011 }
2012
2013 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2014 {
2015 FloatParts p = float64_unpack_canonical(a, s);
2016 FloatParts pr = float_to_float(p, &bfloat16_params, s);
2017 return bfloat16_round_pack_canonical(pr, s);
2018 }
2019
2020 /*
2021 * Rounds the floating-point value `a' to an integer, and returns the
2022 * result as a floating-point value. The operation is performed
2023 * according to the IEC/IEEE Standard for Binary Floating-Point
2024 * Arithmetic.
2025 */
2026
2027 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
2028 int scale, float_status *s)
2029 {
2030 switch (a.cls) {
2031 case float_class_qnan:
2032 case float_class_snan:
2033 return return_nan(a, s);
2034
2035 case float_class_zero:
2036 case float_class_inf:
2037 /* already "integral" */
2038 break;
2039
2040 case float_class_normal:
2041 scale = MIN(MAX(scale, -0x10000), 0x10000);
2042 a.exp += scale;
2043
2044 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2045 /* already integral */
2046 break;
2047 }
2048 if (a.exp < 0) {
2049 bool one;
2050 /* all fractional */
2051 s->float_exception_flags |= float_flag_inexact;
2052 switch (rmode) {
2053 case float_round_nearest_even:
2054 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2055 break;
2056 case float_round_ties_away:
2057 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2058 break;
2059 case float_round_to_zero:
2060 one = false;
2061 break;
2062 case float_round_up:
2063 one = !a.sign;
2064 break;
2065 case float_round_down:
2066 one = a.sign;
2067 break;
2068 case float_round_to_odd:
2069 one = true;
2070 break;
2071 default:
2072 g_assert_not_reached();
2073 }
2074
2075 if (one) {
2076 a.frac = DECOMPOSED_IMPLICIT_BIT;
2077 a.exp = 0;
2078 } else {
2079 a.cls = float_class_zero;
2080 }
2081 } else {
2082 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2083 uint64_t frac_lsbm1 = frac_lsb >> 1;
2084 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2085 uint64_t rnd_mask = rnd_even_mask >> 1;
2086 uint64_t inc;
2087
2088 switch (rmode) {
2089 case float_round_nearest_even:
2090 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2091 break;
2092 case float_round_ties_away:
2093 inc = frac_lsbm1;
2094 break;
2095 case float_round_to_zero:
2096 inc = 0;
2097 break;
2098 case float_round_up:
2099 inc = a.sign ? 0 : rnd_mask;
2100 break;
2101 case float_round_down:
2102 inc = a.sign ? rnd_mask : 0;
2103 break;
2104 case float_round_to_odd:
2105 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2106 break;
2107 default:
2108 g_assert_not_reached();
2109 }
2110
2111 if (a.frac & rnd_mask) {
2112 s->float_exception_flags |= float_flag_inexact;
2113 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2114 a.frac >>= 1;
2115 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2116 a.exp++;
2117 }
2118 a.frac &= ~rnd_mask;
2119 }
2120 }
2121 break;
2122 default:
2123 g_assert_not_reached();
2124 }
2125 return a;
2126 }
2127
2128 float16 float16_round_to_int(float16 a, float_status *s)
2129 {
2130 FloatParts pa = float16_unpack_canonical(a, s);
2131 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2132 return float16_round_pack_canonical(pr, s);
2133 }
2134
2135 float32 float32_round_to_int(float32 a, float_status *s)
2136 {
2137 FloatParts pa = float32_unpack_canonical(a, s);
2138 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2139 return float32_round_pack_canonical(pr, s);
2140 }
2141
2142 float64 float64_round_to_int(float64 a, float_status *s)
2143 {
2144 FloatParts pa = float64_unpack_canonical(a, s);
2145 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2146 return float64_round_pack_canonical(pr, s);
2147 }
2148
2149 /*
2150 * Rounds the bfloat16 value `a' to an integer, and returns the
2151 * result as a bfloat16 value.
2152 */
2153
2154 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2155 {
2156 FloatParts pa = bfloat16_unpack_canonical(a, s);
2157 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2158 return bfloat16_round_pack_canonical(pr, s);
2159 }
2160
2161 /*
2162 * Returns the result of converting the floating-point value `a' to
2163 * the two's complement integer format. The conversion is performed
2164 * according to the IEC/IEEE Standard for Binary Floating-Point
2165 * Arithmetic---which means in particular that the conversion is
2166 * rounded according to the current rounding mode. If `a' is a NaN,
2167 * the largest positive integer is returned. Otherwise, if the
2168 * conversion overflows, the largest integer with the same sign as `a'
2169 * is returned.
2170 */
2171
2172 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
2173 int scale, int64_t min, int64_t max,
2174 float_status *s)
2175 {
2176 uint64_t r;
2177 int orig_flags = get_float_exception_flags(s);
2178 FloatParts p = round_to_int(in, rmode, scale, s);
2179
2180 switch (p.cls) {
2181 case float_class_snan:
2182 case float_class_qnan:
2183 s->float_exception_flags = orig_flags | float_flag_invalid;
2184 return max;
2185 case float_class_inf:
2186 s->float_exception_flags = orig_flags | float_flag_invalid;
2187 return p.sign ? min : max;
2188 case float_class_zero:
2189 return 0;
2190 case float_class_normal:
2191 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2192 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2193 } else {
2194 r = UINT64_MAX;
2195 }
2196 if (p.sign) {
2197 if (r <= -(uint64_t) min) {
2198 return -r;
2199 } else {
2200 s->float_exception_flags = orig_flags | float_flag_invalid;
2201 return min;
2202 }
2203 } else {
2204 if (r <= max) {
2205 return r;
2206 } else {
2207 s->float_exception_flags = orig_flags | float_flag_invalid;
2208 return max;
2209 }
2210 }
2211 default:
2212 g_assert_not_reached();
2213 }
2214 }
2215
2216 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2217 float_status *s)
2218 {
2219 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2220 rmode, scale, INT8_MIN, INT8_MAX, s);
2221 }
2222
2223 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2224 float_status *s)
2225 {
2226 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2227 rmode, scale, INT16_MIN, INT16_MAX, s);
2228 }
2229
2230 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2231 float_status *s)
2232 {
2233 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2234 rmode, scale, INT32_MIN, INT32_MAX, s);
2235 }
2236
2237 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2238 float_status *s)
2239 {
2240 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2241 rmode, scale, INT64_MIN, INT64_MAX, s);
2242 }
2243
2244 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2245 float_status *s)
2246 {
2247 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2248 rmode, scale, INT16_MIN, INT16_MAX, s);
2249 }
2250
2251 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2252 float_status *s)
2253 {
2254 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2255 rmode, scale, INT32_MIN, INT32_MAX, s);
2256 }
2257
2258 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2259 float_status *s)
2260 {
2261 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2262 rmode, scale, INT64_MIN, INT64_MAX, s);
2263 }
2264
2265 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2266 float_status *s)
2267 {
2268 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2269 rmode, scale, INT16_MIN, INT16_MAX, s);
2270 }
2271
2272 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2273 float_status *s)
2274 {
2275 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2276 rmode, scale, INT32_MIN, INT32_MAX, s);
2277 }
2278
2279 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2280 float_status *s)
2281 {
2282 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2283 rmode, scale, INT64_MIN, INT64_MAX, s);
2284 }
2285
2286 int8_t float16_to_int8(float16 a, float_status *s)
2287 {
2288 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2289 }
2290
2291 int16_t float16_to_int16(float16 a, float_status *s)
2292 {
2293 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2294 }
2295
2296 int32_t float16_to_int32(float16 a, float_status *s)
2297 {
2298 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2299 }
2300
2301 int64_t float16_to_int64(float16 a, float_status *s)
2302 {
2303 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2304 }
2305
2306 int16_t float32_to_int16(float32 a, float_status *s)
2307 {
2308 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2309 }
2310
2311 int32_t float32_to_int32(float32 a, float_status *s)
2312 {
2313 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2314 }
2315
2316 int64_t float32_to_int64(float32 a, float_status *s)
2317 {
2318 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2319 }
2320
2321 int16_t float64_to_int16(float64 a, float_status *s)
2322 {
2323 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2324 }
2325
2326 int32_t float64_to_int32(float64 a, float_status *s)
2327 {
2328 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2329 }
2330
2331 int64_t float64_to_int64(float64 a, float_status *s)
2332 {
2333 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2334 }
2335
2336 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2337 {
2338 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2339 }
2340
2341 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2342 {
2343 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2344 }
2345
2346 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2347 {
2348 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2349 }
2350
2351 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2352 {
2353 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2354 }
2355
2356 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2357 {
2358 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2359 }
2360
2361 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2362 {
2363 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2364 }
2365
2366 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2367 {
2368 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2369 }
2370
2371 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2372 {
2373 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2374 }
2375
2376 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2377 {
2378 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2379 }
2380
2381 /*
2382 * Returns the result of converting the floating-point value `a' to
2383 * the two's complement integer format.
2384 */
2385
2386 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2387 float_status *s)
2388 {
2389 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2390 rmode, scale, INT16_MIN, INT16_MAX, s);
2391 }
2392
2393 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2394 float_status *s)
2395 {
2396 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2397 rmode, scale, INT32_MIN, INT32_MAX, s);
2398 }
2399
2400 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2401 float_status *s)
2402 {
2403 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2404 rmode, scale, INT64_MIN, INT64_MAX, s);
2405 }
2406
2407 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2408 {
2409 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2410 }
2411
2412 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2413 {
2414 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2415 }
2416
2417 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2418 {
2419 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2420 }
2421
2422 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2423 {
2424 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2425 }
2426
2427 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2428 {
2429 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2430 }
2431
2432 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2433 {
2434 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2435 }
2436
2437 /*
2438 * Returns the result of converting the floating-point value `a' to
2439 * the unsigned integer format. The conversion is performed according
2440 * to the IEC/IEEE Standard for Binary Floating-Point
2441 * Arithmetic---which means in particular that the conversion is
2442 * rounded according to the current rounding mode. If `a' is a NaN,
2443 * the largest unsigned integer is returned. Otherwise, if the
2444 * conversion overflows, the largest unsigned integer is returned. If
2445 * the 'a' is negative, the result is rounded and zero is returned;
2446 * values that do not round to zero will raise the inexact exception
2447 * flag.
2448 */
2449
2450 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
2451 int scale, uint64_t max,
2452 float_status *s)
2453 {
2454 int orig_flags = get_float_exception_flags(s);
2455 FloatParts p = round_to_int(in, rmode, scale, s);
2456 uint64_t r;
2457
2458 switch (p.cls) {
2459 case float_class_snan:
2460 case float_class_qnan:
2461 s->float_exception_flags = orig_flags | float_flag_invalid;
2462 return max;
2463 case float_class_inf:
2464 s->float_exception_flags = orig_flags | float_flag_invalid;
2465 return p.sign ? 0 : max;
2466 case float_class_zero:
2467 return 0;
2468 case float_class_normal:
2469 if (p.sign) {
2470 s->float_exception_flags = orig_flags | float_flag_invalid;
2471 return 0;
2472 }
2473
2474 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2475 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2476 } else {
2477 s->float_exception_flags = orig_flags | float_flag_invalid;
2478 return max;
2479 }
2480
2481 /* For uint64 this will never trip, but if p.exp is too large
2482 * to shift a decomposed fraction we shall have exited via the
2483 * 3rd leg above.
2484 */
2485 if (r > max) {
2486 s->float_exception_flags = orig_flags | float_flag_invalid;
2487 return max;
2488 }
2489 return r;
2490 default:
2491 g_assert_not_reached();
2492 }
2493 }
2494
2495 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2496 float_status *s)
2497 {
2498 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2499 rmode, scale, UINT8_MAX, s);
2500 }
2501
2502 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2503 float_status *s)
2504 {
2505 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2506 rmode, scale, UINT16_MAX, s);
2507 }
2508
2509 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2510 float_status *s)
2511 {
2512 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2513 rmode, scale, UINT32_MAX, s);
2514 }
2515
2516 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2517 float_status *s)
2518 {
2519 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2520 rmode, scale, UINT64_MAX, s);
2521 }
2522
2523 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2524 float_status *s)
2525 {
2526 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2527 rmode, scale, UINT16_MAX, s);
2528 }
2529
2530 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2531 float_status *s)
2532 {
2533 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2534 rmode, scale, UINT32_MAX, s);
2535 }
2536
2537 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2538 float_status *s)
2539 {
2540 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2541 rmode, scale, UINT64_MAX, s);
2542 }
2543
2544 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2545 float_status *s)
2546 {
2547 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2548 rmode, scale, UINT16_MAX, s);
2549 }
2550
2551 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2552 float_status *s)
2553 {
2554 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2555 rmode, scale, UINT32_MAX, s);
2556 }
2557
2558 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2559 float_status *s)
2560 {
2561 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2562 rmode, scale, UINT64_MAX, s);
2563 }
2564
2565 uint8_t float16_to_uint8(float16 a, float_status *s)
2566 {
2567 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2568 }
2569
2570 uint16_t float16_to_uint16(float16 a, float_status *s)
2571 {
2572 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2573 }
2574
2575 uint32_t float16_to_uint32(float16 a, float_status *s)
2576 {
2577 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2578 }
2579
2580 uint64_t float16_to_uint64(float16 a, float_status *s)
2581 {
2582 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2583 }
2584
2585 uint16_t float32_to_uint16(float32 a, float_status *s)
2586 {
2587 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2588 }
2589
2590 uint32_t float32_to_uint32(float32 a, float_status *s)
2591 {
2592 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2593 }
2594
2595 uint64_t float32_to_uint64(float32 a, float_status *s)
2596 {
2597 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2598 }
2599
2600 uint16_t float64_to_uint16(float64 a, float_status *s)
2601 {
2602 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2603 }
2604
2605 uint32_t float64_to_uint32(float64 a, float_status *s)
2606 {
2607 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2608 }
2609
2610 uint64_t float64_to_uint64(float64 a, float_status *s)
2611 {
2612 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614
2615 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2616 {
2617 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2618 }
2619
2620 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2621 {
2622 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2623 }
2624
2625 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2626 {
2627 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2628 }
2629
2630 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2631 {
2632 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634
2635 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2636 {
2637 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639
2640 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2641 {
2642 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2643 }
2644
2645 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2646 {
2647 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2648 }
2649
2650 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2651 {
2652 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2653 }
2654
2655 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2656 {
2657 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2658 }
2659
2660 /*
2661 * Returns the result of converting the bfloat16 value `a' to
2662 * the unsigned integer format.
2663 */
2664
2665 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2666 int scale, float_status *s)
2667 {
2668 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2669 rmode, scale, UINT16_MAX, s);
2670 }
2671
2672 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2673 int scale, float_status *s)
2674 {
2675 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2676 rmode, scale, UINT32_MAX, s);
2677 }
2678
2679 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2680 int scale, float_status *s)
2681 {
2682 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2683 rmode, scale, UINT64_MAX, s);
2684 }
2685
2686 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2687 {
2688 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2689 }
2690
2691 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2692 {
2693 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2694 }
2695
2696 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2697 {
2698 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2699 }
2700
2701 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2702 {
2703 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2704 }
2705
2706 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2707 {
2708 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2709 }
2710
2711 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2712 {
2713 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2714 }
2715
2716 /*
2717 * Integer to float conversions
2718 *
2719 * Returns the result of converting the two's complement integer `a'
2720 * to the floating-point format. The conversion is performed according
2721 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2722 */
2723
2724 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2725 {
2726 FloatParts r = { .sign = false };
2727
2728 if (a == 0) {
2729 r.cls = float_class_zero;
2730 } else {
2731 uint64_t f = a;
2732 int shift;
2733
2734 r.cls = float_class_normal;
2735 if (a < 0) {
2736 f = -f;
2737 r.sign = true;
2738 }
2739 shift = clz64(f);
2740 scale = MIN(MAX(scale, -0x10000), 0x10000);
2741
2742 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2743 r.frac = f << shift;
2744 }
2745
2746 return r;
2747 }
2748
2749 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2750 {
2751 FloatParts pa = int_to_float(a, scale, status);
2752 return float16_round_pack_canonical(pa, status);
2753 }
2754
2755 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2756 {
2757 return int64_to_float16_scalbn(a, scale, status);
2758 }
2759
2760 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2761 {
2762 return int64_to_float16_scalbn(a, scale, status);
2763 }
2764
2765 float16 int64_to_float16(int64_t a, float_status *status)
2766 {
2767 return int64_to_float16_scalbn(a, 0, status);
2768 }
2769
2770 float16 int32_to_float16(int32_t a, float_status *status)
2771 {
2772 return int64_to_float16_scalbn(a, 0, status);
2773 }
2774
2775 float16 int16_to_float16(int16_t a, float_status *status)
2776 {
2777 return int64_to_float16_scalbn(a, 0, status);
2778 }
2779
2780 float16 int8_to_float16(int8_t a, float_status *status)
2781 {
2782 return int64_to_float16_scalbn(a, 0, status);
2783 }
2784
2785 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2786 {
2787 FloatParts pa = int_to_float(a, scale, status);
2788 return float32_round_pack_canonical(pa, status);
2789 }
2790
2791 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2792 {
2793 return int64_to_float32_scalbn(a, scale, status);
2794 }
2795
2796 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2797 {
2798 return int64_to_float32_scalbn(a, scale, status);
2799 }
2800
2801 float32 int64_to_float32(int64_t a, float_status *status)
2802 {
2803 return int64_to_float32_scalbn(a, 0, status);
2804 }
2805
2806 float32 int32_to_float32(int32_t a, float_status *status)
2807 {
2808 return int64_to_float32_scalbn(a, 0, status);
2809 }
2810
2811 float32 int16_to_float32(int16_t a, float_status *status)
2812 {
2813 return int64_to_float32_scalbn(a, 0, status);
2814 }
2815
2816 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2817 {
2818 FloatParts pa = int_to_float(a, scale, status);
2819 return float64_round_pack_canonical(pa, status);
2820 }
2821
2822 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2823 {
2824 return int64_to_float64_scalbn(a, scale, status);
2825 }
2826
2827 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2828 {
2829 return int64_to_float64_scalbn(a, scale, status);
2830 }
2831
2832 float64 int64_to_float64(int64_t a, float_status *status)
2833 {
2834 return int64_to_float64_scalbn(a, 0, status);
2835 }
2836
2837 float64 int32_to_float64(int32_t a, float_status *status)
2838 {
2839 return int64_to_float64_scalbn(a, 0, status);
2840 }
2841
2842 float64 int16_to_float64(int16_t a, float_status *status)
2843 {
2844 return int64_to_float64_scalbn(a, 0, status);
2845 }
2846
2847 /*
2848 * Returns the result of converting the two's complement integer `a'
2849 * to the bfloat16 format.
2850 */
2851
2852 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2853 {
2854 FloatParts pa = int_to_float(a, scale, status);
2855 return bfloat16_round_pack_canonical(pa, status);
2856 }
2857
2858 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2859 {
2860 return int64_to_bfloat16_scalbn(a, scale, status);
2861 }
2862
2863 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2864 {
2865 return int64_to_bfloat16_scalbn(a, scale, status);
2866 }
2867
2868 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2869 {
2870 return int64_to_bfloat16_scalbn(a, 0, status);
2871 }
2872
2873 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2874 {
2875 return int64_to_bfloat16_scalbn(a, 0, status);
2876 }
2877
2878 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2879 {
2880 return int64_to_bfloat16_scalbn(a, 0, status);
2881 }
2882
2883 /*
2884 * Unsigned Integer to float conversions
2885 *
2886 * Returns the result of converting the unsigned integer `a' to the
2887 * floating-point format. The conversion is performed according to the
2888 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2889 */
2890
2891 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2892 {
2893 FloatParts r = { .sign = false };
2894 int shift;
2895
2896 if (a == 0) {
2897 r.cls = float_class_zero;
2898 } else {
2899 scale = MIN(MAX(scale, -0x10000), 0x10000);
2900 shift = clz64(a);
2901 r.cls = float_class_normal;
2902 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2903 r.frac = a << shift;
2904 }
2905
2906 return r;
2907 }
2908
2909 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2910 {
2911 FloatParts pa = uint_to_float(a, scale, status);
2912 return float16_round_pack_canonical(pa, status);
2913 }
2914
2915 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2916 {
2917 return uint64_to_float16_scalbn(a, scale, status);
2918 }
2919
2920 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2921 {
2922 return uint64_to_float16_scalbn(a, scale, status);
2923 }
2924
2925 float16 uint64_to_float16(uint64_t a, float_status *status)
2926 {
2927 return uint64_to_float16_scalbn(a, 0, status);
2928 }
2929
2930 float16 uint32_to_float16(uint32_t a, float_status *status)
2931 {
2932 return uint64_to_float16_scalbn(a, 0, status);
2933 }
2934
2935 float16 uint16_to_float16(uint16_t a, float_status *status)
2936 {
2937 return uint64_to_float16_scalbn(a, 0, status);
2938 }
2939
2940 float16 uint8_to_float16(uint8_t a, float_status *status)
2941 {
2942 return uint64_to_float16_scalbn(a, 0, status);
2943 }
2944
2945 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2946 {
2947 FloatParts pa = uint_to_float(a, scale, status);
2948 return float32_round_pack_canonical(pa, status);
2949 }
2950
2951 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2952 {
2953 return uint64_to_float32_scalbn(a, scale, status);
2954 }
2955
2956 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2957 {
2958 return uint64_to_float32_scalbn(a, scale, status);
2959 }
2960
2961 float32 uint64_to_float32(uint64_t a, float_status *status)
2962 {
2963 return uint64_to_float32_scalbn(a, 0, status);
2964 }
2965
2966 float32 uint32_to_float32(uint32_t a, float_status *status)
2967 {
2968 return uint64_to_float32_scalbn(a, 0, status);
2969 }
2970
2971 float32 uint16_to_float32(uint16_t a, float_status *status)
2972 {
2973 return uint64_to_float32_scalbn(a, 0, status);
2974 }
2975
2976 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2977 {
2978 FloatParts pa = uint_to_float(a, scale, status);
2979 return float64_round_pack_canonical(pa, status);
2980 }
2981
2982 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2983 {
2984 return uint64_to_float64_scalbn(a, scale, status);
2985 }
2986
2987 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2988 {
2989 return uint64_to_float64_scalbn(a, scale, status);
2990 }
2991
2992 float64 uint64_to_float64(uint64_t a, float_status *status)
2993 {
2994 return uint64_to_float64_scalbn(a, 0, status);
2995 }
2996
2997 float64 uint32_to_float64(uint32_t a, float_status *status)
2998 {
2999 return uint64_to_float64_scalbn(a, 0, status);
3000 }
3001
3002 float64 uint16_to_float64(uint16_t a, float_status *status)
3003 {
3004 return uint64_to_float64_scalbn(a, 0, status);
3005 }
3006
3007 /*
3008 * Returns the result of converting the unsigned integer `a' to the
3009 * bfloat16 format.
3010 */
3011
3012 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3013 {
3014 FloatParts pa = uint_to_float(a, scale, status);
3015 return bfloat16_round_pack_canonical(pa, status);
3016 }
3017
3018 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3019 {
3020 return uint64_to_bfloat16_scalbn(a, scale, status);
3021 }
3022
3023 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3024 {
3025 return uint64_to_bfloat16_scalbn(a, scale, status);
3026 }
3027
3028 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3029 {
3030 return uint64_to_bfloat16_scalbn(a, 0, status);
3031 }
3032
3033 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3034 {
3035 return uint64_to_bfloat16_scalbn(a, 0, status);
3036 }
3037
3038 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3039 {
3040 return uint64_to_bfloat16_scalbn(a, 0, status);
3041 }
3042
3043 /* Float Min/Max */
3044 /* min() and max() functions. These can't be implemented as
3045 * 'compare and pick one input' because that would mishandle
3046 * NaNs and +0 vs -0.
3047 *
3048 * minnum() and maxnum() functions. These are similar to the min()
3049 * and max() functions but if one of the arguments is a QNaN and
3050 * the other is numerical then the numerical argument is returned.
3051 * SNaNs will get quietened before being returned.
3052 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3053 * and maxNum() operations. min() and max() are the typical min/max
3054 * semantics provided by many CPUs which predate that specification.
3055 *
3056 * minnummag() and maxnummag() functions correspond to minNumMag()
3057 * and minNumMag() from the IEEE-754 2008.
3058 */
3059 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
3060 bool ieee, bool ismag, float_status *s)
3061 {
3062 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3063 if (ieee) {
3064 /* Takes two floating-point values `a' and `b', one of
3065 * which is a NaN, and returns the appropriate NaN
3066 * result. If either `a' or `b' is a signaling NaN,
3067 * the invalid exception is raised.
3068 */
3069 if (is_snan(a.cls) || is_snan(b.cls)) {
3070 return pick_nan(a, b, s);
3071 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3072 return b;
3073 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3074 return a;
3075 }
3076 }
3077 return pick_nan(a, b, s);
3078 } else {
3079 int a_exp, b_exp;
3080
3081 switch (a.cls) {
3082 case float_class_normal:
3083 a_exp = a.exp;
3084 break;
3085 case float_class_inf:
3086 a_exp = INT_MAX;
3087 break;
3088 case float_class_zero:
3089 a_exp = INT_MIN;
3090 break;
3091 default:
3092 g_assert_not_reached();
3093 break;
3094 }
3095 switch (b.cls) {
3096 case float_class_normal:
3097 b_exp = b.exp;
3098 break;
3099 case float_class_inf:
3100 b_exp = INT_MAX;
3101 break;
3102 case float_class_zero:
3103 b_exp = INT_MIN;
3104 break;
3105 default:
3106 g_assert_not_reached();
3107 break;
3108 }
3109
3110 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3111 bool a_less = a_exp < b_exp;
3112 if (a_exp == b_exp) {
3113 a_less = a.frac < b.frac;
3114 }
3115 return a_less ^ ismin ? b : a;
3116 }
3117
3118 if (a.sign == b.sign) {
3119 bool a_less = a_exp < b_exp;
3120 if (a_exp == b_exp) {
3121 a_less = a.frac < b.frac;
3122 }
3123 return a.sign ^ a_less ^ ismin ? b : a;
3124 } else {
3125 return a.sign ^ ismin ? b : a;
3126 }
3127 }
3128 }
3129
3130 #define MINMAX(sz, name, ismin, isiee, ismag) \
3131 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3132 float_status *s) \
3133 { \
3134 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
3135 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
3136 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3137 \
3138 return float ## sz ## _round_pack_canonical(pr, s); \
3139 }
3140
3141 MINMAX(16, min, true, false, false)
3142 MINMAX(16, minnum, true, true, false)
3143 MINMAX(16, minnummag, true, true, true)
3144 MINMAX(16, max, false, false, false)
3145 MINMAX(16, maxnum, false, true, false)
3146 MINMAX(16, maxnummag, false, true, true)
3147
3148 MINMAX(32, min, true, false, false)
3149 MINMAX(32, minnum, true, true, false)
3150 MINMAX(32, minnummag, true, true, true)
3151 MINMAX(32, max, false, false, false)
3152 MINMAX(32, maxnum, false, true, false)
3153 MINMAX(32, maxnummag, false, true, true)
3154
3155 MINMAX(64, min, true, false, false)
3156 MINMAX(64, minnum, true, true, false)
3157 MINMAX(64, minnummag, true, true, true)
3158 MINMAX(64, max, false, false, false)
3159 MINMAX(64, maxnum, false, true, false)
3160 MINMAX(64, maxnummag, false, true, true)
3161
3162 #undef MINMAX
3163
3164 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3165 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3166 { \
3167 FloatParts pa = bfloat16_unpack_canonical(a, s); \
3168 FloatParts pb = bfloat16_unpack_canonical(b, s); \
3169 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3170 \
3171 return bfloat16_round_pack_canonical(pr, s); \
3172 }
3173
3174 BF16_MINMAX(min, true, false, false)
3175 BF16_MINMAX(minnum, true, true, false)
3176 BF16_MINMAX(minnummag, true, true, true)
3177 BF16_MINMAX(max, false, false, false)
3178 BF16_MINMAX(maxnum, false, true, false)
3179 BF16_MINMAX(maxnummag, false, true, true)
3180
3181 #undef BF16_MINMAX
3182
3183 /* Floating point compare */
3184 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet,
3185 float_status *s)
3186 {
3187 if (is_nan(a.cls) || is_nan(b.cls)) {
3188 if (!is_quiet ||
3189 a.cls == float_class_snan ||
3190 b.cls == float_class_snan) {
3191 s->float_exception_flags |= float_flag_invalid;
3192 }
3193 return float_relation_unordered;
3194 }
3195
3196 if (a.cls == float_class_zero) {
3197 if (b.cls == float_class_zero) {
3198 return float_relation_equal;
3199 }
3200 return b.sign ? float_relation_greater : float_relation_less;
3201 } else if (b.cls == float_class_zero) {
3202 return a.sign ? float_relation_less : float_relation_greater;
3203 }
3204
3205 /* The only really important thing about infinity is its sign. If
3206 * both are infinities the sign marks the smallest of the two.
3207 */
3208 if (a.cls == float_class_inf) {
3209 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3210 return float_relation_equal;
3211 }
3212 return a.sign ? float_relation_less : float_relation_greater;
3213 } else if (b.cls == float_class_inf) {
3214 return b.sign ? float_relation_greater : float_relation_less;
3215 }
3216
3217 if (a.sign != b.sign) {
3218 return a.sign ? float_relation_less : float_relation_greater;
3219 }
3220
3221 if (a.exp == b.exp) {
3222 if (a.frac == b.frac) {
3223 return float_relation_equal;
3224 }
3225 if (a.sign) {
3226 return a.frac > b.frac ?
3227 float_relation_less : float_relation_greater;
3228 } else {
3229 return a.frac > b.frac ?
3230 float_relation_greater : float_relation_less;
3231 }
3232 } else {
3233 if (a.sign) {
3234 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3235 } else {
3236 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3237 }
3238 }
3239 }
3240
3241 #define COMPARE(name, attr, sz) \
3242 static int attr \
3243 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3244 { \
3245 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
3246 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
3247 return compare_floats(pa, pb, is_quiet, s); \
3248 }
3249
3250 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3251 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3252 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3253
3254 #undef COMPARE
3255
3256 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3257 {
3258 return soft_f16_compare(a, b, false, s);
3259 }
3260
3261 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3262 {
3263 return soft_f16_compare(a, b, true, s);
3264 }
3265
3266 static FloatRelation QEMU_FLATTEN
3267 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3268 {
3269 union_float32 ua, ub;
3270
3271 ua.s = xa;
3272 ub.s = xb;
3273
3274 if (QEMU_NO_HARDFLOAT) {
3275 goto soft;
3276 }
3277
3278 float32_input_flush2(&ua.s, &ub.s, s);
3279 if (isgreaterequal(ua.h, ub.h)) {
3280 if (isgreater(ua.h, ub.h)) {
3281 return float_relation_greater;
3282 }
3283 return float_relation_equal;
3284 }
3285 if (likely(isless(ua.h, ub.h))) {
3286 return float_relation_less;
3287 }
3288 /* The only condition remaining is unordered.
3289 * Fall through to set flags.
3290 */
3291 soft:
3292 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3293 }
3294
3295 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3296 {
3297 return f32_compare(a, b, false, s);
3298 }
3299
3300 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3301 {
3302 return f32_compare(a, b, true, s);
3303 }
3304
3305 static FloatRelation QEMU_FLATTEN
3306 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3307 {
3308 union_float64 ua, ub;
3309
3310 ua.s = xa;
3311 ub.s = xb;
3312
3313 if (QEMU_NO_HARDFLOAT) {
3314 goto soft;
3315 }
3316
3317 float64_input_flush2(&ua.s, &ub.s, s);
3318 if (isgreaterequal(ua.h, ub.h)) {
3319 if (isgreater(ua.h, ub.h)) {
3320 return float_relation_greater;
3321 }
3322 return float_relation_equal;
3323 }
3324 if (likely(isless(ua.h, ub.h))) {
3325 return float_relation_less;
3326 }
3327 /* The only condition remaining is unordered.
3328 * Fall through to set flags.
3329 */
3330 soft:
3331 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3332 }
3333
3334 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3335 {
3336 return f64_compare(a, b, false, s);
3337 }
3338
3339 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3340 {
3341 return f64_compare(a, b, true, s);
3342 }
3343
3344 static FloatRelation QEMU_FLATTEN
3345 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3346 {
3347 FloatParts pa = bfloat16_unpack_canonical(a, s);
3348 FloatParts pb = bfloat16_unpack_canonical(b, s);
3349 return compare_floats(pa, pb, is_quiet, s);
3350 }
3351
3352 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3353 {
3354 return soft_bf16_compare(a, b, false, s);
3355 }
3356
3357 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3358 {
3359 return soft_bf16_compare(a, b, true, s);
3360 }
3361
3362 /* Multiply A by 2 raised to the power N. */
3363 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3364 {
3365 if (unlikely(is_nan(a.cls))) {
3366 return return_nan(a, s);
3367 }
3368 if (a.cls == float_class_normal) {
3369 /* The largest float type (even though not supported by FloatParts)
3370 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3371 * still allows rounding to infinity, without allowing overflow
3372 * within the int32_t that backs FloatParts.exp.
3373 */
3374 n = MIN(MAX(n, -0x10000), 0x10000);
3375 a.exp += n;
3376 }
3377 return a;
3378 }
3379
3380 float16 float16_scalbn(float16 a, int n, float_status *status)
3381 {
3382 FloatParts pa = float16_unpack_canonical(a, status);
3383 FloatParts pr = scalbn_decomposed(pa, n, status);
3384 return float16_round_pack_canonical(pr, status);
3385 }
3386
3387 float32 float32_scalbn(float32 a, int n, float_status *status)
3388 {
3389 FloatParts pa = float32_unpack_canonical(a, status);
3390 FloatParts pr = scalbn_decomposed(pa, n, status);
3391 return float32_round_pack_canonical(pr, status);
3392 }
3393
3394 float64 float64_scalbn(float64 a, int n, float_status *status)
3395 {
3396 FloatParts pa = float64_unpack_canonical(a, status);
3397 FloatParts pr = scalbn_decomposed(pa, n, status);
3398 return float64_round_pack_canonical(pr, status);
3399 }
3400
3401 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3402 {
3403 FloatParts pa = bfloat16_unpack_canonical(a, status);
3404 FloatParts pr = scalbn_decomposed(pa, n, status);
3405 return bfloat16_round_pack_canonical(pr, status);
3406 }
3407
3408 /*
3409 * Square Root
3410 *
3411 * The old softfloat code did an approximation step before zeroing in
3412 * on the final result. However for simpleness we just compute the
3413 * square root by iterating down from the implicit bit to enough extra
3414 * bits to ensure we get a correctly rounded result.
3415 *
3416 * This does mean however the calculation is slower than before,
3417 * especially for 64 bit floats.
3418 */
3419
3420 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3421 {
3422 uint64_t a_frac, r_frac, s_frac;
3423 int bit, last_bit;
3424
3425 if (is_nan(a.cls)) {
3426 return return_nan(a, s);
3427 }
3428 if (a.cls == float_class_zero) {
3429 return a; /* sqrt(+-0) = +-0 */
3430 }
3431 if (a.sign) {
3432 s->float_exception_flags |= float_flag_invalid;
3433 return parts_default_nan(s);
3434 }
3435 if (a.cls == float_class_inf) {
3436 return a; /* sqrt(+inf) = +inf */
3437 }
3438
3439 assert(a.cls == float_class_normal);
3440
3441 /* We need two overflow bits at the top. Adding room for that is a
3442 * right shift. If the exponent is odd, we can discard the low bit
3443 * by multiplying the fraction by 2; that's a left shift. Combine
3444 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3445 */
3446 a_frac = a.frac >> (2 - (a.exp & 1));
3447 a.exp >>= 1;
3448
3449 /* Bit-by-bit computation of sqrt. */
3450 r_frac = 0;
3451 s_frac = 0;
3452
3453 /* Iterate from implicit bit down to the 3 extra bits to compute a
3454 * properly rounded result. Remember we've inserted two more bits
3455 * at the top, so these positions are two less.
3456 */
3457 bit = DECOMPOSED_BINARY_POINT - 2;
3458 last_bit = MAX(p->frac_shift - 4, 0);
3459 do {
3460 uint64_t q = 1ULL << bit;
3461 uint64_t t_frac = s_frac + q;
3462 if (t_frac <= a_frac) {
3463 s_frac = t_frac + q;
3464 a_frac -= t_frac;
3465 r_frac += q;
3466 }
3467 a_frac <<= 1;
3468 } while (--bit >= last_bit);
3469
3470 /* Undo the right shift done above. If there is any remaining
3471 * fraction, the result is inexact. Set the sticky bit.
3472 */
3473 a.frac = (r_frac << 2) + (a_frac != 0);
3474
3475 return a;
3476 }
3477
3478 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3479 {
3480 FloatParts pa = float16_unpack_canonical(a, status);
3481 FloatParts pr = sqrt_float(pa, status, &float16_params);
3482 return float16_round_pack_canonical(pr, status);
3483 }
3484
3485 static float32 QEMU_SOFTFLOAT_ATTR
3486 soft_f32_sqrt(float32 a, float_status *status)
3487 {
3488 FloatParts pa = float32_unpack_canonical(a, status);
3489 FloatParts pr = sqrt_float(pa, status, &float32_params);
3490 return float32_round_pack_canonical(pr, status);
3491 }
3492
3493 static float64 QEMU_SOFTFLOAT_ATTR
3494 soft_f64_sqrt(float64 a, float_status *status)
3495 {
3496 FloatParts pa = float64_unpack_canonical(a, status);
3497 FloatParts pr = sqrt_float(pa, status, &float64_params);
3498 return float64_round_pack_canonical(pr, status);
3499 }
3500
3501 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3502 {
3503 union_float32 ua, ur;
3504
3505 ua.s = xa;
3506 if (unlikely(!can_use_fpu(s))) {
3507 goto soft;
3508 }
3509
3510 float32_input_flush1(&ua.s, s);
3511 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3512 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3513 fpclassify(ua.h) == FP_ZERO) ||
3514 signbit(ua.h))) {
3515 goto soft;
3516 }
3517 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3518 float32_is_neg(ua.s))) {
3519 goto soft;
3520 }
3521 ur.h = sqrtf(ua.h);
3522 return ur.s;
3523
3524 soft:
3525 return soft_f32_sqrt(ua.s, s);
3526 }
3527
3528 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3529 {
3530 union_float64 ua, ur;
3531
3532 ua.s = xa;
3533 if (unlikely(!can_use_fpu(s))) {
3534 goto soft;
3535 }
3536
3537 float64_input_flush1(&ua.s, s);
3538 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3539 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3540 fpclassify(ua.h) == FP_ZERO) ||
3541 signbit(ua.h))) {
3542 goto soft;
3543 }
3544 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3545 float64_is_neg(ua.s))) {
3546 goto soft;
3547 }
3548 ur.h = sqrt(ua.h);
3549 return ur.s;
3550
3551 soft:
3552 return soft_f64_sqrt(ua.s, s);
3553 }
3554
3555 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3556 {
3557 FloatParts pa = bfloat16_unpack_canonical(a, status);
3558 FloatParts pr = sqrt_float(pa, status, &bfloat16_params);
3559 return bfloat16_round_pack_canonical(pr, status);
3560 }
3561
3562 /*----------------------------------------------------------------------------
3563 | The pattern for a default generated NaN.
3564 *----------------------------------------------------------------------------*/
3565
3566 float16 float16_default_nan(float_status *status)
3567 {
3568 FloatParts p = parts_default_nan(status);
3569 p.frac >>= float16_params.frac_shift;
3570 return float16_pack_raw(p);
3571 }
3572
3573 float32 float32_default_nan(float_status *status)
3574 {
3575 FloatParts p = parts_default_nan(status);
3576 p.frac >>= float32_params.frac_shift;
3577 return float32_pack_raw(p);
3578 }
3579
3580 float64 float64_default_nan(float_status *status)
3581 {
3582 FloatParts p = parts_default_nan(status);
3583 p.frac >>= float64_params.frac_shift;
3584 return float64_pack_raw(p);
3585 }
3586
3587 float128 float128_default_nan(float_status *status)
3588 {
3589 FloatParts p = parts_default_nan(status);
3590 float128 r;
3591
3592 /* Extrapolate from the choices made by parts_default_nan to fill
3593 * in the quad-floating format. If the low bit is set, assume we
3594 * want to set all non-snan bits.
3595 */
3596 r.low = -(p.frac & 1);
3597 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3598 r.high |= UINT64_C(0x7FFF000000000000);
3599 r.high |= (uint64_t)p.sign << 63;
3600
3601 return r;
3602 }
3603
3604 bfloat16 bfloat16_default_nan(float_status *status)
3605 {
3606 FloatParts p = parts_default_nan(status);
3607 p.frac >>= bfloat16_params.frac_shift;
3608 return bfloat16_pack_raw(p);
3609 }
3610
3611 /*----------------------------------------------------------------------------
3612 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3613 *----------------------------------------------------------------------------*/
3614
3615 float16 float16_silence_nan(float16 a, float_status *status)
3616 {
3617 FloatParts p = float16_unpack_raw(a);
3618 p.frac <<= float16_params.frac_shift;
3619 p = parts_silence_nan(p, status);
3620 p.frac >>= float16_params.frac_shift;
3621 return float16_pack_raw(p);
3622 }
3623
3624 float32 float32_silence_nan(float32 a, float_status *status)
3625 {
3626 FloatParts p = float32_unpack_raw(a);
3627 p.frac <<= float32_params.frac_shift;
3628 p = parts_silence_nan(p, status);
3629 p.frac >>= float32_params.frac_shift;
3630 return float32_pack_raw(p);
3631 }
3632
3633 float64 float64_silence_nan(float64 a, float_status *status)
3634 {
3635 FloatParts p = float64_unpack_raw(a);
3636 p.frac <<= float64_params.frac_shift;
3637 p = parts_silence_nan(p, status);
3638 p.frac >>= float64_params.frac_shift;
3639 return float64_pack_raw(p);
3640 }
3641
3642 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3643 {
3644 FloatParts p = bfloat16_unpack_raw(a);
3645 p.frac <<= bfloat16_params.frac_shift;
3646 p = parts_silence_nan(p, status);
3647 p.frac >>= bfloat16_params.frac_shift;
3648 return bfloat16_pack_raw(p);
3649 }
3650
3651 /*----------------------------------------------------------------------------
3652 | If `a' is denormal and we are in flush-to-zero mode then set the
3653 | input-denormal exception and return zero. Otherwise just return the value.
3654 *----------------------------------------------------------------------------*/
3655
3656 static bool parts_squash_denormal(FloatParts p, float_status *status)
3657 {
3658 if (p.exp == 0 && p.frac != 0) {
3659 float_raise(float_flag_input_denormal, status);
3660 return true;
3661 }
3662
3663 return false;
3664 }
3665
3666 float16 float16_squash_input_denormal(float16 a, float_status *status)
3667 {
3668 if (status->flush_inputs_to_zero) {
3669 FloatParts p = float16_unpack_raw(a);
3670 if (parts_squash_denormal(p, status)) {
3671 return float16_set_sign(float16_zero, p.sign);
3672 }
3673 }
3674 return a;
3675 }
3676
3677 float32 float32_squash_input_denormal(float32 a, float_status *status)
3678 {
3679 if (status->flush_inputs_to_zero) {
3680 FloatParts p = float32_unpack_raw(a);
3681 if (parts_squash_denormal(p, status)) {
3682 return float32_set_sign(float32_zero, p.sign);
3683 }
3684 }
3685 return a;
3686 }
3687
3688 float64 float64_squash_input_denormal(float64 a, float_status *status)
3689 {
3690 if (status->flush_inputs_to_zero) {
3691 FloatParts p = float64_unpack_raw(a);
3692 if (parts_squash_denormal(p, status)) {
3693 return float64_set_sign(float64_zero, p.sign);
3694 }
3695 }
3696 return a;
3697 }
3698
3699 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3700 {
3701 if (status->flush_inputs_to_zero) {
3702 FloatParts p = bfloat16_unpack_raw(a);
3703 if (parts_squash_denormal(p, status)) {
3704 return bfloat16_set_sign(bfloat16_zero, p.sign);
3705 }
3706 }
3707 return a;
3708 }
3709
3710 /*----------------------------------------------------------------------------
3711 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3712 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3713 | input. If `zSign' is 1, the input is negated before being converted to an
3714 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3715 | is simply rounded to an integer, with the inexact exception raised if the
3716 | input cannot be represented exactly as an integer. However, if the fixed-
3717 | point input is too large, the invalid exception is raised and the largest
3718 | positive or negative integer is returned.
3719 *----------------------------------------------------------------------------*/
3720
3721 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3722 float_status *status)
3723 {
3724 int8_t roundingMode;
3725 bool roundNearestEven;
3726 int8_t roundIncrement, roundBits;
3727 int32_t z;
3728
3729 roundingMode = status->float_rounding_mode;
3730 roundNearestEven = ( roundingMode == float_round_nearest_even );
3731 switch (roundingMode) {
3732 case float_round_nearest_even:
3733 case float_round_ties_away:
3734 roundIncrement = 0x40;
3735 break;
3736 case float_round_to_zero:
3737 roundIncrement = 0;
3738 break;
3739 case float_round_up:
3740 roundIncrement = zSign ? 0 : 0x7f;
3741 break;
3742 case float_round_down:
3743 roundIncrement = zSign ? 0x7f : 0;
3744 break;
3745 case float_round_to_odd:
3746 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3747 break;
3748 default:
3749 abort();
3750 }
3751 roundBits = absZ & 0x7F;
3752 absZ = ( absZ + roundIncrement )>>7;
3753 if (!(roundBits ^ 0x40) && roundNearestEven) {
3754 absZ &= ~1;
3755 }
3756 z = absZ;
3757 if ( zSign ) z = - z;
3758 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3759 float_raise(float_flag_invalid, status);
3760 return zSign ? INT32_MIN : INT32_MAX;
3761 }
3762 if (roundBits) {
3763 status->float_exception_flags |= float_flag_inexact;
3764 }
3765 return z;
3766
3767 }
3768
3769 /*----------------------------------------------------------------------------
3770 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3771 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3772 | and returns the properly rounded 64-bit integer corresponding to the input.
3773 | If `zSign' is 1, the input is negated before being converted to an integer.
3774 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3775 | the inexact exception raised if the input cannot be represented exactly as
3776 | an integer. However, if the fixed-point input is too large, the invalid
3777 | exception is raised and the largest positive or negative integer is
3778 | returned.
3779 *----------------------------------------------------------------------------*/
3780
3781 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3782 float_status *status)
3783 {
3784 int8_t roundingMode;
3785 bool roundNearestEven, increment;
3786 int64_t z;
3787
3788 roundingMode = status->float_rounding_mode;
3789 roundNearestEven = ( roundingMode == float_round_nearest_even );
3790 switch (roundingMode) {
3791 case float_round_nearest_even:
3792 case float_round_ties_away:
3793 increment = ((int64_t) absZ1 < 0);
3794 break;
3795 case float_round_to_zero:
3796 increment = 0;
3797 break;
3798 case float_round_up:
3799 increment = !zSign && absZ1;
3800 break;
3801 case float_round_down:
3802 increment = zSign && absZ1;
3803 break;
3804 case float_round_to_odd:
3805 increment = !(absZ0 & 1) && absZ1;
3806 break;
3807 default:
3808 abort();
3809 }
3810 if ( increment ) {
3811 ++absZ0;
3812 if ( absZ0 == 0 ) goto overflow;
3813 if (!(absZ1 << 1) && roundNearestEven) {
3814 absZ0 &= ~1;
3815 }
3816 }
3817 z = absZ0;
3818 if ( zSign ) z = - z;
3819 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3820 overflow:
3821 float_raise(float_flag_invalid, status);
3822 return zSign ? INT64_MIN : INT64_MAX;
3823 }
3824 if (absZ1) {
3825 status->float_exception_flags |= float_flag_inexact;
3826 }
3827 return z;
3828
3829 }
3830
3831 /*----------------------------------------------------------------------------
3832 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3833 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3834 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3835 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3836 | with the inexact exception raised if the input cannot be represented exactly
3837 | as an integer. However, if the fixed-point input is too large, the invalid
3838 | exception is raised and the largest unsigned integer is returned.
3839 *----------------------------------------------------------------------------*/
3840
3841 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3842 uint64_t absZ1, float_status *status)
3843 {
3844 int8_t roundingMode;
3845 bool roundNearestEven, increment;
3846
3847 roundingMode = status->float_rounding_mode;
3848 roundNearestEven = (roundingMode == float_round_nearest_even);
3849 switch (roundingMode) {
3850 case float_round_nearest_even:
3851 case float_round_ties_away:
3852 increment = ((int64_t)absZ1 < 0);
3853 break;
3854 case float_round_to_zero:
3855 increment = 0;
3856 break;
3857 case float_round_up:
3858 increment = !zSign && absZ1;
3859 break;
3860 case float_round_down:
3861 increment = zSign && absZ1;
3862 break;
3863 case float_round_to_odd:
3864 increment = !(absZ0 & 1) && absZ1;
3865 break;
3866 default:
3867 abort();
3868 }
3869 if (increment) {
3870 ++absZ0;
3871 if (absZ0 == 0) {
3872 float_raise(float_flag_invalid, status);
3873 return UINT64_MAX;
3874 }
3875 if (!(absZ1 << 1) && roundNearestEven) {
3876 absZ0 &= ~1;
3877 }
3878 }
3879
3880 if (zSign && absZ0) {
3881 float_raise(float_flag_invalid, status);
3882 return 0;
3883 }
3884
3885 if (absZ1) {
3886 status->float_exception_flags |= float_flag_inexact;
3887 }
3888 return absZ0;
3889 }
3890
3891 /*----------------------------------------------------------------------------
3892 | Normalizes the subnormal single-precision floating-point value represented
3893 | by the denormalized significand `aSig'. The normalized exponent and
3894 | significand are stored at the locations pointed to by `zExpPtr' and
3895 | `zSigPtr', respectively.
3896 *----------------------------------------------------------------------------*/
3897
3898 static void
3899 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3900 {
3901 int8_t shiftCount;
3902
3903 shiftCount = clz32(aSig) - 8;
3904 *zSigPtr = aSig<<shiftCount;
3905 *zExpPtr = 1 - shiftCount;
3906
3907 }
3908
3909 /*----------------------------------------------------------------------------
3910 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3911 | and significand `zSig', and returns the proper single-precision floating-
3912 | point value corresponding to the abstract input. Ordinarily, the abstract
3913 | value is simply rounded and packed into the single-precision format, with
3914 | the inexact exception raised if the abstract input cannot be represented
3915 | exactly. However, if the abstract value is too large, the overflow and
3916 | inexact exceptions are raised and an infinity or maximal finite value is
3917 | returned. If the abstract value is too small, the input value is rounded to
3918 | a subnormal number, and the underflow and inexact exceptions are raised if
3919 | the abstract input cannot be represented exactly as a subnormal single-
3920 | precision floating-point number.
3921 | The input significand `zSig' has its binary point between bits 30
3922 | and 29, which is 7 bits to the left of the usual location. This shifted
3923 | significand must be normalized or smaller. If `zSig' is not normalized,
3924 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3925 | and it must not require rounding. In the usual case that `zSig' is
3926 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3927 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3928 | Binary Floating-Point Arithmetic.
3929 *----------------------------------------------------------------------------*/
3930
3931 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3932 float_status *status)
3933 {
3934 int8_t roundingMode;
3935 bool roundNearestEven;
3936 int8_t roundIncrement, roundBits;
3937 bool isTiny;
3938
3939 roundingMode = status->float_rounding_mode;
3940 roundNearestEven = ( roundingMode == float_round_nearest_even );
3941 switch (roundingMode) {
3942 case float_round_nearest_even:
3943 case float_round_ties_away:
3944 roundIncrement = 0x40;
3945 break;
3946 case float_round_to_zero:
3947 roundIncrement = 0;
3948 break;
3949 case float_round_up:
3950 roundIncrement = zSign ? 0 : 0x7f;
3951 break;
3952 case float_round_down:
3953 roundIncrement = zSign ? 0x7f : 0;
3954 break;
3955 case float_round_to_odd:
3956 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3957 break;
3958 default:
3959 abort();
3960 break;
3961 }
3962 roundBits = zSig & 0x7F;
3963 if ( 0xFD <= (uint16_t) zExp ) {
3964 if ( ( 0xFD < zExp )
3965 || ( ( zExp == 0xFD )
3966 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3967 ) {
3968 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3969 roundIncrement != 0;
3970 float_raise(float_flag_overflow | float_flag_inexact, status);
3971 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3972 }
3973 if ( zExp < 0 ) {
3974 if (status->flush_to_zero) {
3975 float_raise(float_flag_output_denormal, status);
3976 return packFloat32(zSign, 0, 0);
3977 }
3978 isTiny = status->tininess_before_rounding
3979 || (zExp < -1)
3980 || (zSig + roundIncrement < 0x80000000);
3981 shift32RightJamming( zSig, - zExp, &zSig );
3982 zExp = 0;
3983 roundBits = zSig & 0x7F;
3984 if (isTiny && roundBits) {
3985 float_raise(float_flag_underflow, status);
3986 }
3987 if (roundingMode == float_round_to_odd) {
3988 /*
3989 * For round-to-odd case, the roundIncrement depends on
3990 * zSig which just changed.
3991 */
3992 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3993 }
3994 }
3995 }
3996 if (roundBits) {
3997 status->float_exception_flags |= float_flag_inexact;
3998 }
3999 zSig = ( zSig + roundIncrement )>>7;
4000 if (!(roundBits ^ 0x40) && roundNearestEven) {
4001 zSig &= ~1;
4002 }
4003 if ( zSig == 0 ) zExp = 0;
4004 return packFloat32( zSign, zExp, zSig );
4005
4006 }
4007
4008 /*----------------------------------------------------------------------------
4009 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4010 | and significand `zSig', and returns the proper single-precision floating-
4011 | point value corresponding to the abstract input. This routine is just like
4012 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4013 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4014 | floating-point exponent.
4015 *----------------------------------------------------------------------------*/
4016
4017 static float32
4018 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4019 float_status *status)
4020 {
4021 int8_t shiftCount;
4022
4023 shiftCount = clz32(zSig) - 1;
4024 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4025 status);
4026
4027 }
4028
4029 /*----------------------------------------------------------------------------
4030 | Normalizes the subnormal double-precision floating-point value represented
4031 | by the denormalized significand `aSig'. The normalized exponent and
4032 | significand are stored at the locations pointed to by `zExpPtr' and
4033 | `zSigPtr', respectively.
4034 *----------------------------------------------------------------------------*/
4035
4036 static void
4037 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4038 {
4039 int8_t shiftCount;
4040
4041 shiftCount = clz64(aSig) - 11;
4042 *zSigPtr = aSig<<shiftCount;
4043 *zExpPtr = 1 - shiftCount;
4044
4045 }
4046
4047 /*----------------------------------------------------------------------------
4048 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4049 | double-precision floating-point value, returning the result. After being
4050 | shifted into the proper positions, the three fields are simply added
4051 | together to form the result. This means that any integer portion of `zSig'
4052 | will be added into the exponent. Since a properly normalized significand
4053 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4054 | than the desired result exponent whenever `zSig' is a complete, normalized
4055 | significand.
4056 *----------------------------------------------------------------------------*/
4057
4058 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4059 {
4060
4061 return make_float64(
4062 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4063
4064 }
4065
4066 /*----------------------------------------------------------------------------
4067 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4068 | and significand `zSig', and returns the proper double-precision floating-
4069 | point value corresponding to the abstract input. Ordinarily, the abstract
4070 | value is simply rounded and packed into the double-precision format, with
4071 | the inexact exception raised if the abstract input cannot be represented
4072 | exactly. However, if the abstract value is too large, the overflow and
4073 | inexact exceptions are raised and an infinity or maximal finite value is
4074 | returned. If the abstract value is too small, the input value is rounded to
4075 | a subnormal number, and the underflow and inexact exceptions are raised if
4076 | the abstract input cannot be represented exactly as a subnormal double-
4077 | precision floating-point number.
4078 | The input significand `zSig' has its binary point between bits 62
4079 | and 61, which is 10 bits to the left of the usual location. This shifted
4080 | significand must be normalized or smaller. If `zSig' is not normalized,
4081 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4082 | and it must not require rounding. In the usual case that `zSig' is
4083 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4084 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4085 | Binary Floating-Point Arithmetic.
4086 *----------------------------------------------------------------------------*/
4087
4088 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4089 float_status *status)
4090 {
4091 int8_t roundingMode;
4092 bool roundNearestEven;
4093 int roundIncrement, roundBits;
4094 bool isTiny;
4095
4096 roundingMode = status->float_rounding_mode;
4097 roundNearestEven = ( roundingMode == float_round_nearest_even );
4098 switch (roundingMode) {
4099 case float_round_nearest_even:
4100 case float_round_ties_away:
4101 roundIncrement = 0x200;
4102 break;
4103 case float_round_to_zero:
4104 roundIncrement = 0;
4105 break;
4106 case float_round_up:
4107 roundIncrement = zSign ? 0 : 0x3ff;
4108 break;
4109 case float_round_down:
4110 roundIncrement = zSign ? 0x3ff : 0;
4111 break;
4112 case float_round_to_odd:
4113 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4114 break;
4115 default:
4116 abort();
4117 }
4118 roundBits = zSig & 0x3FF;
4119 if ( 0x7FD <= (uint16_t) zExp ) {
4120 if ( ( 0x7FD < zExp )
4121 || ( ( zExp == 0x7FD )
4122 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4123 ) {
4124 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4125 roundIncrement != 0;
4126 float_raise(float_flag_overflow | float_flag_inexact, status);
4127 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4128 }
4129 if ( zExp < 0 ) {
4130 if (status->flush_to_zero) {
4131 float_raise(float_flag_output_denormal, status);
4132 return packFloat64(zSign, 0, 0);
4133 }
4134 isTiny = status->tininess_before_rounding
4135 || (zExp < -1)
4136 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4137 shift64RightJamming( zSig, - zExp, &zSig );
4138 zExp = 0;
4139 roundBits = zSig & 0x3FF;
4140 if (isTiny && roundBits) {
4141 float_raise(float_flag_underflow, status);
4142 }
4143 if (roundingMode == float_round_to_odd) {
4144 /*
4145 * For round-to-odd case, the roundIncrement depends on
4146 * zSig which just changed.
4147 */
4148 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4149 }
4150 }
4151 }
4152 if (roundBits) {
4153 status->float_exception_flags |= float_flag_inexact;
4154 }
4155 zSig = ( zSig + roundIncrement )>>10;
4156 if (!(roundBits ^ 0x200) && roundNearestEven) {
4157 zSig &= ~1;
4158 }
4159 if ( zSig == 0 ) zExp = 0;
4160 return packFloat64( zSign, zExp, zSig );
4161
4162 }
4163
4164 /*----------------------------------------------------------------------------
4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4166 | and significand `zSig', and returns the proper double-precision floating-
4167 | point value corresponding to the abstract input. This routine is just like
4168 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4169 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4170 | floating-point exponent.
4171 *----------------------------------------------------------------------------*/
4172
4173 static float64
4174 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4175 float_status *status)
4176 {
4177 int8_t shiftCount;
4178
4179 shiftCount = clz64(zSig) - 1;
4180 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4181 status);
4182
4183 }
4184
4185 /*----------------------------------------------------------------------------
4186 | Normalizes the subnormal extended double-precision floating-point value
4187 | represented by the denormalized significand `aSig'. The normalized exponent
4188 | and significand are stored at the locations pointed to by `zExpPtr' and
4189 | `zSigPtr', respectively.
4190 *----------------------------------------------------------------------------*/
4191
4192 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4193 uint64_t *zSigPtr)
4194 {
4195 int8_t shiftCount;
4196
4197 shiftCount = clz64(aSig);
4198 *zSigPtr = aSig<<shiftCount;
4199 *zExpPtr = 1 - shiftCount;
4200 }
4201
4202 /*----------------------------------------------------------------------------
4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4204 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4205 | and returns the proper extended double-precision floating-point value
4206 | corresponding to the abstract input. Ordinarily, the abstract value is
4207 | rounded and packed into the extended double-precision format, with the
4208 | inexact exception raised if the abstract input cannot be represented
4209 | exactly. However, if the abstract value is too large, the overflow and
4210 | inexact exceptions are raised and an infinity or maximal finite value is
4211 | returned. If the abstract value is too small, the input value is rounded to
4212 | a subnormal number, and the underflow and inexact exceptions are raised if
4213 | the abstract input cannot be represented exactly as a subnormal extended
4214 | double-precision floating-point number.
4215 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4216 | number of bits as single or double precision, respectively. Otherwise, the
4217 | result is rounded to the full precision of the extended double-precision
4218 | format.
4219 | The input significand must be normalized or smaller. If the input
4220 | significand is not normalized, `zExp' must be 0; in that case, the result
4221 | returned is a subnormal number, and it must not require rounding. The
4222 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4223 | Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4225
4226 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4227 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4228 float_status *status)
4229 {
4230 int8_t roundingMode;
4231 bool roundNearestEven, increment, isTiny;
4232 int64_t roundIncrement, roundMask, roundBits;
4233
4234 roundingMode = status->float_rounding_mode;
4235 roundNearestEven = ( roundingMode == float_round_nearest_even );
4236 if ( roundingPrecision == 80 ) goto precision80;
4237 if ( roundingPrecision == 64 ) {
4238 roundIncrement = UINT64_C(0x0000000000000400);
4239 roundMask = UINT64_C(0x00000000000007FF);
4240 }
4241 else if ( roundingPrecision == 32 ) {
4242 roundIncrement = UINT64_C(0x0000008000000000);
4243 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4244 }
4245 else {
4246 goto precision80;
4247 }
4248 zSig0 |= ( zSig1 != 0 );
4249 switch (roundingMode) {
4250 case float_round_nearest_even:
4251 case float_round_ties_away:
4252 break;
4253 case float_round_to_zero:
4254 roundIncrement = 0;
4255 break;
4256 case float_round_up:
4257 roundIncrement = zSign ? 0 : roundMask;
4258 break;
4259 case float_round_down:
4260 roundIncrement = zSign ? roundMask : 0;
4261 break;
4262 default:
4263 abort();
4264 }
4265 roundBits = zSig0 & roundMask;
4266 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4267 if ( ( 0x7FFE < zExp )
4268 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4269 ) {
4270 goto overflow;
4271 }
4272 if ( zExp <= 0 ) {
4273 if (status->flush_to_zero) {
4274 float_raise(float_flag_output_denormal, status);
4275 return packFloatx80(zSign, 0, 0);
4276 }
4277 isTiny = status->tininess_before_rounding
4278 || (zExp < 0 )
4279 || (zSig0 <= zSig0 + roundIncrement);
4280 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4281 zExp = 0;
4282 roundBits = zSig0 & roundMask;
4283 if (isTiny && roundBits) {
4284 float_raise(float_flag_underflow, status);
4285 }
4286 if (roundBits) {
4287 status->float_exception_flags |= float_flag_inexact;
4288 }
4289 zSig0 += roundIncrement;
4290 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4291 roundIncrement = roundMask + 1;
4292 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4293 roundMask |= roundIncrement;
4294 }
4295 zSig0 &= ~ roundMask;
4296 return packFloatx80( zSign, zExp, zSig0 );
4297 }
4298 }
4299 if (roundBits) {
4300 status->float_exception_flags |= float_flag_inexact;
4301 }
4302 zSig0 += roundIncrement;
4303 if ( zSig0 < roundIncrement ) {
4304 ++zExp;
4305 zSig0 = UINT64_C(0x8000000000000000);
4306 }
4307 roundIncrement = roundMask + 1;
4308 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4309 roundMask |= roundIncrement;
4310 }
4311 zSig0 &= ~ roundMask;
4312 if ( zSig0 == 0 ) zExp = 0;
4313 return packFloatx80( zSign, zExp, zSig0 );
4314 precision80:
4315 switch (roundingMode) {
4316 case float_round_nearest_even:
4317 case float_round_ties_away:
4318 increment = ((int64_t)zSig1 < 0);
4319 break;
4320 case float_round_to_zero:
4321 increment = 0;
4322 break;
4323 case float_round_up:
4324 increment = !zSign && zSig1;
4325 break;
4326 case float_round_down:
4327 increment = zSign && zSig1;
4328 break;
4329 default:
4330 abort();
4331 }
4332 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4333 if ( ( 0x7FFE < zExp )
4334 || ( ( zExp == 0x7FFE )
4335 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4336 && increment
4337 )
4338 ) {
4339 roundMask = 0;
4340 overflow:
4341 float_raise(float_flag_overflow | float_flag_inexact, status);
4342 if ( ( roundingMode == float_round_to_zero )
4343 || ( zSign && ( roundingMode == float_round_up ) )
4344 || ( ! zSign && ( roundingMode == float_round_down ) )
4345 ) {
4346 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4347 }
4348 return packFloatx80(zSign,
4349 floatx80_infinity_high,
4350 floatx80_infinity_low);
4351 }
4352 if ( zExp <= 0 ) {
4353 isTiny = status->tininess_before_rounding
4354 || (zExp < 0)
4355 || !increment
4356 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4357 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4358 zExp = 0;
4359 if (isTiny && zSig1) {
4360 float_raise(float_flag_underflow, status);
4361 }
4362 if (zSig1) {
4363 status->float_exception_flags |= float_flag_inexact;
4364 }
4365 switch (roundingMode) {
4366 case float_round_nearest_even:
4367 case float_round_ties_away:
4368 increment = ((int64_t)zSig1 < 0);
4369 break;
4370 case float_round_to_zero:
4371 increment = 0;
4372 break;
4373 case float_round_up:
4374 increment = !zSign && zSig1;
4375 break;
4376 case float_round_down:
4377 increment = zSign && zSig1;
4378 break;
4379 default:
4380 abort();
4381 }
4382 if ( increment ) {
4383 ++zSig0;
4384 if (!(zSig1 << 1) && roundNearestEven) {
4385 zSig0 &= ~1;
4386 }
4387 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4388 }
4389 return packFloatx80( zSign, zExp, zSig0 );
4390 }
4391 }
4392 if (zSig1) {
4393 status->float_exception_flags |= float_flag_inexact;
4394 }
4395 if ( increment ) {
4396 ++zSig0;
4397 if ( zSig0 == 0 ) {
4398 ++zExp;
4399 zSig0 = UINT64_C(0x8000000000000000);
4400 }
4401 else {
4402 if (!(zSig1 << 1) && roundNearestEven) {
4403 zSig0 &= ~1;
4404 }
4405 }
4406 }
4407 else {
4408 if ( zSig0 == 0 ) zExp = 0;
4409 }
4410 return packFloatx80( zSign, zExp, zSig0 );
4411
4412 }
4413
4414 /*----------------------------------------------------------------------------
4415 | Takes an abstract floating-point value having sign `zSign', exponent
4416 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4417 | and returns the proper extended double-precision floating-point value
4418 | corresponding to the abstract input. This routine is just like
4419 | `roundAndPackFloatx80' except that the input significand does not have to be
4420 | normalized.
4421 *----------------------------------------------------------------------------*/
4422
4423 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4424 bool zSign, int32_t zExp,
4425 uint64_t zSig0, uint64_t zSig1,
4426 float_status *status)
4427 {
4428 int8_t shiftCount;
4429
4430 if ( zSig0 == 0 ) {
4431 zSig0 = zSig1;
4432 zSig1 = 0;
4433 zExp -= 64;
4434 }
4435 shiftCount = clz64(zSig0);
4436 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4437 zExp -= shiftCount;
4438 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4439 zSig0, zSig1, status);
4440
4441 }
4442
4443 /*----------------------------------------------------------------------------
4444 | Returns the least-significant 64 fraction bits of the quadruple-precision
4445 | floating-point value `a'.
4446 *----------------------------------------------------------------------------*/
4447
4448 static inline uint64_t extractFloat128Frac1( float128 a )
4449 {
4450
4451 return a.low;
4452
4453 }
4454
4455 /*----------------------------------------------------------------------------
4456 | Returns the most-significant 48 fraction bits of the quadruple-precision
4457 | floating-point value `a'.
4458 *----------------------------------------------------------------------------*/
4459
4460 static inline uint64_t extractFloat128Frac0( float128 a )
4461 {
4462
4463 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4464
4465 }
4466
4467 /*----------------------------------------------------------------------------
4468 | Returns the exponent bits of the quadruple-precision floating-point value
4469 | `a'.
4470 *----------------------------------------------------------------------------*/
4471
4472 static inline int32_t extractFloat128Exp( float128 a )
4473 {
4474
4475 return ( a.high>>48 ) & 0x7FFF;
4476
4477 }
4478
4479 /*----------------------------------------------------------------------------
4480 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4481 *----------------------------------------------------------------------------*/
4482
4483 static inline bool extractFloat128Sign(float128 a)
4484 {
4485 return a.high >> 63;
4486 }
4487
4488 /*----------------------------------------------------------------------------
4489 | Normalizes the subnormal quadruple-precision floating-point value
4490 | represented by the denormalized significand formed by the concatenation of
4491 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4492 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4493 | significand are stored at the location pointed to by `zSig0Ptr', and the
4494 | least significant 64 bits of the normalized significand are stored at the
4495 | location pointed to by `zSig1Ptr'.
4496 *----------------------------------------------------------------------------*/
4497
4498 static void
4499 normalizeFloat128Subnormal(
4500 uint64_t aSig0,
4501 uint64_t aSig1,
4502 int32_t *zExpPtr,
4503 uint64_t *zSig0Ptr,
4504 uint64_t *zSig1Ptr
4505 )
4506 {
4507 int8_t shiftCount;
4508
4509 if ( aSig0 == 0 ) {
4510 shiftCount = clz64(aSig1) - 15;
4511 if ( shiftCount < 0 ) {
4512 *zSig0Ptr = aSig1>>( - shiftCount );
4513 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4514 }
4515 else {
4516 *zSig0Ptr = aSig1<<shiftCount;
4517 *zSig1Ptr = 0;
4518 }
4519 *zExpPtr = - shiftCount - 63;
4520 }
4521 else {
4522 shiftCount = clz64(aSig0) - 15;
4523 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4524 *zExpPtr = 1 - shiftCount;
4525 }
4526
4527 }
4528
4529 /*----------------------------------------------------------------------------
4530 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4531 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4532 | floating-point value, returning the result. After being shifted into the
4533 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4534 | added together to form the most significant 32 bits of the result. This
4535 | means that any integer portion of `zSig0' will be added into the exponent.
4536 | Since a properly normalized significand will have an integer portion equal
4537 | to 1, the `zExp' input should be 1 less than the desired result exponent
4538 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4539 | significand.
4540 *----------------------------------------------------------------------------*/
4541
4542 static inline float128
4543 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4544 {
4545 float128 z;
4546
4547 z.low = zSig1;
4548 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4549 return z;
4550 }
4551
4552 /*----------------------------------------------------------------------------
4553 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4554 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4555 | and `zSig2', and returns the proper quadruple-precision floating-point value
4556 | corresponding to the abstract input. Ordinarily, the abstract value is
4557 | simply rounded and packed into the quadruple-precision format, with the
4558 | inexact exception raised if the abstract input cannot be represented
4559 | exactly. However, if the abstract value is too large, the overflow and
4560 | inexact exceptions are raised and an infinity or maximal finite value is
4561 | returned. If the abstract value is too small, the input value is rounded to
4562 | a subnormal number, and the underflow and inexact exceptions are raised if
4563 | the abstract input cannot be represented exactly as a subnormal quadruple-
4564 | precision floating-point number.
4565 | The input significand must be normalized or smaller. If the input
4566 | significand is not normalized, `zExp' must be 0; in that case, the result
4567 | returned is a subnormal number, and it must not require rounding. In the
4568 | usual case that the input significand is normalized, `zExp' must be 1 less
4569 | than the ``true'' floating-point exponent. The handling of underflow and
4570 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4572
4573 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4574 uint64_t zSig0, uint64_t zSig1,
4575 uint64_t zSig2, float_status *status)
4576 {
4577 int8_t roundingMode;
4578 bool roundNearestEven, increment, isTiny;
4579
4580 roundingMode = status->float_rounding_mode;
4581 roundNearestEven = ( roundingMode == float_round_nearest_even );
4582 switch (roundingMode) {
4583 case float_round_nearest_even:
4584 case float_round_ties_away:
4585 increment = ((int64_t)zSig2 < 0);
4586 break;
4587 case float_round_to_zero:
4588 increment = 0;
4589 break;
4590 case float_round_up:
4591 increment = !zSign && zSig2;
4592 break;
4593 case float_round_down:
4594 increment = zSign && zSig2;
4595 break;
4596 case float_round_to_odd:
4597 increment = !(zSig1 & 0x1) && zSig2;
4598 break;
4599 default:
4600 abort();
4601 }
4602 if ( 0x7FFD <= (uint32_t) zExp ) {
4603 if ( ( 0x7FFD < zExp )
4604 || ( ( zExp == 0x7FFD )
4605 && eq128(
4606 UINT64_C(0x0001FFFFFFFFFFFF),
4607 UINT64_C(0xFFFFFFFFFFFFFFFF),
4608 zSig0,
4609 zSig1
4610 )
4611 && increment
4612 )
4613 ) {
4614 float_raise(float_flag_overflow | float_flag_inexact, status);
4615 if ( ( roundingMode == float_round_to_zero )
4616 || ( zSign && ( roundingMode == float_round_up ) )
4617 || ( ! zSign && ( roundingMode == float_round_down ) )
4618 || (roundingMode == float_round_to_odd)
4619 ) {
4620 return
4621 packFloat128(
4622 zSign,
4623 0x7FFE,
4624 UINT64_C(0x0000FFFFFFFFFFFF),
4625 UINT64_C(0xFFFFFFFFFFFFFFFF)
4626 );
4627 }
4628 return packFloat128( zSign, 0x7FFF, 0, 0 );
4629 }
4630 if ( zExp < 0 ) {
4631 if (status->flush_to_zero) {
4632 float_raise(float_flag_output_denormal, status);
4633 return packFloat128(zSign, 0, 0, 0);
4634 }
4635 isTiny = status->tininess_before_rounding
4636 || (zExp < -1)
4637 || !increment
4638 || lt128(zSig0, zSig1,
4639 UINT64_C(0x0001FFFFFFFFFFFF),
4640 UINT64_C(0xFFFFFFFFFFFFFFFF));
4641 shift128ExtraRightJamming(
4642 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4643 zExp = 0;
4644 if (isTiny && zSig2) {
4645 float_raise(float_flag_underflow, status);
4646 }
4647 switch (roundingMode) {
4648 case float_round_nearest_even:
4649 case float_round_ties_away:
4650 increment = ((int64_t)zSig2 < 0);
4651 break;
4652 case float_round_to_zero:
4653 increment = 0;
4654 break;
4655 case float_round_up:
4656 increment = !zSign && zSig2;
4657 break;
4658 case float_round_down:
4659 increment = zSign && zSig2;
4660 break;
4661 case float_round_to_odd:
4662 increment = !(zSig1 & 0x1) && zSig2;
4663 break;
4664 default:
4665 abort();
4666 }
4667 }
4668 }
4669 if (zSig2) {
4670 status->float_exception_flags |= float_flag_inexact;
4671 }
4672 if ( increment ) {
4673 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4674 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4675 zSig1 &= ~1;
4676 }
4677 }
4678 else {
4679 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4680 }
4681 return packFloat128( zSign, zExp, zSig0, zSig1 );
4682
4683 }
4684
4685 /*----------------------------------------------------------------------------
4686 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4687 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4688 | returns the proper quadruple-precision floating-point value corresponding
4689 | to the abstract input. This routine is just like `roundAndPackFloat128'
4690 | except that the input significand has fewer bits and does not have to be
4691 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4692 | point exponent.
4693 *----------------------------------------------------------------------------*/
4694
4695 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4696 uint64_t zSig0, uint64_t zSig1,
4697 float_status *status)
4698 {
4699 int8_t shiftCount;
4700 uint64_t zSig2;
4701
4702 if ( zSig0 == 0 ) {
4703 zSig0 = zSig1;
4704 zSig1 = 0;
4705 zExp -= 64;
4706 }
4707 shiftCount = clz64(zSig0) - 15;
4708 if ( 0 <= shiftCount ) {
4709 zSig2 = 0;
4710 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4711 }
4712 else {
4713 shift128ExtraRightJamming(
4714 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4715 }
4716 zExp -= shiftCount;
4717 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4718
4719 }
4720
4721
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the 32-bit two's complement integer `a'
4724 | to the extended double-precision floating-point format. The conversion
4725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4726 | Arithmetic.
4727 *----------------------------------------------------------------------------*/
4728
4729 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4730 {
4731 bool zSign;
4732 uint32_t absA;
4733 int8_t shiftCount;
4734 uint64_t zSig;
4735
4736 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4737 zSign = ( a < 0 );
4738 absA = zSign ? - a : a;
4739 shiftCount = clz32(absA) + 32;
4740 zSig = absA;
4741 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4742
4743 }
4744
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of converting the 32-bit two's complement integer `a' to
4747 | the quadruple-precision floating-point format. The conversion is performed
4748 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4750
4751 float128 int32_to_float128(int32_t a, float_status *status)
4752 {
4753 bool zSign;
4754 uint32_t absA;
4755 int8_t shiftCount;
4756 uint64_t zSig0;
4757
4758 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4759 zSign = ( a < 0 );
4760 absA = zSign ? - a : a;
4761 shiftCount = clz32(absA) + 17;
4762 zSig0 = absA;
4763 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4764
4765 }
4766
4767 /*----------------------------------------------------------------------------
4768 | Returns the result of converting the 64-bit two's complement integer `a'
4769 | to the extended double-precision floating-point format. The conversion
4770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4771 | Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773
4774 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4775 {
4776 bool zSign;
4777 uint64_t absA;
4778 int8_t shiftCount;
4779
4780 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4781 zSign = ( a < 0 );
4782 absA = zSign ? - a : a;
4783 shiftCount = clz64(absA);
4784 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4785
4786 }
4787
4788 /*----------------------------------------------------------------------------
4789 | Returns the result of converting the 64-bit two's complement integer `a' to
4790 | the quadruple-precision floating-point format. The conversion is performed
4791 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4792 *----------------------------------------------------------------------------*/
4793
4794 float128 int64_to_float128(int64_t a, float_status *status)
4795 {
4796 bool zSign;
4797 uint64_t absA;
4798 int8_t shiftCount;
4799 int32_t zExp;
4800 uint64_t zSig0, zSig1;
4801
4802 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4803 zSign = ( a < 0 );
4804 absA = zSign ? - a : a;
4805 shiftCount = clz64(absA) + 49;
4806 zExp = 0x406E - shiftCount;
4807 if ( 64 <= shiftCount ) {
4808 zSig1 = 0;
4809 zSig0 = absA;
4810 shiftCount -= 64;
4811 }
4812 else {
4813 zSig1 = absA;
4814 zSig0 = 0;
4815 }
4816 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4817 return packFloat128( zSign, zExp, zSig0, zSig1 );
4818
4819 }
4820
4821 /*----------------------------------------------------------------------------
4822 | Returns the result of converting the 64-bit unsigned integer `a'
4823 | to the quadruple-precision floating-point format. The conversion is performed
4824 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4826
4827 float128 uint64_to_float128(uint64_t a, float_status *status)
4828 {
4829 if (a == 0) {
4830 return float128_zero;
4831 }
4832 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4833 }
4834
4835 /*----------------------------------------------------------------------------
4836 | Returns the result of converting the single-precision floating-point value
4837 | `a' to the extended double-precision floating-point format. The conversion
4838 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4839 | Arithmetic.
4840 *----------------------------------------------------------------------------*/
4841
4842 floatx80 float32_to_floatx80(float32 a, float_status *status)
4843 {
4844 bool aSign;
4845 int aExp;
4846 uint32_t aSig;
4847
4848 a = float32_squash_input_denormal(a, status);
4849 aSig = extractFloat32Frac( a );
4850 aExp = extractFloat32Exp( a );
4851 aSign = extractFloat32Sign( a );
4852 if ( aExp == 0xFF ) {
4853 if (aSig) {
4854 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4855 status);
4856 return floatx80_silence_nan(res, status);
4857 }
4858 return packFloatx80(aSign,
4859 floatx80_infinity_high,
4860 floatx80_infinity_low);
4861 }
4862 if ( aExp == 0 ) {
4863 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4864 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4865 }
4866 aSig |= 0x00800000;
4867 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4868
4869 }
4870
4871 /*----------------------------------------------------------------------------
4872 | Returns the result of converting the single-precision floating-point value
4873 | `a' to the double-precision floating-point format. The conversion is
4874 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4875 | Arithmetic.
4876 *----------------------------------------------------------------------------*/
4877
4878 float128 float32_to_float128(float32 a, float_status *status)
4879 {
4880 bool aSign;
4881 int aExp;
4882 uint32_t aSig;
4883
4884 a = float32_squash_input_denormal(a, status);
4885 aSig = extractFloat32Frac( a );
4886 aExp = extractFloat32Exp( a );
4887 aSign = extractFloat32Sign( a );
4888 if ( aExp == 0xFF ) {
4889 if (aSig) {
4890 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4891 }
4892 return packFloat128( aSign, 0x7FFF, 0, 0 );
4893 }
4894 if ( aExp == 0 ) {
4895 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4896 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4897 --aExp;
4898 }
4899 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4900
4901 }
4902
4903 /*----------------------------------------------------------------------------
4904 | Returns the remainder of the single-precision floating-point value `a'
4905 | with respect to the corresponding value `b'. The operation is performed
4906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4907 *----------------------------------------------------------------------------*/
4908
4909 float32 float32_rem(float32 a, float32 b, float_status *status)
4910 {
4911 bool aSign, zSign;
4912 int aExp, bExp, expDiff;
4913 uint32_t aSig, bSig;
4914 uint32_t q;
4915 uint64_t aSig64, bSig64, q64;
4916 uint32_t alternateASig;
4917 int32_t sigMean;
4918 a = float32_squash_input_denormal(a, status);
4919 b = float32_squash_input_denormal(b, status);
4920
4921 aSig = extractFloat32Frac( a );
4922 aExp = extractFloat32Exp( a );
4923 aSign = extractFloat32Sign( a );
4924 bSig = extractFloat32Frac( b );
4925 bExp = extractFloat32Exp( b );
4926 if ( aExp == 0xFF ) {
4927 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4928 return propagateFloat32NaN(a, b, status);
4929 }
4930 float_raise(float_flag_invalid, status);
4931 return float32_default_nan(status);
4932 }
4933 if ( bExp == 0xFF ) {
4934 if (bSig) {
4935 return propagateFloat32NaN(a, b, status);
4936 }
4937 return a;
4938 }
4939 if ( bExp == 0 ) {
4940 if ( bSig == 0 ) {
4941 float_raise(float_flag_invalid, status);
4942 return float32_default_nan(status);
4943 }
4944 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4945 }
4946 if ( aExp == 0 ) {
4947 if ( aSig == 0 ) return a;
4948 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4949 }
4950 expDiff = aExp - bExp;
4951 aSig |= 0x00800000;
4952 bSig |= 0x00800000;
4953 if ( expDiff < 32 ) {
4954 aSig <<= 8;
4955 bSig <<= 8;
4956 if ( expDiff < 0 ) {
4957 if ( expDiff < -1 ) return a;
4958 aSig >>= 1;
4959 }
4960 q = ( bSig <= aSig );
4961 if ( q ) aSig -= bSig;
4962 if ( 0 < expDiff ) {
4963 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4964 q >>= 32 - expDiff;
4965 bSig >>= 2;
4966 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4967 }
4968 else {
4969 aSig >>= 2;
4970 bSig >>= 2;
4971 }
4972 }
4973 else {
4974 if ( bSig <= aSig ) aSig -= bSig;
4975 aSig64 = ( (uint64_t) aSig )<<40;
4976 bSig64 = ( (uint64_t) bSig )<<40;
4977 expDiff -= 64;
4978 while ( 0 < expDiff ) {
4979 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4980 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4981 aSig64 = - ( ( bSig * q64 )<<38 );
4982 expDiff -= 62;
4983 }
4984 expDiff += 64;
4985 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4986 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4987 q = q64>>( 64 - expDiff );
4988 bSig <<= 6;
4989 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4990 }
4991 do {
4992 alternateASig = aSig;
4993 ++q;
4994 aSig -= bSig;
4995 } while ( 0 <= (int32_t) aSig );
4996 sigMean = aSig + alternateASig;
4997 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4998 aSig = alternateASig;
4999 }
5000 zSign = ( (int32_t) aSig < 0 );
5001 if ( zSign ) aSig = - aSig;
5002 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5003 }
5004
5005
5006
5007 /*----------------------------------------------------------------------------
5008 | Returns the binary exponential of the single-precision floating-point value
5009 | `a'. The operation is performed according to the IEC/IEEE Standard for
5010 | Binary Floating-Point Arithmetic.
5011 |
5012 | Uses the following identities:
5013 |
5014 | 1. -------------------------------------------------------------------------
5015 | x x*ln(2)
5016 | 2 = e
5017 |
5018 | 2. -------------------------------------------------------------------------
5019 | 2 3 4 5 n
5020 | x x x x x x x
5021 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5022 | 1! 2! 3! 4! 5! n!
5023 *----------------------------------------------------------------------------*/
5024
5025 static const float64 float32_exp2_coefficients[15] =
5026 {
5027 const_float64( 0x3ff0000000000000ll ), /* 1 */
5028 const_float64( 0x3fe0000000000000ll ), /* 2 */
5029 const_float64( 0x3fc5555555555555ll ), /* 3 */
5030 const_float64( 0x3fa5555555555555ll ), /* 4 */
5031 const_float64( 0x3f81111111111111ll ), /* 5 */
5032 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5033 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5034 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5035 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5036 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5037 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5038 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5039 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5040 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5041 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5042 };
5043
5044 float32 float32_exp2(float32 a, float_status *status)
5045 {
5046 bool aSign;
5047 int aExp;
5048 uint32_t aSig;
5049 float64 r, x, xn;
5050 int i;
5051 a = float32_squash_input_denormal(a, status);
5052
5053 aSig = extractFloat32Frac( a );
5054 aExp = extractFloat32Exp( a );
5055 aSign = extractFloat32Sign( a );
5056
5057 if ( aExp == 0xFF) {
5058 if (aSig) {
5059 return propagateFloat32NaN(a, float32_zero, status);
5060 }
5061 return (aSign) ? float32_zero : a;
5062 }
5063 if (aExp == 0) {
5064 if (aSig == 0) return float32_one;
5065 }
5066
5067 float_raise(float_flag_inexact, status);
5068
5069 /* ******************************* */
5070 /* using float64 for approximation */
5071 /* ******************************* */
5072 x = float32_to_float64(a, status);
5073 x = float64_mul(x, float64_ln2, status);
5074
5075 xn = x;
5076 r = float64_one;
5077 for (i = 0 ; i < 15 ; i++) {
5078 float64 f;
5079
5080 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5081 r = float64_add(r, f, status);
5082
5083 xn = float64_mul(xn, x, status);
5084 }
5085
5086 return float64_to_float32(r, status);
5087 }
5088
5089 /*----------------------------------------------------------------------------
5090 | Returns the binary log of the single-precision floating-point value `a'.
5091 | The operation is performed according to the IEC/IEEE Standard for Binary
5092 | Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094 float32 float32_log2(float32 a, float_status *status)
5095 {
5096 bool aSign, zSign;
5097 int aExp;
5098 uint32_t aSig, zSig, i;
5099
5100 a = float32_squash_input_denormal(a, status);
5101 aSig = extractFloat32Frac( a );
5102 aExp = extractFloat32Exp( a );
5103 aSign = extractFloat32Sign( a );
5104
5105 if ( aExp == 0 ) {
5106 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5107 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5108 }
5109 if ( aSign ) {
5110 float_raise(float_flag_invalid, status);
5111 return float32_default_nan(status);
5112 }
5113 if ( aExp == 0xFF ) {
5114 if (aSig) {
5115 return propagateFloat32NaN(a, float32_zero, status);
5116 }
5117 return a;
5118 }
5119
5120 aExp -= 0x7F;
5121 aSig |= 0x00800000;
5122 zSign = aExp < 0;
5123 zSig = aExp << 23;
5124
5125 for (i = 1 << 22; i > 0; i >>= 1) {
5126 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5127 if ( aSig & 0x01000000 ) {
5128 aSig >>= 1;
5129 zSig |= i;
5130 }
5131 }
5132
5133 if ( zSign )
5134 zSig = -zSig;
5135
5136 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5137 }
5138
5139 /*----------------------------------------------------------------------------
5140 | Returns the result of converting the double-precision floating-point value
5141 | `a' to the extended double-precision floating-point format. The conversion
5142 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5143 | Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145
5146 floatx80 float64_to_floatx80(float64 a, float_status *status)
5147 {
5148 bool aSign;
5149 int aExp;
5150 uint64_t aSig;
5151
5152 a = float64_squash_input_denormal(a, status);
5153 aSig = extractFloat64Frac( a );
5154 aExp = extractFloat64Exp( a );
5155 aSign = extractFloat64Sign( a );
5156 if ( aExp == 0x7FF ) {
5157 if (aSig) {
5158 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5159 status);
5160 return floatx80_silence_nan(res, status);
5161 }
5162 return packFloatx80(aSign,
5163 floatx80_infinity_high,
5164 floatx80_infinity_low);
5165 }
5166 if ( aExp == 0 ) {
5167 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5168 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5169 }
5170 return
5171 packFloatx80(
5172 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5173
5174 }
5175
5176 /*----------------------------------------------------------------------------
5177 | Returns the result of converting the double-precision floating-point value
5178 | `a' to the quadruple-precision floating-point format. The conversion is
5179 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5180 | Arithmetic.
5181 *----------------------------------------------------------------------------*/
5182
5183 float128 float64_to_float128(float64 a, float_status *status)
5184 {
5185 bool aSign;
5186 int aExp;
5187 uint64_t aSig, zSig0, zSig1;
5188
5189 a = float64_squash_input_denormal(a, status);
5190 aSig = extractFloat64Frac( a );
5191 aExp = extractFloat64Exp( a );
5192 aSign = extractFloat64Sign( a );
5193 if ( aExp == 0x7FF ) {
5194 if (aSig) {
5195 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5196 }
5197 return packFloat128( aSign, 0x7FFF, 0, 0 );
5198 }
5199 if ( aExp == 0 ) {
5200 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5201 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5202 --aExp;
5203 }
5204 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5205 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5206
5207 }
5208
5209
5210 /*----------------------------------------------------------------------------
5211 | Returns the remainder of the double-precision floating-point value `a'
5212 | with respect to the corresponding value `b'. The operation is performed
5213 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5214 *----------------------------------------------------------------------------*/
5215
5216 float64 float64_rem(float64 a, float64 b, float_status *status)
5217 {
5218 bool aSign, zSign;
5219 int aExp, bExp, expDiff;
5220 uint64_t aSig, bSig;
5221 uint64_t q, alternateASig;
5222 int64_t sigMean;
5223
5224 a = float64_squash_input_denormal(a, status);
5225 b = float64_squash_input_denormal(b, status);
5226 aSig = extractFloat64Frac( a );
5227 aExp = extractFloat64Exp( a );
5228 aSign = extractFloat64Sign( a );
5229 bSig = extractFloat64Frac( b );
5230 bExp = extractFloat64Exp( b );
5231 if ( aExp == 0x7FF ) {
5232 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5233 return propagateFloat64NaN(a, b, status);
5234 }
5235 float_raise(float_flag_invalid, status);
5236 return float64_default_nan(status);
5237 }
5238 if ( bExp == 0x7FF ) {
5239 if (bSig) {
5240 return propagateFloat64NaN(a, b, status);
5241 }
5242 return a;
5243 }
5244 if ( bExp == 0 ) {
5245 if ( bSig == 0 ) {
5246 float_raise(float_flag_invalid, status);
5247 return float64_default_nan(status);
5248 }
5249 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5250 }
5251 if ( aExp == 0 ) {
5252 if ( aSig == 0 ) return a;
5253 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5254 }
5255 expDiff = aExp - bExp;
5256 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5257 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5258 if ( expDiff < 0 ) {
5259 if ( expDiff < -1 ) return a;
5260 aSig >>= 1;
5261 }
5262 q = ( bSig <= aSig );
5263 if ( q ) aSig -= bSig;
5264 expDiff -= 64;
5265 while ( 0 < expDiff ) {
5266 q = estimateDiv128To64( aSig, 0, bSig );
5267 q = ( 2 < q ) ? q - 2 : 0;
5268 aSig = - ( ( bSig>>2 ) * q );
5269 expDiff -= 62;
5270 }
5271 expDiff += 64;
5272 if ( 0 < expDiff ) {
5273 q = estimateDiv128To64( aSig, 0, bSig );
5274 q = ( 2 < q ) ? q - 2 : 0;
5275 q >>= 64 - expDiff;
5276 bSig >>= 2;
5277 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5278 }
5279 else {
5280 aSig >>= 2;
5281 bSig >>= 2;
5282 }
5283 do {
5284 alternateASig = aSig;
5285 ++q;
5286 aSig -= bSig;
5287 } while ( 0 <= (int64_t) aSig );
5288 sigMean = aSig + alternateASig;
5289 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5290 aSig = alternateASig;
5291 }
5292 zSign = ( (int64_t) aSig < 0 );
5293 if ( zSign ) aSig = - aSig;
5294 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5295
5296 }
5297
5298 /*----------------------------------------------------------------------------
5299 | Returns the binary log of the double-precision floating-point value `a'.
5300 | The operation is performed according to the IEC/IEEE Standard for Binary
5301 | Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303 float64 float64_log2(float64 a, float_status *status)
5304 {
5305 bool aSign, zSign;
5306 int aExp;
5307 uint64_t aSig, aSig0, aSig1, zSig, i;
5308 a = float64_squash_input_denormal(a, status);
5309
5310 aSig = extractFloat64Frac( a );
5311 aExp = extractFloat64Exp( a );
5312 aSign = extractFloat64Sign( a );
5313
5314 if ( aExp == 0 ) {
5315 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5316 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5317 }
5318 if ( aSign ) {
5319 float_raise(float_flag_invalid, status);
5320 return float64_default_nan(status);
5321 }
5322 if ( aExp == 0x7FF ) {
5323 if (aSig) {
5324 return propagateFloat64NaN(a, float64_zero, status);
5325 }
5326 return a;
5327 }
5328
5329 aExp -= 0x3FF;
5330 aSig |= UINT64_C(0x0010000000000000);
5331 zSign = aExp < 0;
5332 zSig = (uint64_t)aExp << 52;
5333 for (i = 1LL << 51; i > 0; i >>= 1) {
5334 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5335 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5336 if ( aSig & UINT64_C(0x0020000000000000) ) {
5337 aSig >>= 1;
5338 zSig |= i;
5339 }
5340 }
5341
5342 if ( zSign )
5343 zSig = -zSig;
5344 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5345 }
5346
5347 /*----------------------------------------------------------------------------
5348 | Returns the result of converting the extended double-precision floating-
5349 | point value `a' to the 32-bit two's complement integer format. The
5350 | conversion is performed according to the IEC/IEEE Standard for Binary
5351 | Floating-Point Arithmetic---which means in particular that the conversion
5352 | is rounded according to the current rounding mode. If `a' is a NaN, the
5353 | largest positive integer is returned. Otherwise, if the conversion
5354 | overflows, the largest integer with the same sign as `a' is returned.
5355 *----------------------------------------------------------------------------*/
5356
5357 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5358 {
5359 bool aSign;
5360 int32_t aExp, shiftCount;
5361 uint64_t aSig;
5362
5363 if (floatx80_invalid_encoding(a)) {
5364 float_raise(float_flag_invalid, status);
5365 return 1 << 31;
5366 }
5367 aSig = extractFloatx80Frac( a );
5368 aExp = extractFloatx80Exp( a );
5369 aSign = extractFloatx80Sign( a );
5370 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5371 shiftCount = 0x4037 - aExp;
5372 if ( shiftCount <= 0 ) shiftCount = 1;
5373 shift64RightJamming( aSig, shiftCount, &aSig );
5374 return roundAndPackInt32(aSign, aSig, status);
5375
5376 }
5377
5378 /*----------------------------------------------------------------------------
5379 | Returns the result of converting the extended double-precision floating-
5380 | point value `a' to the 32-bit two's complement integer format. The
5381 | conversion is performed according to the IEC/IEEE Standard for Binary
5382 | Floating-Point Arithmetic, except that the conversion is always rounded
5383 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5384 | Otherwise, if the conversion overflows, the largest integer with the same
5385 | sign as `a' is returned.
5386 *----------------------------------------------------------------------------*/
5387
5388 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5389 {
5390 bool aSign;
5391 int32_t aExp, shiftCount;
5392 uint64_t aSig, savedASig;
5393 int32_t z;
5394
5395 if (floatx80_invalid_encoding(a)) {
5396 float_raise(float_flag_invalid, status);
5397 return 1 << 31;
5398 }
5399 aSig = extractFloatx80Frac( a );
5400 aExp = extractFloatx80Exp( a );
5401 aSign = extractFloatx80Sign( a );
5402 if ( 0x401E < aExp ) {
5403 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5404 goto invalid;
5405 }
5406 else if ( aExp < 0x3FFF ) {
5407 if (aExp || aSig) {
5408 status->float_exception_flags |= float_flag_inexact;
5409 }
5410 return 0;
5411 }
5412 shiftCount = 0x403E - aExp;
5413 savedASig = aSig;
5414 aSig >>= shiftCount;
5415 z = aSig;
5416 if ( aSign ) z = - z;
5417 if ( ( z < 0 ) ^ aSign ) {
5418 invalid:
5419 float_raise(float_flag_invalid, status);
5420 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5421 }
5422 if ( ( aSig<<shiftCount ) != savedASig ) {
5423 status->float_exception_flags |= float_flag_inexact;
5424 }
5425 return z;
5426
5427 }
5428
5429 /*----------------------------------------------------------------------------
5430 | Returns the result of converting the extended double-precision floating-
5431 | point value `a' to the 64-bit two's complement integer format. The
5432 | conversion is performed according to the IEC/IEEE Standard for Binary
5433 | Floating-Point Arithmetic---which means in particular that the conversion
5434 | is rounded according to the current rounding mode. If `a' is a NaN,
5435 | the largest positive integer is returned. Otherwise, if the conversion
5436 | overflows, the largest integer with the same sign as `a' is returned.
5437 *----------------------------------------------------------------------------*/
5438
5439 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5440 {
5441 bool aSign;
5442 int32_t aExp, shiftCount;
5443 uint64_t aSig, aSigExtra;
5444
5445 if (floatx80_invalid_encoding(a)) {
5446 float_raise(float_flag_invalid, status);
5447 return 1ULL << 63;
5448 }
5449 aSig = extractFloatx80Frac( a );
5450 aExp = extractFloatx80Exp( a );
5451 aSign = extractFloatx80Sign( a );
5452 shiftCount = 0x403E - aExp;
5453 if ( shiftCount <= 0 ) {
5454 if ( shiftCount ) {
5455 float_raise(float_flag_invalid, status);
5456 if (!aSign || floatx80_is_any_nan(a)) {
5457 return INT64_MAX;
5458 }
5459 return INT64_MIN;
5460 }
5461 aSigExtra = 0;
5462 }
5463 else {
5464 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5465 }
5466 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5467
5468 }
5469
5470 /*----------------------------------------------------------------------------
5471 | Returns the result of converting the extended double-precision floating-
5472 | point value `a' to the 64-bit two's complement integer format. The
5473 | conversion is performed according to the IEC/IEEE Standard for Binary
5474 | Floating-Point Arithmetic, except that the conversion is always rounded
5475 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5476 | Otherwise, if the conversion overflows, the largest integer with the same
5477 | sign as `a' is returned.
5478 *----------------------------------------------------------------------------*/
5479
5480 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5481 {
5482 bool aSign;
5483 int32_t aExp, shiftCount;
5484 uint64_t aSig;
5485 int64_t z;
5486
5487 if (floatx80_invalid_encoding(a)) {
5488 float_raise(float_flag_invalid, status);
5489 return 1ULL << 63;
5490 }
5491 aSig = extractFloatx80Frac( a );
5492 aExp = extractFloatx80Exp( a );
5493 aSign = extractFloatx80Sign( a );
5494 shiftCount = aExp - 0x403E;
5495 if ( 0 <= shiftCount ) {
5496 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5497 if ( ( a.high != 0xC03E ) || aSig ) {
5498 float_raise(float_flag_invalid, status);
5499 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5500 return INT64_MAX;
5501 }
5502 }
5503 return INT64_MIN;
5504 }
5505 else if ( aExp < 0x3FFF ) {
5506 if (aExp | aSig) {
5507 status->float_exception_flags |= float_flag_inexact;
5508 }
5509 return 0;
5510 }
5511 z = aSig>>( - shiftCount );
5512 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5513 status->float_exception_flags |= float_flag_inexact;
5514 }
5515 if ( aSign ) z = - z;
5516 return z;
5517
5518 }
5519
5520 /*----------------------------------------------------------------------------
5521 | Returns the result of converting the extended double-precision floating-
5522 | point value `a' to the single-precision floating-point format. The
5523 | conversion is performed according to the IEC/IEEE Standard for Binary
5524 | Floating-Point Arithmetic.
5525 *----------------------------------------------------------------------------*/
5526
5527 float32 floatx80_to_float32(floatx80 a, float_status *status)
5528 {
5529 bool aSign;
5530 int32_t aExp;
5531 uint64_t aSig;
5532
5533 if (floatx80_invalid_encoding(a)) {
5534 float_raise(float_flag_invalid, status);
5535 return float32_default_nan(status);
5536 }
5537 aSig = extractFloatx80Frac( a );
5538 aExp = extractFloatx80Exp( a );
5539 aSign = extractFloatx80Sign( a );
5540 if ( aExp == 0x7FFF ) {
5541 if ( (uint64_t) ( aSig<<1 ) ) {
5542 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5543 status);
5544 return float32_silence_nan(res, status);
5545 }
5546 return packFloat32( aSign, 0xFF, 0 );
5547 }
5548 shift64RightJamming( aSig, 33, &aSig );
5549 if ( aExp || aSig ) aExp -= 0x3F81;
5550 return roundAndPackFloat32(aSign, aExp, aSig, status);
5551
5552 }
5553
5554 /*----------------------------------------------------------------------------
5555 | Returns the result of converting the extended double-precision floating-
5556 | point value `a' to the double-precision floating-point format. The
5557 | conversion is performed according to the IEC/IEEE Standard for Binary
5558 | Floating-Point Arithmetic.
5559 *----------------------------------------------------------------------------*/
5560
5561 float64 floatx80_to_float64(floatx80 a, float_status *status)
5562 {
5563 bool aSign;
5564 int32_t aExp;
5565 uint64_t aSig, zSig;
5566
5567 if (floatx80_invalid_encoding(a)) {
5568 float_raise(float_flag_invalid, status);
5569 return float64_default_nan(status);
5570 }
5571 aSig = extractFloatx80Frac( a );
5572 aExp = extractFloatx80Exp( a );
5573 aSign = extractFloatx80Sign( a );
5574 if ( aExp == 0x7FFF ) {
5575 if ( (uint64_t) ( aSig<<1 ) ) {
5576 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5577 status);
5578 return float64_silence_nan(res, status);
5579 }
5580 return packFloat64( aSign, 0x7FF, 0 );
5581 }
5582 shift64RightJamming( aSig, 1, &zSig );
5583 if ( aExp || aSig ) aExp -= 0x3C01;
5584 return roundAndPackFloat64(aSign, aExp, zSig, status);
5585
5586 }
5587
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the quadruple-precision floating-point format. The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5594
5595 float128 floatx80_to_float128(floatx80 a, float_status *status)
5596 {
5597 bool aSign;
5598 int aExp;
5599 uint64_t aSig, zSig0, zSig1;
5600
5601 if (floatx80_invalid_encoding(a)) {
5602 float_raise(float_flag_invalid, status);
5603 return float128_default_nan(status);
5604 }
5605 aSig = extractFloatx80Frac( a );
5606 aExp = extractFloatx80Exp( a );
5607 aSign = extractFloatx80Sign( a );
5608 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5609 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5610 status);
5611 return float128_silence_nan(res, status);
5612 }
5613 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5614 return packFloat128( aSign, aExp, zSig0, zSig1 );
5615
5616 }
5617
5618 /*----------------------------------------------------------------------------
5619 | Rounds the extended double-precision floating-point value `a'
5620 | to the precision provided by floatx80_rounding_precision and returns the
5621 | result as an extended double-precision floating-point value.
5622 | The operation is performed according to the IEC/IEEE Standard for Binary
5623 | Floating-Point Arithmetic.
5624 *----------------------------------------------------------------------------*/
5625
5626 floatx80 floatx80_round(floatx80 a, float_status *status)
5627 {
5628 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5629 extractFloatx80Sign(a),
5630 extractFloatx80Exp(a),
5631 extractFloatx80Frac(a), 0, status);
5632 }
5633
5634 /*----------------------------------------------------------------------------
5635 | Rounds the extended double-precision floating-point value `a' to an integer,
5636 | and returns the result as an extended quadruple-precision floating-point
5637 | value. The operation is performed according to the IEC/IEEE Standard for
5638 | Binary Floating-Point Arithmetic.
5639 *----------------------------------------------------------------------------*/
5640
5641 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5642 {
5643 bool aSign;
5644 int32_t aExp;
5645 uint64_t lastBitMask, roundBitsMask;
5646 floatx80 z;
5647
5648 if (floatx80_invalid_encoding(a)) {
5649 float_raise(float_flag_invalid, status);
5650 return floatx80_default_nan(status);
5651 }
5652 aExp = extractFloatx80Exp( a );
5653 if ( 0x403E <= aExp ) {
5654 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5655 return propagateFloatx80NaN(a, a, status);
5656 }
5657 return a;
5658 }
5659 if ( aExp < 0x3FFF ) {
5660 if ( ( aExp == 0 )
5661 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5662 return a;
5663 }
5664 status->float_exception_flags |= float_flag_inexact;
5665 aSign = extractFloatx80Sign( a );
5666 switch (status->float_rounding_mode) {
5667 case float_round_nearest_even:
5668 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5669 ) {
5670 return
5671 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5672 }
5673 break;
5674 case float_round_ties_away:
5675 if (aExp == 0x3FFE) {
5676 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5677 }
5678 break;
5679 case float_round_down:
5680 return
5681 aSign ?
5682 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5683 : packFloatx80( 0, 0, 0 );
5684 case float_round_up:
5685 return
5686 aSign ? packFloatx80( 1, 0, 0 )
5687 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5688
5689 case float_round_to_zero:
5690 break;
5691 default:
5692 g_assert_not_reached();
5693 }
5694 return packFloatx80( aSign, 0, 0 );
5695 }
5696 lastBitMask = 1;
5697 lastBitMask <<= 0x403E - aExp;
5698 roundBitsMask = lastBitMask - 1;
5699 z = a;
5700 switch (status->float_rounding_mode) {
5701 case float_round_nearest_even:
5702 z.low += lastBitMask>>1;
5703 if ((z.low & roundBitsMask) == 0) {
5704 z.low &= ~lastBitMask;
5705 }
5706 break;
5707 case float_round_ties_away:
5708 z.low += lastBitMask >> 1;
5709 break;
5710 case float_round_to_zero:
5711 break;
5712 case float_round_up:
5713 if (!extractFloatx80Sign(z)) {
5714 z.low += roundBitsMask;
5715 }
5716 break;
5717 case float_round_down:
5718 if (extractFloatx80Sign(z)) {
5719 z.low += roundBitsMask;
5720 }
5721 break;
5722 default:
5723 abort();
5724 }
5725 z.low &= ~ roundBitsMask;
5726 if ( z.low == 0 ) {
5727 ++z.high;
5728 z.low = UINT64_C(0x8000000000000000);
5729 }
5730 if (z.low != a.low) {
5731 status->float_exception_flags |= float_flag_inexact;
5732 }
5733 return z;
5734
5735 }
5736
5737 /*----------------------------------------------------------------------------
5738 | Returns the result of adding the absolute values of the extended double-
5739 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5740 | negated before being returned. `zSign' is ignored if the result is a NaN.
5741 | The addition is performed according to the IEC/IEEE Standard for Binary
5742 | Floating-Point Arithmetic.
5743 *----------------------------------------------------------------------------*/
5744
5745 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5746 float_status *status)
5747 {
5748 int32_t aExp, bExp, zExp;
5749 uint64_t aSig, bSig, zSig0, zSig1;
5750 int32_t expDiff;
5751
5752 aSig = extractFloatx80Frac( a );
5753 aExp = extractFloatx80Exp( a );
5754 bSig = extractFloatx80Frac( b );
5755 bExp = extractFloatx80Exp( b );
5756 expDiff = aExp - bExp;
5757 if ( 0 < expDiff ) {
5758 if ( aExp == 0x7FFF ) {
5759 if ((uint64_t)(aSig << 1)) {
5760 return propagateFloatx80NaN(a, b, status);
5761 }
5762 return a;
5763 }
5764 if ( bExp == 0 ) --expDiff;
5765 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5766 zExp = aExp;
5767 }
5768 else if ( expDiff < 0 ) {
5769 if ( bExp == 0x7FFF ) {
5770 if ((uint64_t)(bSig << 1)) {
5771 return propagateFloatx80NaN(a, b, status);
5772 }
5773 return packFloatx80(zSign,
5774 floatx80_infinity_high,
5775 floatx80_infinity_low);
5776 }
5777 if ( aExp == 0 ) ++expDiff;
5778 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5779 zExp = bExp;
5780 }
5781 else {
5782 if ( aExp == 0x7FFF ) {
5783 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5784 return propagateFloatx80NaN(a, b, status);
5785 }
5786 return a;
5787 }
5788 zSig1 = 0;
5789 zSig0 = aSig + bSig;
5790 if ( aExp == 0 ) {
5791 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5792 /* At least one of the values is a pseudo-denormal,
5793 * and there is a carry out of the result. */
5794 zExp = 1;
5795 goto shiftRight1;
5796 }
5797 if (zSig0 == 0) {
5798 return packFloatx80(zSign, 0, 0);
5799 }
5800 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5801 goto roundAndPack;
5802 }
5803 zExp = aExp;
5804 goto shiftRight1;
5805 }
5806 zSig0 = aSig + bSig;
5807 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5808 shiftRight1:
5809 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5810 zSig0 |= UINT64_C(0x8000000000000000);
5811 ++zExp;
5812 roundAndPack:
5813 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5814 zSign, zExp, zSig0, zSig1, status);
5815 }
5816
5817 /*----------------------------------------------------------------------------
5818 | Returns the result of subtracting the absolute values of the extended
5819 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5820 | difference is negated before being returned. `zSign' is ignored if the
5821 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5822 | Standard for Binary Floating-Point Arithmetic.
5823 *----------------------------------------------------------------------------*/
5824
5825 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5826 float_status *status)
5827 {
5828 int32_t aExp, bExp, zExp;
5829 uint64_t aSig, bSig, zSig0, zSig1;
5830 int32_t expDiff;
5831
5832 aSig = extractFloatx80Frac( a );
5833 aExp = extractFloatx80Exp( a );
5834 bSig = extractFloatx80Frac( b );
5835 bExp = extractFloatx80Exp( b );
5836 expDiff = aExp - bExp;
5837 if ( 0 < expDiff ) goto aExpBigger;
5838 if ( expDiff < 0 ) goto bExpBigger;
5839 if ( aExp == 0x7FFF ) {
5840 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5841 return propagateFloatx80NaN(a, b, status);
5842 }
5843 float_raise(float_flag_invalid, status);
5844 return floatx80_default_nan(status);
5845 }
5846 if ( aExp == 0 ) {
5847 aExp = 1;
5848 bExp = 1;
5849 }
5850 zSig1 = 0;
5851 if ( bSig < aSig ) goto aBigger;
5852 if ( aSig < bSig ) goto bBigger;
5853 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5854 bExpBigger:
5855 if ( bExp == 0x7FFF ) {
5856 if ((uint64_t)(bSig << 1)) {
5857 return propagateFloatx80NaN(a, b, status);
5858 }
5859 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5860 floatx80_infinity_low);
5861 }
5862 if ( aExp == 0 ) ++expDiff;
5863 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5864 bBigger:
5865 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5866 zExp = bExp;
5867 zSign ^= 1;
5868 goto normalizeRoundAndPack;
5869 aExpBigger:
5870 if ( aExp == 0x7FFF ) {
5871 if ((uint64_t)(aSig << 1)) {
5872 return propagateFloatx80NaN(a, b, status);
5873 }
5874 return a;
5875 }
5876 if ( bExp == 0 ) --expDiff;
5877 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5878 aBigger:
5879 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5880 zExp = aExp;
5881 normalizeRoundAndPack:
5882 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5883 zSign, zExp, zSig0, zSig1, status);
5884 }
5885
5886 /*----------------------------------------------------------------------------
5887 | Returns the result of adding the extended double-precision floating-point
5888 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5889 | Standard for Binary Floating-Point Arithmetic.
5890 *----------------------------------------------------------------------------*/
5891
5892 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5893 {
5894 bool aSign, bSign;
5895
5896 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5897 float_raise(float_flag_invalid, status);
5898 return floatx80_default_nan(status);
5899 }
5900 aSign = extractFloatx80Sign( a );
5901 bSign = extractFloatx80Sign( b );
5902 if ( aSign == bSign ) {
5903 return addFloatx80Sigs(a, b, aSign, status);
5904 }
5905 else {
5906 return subFloatx80Sigs(a, b, aSign, status);
5907 }
5908
5909 }
5910
5911 /*----------------------------------------------------------------------------
5912 | Returns the result of subtracting the extended double-precision floating-
5913 | point values `a' and `b'. The operation is performed according to the
5914 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5916
5917 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5918 {
5919 bool aSign, bSign;
5920
5921 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5922 float_raise(float_flag_invalid, status);
5923 return floatx80_default_nan(status);
5924 }
5925 aSign = extractFloatx80Sign( a );
5926 bSign = extractFloatx80Sign( b );
5927 if ( aSign == bSign ) {
5928 return subFloatx80Sigs(a, b, aSign, status);
5929 }
5930 else {
5931 return addFloatx80Sigs(a, b, aSign, status);
5932 }
5933
5934 }
5935
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of multiplying the extended double-precision floating-
5938 | point values `a' and `b'. The operation is performed according to the
5939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5941
5942 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5943 {
5944 bool aSign, bSign, zSign;
5945 int32_t aExp, bExp, zExp;
5946 uint64_t aSig, bSig, zSig0, zSig1;
5947
5948 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5949 float_raise(float_flag_invalid, status);
5950 return floatx80_default_nan(status);
5951 }
5952 aSig = extractFloatx80Frac( a );
5953 aExp = extractFloatx80Exp( a );
5954 aSign = extractFloatx80Sign( a );
5955 bSig = extractFloatx80Frac( b );
5956 bExp = extractFloatx80Exp( b );
5957 bSign = extractFloatx80Sign( b );
5958 zSign = aSign ^ bSign;
5959 if ( aExp == 0x7FFF ) {
5960 if ( (uint64_t) ( aSig<<1 )
5961 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5962 return propagateFloatx80NaN(a, b, status);
5963 }
5964 if ( ( bExp | bSig ) == 0 ) goto invalid;
5965 return packFloatx80(zSign, floatx80_infinity_high,
5966 floatx80_infinity_low);
5967 }
5968 if ( bExp == 0x7FFF ) {
5969 if ((uint64_t)(bSig << 1)) {
5970 return propagateFloatx80NaN(a, b, status);
5971 }
5972 if ( ( aExp | aSig ) == 0 ) {
5973 invalid:
5974 float_raise(float_flag_invalid, status);
5975 return floatx80_default_nan(status);
5976 }
5977 return packFloatx80(zSign, floatx80_infinity_high,
5978 floatx80_infinity_low);
5979 }
5980 if ( aExp == 0 ) {
5981 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5982 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5983 }
5984 if ( bExp == 0 ) {
5985 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5986 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5987 }
5988 zExp = aExp + bExp - 0x3FFE;
5989 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5990 if ( 0 < (int64_t) zSig0 ) {
5991 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5992 --zExp;
5993 }
5994 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5995 zSign, zExp, zSig0, zSig1, status);
5996 }
5997
5998 /*----------------------------------------------------------------------------
5999 | Returns the result of dividing the extended double-precision floating-point
6000 | value `a' by the corresponding value `b'. The operation is performed
6001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002 *----------------------------------------------------------------------------*/
6003
6004 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6005 {
6006 bool aSign, bSign, zSign;
6007 int32_t aExp, bExp, zExp;
6008 uint64_t aSig, bSig, zSig0, zSig1;
6009 uint64_t rem0, rem1, rem2, term0, term1, term2;
6010
6011 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6012 float_raise(float_flag_invalid, status);
6013 return floatx80_default_nan(status);
6014 }
6015 aSig = extractFloatx80Frac( a );
6016 aExp = extractFloatx80Exp( a );
6017 aSign = extractFloatx80Sign( a );
6018 bSig = extractFloatx80Frac( b );
6019 bExp = extractFloatx80Exp( b );
6020 bSign = extractFloatx80Sign( b );
6021 zSign = aSign ^ bSign;
6022 if ( aExp == 0x7FFF ) {
6023 if ((uint64_t)(aSig << 1)) {
6024 return propagateFloatx80NaN(a, b, status);
6025 }
6026 if ( bExp == 0x7FFF ) {
6027 if ((uint64_t)(bSig << 1)) {
6028 return propagateFloatx80NaN(a, b, status);
6029 }
6030 goto invalid;
6031 }
6032 return packFloatx80(zSign, floatx80_infinity_high,
6033 floatx80_infinity_low);
6034 }
6035 if ( bExp == 0x7FFF ) {
6036 if ((uint64_t)(bSig << 1)) {
6037 return propagateFloatx80NaN(a, b, status);
6038 }
6039 return packFloatx80( zSign, 0, 0 );
6040 }
6041 if ( bExp == 0 ) {
6042 if ( bSig == 0 ) {
6043 if ( ( aExp | aSig ) == 0 ) {
6044 invalid:
6045 float_raise(float_flag_invalid, status);
6046 return floatx80_default_nan(status);
6047 }
6048 float_raise(float_flag_divbyzero, status);
6049 return packFloatx80(zSign, floatx80_infinity_high,
6050 floatx80_infinity_low);
6051 }
6052 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6053 }
6054 if ( aExp == 0 ) {
6055 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6056 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6057 }
6058 zExp = aExp - bExp + 0x3FFE;
6059 rem1 = 0;
6060 if ( bSig <= aSig ) {
6061 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6062 ++zExp;
6063 }
6064 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6065 mul64To128( bSig, zSig0, &term0, &term1 );
6066 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6067 while ( (int64_t) rem0 < 0 ) {
6068 --zSig0;
6069 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6070 }
6071 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6072 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6073 mul64To128( bSig, zSig1, &term1, &term2 );
6074 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6075 while ( (int64_t) rem1 < 0 ) {
6076 --zSig1;
6077 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6078 }
6079 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6080 }
6081 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6082 zSign, zExp, zSig0, zSig1, status);
6083 }
6084
6085 /*----------------------------------------------------------------------------
6086 | Returns the remainder of the extended double-precision floating-point value
6087 | `a' with respect to the corresponding value `b'. The operation is performed
6088 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6089 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6090 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6091 | the absolute value of the integer quotient.
6092 *----------------------------------------------------------------------------*/
6093
6094 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6095 float_status *status)
6096 {
6097 bool aSign, zSign;
6098 int32_t aExp, bExp, expDiff, aExpOrig;
6099 uint64_t aSig0, aSig1, bSig;
6100 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6101
6102 *quotient = 0;
6103 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6104 float_raise(float_flag_invalid, status);
6105 return floatx80_default_nan(status);
6106 }
6107 aSig0 = extractFloatx80Frac( a );
6108 aExpOrig = aExp = extractFloatx80Exp( a );
6109 aSign = extractFloatx80Sign( a );
6110 bSig = extractFloatx80Frac( b );
6111 bExp = extractFloatx80Exp( b );
6112 if ( aExp == 0x7FFF ) {
6113 if ( (uint64_t) ( aSig0<<1 )
6114 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6115 return propagateFloatx80NaN(a, b, status);
6116 }
6117 goto invalid;
6118 }
6119 if ( bExp == 0x7FFF ) {
6120 if ((uint64_t)(bSig << 1)) {
6121 return propagateFloatx80NaN(a, b, status);
6122 }
6123 if (aExp == 0 && aSig0 >> 63) {
6124 /*
6125 * Pseudo-denormal argument must be returned in normalized
6126 * form.
6127 */
6128 return packFloatx80(aSign, 1, aSig0);
6129 }
6130 return a;
6131 }
6132 if ( bExp == 0 ) {
6133 if ( bSig == 0 ) {
6134 invalid:
6135 float_raise(float_flag_invalid, status);
6136 return floatx80_default_nan(status);
6137 }
6138 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6139 }
6140 if ( aExp == 0 ) {
6141 if ( aSig0 == 0 ) return a;
6142 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6143 }
6144 zSign = aSign;
6145 expDiff = aExp - bExp;
6146 aSig1 = 0;
6147 if ( expDiff < 0 ) {
6148 if ( mod || expDiff < -1 ) {
6149 if (aExp == 1 && aExpOrig == 0) {
6150 /*
6151 * Pseudo-denormal argument must be returned in
6152 * normalized form.
6153 */
6154 return packFloatx80(aSign, aExp, aSig0);
6155 }
6156 return a;
6157 }
6158 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6159 expDiff = 0;
6160 }
6161 *quotient = q = ( bSig <= aSig0 );
6162 if ( q ) aSig0 -= bSig;
6163 expDiff -= 64;
6164 while ( 0 < expDiff ) {
6165 q = estimateDiv128To64( aSig0, aSig1, bSig );
6166 q = ( 2 < q ) ? q - 2 : 0;
6167 mul64To128( bSig, q, &term0, &term1 );
6168 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6169 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6170 expDiff -= 62;
6171 *quotient <<= 62;
6172 *quotient += q;
6173 }
6174 expDiff += 64;
6175 if ( 0 < expDiff ) {
6176 q = estimateDiv128To64( aSig0, aSig1, bSig );
6177 q = ( 2 < q ) ? q - 2 : 0;
6178 q >>= 64 - expDiff;
6179 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6180 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6181 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6182 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6183 ++q;
6184 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6185 }
6186 if (expDiff < 64) {
6187 *quotient <<= expDiff;
6188 } else {
6189 *quotient = 0;
6190 }
6191 *quotient += q;
6192 }
6193 else {
6194 term1 = 0;
6195 term0 = bSig;
6196 }
6197 if (!mod) {
6198 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6199 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6200 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6201 && ( q & 1 ) )
6202 ) {
6203 aSig0 = alternateASig0;
6204 aSig1 = alternateASig1;
6205 zSign = ! zSign;
6206 ++*quotient;
6207 }
6208 }
6209 return
6210 normalizeRoundAndPackFloatx80(
6211 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6212
6213 }
6214
6215 /*----------------------------------------------------------------------------
6216 | Returns the remainder of the extended double-precision floating-point value
6217 | `a' with respect to the corresponding value `b'. The operation is performed
6218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6220
6221 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6222 {
6223 uint64_t quotient;
6224 return floatx80_modrem(a, b, false, &quotient, status);
6225 }
6226
6227 /*----------------------------------------------------------------------------
6228 | Returns the remainder of the extended double-precision floating-point value
6229 | `a' with respect to the corresponding value `b', with the quotient truncated
6230 | toward zero.
6231 *----------------------------------------------------------------------------*/
6232
6233 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6234 {
6235 uint64_t quotient;
6236 return floatx80_modrem(a, b, true, &quotient, status);
6237 }
6238
6239 /*----------------------------------------------------------------------------
6240 | Returns the square root of the extended double-precision floating-point
6241 | value `a'. The operation is performed according to the IEC/IEEE Standard
6242 | for Binary Floating-Point Arithmetic.
6243 *----------------------------------------------------------------------------*/
6244
6245 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6246 {
6247 bool aSign;
6248 int32_t aExp, zExp;
6249 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6250 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6251
6252 if (floatx80_invalid_encoding(a)) {
6253 float_raise(float_flag_invalid, status);
6254 return floatx80_default_nan(status);
6255 }
6256 aSig0 = extractFloatx80Frac( a );
6257 aExp = extractFloatx80Exp( a );
6258 aSign = extractFloatx80Sign( a );
6259 if ( aExp == 0x7FFF ) {
6260 if ((uint64_t)(aSig0 << 1)) {
6261 return propagateFloatx80NaN(a, a, status);
6262 }
6263 if ( ! aSign ) return a;
6264 goto invalid;
6265 }
6266 if ( aSign ) {
6267 if ( ( aExp | aSig0 ) == 0 ) return a;
6268 invalid:
6269 float_raise(float_flag_invalid, status);
6270 return floatx80_default_nan(status);
6271 }
6272 if ( aExp == 0 ) {
6273 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6274 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6275 }
6276 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6277 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6278 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6279 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6280 doubleZSig0 = zSig0<<1;
6281 mul64To128( zSig0, zSig0, &term0, &term1 );
6282 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6283 while ( (int64_t) rem0 < 0 ) {
6284 --zSig0;
6285 doubleZSig0 -= 2;
6286 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6287 }
6288 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6289 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6290 if ( zSig1 == 0 ) zSig1 = 1;
6291 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6292 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6293 mul64To128( zSig1, zSig1, &term2, &term3 );
6294 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6295 while ( (int64_t) rem1 < 0 ) {
6296 --zSig1;
6297 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6298 term3 |= 1;
6299 term2 |= doubleZSig0;
6300 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6301 }
6302 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6303 }
6304 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6305 zSig0 |= doubleZSig0;
6306 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6307 0, zExp, zSig0, zSig1, status);
6308 }
6309
6310 /*----------------------------------------------------------------------------
6311 | Returns the result of converting the quadruple-precision floating-point
6312 | value `a' to the 32-bit two's complement integer format. The conversion
6313 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6314 | Arithmetic---which means in particular that the conversion is rounded
6315 | according to the current rounding mode. If `a' is a NaN, the largest
6316 | positive integer is returned. Otherwise, if the conversion overflows, the
6317 | largest integer with the same sign as `a' is returned.
6318 *----------------------------------------------------------------------------*/
6319
6320 int32_t float128_to_int32(float128 a, float_status *status)
6321 {
6322 bool aSign;
6323 int32_t aExp, shiftCount;
6324 uint64_t aSig0, aSig1;
6325
6326 aSig1 = extractFloat128Frac1( a );
6327 aSig0 = extractFloat128Frac0( a );
6328 aExp = extractFloat128Exp( a );
6329 aSign = extractFloat128Sign( a );
6330 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6331 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6332 aSig0 |= ( aSig1 != 0 );
6333 shiftCount = 0x4028 - aExp;
6334 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6335 return roundAndPackInt32(aSign, aSig0, status);
6336
6337 }
6338
6339 /*----------------------------------------------------------------------------
6340 | Returns the result of converting the quadruple-precision floating-point
6341 | value `a' to the 32-bit two's complement integer format. The conversion
6342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6343 | Arithmetic, except that the conversion is always rounded toward zero. If
6344 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6345 | conversion overflows, the largest integer with the same sign as `a' is
6346 | returned.
6347 *----------------------------------------------------------------------------*/
6348
6349 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6350 {
6351 bool aSign;
6352 int32_t aExp, shiftCount;
6353 uint64_t aSig0, aSig1, savedASig;
6354 int32_t z;
6355
6356 aSig1 = extractFloat128Frac1( a );
6357 aSig0 = extractFloat128Frac0( a );
6358 aExp = extractFloat128Exp( a );
6359 aSign = extractFloat128Sign( a );
6360 aSig0 |= ( aSig1 != 0 );
6361 if ( 0x401E < aExp ) {
6362 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6363 goto invalid;
6364 }
6365 else if ( aExp < 0x3FFF ) {
6366 if (aExp || aSig0) {
6367 status->float_exception_flags |= float_flag_inexact;
6368 }
6369 return 0;
6370 }
6371 aSig0 |= UINT64_C(0x0001000000000000);
6372 shiftCount = 0x402F - aExp;
6373 savedASig = aSig0;
6374 aSig0 >>= shiftCount;
6375 z = aSig0;
6376 if ( aSign ) z = - z;
6377 if ( ( z < 0 ) ^ aSign ) {
6378 invalid:
6379 float_raise(float_flag_invalid, status);
6380 return aSign ? INT32_MIN : INT32_MAX;
6381 }
6382 if ( ( aSig0<<shiftCount ) != savedASig ) {
6383 status->float_exception_flags |= float_flag_inexact;
6384 }
6385 return z;
6386
6387 }
6388
6389 /*----------------------------------------------------------------------------
6390 | Returns the result of converting the quadruple-precision floating-point
6391 | value `a' to the 64-bit two's complement integer format. The conversion
6392 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6393 | Arithmetic---which means in particular that the conversion is rounded
6394 | according to the current rounding mode. If `a' is a NaN, the largest
6395 | positive integer is returned. Otherwise, if the conversion overflows, the
6396 | largest integer with the same sign as `a' is returned.
6397 *----------------------------------------------------------------------------*/
6398
6399 int64_t float128_to_int64(float128 a, float_status *status)
6400 {
6401 bool aSign;
6402 int32_t aExp, shiftCount;
6403 uint64_t aSig0, aSig1;
6404
6405 aSig1 = extractFloat128Frac1( a );
6406 aSig0 = extractFloat128Frac0( a );
6407 aExp = extractFloat128Exp( a );
6408 aSign = extractFloat128Sign( a );
6409 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6410 shiftCount = 0x402F - aExp;
6411 if ( shiftCount <= 0 ) {
6412 if ( 0x403E < aExp ) {
6413 float_raise(float_flag_invalid, status);
6414 if ( ! aSign
6415 || ( ( aExp == 0x7FFF )
6416 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6417 )
6418 ) {
6419 return INT64_MAX;
6420 }
6421 return INT64_MIN;
6422 }
6423 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6424 }
6425 else {
6426 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6427 }
6428 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6429
6430 }
6431
6432 /*----------------------------------------------------------------------------
6433 | Returns the result of converting the quadruple-precision floating-point
6434 | value `a' to the 64-bit two's complement integer format. The conversion
6435 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6436 | Arithmetic, except that the conversion is always rounded toward zero.
6437 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6438 | the conversion overflows, the largest integer with the same sign as `a' is
6439 | returned.
6440 *----------------------------------------------------------------------------*/
6441
6442 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6443 {
6444 bool aSign;
6445 int32_t aExp, shiftCount;
6446 uint64_t aSig0, aSig1;
6447 int64_t z;
6448
6449 aSig1 = extractFloat128Frac1( a );
6450 aSig0 = extractFloat128Frac0( a );
6451 aExp = extractFloat128Exp( a );
6452 aSign = extractFloat128Sign( a );
6453 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6454 shiftCount = aExp - 0x402F;
6455 if ( 0 < shiftCount ) {
6456 if ( 0x403E <= aExp ) {
6457 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6458 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6459 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6460 if (aSig1) {
6461 status->float_exception_flags |= float_flag_inexact;
6462 }
6463 }
6464 else {
6465 float_raise(float_flag_invalid, status);
6466 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6467 return INT64_MAX;
6468 }
6469 }
6470 return INT64_MIN;
6471 }
6472 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6473 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6474 status->float_exception_flags |= float_flag_inexact;
6475 }
6476 }
6477 else {
6478 if ( aExp < 0x3FFF ) {
6479 if ( aExp | aSig0 | aSig1 ) {
6480 status->float_exception_flags |= float_flag_inexact;
6481 }
6482 return 0;
6483 }
6484 z = aSig0>>( - shiftCount );
6485 if ( aSig1
6486 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6487 status->float_exception_flags |= float_flag_inexact;
6488 }
6489 }
6490 if ( aSign ) z = - z;
6491 return z;
6492
6493 }
6494
6495 /*----------------------------------------------------------------------------
6496 | Returns the result of converting the quadruple-precision floating-point value
6497 | `a' to the 64-bit unsigned integer format. The conversion is
6498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6499 | Arithmetic---which means in particular that the conversion is rounded
6500 | according to the current rounding mode. If `a' is a NaN, the largest
6501 | positive integer is returned. If the conversion overflows, the
6502 | largest unsigned integer is returned. If 'a' is negative, the value is
6503 | rounded and zero is returned; negative values that do not round to zero
6504 | will raise the inexact exception.
6505 *----------------------------------------------------------------------------*/
6506
6507 uint64_t float128_to_uint64(float128 a, float_status *status)
6508 {
6509 bool aSign;
6510 int aExp;
6511 int shiftCount;
6512 uint64_t aSig0, aSig1;
6513
6514 aSig0 = extractFloat128Frac0(a);
6515 aSig1 = extractFloat128Frac1(a);
6516 aExp = extractFloat128Exp(a);
6517 aSign = extractFloat128Sign(a);
6518 if (aSign && (aExp > 0x3FFE)) {
6519 float_raise(float_flag_invalid, status);
6520 if (float128_is_any_nan(a)) {
6521 return UINT64_MAX;
6522 } else {
6523 return 0;
6524 }
6525 }
6526 if (aExp) {
6527 aSig0 |= UINT64_C(0x0001000000000000);
6528 }
6529 shiftCount = 0x402F - aExp;
6530 if (shiftCount <= 0) {
6531 if (0x403E < aExp) {
6532 float_raise(float_flag_invalid, status);
6533 return UINT64_MAX;
6534 }
6535 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6536 } else {
6537 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6538 }
6539 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6540 }
6541
6542 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6543 {
6544 uint64_t v;
6545 signed char current_rounding_mode = status->float_rounding_mode;
6546
6547 set_float_rounding_mode(float_round_to_zero, status);
6548 v = float128_to_uint64(a, status);
6549 set_float_rounding_mode(current_rounding_mode, status);
6550
6551 return v;
6552 }
6553
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point
6556 | value `a' to the 32-bit unsigned integer format. The conversion
6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic except that the conversion is always rounded toward zero.
6559 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6560 | if the conversion overflows, the largest unsigned integer is returned.
6561 | If 'a' is negative, the value is rounded and zero is returned; negative
6562 | values that do not round to zero will raise the inexact exception.
6563 *----------------------------------------------------------------------------*/
6564
6565 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6566 {
6567 uint64_t v;
6568 uint32_t res;
6569 int old_exc_flags = get_float_exception_flags(status);
6570
6571 v = float128_to_uint64_round_to_zero(a, status);
6572 if (v > 0xffffffff) {
6573 res = 0xffffffff;
6574 } else {
6575 return v;
6576 }
6577 set_float_exception_flags(old_exc_flags, status);
6578 float_raise(float_flag_invalid, status);
6579 return res;
6580 }
6581
6582 /*----------------------------------------------------------------------------
6583 | Returns the result of converting the quadruple-precision floating-point value
6584 | `a' to the 32-bit unsigned integer format. The conversion is
6585 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6586 | Arithmetic---which means in particular that the conversion is rounded
6587 | according to the current rounding mode. If `a' is a NaN, the largest
6588 | positive integer is returned. If the conversion overflows, the
6589 | largest unsigned integer is returned. If 'a' is negative, the value is
6590 | rounded and zero is returned; negative values that do not round to zero
6591 | will raise the inexact exception.
6592 *----------------------------------------------------------------------------*/
6593
6594 uint32_t float128_to_uint32(float128 a, float_status *status)
6595 {
6596 uint64_t v;
6597 uint32_t res;
6598 int old_exc_flags = get_float_exception_flags(status);
6599
6600 v = float128_to_uint64(a, status);
6601 if (v > 0xffffffff) {
6602 res = 0xffffffff;
6603 } else {
6604 return v;
6605 }
6606 set_float_exception_flags(old_exc_flags, status);
6607 float_raise(float_flag_invalid, status);
6608 return res;
6609 }
6610
6611 /*----------------------------------------------------------------------------
6612 | Returns the result of converting the quadruple-precision floating-point
6613 | value `a' to the single-precision floating-point format. The conversion
6614 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6615 | Arithmetic.
6616 *----------------------------------------------------------------------------*/
6617
6618 float32 float128_to_float32(float128 a, float_status *status)
6619 {
6620 bool aSign;
6621 int32_t aExp;
6622 uint64_t aSig0, aSig1;
6623 uint32_t zSig;
6624
6625 aSig1 = extractFloat128Frac1( a );
6626 aSig0 = extractFloat128Frac0( a );
6627 aExp = extractFloat128Exp( a );
6628 aSign = extractFloat128Sign( a );
6629 if ( aExp == 0x7FFF ) {
6630 if ( aSig0 | aSig1 ) {
6631 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6632 }
6633 return packFloat32( aSign, 0xFF, 0 );
6634 }
6635 aSig0 |= ( aSig1 != 0 );
6636 shift64RightJamming( aSig0, 18, &aSig0 );
6637 zSig = aSig0;
6638 if ( aExp || zSig ) {
6639 zSig |= 0x40000000;
6640 aExp -= 0x3F81;
6641 }
6642 return roundAndPackFloat32(aSign, aExp, zSig, status);
6643
6644 }
6645
6646 /*----------------------------------------------------------------------------
6647 | Returns the result of converting the quadruple-precision floating-point
6648 | value `a' to the double-precision floating-point format. The conversion
6649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6650 | Arithmetic.
6651 *----------------------------------------------------------------------------*/
6652
6653 float64 float128_to_float64(float128 a, float_status *status)
6654 {
6655 bool aSign;
6656 int32_t aExp;
6657 uint64_t aSig0, aSig1;
6658
6659 aSig1 = extractFloat128Frac1( a );
6660 aSig0 = extractFloat128Frac0( a );
6661 aExp = extractFloat128Exp( a );
6662 aSign = extractFloat128Sign( a );
6663 if ( aExp == 0x7FFF ) {
6664 if ( aSig0 | aSig1 ) {
6665 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6666 }
6667 return packFloat64( aSign, 0x7FF, 0 );
6668 }
6669 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6670 aSig0 |= ( aSig1 != 0 );
6671 if ( aExp || aSig0 ) {
6672 aSig0 |= UINT64_C(0x4000000000000000);
6673 aExp -= 0x3C01;
6674 }
6675 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6676
6677 }
6678
6679 /*----------------------------------------------------------------------------
6680 | Returns the result of converting the quadruple-precision floating-point
6681 | value `a' to the extended double-precision floating-point format. The
6682 | conversion is performed according to the IEC/IEEE Standard for Binary
6683 | Floating-Point Arithmetic.
6684 *----------------------------------------------------------------------------*/
6685
6686 floatx80 float128_to_floatx80(float128 a, float_status *status)
6687 {
6688 bool aSign;
6689 int32_t aExp;
6690 uint64_t aSig0, aSig1;
6691
6692 aSig1 = extractFloat128Frac1( a );
6693 aSig0 = extractFloat128Frac0( a );
6694 aExp = extractFloat128Exp( a );
6695 aSign = extractFloat128Sign( a );
6696 if ( aExp == 0x7FFF ) {
6697 if ( aSig0 | aSig1 ) {
6698 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6699 status);
6700 return floatx80_silence_nan(res, status);
6701 }
6702 return packFloatx80(aSign, floatx80_infinity_high,
6703 floatx80_infinity_low);
6704 }
6705 if ( aExp == 0 ) {
6706 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6707 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6708 }
6709 else {
6710 aSig0 |= UINT64_C(0x0001000000000000);
6711 }
6712 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6713 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6714
6715 }
6716
6717 /*----------------------------------------------------------------------------
6718 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6719 | returns the result as a quadruple-precision floating-point value. The
6720 | operation is performed according to the IEC/IEEE Standard for Binary
6721 | Floating-Point Arithmetic.
6722 *----------------------------------------------------------------------------*/
6723
6724 float128 float128_round_to_int(float128 a, float_status *status)
6725 {
6726 bool aSign;
6727 int32_t aExp;
6728 uint64_t lastBitMask, roundBitsMask;
6729 float128 z;
6730
6731 aExp = extractFloat128Exp( a );
6732 if ( 0x402F <= aExp ) {
6733 if ( 0x406F <= aExp ) {
6734 if ( ( aExp == 0x7FFF )
6735 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6736 ) {
6737 return propagateFloat128NaN(a, a, status);
6738 }
6739 return a;
6740 }
6741 lastBitMask = 1;
6742 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6743 roundBitsMask = lastBitMask - 1;
6744 z = a;
6745 switch (status->float_rounding_mode) {
6746 case float_round_nearest_even:
6747 if ( lastBitMask ) {
6748 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6749 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6750 }
6751 else {
6752 if ( (int64_t) z.low < 0 ) {
6753 ++z.high;
6754 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6755 }
6756 }
6757 break;
6758 case float_round_ties_away:
6759 if (lastBitMask) {
6760 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6761 } else {
6762 if ((int64_t) z.low < 0) {
6763 ++z.high;
6764 }
6765 }
6766 break;
6767 case float_round_to_zero:
6768 break;
6769 case float_round_up:
6770 if (!extractFloat128Sign(z)) {
6771 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6772 }
6773 break;
6774 case float_round_down:
6775 if (extractFloat128Sign(z)) {
6776 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6777 }
6778 break;
6779 case float_round_to_odd:
6780 /*
6781 * Note that if lastBitMask == 0, the last bit is the lsb
6782 * of high, and roundBitsMask == -1.
6783 */
6784 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6785 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6786 }
6787 break;
6788 default:
6789 abort();
6790 }
6791 z.low &= ~ roundBitsMask;
6792 }
6793 else {
6794 if ( aExp < 0x3FFF ) {
6795 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6796 status->float_exception_flags |= float_flag_inexact;
6797 aSign = extractFloat128Sign( a );
6798 switch (status->float_rounding_mode) {
6799 case float_round_nearest_even:
6800 if ( ( aExp == 0x3FFE )
6801 && ( extractFloat128Frac0( a )
6802 | extractFloat128Frac1( a ) )
6803 ) {
6804 return packFloat128( aSign, 0x3FFF, 0, 0 );
6805 }
6806 break;
6807 case float_round_ties_away:
6808 if (aExp == 0x3FFE) {
6809 return packFloat128(aSign, 0x3FFF, 0, 0);
6810 }
6811 break;
6812 case float_round_down:
6813 return
6814 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6815 : packFloat128( 0, 0, 0, 0 );
6816 case float_round_up:
6817 return
6818 aSign ? packFloat128( 1, 0, 0, 0 )
6819 : packFloat128( 0, 0x3FFF, 0, 0 );
6820
6821 case float_round_to_odd:
6822 return packFloat128(aSign, 0x3FFF, 0, 0);
6823
6824 case float_round_to_zero:
6825 break;
6826 }
6827 return packFloat128( aSign, 0, 0, 0 );
6828 }
6829 lastBitMask = 1;
6830 lastBitMask <<= 0x402F - aExp;
6831 roundBitsMask = lastBitMask - 1;
6832 z.low = 0;
6833 z.high = a.high;
6834 switch (status->float_rounding_mode) {
6835 case float_round_nearest_even:
6836 z.high += lastBitMask>>1;
6837 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6838 z.high &= ~ lastBitMask;
6839 }
6840 break;
6841 case float_round_ties_away:
6842 z.high += lastBitMask>>1;
6843 break;
6844 case float_round_to_zero:
6845 break;
6846 case float_round_up:
6847 if (!extractFloat128Sign(z)) {
6848 z.high |= ( a.low != 0 );
6849 z.high += roundBitsMask;
6850 }
6851 break;
6852 case float_round_down:
6853 if (extractFloat128Sign(z)) {
6854 z.high |= (a.low != 0);
6855 z.high += roundBitsMask;
6856 }
6857 break;
6858 case float_round_to_odd:
6859 if ((z.high & lastBitMask) == 0) {
6860 z.high |= (a.low != 0);
6861 z.high += roundBitsMask;
6862 }
6863 break;
6864 default:
6865 abort();
6866 }
6867 z.high &= ~ roundBitsMask;
6868 }
6869 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6870 status->float_exception_flags |= float_flag_inexact;
6871 }
6872 return z;
6873
6874 }
6875
6876 /*----------------------------------------------------------------------------
6877 | Returns the result of adding the absolute values of the quadruple-precision
6878 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6879 | before being returned. `zSign' is ignored if the result is a NaN.
6880 | The addition is performed according to the IEC/IEEE Standard for Binary
6881 | Floating-Point Arithmetic.
6882 *----------------------------------------------------------------------------*/
6883
6884 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6885 float_status *status)
6886 {
6887 int32_t aExp, bExp, zExp;
6888 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6889 int32_t expDiff;
6890
6891 aSig1 = extractFloat128Frac1( a );
6892 aSig0 = extractFloat128Frac0( a );
6893 aExp = extractFloat128Exp( a );
6894 bSig1 = extractFloat128Frac1( b );
6895 bSig0 = extractFloat128Frac0( b );
6896 bExp = extractFloat128Exp( b );
6897 expDiff = aExp - bExp;
6898 if ( 0 < expDiff ) {
6899 if ( aExp == 0x7FFF ) {
6900 if (aSig0 | aSig1) {
6901 return propagateFloat128NaN(a, b, status);
6902 }
6903 return a;
6904 }
6905 if ( bExp == 0 ) {
6906 --expDiff;
6907 }
6908 else {
6909 bSig0 |= UINT64_C(0x0001000000000000);
6910 }
6911 shift128ExtraRightJamming(
6912 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6913 zExp = aExp;
6914 }
6915 else if ( expDiff < 0 ) {
6916 if ( bExp == 0x7FFF ) {
6917 if (bSig0 | bSig1) {
6918 return propagateFloat128NaN(a, b, status);
6919 }
6920 return packFloat128( zSign, 0x7FFF, 0, 0 );
6921 }
6922 if ( aExp == 0 ) {
6923 ++expDiff;
6924 }
6925 else {
6926 aSig0 |= UINT64_C(0x0001000000000000);
6927 }
6928 shift128ExtraRightJamming(
6929 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6930 zExp = bExp;
6931 }
6932 else {
6933 if ( aExp == 0x7FFF ) {
6934 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6935 return propagateFloat128NaN(a, b, status);
6936 }
6937 return a;
6938 }
6939 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6940 if ( aExp == 0 ) {
6941 if (status->flush_to_zero) {
6942 if (zSig0 | zSig1) {
6943 float_raise(float_flag_output_denormal, status);
6944 }
6945 return packFloat128(zSign, 0, 0, 0);
6946 }
6947 return packFloat128( zSign, 0, zSig0, zSig1 );
6948 }
6949 zSig2 = 0;
6950 zSig0 |= UINT64_C(0x0002000000000000);
6951 zExp = aExp;
6952 goto shiftRight1;
6953 }
6954 aSig0 |= UINT64_C(0x0001000000000000);
6955 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6956 --zExp;
6957 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6958 ++zExp;
6959 shiftRight1:
6960 shift128ExtraRightJamming(
6961 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6962 roundAndPack:
6963 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6964
6965 }
6966
6967 /*----------------------------------------------------------------------------
6968 | Returns the result of subtracting the absolute values of the quadruple-
6969 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6970 | difference is negated before being returned. `zSign' is ignored if the
6971 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6972 | Standard for Binary Floating-Point Arithmetic.
6973 *----------------------------------------------------------------------------*/
6974
6975 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6976 float_status *status)
6977 {
6978 int32_t aExp, bExp, zExp;
6979 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6980 int32_t expDiff;
6981
6982 aSig1 = extractFloat128Frac1( a );
6983 aSig0 = extractFloat128Frac0( a );
6984 aExp = extractFloat128Exp( a );
6985 bSig1 = extractFloat128Frac1( b );
6986 bSig0 = extractFloat128Frac0( b );
6987 bExp = extractFloat128Exp( b );
6988 expDiff = aExp - bExp;
6989 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6990 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6991 if ( 0 < expDiff ) goto aExpBigger;
6992 if ( expDiff < 0 ) goto bExpBigger;
6993 if ( aExp == 0x7FFF ) {
6994 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6995 return propagateFloat128NaN(a, b, status);
6996 }
6997 float_raise(float_flag_invalid, status);
6998 return float128_default_nan(status);
6999 }
7000 if ( aExp == 0 ) {
7001 aExp = 1;
7002 bExp = 1;
7003 }
7004 if ( bSig0 < aSig0 ) goto aBigger;
7005 if ( aSig0 < bSig0 ) goto bBigger;
7006 if ( bSig1 < aSig1 ) goto aBigger;
7007 if ( aSig1 < bSig1 ) goto bBigger;
7008 return packFloat128(status->float_rounding_mode == float_round_down,
7009 0, 0, 0);
7010 bExpBigger:
7011 if ( bExp == 0x7FFF ) {
7012 if (bSig0 | bSig1) {
7013 return propagateFloat128NaN(a, b, status);
7014 }
7015 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7016 }
7017 if ( aExp == 0 ) {
7018 ++expDiff;
7019 }
7020 else {
7021 aSig0 |= UINT64_C(0x4000000000000000);
7022 }
7023 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7024 bSig0 |= UINT64_C(0x4000000000000000);
7025 bBigger:
7026 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7027 zExp = bExp;
7028 zSign ^= 1;
7029 goto normalizeRoundAndPack;
7030 aExpBigger:
7031 if ( aExp == 0x7FFF ) {
7032 if (aSig0 | aSig1) {
7033 return propagateFloat128NaN(a, b, status);
7034 }
7035 return a;
7036 }
7037 if ( bExp == 0 ) {
7038 --expDiff;
7039 }
7040 else {
7041 bSig0 |= UINT64_C(0x4000000000000000);
7042 }
7043 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7044 aSig0 |= UINT64_C(0x4000000000000000);
7045 aBigger:
7046 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7047 zExp = aExp;
7048 normalizeRoundAndPack:
7049 --zExp;
7050 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7051 status);
7052
7053 }
7054
7055 /*----------------------------------------------------------------------------
7056 | Returns the result of adding the quadruple-precision floating-point values
7057 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7058 | for Binary Floating-Point Arithmetic.
7059 *----------------------------------------------------------------------------*/
7060
7061 float128 float128_add(float128 a, float128 b, float_status *status)
7062 {
7063 bool aSign, bSign;
7064
7065 aSign = extractFloat128Sign( a );
7066 bSign = extractFloat128Sign( b );
7067 if ( aSign == bSign ) {
7068 return addFloat128Sigs(a, b, aSign, status);
7069 }
7070 else {
7071 return subFloat128Sigs(a, b, aSign, status);
7072 }
7073
7074 }
7075
7076 /*----------------------------------------------------------------------------
7077 | Returns the result of subtracting the quadruple-precision floating-point
7078 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7079 | Standard for Binary Floating-Point Arithmetic.
7080 *----------------------------------------------------------------------------*/
7081
7082 float128 float128_sub(float128 a, float128 b, float_status *status)
7083 {
7084 bool aSign, bSign;
7085
7086 aSign = extractFloat128Sign( a );
7087 bSign = extractFloat128Sign( b );
7088 if ( aSign == bSign ) {
7089 return subFloat128Sigs(a, b, aSign, status);
7090 }
7091 else {
7092 return addFloat128Sigs(a, b, aSign, status);
7093 }
7094
7095 }
7096
7097 /*----------------------------------------------------------------------------
7098 | Returns the result of multiplying the quadruple-precision floating-point
7099 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7100 | Standard for Binary Floating-Point Arithmetic.
7101 *----------------------------------------------------------------------------*/
7102
7103 float128 float128_mul(float128 a, float128 b, float_status *status)
7104 {
7105 bool aSign, bSign, zSign;
7106 int32_t aExp, bExp, zExp;
7107 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7108
7109 aSig1 = extractFloat128Frac1( a );
7110 aSig0 = extractFloat128Frac0( a );
7111 aExp = extractFloat128Exp( a );
7112 aSign = extractFloat128Sign( a );
7113 bSig1 = extractFloat128Frac1( b );
7114 bSig0 = extractFloat128Frac0( b );
7115 bExp = extractFloat128Exp( b );
7116 bSign = extractFloat128Sign( b );
7117 zSign = aSign ^ bSign;
7118 if ( aExp == 0x7FFF ) {
7119 if ( ( aSig0 | aSig1 )
7120 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7121 return propagateFloat128NaN(a, b, status);
7122 }
7123 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7124 return packFloat128( zSign, 0x7FFF, 0, 0 );
7125 }
7126 if ( bExp == 0x7FFF ) {
7127 if (bSig0 | bSig1) {
7128 return propagateFloat128NaN(a, b, status);
7129 }
7130 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7131 invalid:
7132 float_raise(float_flag_invalid, status);
7133 return float128_default_nan(status);
7134 }
7135 return packFloat128( zSign, 0x7FFF, 0, 0 );
7136 }
7137 if ( aExp == 0 ) {
7138 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7139 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7140 }
7141 if ( bExp == 0 ) {
7142 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7143 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7144 }
7145 zExp = aExp + bExp - 0x4000;
7146 aSig0 |= UINT64_C(0x0001000000000000);
7147 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7148 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7149 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7150 zSig2 |= ( zSig3 != 0 );
7151 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7152 shift128ExtraRightJamming(
7153 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7154 ++zExp;
7155 }
7156 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7157
7158 }
7159
7160 /*----------------------------------------------------------------------------
7161 | Returns the result of dividing the quadruple-precision floating-point value
7162 | `a' by the corresponding value `b'. The operation is performed according to
7163 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7164 *----------------------------------------------------------------------------*/
7165
7166 float128 float128_div(float128 a, float128 b, float_status *status)
7167 {
7168 bool aSign, bSign, zSign;
7169 int32_t aExp, bExp, zExp;
7170 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7171 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7172
7173 aSig1 = extractFloat128Frac1( a );
7174 aSig0 = extractFloat128Frac0( a );
7175 aExp = extractFloat128Exp( a );
7176 aSign = extractFloat128Sign( a );
7177 bSig1 = extractFloat128Frac1( b );
7178 bSig0 = extractFloat128Frac0( b );
7179 bExp = extractFloat128Exp( b );
7180 bSign = extractFloat128Sign( b );
7181 zSign = aSign ^ bSign;
7182 if ( aExp == 0x7FFF ) {
7183 if (aSig0 | aSig1) {
7184 return propagateFloat128NaN(a, b, status);
7185 }
7186 if ( bExp == 0x7FFF ) {
7187 if (bSig0 | bSig1) {
7188 return propagateFloat128NaN(a, b, status);
7189 }
7190 goto invalid;
7191 }
7192 return packFloat128( zSign, 0x7FFF, 0, 0 );
7193 }
7194 if ( bExp == 0x7FFF ) {
7195 if (bSig0 | bSig1) {
7196 return propagateFloat128NaN(a, b, status);
7197 }
7198 return packFloat128( zSign, 0, 0, 0 );
7199 }
7200 if ( bExp == 0 ) {
7201 if ( ( bSig0 | bSig1 ) == 0 ) {
7202 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7203 invalid:
7204 float_raise(float_flag_invalid, status);
7205 return float128_default_nan(status);
7206 }
7207 float_raise(float_flag_divbyzero, status);
7208 return packFloat128( zSign, 0x7FFF, 0, 0 );
7209 }
7210 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7211 }
7212 if ( aExp == 0 ) {
7213 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7214 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7215 }
7216 zExp = aExp - bExp + 0x3FFD;
7217 shortShift128Left(
7218 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7219 shortShift128Left(
7220 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7221 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7222 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7223 ++zExp;
7224 }
7225 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7226 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7227 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7228 while ( (int64_t) rem0 < 0 ) {
7229 --zSig0;
7230 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7231 }
7232 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7233 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7234 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7235 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7236 while ( (int64_t) rem1 < 0 ) {
7237 --zSig1;
7238 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7239 }
7240 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7241 }
7242 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7243 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7244
7245 }
7246
7247 /*----------------------------------------------------------------------------
7248 | Returns the remainder of the quadruple-precision floating-point value `a'
7249 | with respect to the corresponding value `b'. The operation is performed
7250 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7251 *----------------------------------------------------------------------------*/
7252
7253 float128 float128_rem(float128 a, float128 b, float_status *status)
7254 {
7255 bool aSign, zSign;
7256 int32_t aExp, bExp, expDiff;
7257 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7258 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7259 int64_t sigMean0;
7260
7261 aSig1 = extractFloat128Frac1( a );
7262 aSig0 = extractFloat128Frac0( a );
7263 aExp = extractFloat128Exp( a );
7264 aSign = extractFloat128Sign( a );
7265 bSig1 = extractFloat128Frac1( b );
7266 bSig0 = extractFloat128Frac0( b );
7267 bExp = extractFloat128Exp( b );
7268 if ( aExp == 0x7FFF ) {
7269 if ( ( aSig0 | aSig1 )
7270 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7271 return propagateFloat128NaN(a, b, status);
7272 }
7273 goto invalid;
7274 }
7275 if ( bExp == 0x7FFF ) {
7276 if (bSig0 | bSig1) {
7277 return propagateFloat128NaN(a, b, status);
7278 }
7279 return a;
7280 }
7281 if ( bExp == 0 ) {
7282 if ( ( bSig0 | bSig1 ) == 0 ) {
7283 invalid:
7284 float_raise(float_flag_invalid, status);
7285 return float128_default_nan(status);
7286 }
7287 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7288 }
7289 if ( aExp == 0 ) {
7290 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7291 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7292 }
7293 expDiff = aExp - bExp;
7294 if ( expDiff < -1 ) return a;
7295 shortShift128Left(
7296 aSig0 | UINT64_C(0x0001000000000000),
7297 aSig1,
7298 15 - ( expDiff < 0 ),
7299 &aSig0,
7300 &aSig1
7301 );
7302 shortShift128Left(
7303 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7304 q = le128( bSig0, bSig1, aSig0, aSig1 );
7305 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7306 expDiff -= 64;
7307 while ( 0 < expDiff ) {
7308 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7309 q = ( 4 < q ) ? q - 4 : 0;
7310 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7311 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7312 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7313 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7314 expDiff -= 61;
7315 }
7316 if ( -64 < expDiff ) {
7317 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7318 q = ( 4 < q ) ? q - 4 : 0;
7319 q >>= - expDiff;
7320 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7321 expDiff += 52;
7322 if ( expDiff < 0 ) {
7323 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7324 }
7325 else {
7326 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7327 }
7328 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7329 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7330 }
7331 else {
7332 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7333 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7334 }
7335 do {
7336 alternateASig0 = aSig0;
7337 alternateASig1 = aSig1;
7338 ++q;
7339 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7340 } while ( 0 <= (int64_t) aSig0 );
7341 add128(
7342 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7343 if ( ( sigMean0 < 0 )
7344 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7345 aSig0 = alternateASig0;
7346 aSig1 = alternateASig1;
7347 }
7348 zSign = ( (int64_t) aSig0 < 0 );
7349 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7350 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7351 status);
7352 }
7353
7354 /*----------------------------------------------------------------------------
7355 | Returns the square root of the quadruple-precision floating-point value `a'.
7356 | The operation is performed according to the IEC/IEEE Standard for Binary
7357 | Floating-Point Arithmetic.
7358 *----------------------------------------------------------------------------*/
7359
7360 float128 float128_sqrt(float128 a, float_status *status)
7361 {
7362 bool aSign;
7363 int32_t aExp, zExp;
7364 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7365 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7366
7367 aSig1 = extractFloat128Frac1( a );
7368 aSig0 = extractFloat128Frac0( a );
7369 aExp = extractFloat128Exp( a );
7370 aSign = extractFloat128Sign( a );
7371 if ( aExp == 0x7FFF ) {
7372 if (aSig0 | aSig1) {
7373 return propagateFloat128NaN(a, a, status);
7374 }
7375 if ( ! aSign ) return a;
7376 goto invalid;
7377 }
7378 if ( aSign ) {
7379 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7380 invalid:
7381 float_raise(float_flag_invalid, status);
7382 return float128_default_nan(status);
7383 }
7384 if ( aExp == 0 ) {
7385 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7386 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7387 }
7388 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7389 aSig0 |= UINT64_C(0x0001000000000000);
7390 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7391 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7392 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7393 doubleZSig0 = zSig0<<1;
7394 mul64To128( zSig0, zSig0, &term0, &term1 );
7395 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7396 while ( (int64_t) rem0 < 0 ) {
7397 --zSig0;
7398 doubleZSig0 -= 2;
7399 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7400 }
7401 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7402 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7403 if ( zSig1 == 0 ) zSig1 = 1;
7404 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7405 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7406 mul64To128( zSig1, zSig1, &term2, &term3 );
7407 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7408 while ( (int64_t) rem1 < 0 ) {
7409 --zSig1;
7410 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7411 term3 |= 1;
7412 term2 |= doubleZSig0;
7413 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7414 }
7415 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7416 }
7417 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7418 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7419
7420 }
7421
7422 static inline FloatRelation
7423 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7424 float_status *status)
7425 {
7426 bool aSign, bSign;
7427
7428 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7429 float_raise(float_flag_invalid, status);
7430 return float_relation_unordered;
7431 }
7432 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7433 ( extractFloatx80Frac( a )<<1 ) ) ||
7434 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7435 ( extractFloatx80Frac( b )<<1 ) )) {
7436 if (!is_quiet ||
7437 floatx80_is_signaling_nan(a, status) ||
7438 floatx80_is_signaling_nan(b, status)) {
7439 float_raise(float_flag_invalid, status);
7440 }
7441 return float_relation_unordered;
7442 }
7443 aSign = extractFloatx80Sign( a );
7444 bSign = extractFloatx80Sign( b );
7445 if ( aSign != bSign ) {
7446
7447 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7448 ( ( a.low | b.low ) == 0 ) ) {
7449 /* zero case */
7450 return float_relation_equal;
7451 } else {
7452 return 1 - (2 * aSign);
7453 }
7454 } else {
7455 /* Normalize pseudo-denormals before comparison. */
7456 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7457 ++a.high;
7458 }
7459 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7460 ++b.high;
7461 }
7462 if (a.low == b.low && a.high == b.high) {
7463 return float_relation_equal;
7464 } else {
7465 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7466 }
7467 }
7468 }
7469
7470 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7471 {
7472 return floatx80_compare_internal(a, b, 0, status);
7473 }
7474
7475 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7476 float_status *status)
7477 {
7478 return floatx80_compare_internal(a, b, 1, status);
7479 }
7480
7481 static inline FloatRelation
7482 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7483 float_status *status)
7484 {
7485 bool aSign, bSign;
7486
7487 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7488 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7489 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7490 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7491 if (!is_quiet ||
7492 float128_is_signaling_nan(a, status) ||
7493 float128_is_signaling_nan(b, status)) {
7494 float_raise(float_flag_invalid, status);
7495 }
7496 return float_relation_unordered;
7497 }
7498 aSign = extractFloat128Sign( a );
7499 bSign = extractFloat128Sign( b );
7500 if ( aSign != bSign ) {
7501 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7502 /* zero case */
7503 return float_relation_equal;
7504 } else {
7505 return 1 - (2 * aSign);
7506 }
7507 } else {
7508 if (a.low == b.low && a.high == b.high) {
7509 return float_relation_equal;
7510 } else {
7511 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7512 }
7513 }
7514 }
7515
7516 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7517 {
7518 return float128_compare_internal(a, b, 0, status);
7519 }
7520
7521 FloatRelation float128_compare_quiet(float128 a, float128 b,
7522 float_status *status)
7523 {
7524 return float128_compare_internal(a, b, 1, status);
7525 }
7526
7527 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7528 {
7529 bool aSign;
7530 int32_t aExp;
7531 uint64_t aSig;
7532
7533 if (floatx80_invalid_encoding(a)) {
7534 float_raise(float_flag_invalid, status);
7535 return floatx80_default_nan(status);
7536 }
7537 aSig = extractFloatx80Frac( a );
7538 aExp = extractFloatx80Exp( a );
7539 aSign = extractFloatx80Sign( a );
7540
7541 if ( aExp == 0x7FFF ) {
7542 if ( aSig<<1 ) {
7543 return propagateFloatx80NaN(a, a, status);
7544 }
7545 return a;
7546 }
7547
7548 if (aExp == 0) {
7549 if (aSig == 0) {
7550 return a;
7551 }
7552 aExp++;
7553 }
7554
7555 if (n > 0x10000) {
7556 n = 0x10000;
7557 } else if (n < -0x10000) {
7558 n = -0x10000;
7559 }
7560
7561 aExp += n;
7562 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7563 aSign, aExp, aSig, 0, status);
7564 }
7565
7566 float128 float128_scalbn(float128 a, int n, float_status *status)
7567 {
7568 bool aSign;
7569 int32_t aExp;
7570 uint64_t aSig0, aSig1;
7571
7572 aSig1 = extractFloat128Frac1( a );
7573 aSig0 = extractFloat128Frac0( a );
7574 aExp = extractFloat128Exp( a );
7575 aSign = extractFloat128Sign( a );
7576 if ( aExp == 0x7FFF ) {
7577 if ( aSig0 | aSig1 ) {
7578 return propagateFloat128NaN(a, a, status);
7579 }
7580 return a;
7581 }
7582 if (aExp != 0) {
7583 aSig0 |= UINT64_C(0x0001000000000000);
7584 } else if (aSig0 == 0 && aSig1 == 0) {
7585 return a;
7586 } else {
7587 aExp++;
7588 }
7589
7590 if (n > 0x10000) {
7591 n = 0x10000;
7592 } else if (n < -0x10000) {
7593 n = -0x10000;
7594 }
7595
7596 aExp += n - 1;
7597 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7598 , status);
7599
7600 }
7601
7602 static void __attribute__((constructor)) softfloat_init(void)
7603 {
7604 union_float64 ua, ub, uc, ur;
7605
7606 if (QEMU_NO_HARDFLOAT) {
7607 return;
7608 }
7609 /*
7610 * Test that the host's FMA is not obviously broken. For example,
7611 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7612 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7613 */
7614 ua.s = 0x0020000000000001ULL;
7615 ub.s = 0x3ca0000000000000ULL;
7616 uc.s = 0x0020000000000000ULL;
7617 ur.h = fma(ua.h, ub.h, uc.h);
7618 if (ur.s != 0x0020000000000001ULL) {
7619 force_soft_fma = true;
7620 }
7621 }