]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Use pointers with parts_default_nan
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float. The
504 * exponent is unbiased and the fraction is normalized. All
505 * calculations are done with a 64 bit fraction and then rounded as
506 * appropriate for the final format.
507 *
508 * Thanks to the packed FloatClass a decent compiler should be able to
509 * fit the whole structure into registers and avoid using the stack
510 * for parameter passing.
511 */
512
513 typedef struct {
514 uint64_t frac;
515 int32_t exp;
516 FloatClass cls;
517 bool sign;
518 } FloatParts64;
519
520 #define DECOMPOSED_BINARY_POINT 63
521 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
522
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
535 */
536 typedef struct {
537 int exp_size;
538 int exp_bias;
539 int exp_max;
540 int frac_size;
541 int frac_shift;
542 uint64_t frac_lsb;
543 uint64_t frac_lsbm1;
544 uint64_t round_mask;
545 uint64_t roundeven_mask;
546 bool arm_althp;
547 } FloatFmt;
548
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
551 .exp_size = E, \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
554 .frac_size = F, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
560
561 static const FloatFmt float16_params = {
562 FLOAT_PARAMS(5, 10)
563 };
564
565 static const FloatFmt float16_params_ahp = {
566 FLOAT_PARAMS(5, 10),
567 .arm_althp = true
568 };
569
570 static const FloatFmt bfloat16_params = {
571 FLOAT_PARAMS(8, 7)
572 };
573
574 static const FloatFmt float32_params = {
575 FLOAT_PARAMS(8, 23)
576 };
577
578 static const FloatFmt float64_params = {
579 FLOAT_PARAMS(11, 52)
580 };
581
582 /* Unpack a float to parts, but do not canonicalize. */
583 static inline FloatParts64 unpack_raw(FloatFmt fmt, uint64_t raw)
584 {
585 const int sign_pos = fmt.frac_size + fmt.exp_size;
586
587 return (FloatParts64) {
588 .cls = float_class_unclassified,
589 .sign = extract64(raw, sign_pos, 1),
590 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
591 .frac = extract64(raw, 0, fmt.frac_size),
592 };
593 }
594
595 static inline FloatParts64 float16_unpack_raw(float16 f)
596 {
597 return unpack_raw(float16_params, f);
598 }
599
600 static inline FloatParts64 bfloat16_unpack_raw(bfloat16 f)
601 {
602 return unpack_raw(bfloat16_params, f);
603 }
604
605 static inline FloatParts64 float32_unpack_raw(float32 f)
606 {
607 return unpack_raw(float32_params, f);
608 }
609
610 static inline FloatParts64 float64_unpack_raw(float64 f)
611 {
612 return unpack_raw(float64_params, f);
613 }
614
615 /* Pack a float from parts, but do not canonicalize. */
616 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts64 p)
617 {
618 const int sign_pos = fmt.frac_size + fmt.exp_size;
619 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
620 return deposit64(ret, sign_pos, 1, p.sign);
621 }
622
623 static inline float16 float16_pack_raw(FloatParts64 p)
624 {
625 return make_float16(pack_raw(float16_params, p));
626 }
627
628 static inline bfloat16 bfloat16_pack_raw(FloatParts64 p)
629 {
630 return pack_raw(bfloat16_params, p);
631 }
632
633 static inline float32 float32_pack_raw(FloatParts64 p)
634 {
635 return make_float32(pack_raw(float32_params, p));
636 }
637
638 static inline float64 float64_pack_raw(FloatParts64 p)
639 {
640 return make_float64(pack_raw(float64_params, p));
641 }
642
643 /*----------------------------------------------------------------------------
644 | Functions and definitions to determine: (1) whether tininess for underflow
645 | is detected before or after rounding by default, (2) what (if anything)
646 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
647 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
648 | are propagated from function inputs to output. These details are target-
649 | specific.
650 *----------------------------------------------------------------------------*/
651 #include "softfloat-specialize.c.inc"
652
653 #define parts_default_nan parts64_default_nan
654
655 /* Canonicalize EXP and FRAC, setting CLS. */
656 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
657 float_status *status)
658 {
659 if (part.exp == parm->exp_max && !parm->arm_althp) {
660 if (part.frac == 0) {
661 part.cls = float_class_inf;
662 } else {
663 part.frac <<= parm->frac_shift;
664 part.cls = (parts_is_snan_frac(part.frac, status)
665 ? float_class_snan : float_class_qnan);
666 }
667 } else if (part.exp == 0) {
668 if (likely(part.frac == 0)) {
669 part.cls = float_class_zero;
670 } else if (status->flush_inputs_to_zero) {
671 float_raise(float_flag_input_denormal, status);
672 part.cls = float_class_zero;
673 part.frac = 0;
674 } else {
675 int shift = clz64(part.frac);
676 part.cls = float_class_normal;
677 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
678 part.frac <<= shift;
679 }
680 } else {
681 part.cls = float_class_normal;
682 part.exp -= parm->exp_bias;
683 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
684 }
685 return part;
686 }
687
688 /* Round and uncanonicalize a floating-point number by parts. There
689 * are FRAC_SHIFT bits that may require rounding at the bottom of the
690 * fraction; these bits will be removed. The exponent will be biased
691 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
692 */
693
694 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
695 const FloatFmt *parm)
696 {
697 const uint64_t frac_lsb = parm->frac_lsb;
698 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
699 const uint64_t round_mask = parm->round_mask;
700 const uint64_t roundeven_mask = parm->roundeven_mask;
701 const int exp_max = parm->exp_max;
702 const int frac_shift = parm->frac_shift;
703 uint64_t frac, inc;
704 int exp, flags = 0;
705 bool overflow_norm;
706
707 frac = p.frac;
708 exp = p.exp;
709
710 switch (p.cls) {
711 case float_class_normal:
712 switch (s->float_rounding_mode) {
713 case float_round_nearest_even:
714 overflow_norm = false;
715 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
716 break;
717 case float_round_ties_away:
718 overflow_norm = false;
719 inc = frac_lsbm1;
720 break;
721 case float_round_to_zero:
722 overflow_norm = true;
723 inc = 0;
724 break;
725 case float_round_up:
726 inc = p.sign ? 0 : round_mask;
727 overflow_norm = p.sign;
728 break;
729 case float_round_down:
730 inc = p.sign ? round_mask : 0;
731 overflow_norm = !p.sign;
732 break;
733 case float_round_to_odd:
734 overflow_norm = true;
735 inc = frac & frac_lsb ? 0 : round_mask;
736 break;
737 default:
738 g_assert_not_reached();
739 }
740
741 exp += parm->exp_bias;
742 if (likely(exp > 0)) {
743 if (frac & round_mask) {
744 flags |= float_flag_inexact;
745 if (uadd64_overflow(frac, inc, &frac)) {
746 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
747 exp++;
748 }
749 }
750 frac >>= frac_shift;
751
752 if (parm->arm_althp) {
753 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
754 if (unlikely(exp > exp_max)) {
755 /* Overflow. Return the maximum normal. */
756 flags = float_flag_invalid;
757 exp = exp_max;
758 frac = -1;
759 }
760 } else if (unlikely(exp >= exp_max)) {
761 flags |= float_flag_overflow | float_flag_inexact;
762 if (overflow_norm) {
763 exp = exp_max - 1;
764 frac = -1;
765 } else {
766 p.cls = float_class_inf;
767 goto do_inf;
768 }
769 }
770 } else if (s->flush_to_zero) {
771 flags |= float_flag_output_denormal;
772 p.cls = float_class_zero;
773 goto do_zero;
774 } else {
775 bool is_tiny = s->tininess_before_rounding || (exp < 0);
776
777 if (!is_tiny) {
778 uint64_t discard;
779 is_tiny = !uadd64_overflow(frac, inc, &discard);
780 }
781
782 shift64RightJamming(frac, 1 - exp, &frac);
783 if (frac & round_mask) {
784 /* Need to recompute round-to-even. */
785 switch (s->float_rounding_mode) {
786 case float_round_nearest_even:
787 inc = ((frac & roundeven_mask) != frac_lsbm1
788 ? frac_lsbm1 : 0);
789 break;
790 case float_round_to_odd:
791 inc = frac & frac_lsb ? 0 : round_mask;
792 break;
793 default:
794 break;
795 }
796 flags |= float_flag_inexact;
797 frac += inc;
798 }
799
800 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
801 frac >>= frac_shift;
802
803 if (is_tiny && (flags & float_flag_inexact)) {
804 flags |= float_flag_underflow;
805 }
806 if (exp == 0 && frac == 0) {
807 p.cls = float_class_zero;
808 }
809 }
810 break;
811
812 case float_class_zero:
813 do_zero:
814 exp = 0;
815 frac = 0;
816 break;
817
818 case float_class_inf:
819 do_inf:
820 assert(!parm->arm_althp);
821 exp = exp_max;
822 frac = 0;
823 break;
824
825 case float_class_qnan:
826 case float_class_snan:
827 assert(!parm->arm_althp);
828 exp = exp_max;
829 frac >>= parm->frac_shift;
830 break;
831
832 default:
833 g_assert_not_reached();
834 }
835
836 float_raise(flags, s);
837 p.exp = exp;
838 p.frac = frac;
839 return p;
840 }
841
842 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
843 {
844 g_assert(is_nan(a.cls));
845 if (is_snan(a.cls)) {
846 float_raise(float_flag_invalid, s);
847 if (!s->default_nan_mode) {
848 return parts_silence_nan(a, s);
849 }
850 } else if (!s->default_nan_mode) {
851 return a;
852 }
853 parts_default_nan(&a, s);
854 return a;
855 }
856
857 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
858 {
859 if (is_snan(a.cls) || is_snan(b.cls)) {
860 float_raise(float_flag_invalid, s);
861 }
862
863 if (s->default_nan_mode) {
864 parts_default_nan(&a, s);
865 } else {
866 if (pickNaN(a.cls, b.cls,
867 a.frac > b.frac ||
868 (a.frac == b.frac && a.sign < b.sign), s)) {
869 a = b;
870 }
871 if (is_snan(a.cls)) {
872 return parts_silence_nan(a, s);
873 }
874 }
875 return a;
876 }
877
878 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
879 bool inf_zero, float_status *s)
880 {
881 int which;
882
883 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
884 float_raise(float_flag_invalid, s);
885 }
886
887 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
888
889 if (s->default_nan_mode) {
890 /* Note that this check is after pickNaNMulAdd so that function
891 * has an opportunity to set the Invalid flag.
892 */
893 which = 3;
894 }
895
896 switch (which) {
897 case 0:
898 break;
899 case 1:
900 a = b;
901 break;
902 case 2:
903 a = c;
904 break;
905 case 3:
906 parts_default_nan(&a, s);
907 break;
908 default:
909 g_assert_not_reached();
910 }
911
912 if (is_snan(a.cls)) {
913 return parts_silence_nan(a, s);
914 }
915 return a;
916 }
917
918 /*
919 * Pack/unpack routines with a specific FloatFmt.
920 */
921
922 static FloatParts64 float16a_unpack_canonical(float16 f, float_status *s,
923 const FloatFmt *params)
924 {
925 return sf_canonicalize(float16_unpack_raw(f), params, s);
926 }
927
928 static FloatParts64 float16_unpack_canonical(float16 f, float_status *s)
929 {
930 return float16a_unpack_canonical(f, s, &float16_params);
931 }
932
933 static FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s)
934 {
935 return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
936 }
937
938 static float16 float16a_round_pack_canonical(FloatParts64 p, float_status *s,
939 const FloatFmt *params)
940 {
941 return float16_pack_raw(round_canonical(p, s, params));
942 }
943
944 static float16 float16_round_pack_canonical(FloatParts64 p, float_status *s)
945 {
946 return float16a_round_pack_canonical(p, s, &float16_params);
947 }
948
949 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 p, float_status *s)
950 {
951 return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
952 }
953
954 static FloatParts64 float32_unpack_canonical(float32 f, float_status *s)
955 {
956 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
957 }
958
959 static float32 float32_round_pack_canonical(FloatParts64 p, float_status *s)
960 {
961 return float32_pack_raw(round_canonical(p, s, &float32_params));
962 }
963
964 static FloatParts64 float64_unpack_canonical(float64 f, float_status *s)
965 {
966 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
967 }
968
969 static float64 float64_round_pack_canonical(FloatParts64 p, float_status *s)
970 {
971 return float64_pack_raw(round_canonical(p, s, &float64_params));
972 }
973
974 /*
975 * Returns the result of adding or subtracting the values of the
976 * floating-point values `a' and `b'. The operation is performed
977 * according to the IEC/IEEE Standard for Binary Floating-Point
978 * Arithmetic.
979 */
980
981 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
982 float_status *s)
983 {
984 bool a_sign = a.sign;
985 bool b_sign = b.sign ^ subtract;
986
987 if (a_sign != b_sign) {
988 /* Subtraction */
989
990 if (a.cls == float_class_normal && b.cls == float_class_normal) {
991 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
992 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
993 a.frac = a.frac - b.frac;
994 } else {
995 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
996 a.frac = b.frac - a.frac;
997 a.exp = b.exp;
998 a_sign ^= 1;
999 }
1000
1001 if (a.frac == 0) {
1002 a.cls = float_class_zero;
1003 a.sign = s->float_rounding_mode == float_round_down;
1004 } else {
1005 int shift = clz64(a.frac);
1006 a.frac = a.frac << shift;
1007 a.exp = a.exp - shift;
1008 a.sign = a_sign;
1009 }
1010 return a;
1011 }
1012 if (is_nan(a.cls) || is_nan(b.cls)) {
1013 return pick_nan(a, b, s);
1014 }
1015 if (a.cls == float_class_inf) {
1016 if (b.cls == float_class_inf) {
1017 float_raise(float_flag_invalid, s);
1018 parts_default_nan(&a, s);
1019 }
1020 return a;
1021 }
1022 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1023 a.sign = s->float_rounding_mode == float_round_down;
1024 return a;
1025 }
1026 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1027 b.sign = a_sign ^ 1;
1028 return b;
1029 }
1030 if (b.cls == float_class_zero) {
1031 return a;
1032 }
1033 } else {
1034 /* Addition */
1035 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1036 if (a.exp > b.exp) {
1037 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1038 } else if (a.exp < b.exp) {
1039 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1040 a.exp = b.exp;
1041 }
1042
1043 if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1044 shift64RightJamming(a.frac, 1, &a.frac);
1045 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1046 a.exp += 1;
1047 }
1048 return a;
1049 }
1050 if (is_nan(a.cls) || is_nan(b.cls)) {
1051 return pick_nan(a, b, s);
1052 }
1053 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1054 return a;
1055 }
1056 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1057 b.sign = b_sign;
1058 return b;
1059 }
1060 }
1061 g_assert_not_reached();
1062 }
1063
1064 /*
1065 * Returns the result of adding or subtracting the floating-point
1066 * values `a' and `b'. The operation is performed according to the
1067 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1068 */
1069
1070 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1071 {
1072 FloatParts64 pa = float16_unpack_canonical(a, status);
1073 FloatParts64 pb = float16_unpack_canonical(b, status);
1074 FloatParts64 pr = addsub_floats(pa, pb, false, status);
1075
1076 return float16_round_pack_canonical(pr, status);
1077 }
1078
1079 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1080 {
1081 FloatParts64 pa = float16_unpack_canonical(a, status);
1082 FloatParts64 pb = float16_unpack_canonical(b, status);
1083 FloatParts64 pr = addsub_floats(pa, pb, true, status);
1084
1085 return float16_round_pack_canonical(pr, status);
1086 }
1087
1088 static float32 QEMU_SOFTFLOAT_ATTR
1089 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1090 {
1091 FloatParts64 pa = float32_unpack_canonical(a, status);
1092 FloatParts64 pb = float32_unpack_canonical(b, status);
1093 FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1094
1095 return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1099 {
1100 return soft_f32_addsub(a, b, false, status);
1101 }
1102
1103 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1104 {
1105 return soft_f32_addsub(a, b, true, status);
1106 }
1107
1108 static float64 QEMU_SOFTFLOAT_ATTR
1109 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1110 {
1111 FloatParts64 pa = float64_unpack_canonical(a, status);
1112 FloatParts64 pb = float64_unpack_canonical(b, status);
1113 FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1114
1115 return float64_round_pack_canonical(pr, status);
1116 }
1117
1118 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1119 {
1120 return soft_f64_addsub(a, b, false, status);
1121 }
1122
1123 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1124 {
1125 return soft_f64_addsub(a, b, true, status);
1126 }
1127
1128 static float hard_f32_add(float a, float b)
1129 {
1130 return a + b;
1131 }
1132
1133 static float hard_f32_sub(float a, float b)
1134 {
1135 return a - b;
1136 }
1137
1138 static double hard_f64_add(double a, double b)
1139 {
1140 return a + b;
1141 }
1142
1143 static double hard_f64_sub(double a, double b)
1144 {
1145 return a - b;
1146 }
1147
1148 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1149 {
1150 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1151 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1152 }
1153 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1154 }
1155
1156 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1157 {
1158 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1159 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1160 } else {
1161 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1162 }
1163 }
1164
1165 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1166 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1167 {
1168 return float32_gen2(a, b, s, hard, soft,
1169 f32_is_zon2, f32_addsubmul_post);
1170 }
1171
1172 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1173 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1174 {
1175 return float64_gen2(a, b, s, hard, soft,
1176 f64_is_zon2, f64_addsubmul_post);
1177 }
1178
1179 float32 QEMU_FLATTEN
1180 float32_add(float32 a, float32 b, float_status *s)
1181 {
1182 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1183 }
1184
1185 float32 QEMU_FLATTEN
1186 float32_sub(float32 a, float32 b, float_status *s)
1187 {
1188 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1189 }
1190
1191 float64 QEMU_FLATTEN
1192 float64_add(float64 a, float64 b, float_status *s)
1193 {
1194 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1195 }
1196
1197 float64 QEMU_FLATTEN
1198 float64_sub(float64 a, float64 b, float_status *s)
1199 {
1200 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1201 }
1202
1203 /*
1204 * Returns the result of adding or subtracting the bfloat16
1205 * values `a' and `b'.
1206 */
1207 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1208 {
1209 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1210 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1211 FloatParts64 pr = addsub_floats(pa, pb, false, status);
1212
1213 return bfloat16_round_pack_canonical(pr, status);
1214 }
1215
1216 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1217 {
1218 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1219 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1220 FloatParts64 pr = addsub_floats(pa, pb, true, status);
1221
1222 return bfloat16_round_pack_canonical(pr, status);
1223 }
1224
1225 /*
1226 * Returns the result of multiplying the floating-point values `a' and
1227 * `b'. The operation is performed according to the IEC/IEEE Standard
1228 * for Binary Floating-Point Arithmetic.
1229 */
1230
1231 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1232 {
1233 bool sign = a.sign ^ b.sign;
1234
1235 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1236 uint64_t hi, lo;
1237 int exp = a.exp + b.exp;
1238
1239 mul64To128(a.frac, b.frac, &hi, &lo);
1240 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1241 exp += 1;
1242 } else {
1243 hi <<= 1;
1244 }
1245 hi |= (lo != 0);
1246
1247 /* Re-use a */
1248 a.exp = exp;
1249 a.sign = sign;
1250 a.frac = hi;
1251 return a;
1252 }
1253 /* handle all the NaN cases */
1254 if (is_nan(a.cls) || is_nan(b.cls)) {
1255 return pick_nan(a, b, s);
1256 }
1257 /* Inf * Zero == NaN */
1258 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1259 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1260 float_raise(float_flag_invalid, s);
1261 parts_default_nan(&a, s);
1262 return a;
1263 }
1264 /* Multiply by 0 or Inf */
1265 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1266 a.sign = sign;
1267 return a;
1268 }
1269 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1270 b.sign = sign;
1271 return b;
1272 }
1273 g_assert_not_reached();
1274 }
1275
1276 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1277 {
1278 FloatParts64 pa = float16_unpack_canonical(a, status);
1279 FloatParts64 pb = float16_unpack_canonical(b, status);
1280 FloatParts64 pr = mul_floats(pa, pb, status);
1281
1282 return float16_round_pack_canonical(pr, status);
1283 }
1284
1285 static float32 QEMU_SOFTFLOAT_ATTR
1286 soft_f32_mul(float32 a, float32 b, float_status *status)
1287 {
1288 FloatParts64 pa = float32_unpack_canonical(a, status);
1289 FloatParts64 pb = float32_unpack_canonical(b, status);
1290 FloatParts64 pr = mul_floats(pa, pb, status);
1291
1292 return float32_round_pack_canonical(pr, status);
1293 }
1294
1295 static float64 QEMU_SOFTFLOAT_ATTR
1296 soft_f64_mul(float64 a, float64 b, float_status *status)
1297 {
1298 FloatParts64 pa = float64_unpack_canonical(a, status);
1299 FloatParts64 pb = float64_unpack_canonical(b, status);
1300 FloatParts64 pr = mul_floats(pa, pb, status);
1301
1302 return float64_round_pack_canonical(pr, status);
1303 }
1304
1305 static float hard_f32_mul(float a, float b)
1306 {
1307 return a * b;
1308 }
1309
1310 static double hard_f64_mul(double a, double b)
1311 {
1312 return a * b;
1313 }
1314
1315 float32 QEMU_FLATTEN
1316 float32_mul(float32 a, float32 b, float_status *s)
1317 {
1318 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1319 f32_is_zon2, f32_addsubmul_post);
1320 }
1321
1322 float64 QEMU_FLATTEN
1323 float64_mul(float64 a, float64 b, float_status *s)
1324 {
1325 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1326 f64_is_zon2, f64_addsubmul_post);
1327 }
1328
1329 /*
1330 * Returns the result of multiplying the bfloat16
1331 * values `a' and `b'.
1332 */
1333
1334 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1335 {
1336 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1337 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1338 FloatParts64 pr = mul_floats(pa, pb, status);
1339
1340 return bfloat16_round_pack_canonical(pr, status);
1341 }
1342
1343 /*
1344 * Returns the result of multiplying the floating-point values `a' and
1345 * `b' then adding 'c', with no intermediate rounding step after the
1346 * multiplication. The operation is performed according to the
1347 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1348 * The flags argument allows the caller to select negation of the
1349 * addend, the intermediate product, or the final result. (The
1350 * difference between this and having the caller do a separate
1351 * negation is that negating externally will flip the sign bit on
1352 * NaNs.)
1353 */
1354
1355 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1356 int flags, float_status *s)
1357 {
1358 bool inf_zero, p_sign;
1359 bool sign_flip = flags & float_muladd_negate_result;
1360 FloatClass p_class;
1361 uint64_t hi, lo;
1362 int p_exp;
1363 int ab_mask, abc_mask;
1364
1365 ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1366 abc_mask = float_cmask(c.cls) | ab_mask;
1367 inf_zero = ab_mask == float_cmask_infzero;
1368
1369 /* It is implementation-defined whether the cases of (0,inf,qnan)
1370 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1371 * they return if they do), so we have to hand this information
1372 * off to the target-specific pick-a-NaN routine.
1373 */
1374 if (unlikely(abc_mask & float_cmask_anynan)) {
1375 return pick_nan_muladd(a, b, c, inf_zero, s);
1376 }
1377
1378 if (inf_zero) {
1379 float_raise(float_flag_invalid, s);
1380 parts_default_nan(&a, s);
1381 return a;
1382 }
1383
1384 if (flags & float_muladd_negate_c) {
1385 c.sign ^= 1;
1386 }
1387
1388 p_sign = a.sign ^ b.sign;
1389
1390 if (flags & float_muladd_negate_product) {
1391 p_sign ^= 1;
1392 }
1393
1394 if (ab_mask & float_cmask_inf) {
1395 p_class = float_class_inf;
1396 } else if (ab_mask & float_cmask_zero) {
1397 p_class = float_class_zero;
1398 } else {
1399 p_class = float_class_normal;
1400 }
1401
1402 if (c.cls == float_class_inf) {
1403 if (p_class == float_class_inf && p_sign != c.sign) {
1404 float_raise(float_flag_invalid, s);
1405 parts_default_nan(&c, s);
1406 } else {
1407 c.sign ^= sign_flip;
1408 }
1409 return c;
1410 }
1411
1412 if (p_class == float_class_inf) {
1413 a.cls = float_class_inf;
1414 a.sign = p_sign ^ sign_flip;
1415 return a;
1416 }
1417
1418 if (p_class == float_class_zero) {
1419 if (c.cls == float_class_zero) {
1420 if (p_sign != c.sign) {
1421 p_sign = s->float_rounding_mode == float_round_down;
1422 }
1423 c.sign = p_sign;
1424 } else if (flags & float_muladd_halve_result) {
1425 c.exp -= 1;
1426 }
1427 c.sign ^= sign_flip;
1428 return c;
1429 }
1430
1431 /* a & b should be normals now... */
1432 assert(a.cls == float_class_normal &&
1433 b.cls == float_class_normal);
1434
1435 p_exp = a.exp + b.exp;
1436
1437 mul64To128(a.frac, b.frac, &hi, &lo);
1438
1439 /* Renormalize to the msb. */
1440 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1441 p_exp += 1;
1442 } else {
1443 shortShift128Left(hi, lo, 1, &hi, &lo);
1444 }
1445
1446 /* + add/sub */
1447 if (c.cls != float_class_zero) {
1448 int exp_diff = p_exp - c.exp;
1449 if (p_sign == c.sign) {
1450 /* Addition */
1451 if (exp_diff <= 0) {
1452 shift64RightJamming(hi, -exp_diff, &hi);
1453 p_exp = c.exp;
1454 if (uadd64_overflow(hi, c.frac, &hi)) {
1455 shift64RightJamming(hi, 1, &hi);
1456 hi |= DECOMPOSED_IMPLICIT_BIT;
1457 p_exp += 1;
1458 }
1459 } else {
1460 uint64_t c_hi, c_lo, over;
1461 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1462 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1463 if (over) {
1464 shift64RightJamming(hi, 1, &hi);
1465 hi |= DECOMPOSED_IMPLICIT_BIT;
1466 p_exp += 1;
1467 }
1468 }
1469 } else {
1470 /* Subtraction */
1471 uint64_t c_hi = c.frac, c_lo = 0;
1472
1473 if (exp_diff <= 0) {
1474 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1475 if (exp_diff == 0
1476 &&
1477 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1478 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1479 } else {
1480 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1481 p_sign ^= 1;
1482 p_exp = c.exp;
1483 }
1484 } else {
1485 shift128RightJamming(c_hi, c_lo,
1486 exp_diff,
1487 &c_hi, &c_lo);
1488 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1489 }
1490
1491 if (hi == 0 && lo == 0) {
1492 a.cls = float_class_zero;
1493 a.sign = s->float_rounding_mode == float_round_down;
1494 a.sign ^= sign_flip;
1495 return a;
1496 } else {
1497 int shift;
1498 if (hi != 0) {
1499 shift = clz64(hi);
1500 } else {
1501 shift = clz64(lo) + 64;
1502 }
1503 /* Normalizing to a binary point of 124 is the
1504 correct adjust for the exponent. However since we're
1505 shifting, we might as well put the binary point back
1506 at 63 where we really want it. Therefore shift as
1507 if we're leaving 1 bit at the top of the word, but
1508 adjust the exponent as if we're leaving 3 bits. */
1509 shift128Left(hi, lo, shift, &hi, &lo);
1510 p_exp -= shift;
1511 }
1512 }
1513 }
1514 hi |= (lo != 0);
1515
1516 if (flags & float_muladd_halve_result) {
1517 p_exp -= 1;
1518 }
1519
1520 /* finally prepare our result */
1521 a.cls = float_class_normal;
1522 a.sign = p_sign ^ sign_flip;
1523 a.exp = p_exp;
1524 a.frac = hi;
1525
1526 return a;
1527 }
1528
1529 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1530 int flags, float_status *status)
1531 {
1532 FloatParts64 pa = float16_unpack_canonical(a, status);
1533 FloatParts64 pb = float16_unpack_canonical(b, status);
1534 FloatParts64 pc = float16_unpack_canonical(c, status);
1535 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1536
1537 return float16_round_pack_canonical(pr, status);
1538 }
1539
1540 static float32 QEMU_SOFTFLOAT_ATTR
1541 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1542 float_status *status)
1543 {
1544 FloatParts64 pa = float32_unpack_canonical(a, status);
1545 FloatParts64 pb = float32_unpack_canonical(b, status);
1546 FloatParts64 pc = float32_unpack_canonical(c, status);
1547 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1548
1549 return float32_round_pack_canonical(pr, status);
1550 }
1551
1552 static float64 QEMU_SOFTFLOAT_ATTR
1553 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1554 float_status *status)
1555 {
1556 FloatParts64 pa = float64_unpack_canonical(a, status);
1557 FloatParts64 pb = float64_unpack_canonical(b, status);
1558 FloatParts64 pc = float64_unpack_canonical(c, status);
1559 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1560
1561 return float64_round_pack_canonical(pr, status);
1562 }
1563
1564 static bool force_soft_fma;
1565
1566 float32 QEMU_FLATTEN
1567 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1568 {
1569 union_float32 ua, ub, uc, ur;
1570
1571 ua.s = xa;
1572 ub.s = xb;
1573 uc.s = xc;
1574
1575 if (unlikely(!can_use_fpu(s))) {
1576 goto soft;
1577 }
1578 if (unlikely(flags & float_muladd_halve_result)) {
1579 goto soft;
1580 }
1581
1582 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1583 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1584 goto soft;
1585 }
1586
1587 if (unlikely(force_soft_fma)) {
1588 goto soft;
1589 }
1590
1591 /*
1592 * When (a || b) == 0, there's no need to check for under/over flow,
1593 * since we know the addend is (normal || 0) and the product is 0.
1594 */
1595 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1596 union_float32 up;
1597 bool prod_sign;
1598
1599 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1600 prod_sign ^= !!(flags & float_muladd_negate_product);
1601 up.s = float32_set_sign(float32_zero, prod_sign);
1602
1603 if (flags & float_muladd_negate_c) {
1604 uc.h = -uc.h;
1605 }
1606 ur.h = up.h + uc.h;
1607 } else {
1608 union_float32 ua_orig = ua;
1609 union_float32 uc_orig = uc;
1610
1611 if (flags & float_muladd_negate_product) {
1612 ua.h = -ua.h;
1613 }
1614 if (flags & float_muladd_negate_c) {
1615 uc.h = -uc.h;
1616 }
1617
1618 ur.h = fmaf(ua.h, ub.h, uc.h);
1619
1620 if (unlikely(f32_is_inf(ur))) {
1621 float_raise(float_flag_overflow, s);
1622 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1623 ua = ua_orig;
1624 uc = uc_orig;
1625 goto soft;
1626 }
1627 }
1628 if (flags & float_muladd_negate_result) {
1629 return float32_chs(ur.s);
1630 }
1631 return ur.s;
1632
1633 soft:
1634 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1635 }
1636
1637 float64 QEMU_FLATTEN
1638 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1639 {
1640 union_float64 ua, ub, uc, ur;
1641
1642 ua.s = xa;
1643 ub.s = xb;
1644 uc.s = xc;
1645
1646 if (unlikely(!can_use_fpu(s))) {
1647 goto soft;
1648 }
1649 if (unlikely(flags & float_muladd_halve_result)) {
1650 goto soft;
1651 }
1652
1653 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1654 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1655 goto soft;
1656 }
1657
1658 if (unlikely(force_soft_fma)) {
1659 goto soft;
1660 }
1661
1662 /*
1663 * When (a || b) == 0, there's no need to check for under/over flow,
1664 * since we know the addend is (normal || 0) and the product is 0.
1665 */
1666 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1667 union_float64 up;
1668 bool prod_sign;
1669
1670 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1671 prod_sign ^= !!(flags & float_muladd_negate_product);
1672 up.s = float64_set_sign(float64_zero, prod_sign);
1673
1674 if (flags & float_muladd_negate_c) {
1675 uc.h = -uc.h;
1676 }
1677 ur.h = up.h + uc.h;
1678 } else {
1679 union_float64 ua_orig = ua;
1680 union_float64 uc_orig = uc;
1681
1682 if (flags & float_muladd_negate_product) {
1683 ua.h = -ua.h;
1684 }
1685 if (flags & float_muladd_negate_c) {
1686 uc.h = -uc.h;
1687 }
1688
1689 ur.h = fma(ua.h, ub.h, uc.h);
1690
1691 if (unlikely(f64_is_inf(ur))) {
1692 float_raise(float_flag_overflow, s);
1693 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1694 ua = ua_orig;
1695 uc = uc_orig;
1696 goto soft;
1697 }
1698 }
1699 if (flags & float_muladd_negate_result) {
1700 return float64_chs(ur.s);
1701 }
1702 return ur.s;
1703
1704 soft:
1705 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1706 }
1707
1708 /*
1709 * Returns the result of multiplying the bfloat16 values `a'
1710 * and `b' then adding 'c', with no intermediate rounding step after the
1711 * multiplication.
1712 */
1713
1714 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1715 int flags, float_status *status)
1716 {
1717 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1718 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1719 FloatParts64 pc = bfloat16_unpack_canonical(c, status);
1720 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1721
1722 return bfloat16_round_pack_canonical(pr, status);
1723 }
1724
1725 /*
1726 * Returns the result of dividing the floating-point value `a' by the
1727 * corresponding value `b'. The operation is performed according to
1728 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1729 */
1730
1731 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1732 {
1733 bool sign = a.sign ^ b.sign;
1734
1735 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1736 uint64_t n0, n1, q, r;
1737 int exp = a.exp - b.exp;
1738
1739 /*
1740 * We want a 2*N / N-bit division to produce exactly an N-bit
1741 * result, so that we do not lose any precision and so that we
1742 * do not have to renormalize afterward. If A.frac < B.frac,
1743 * then division would produce an (N-1)-bit result; shift A left
1744 * by one to produce the an N-bit result, and decrement the
1745 * exponent to match.
1746 *
1747 * The udiv_qrnnd algorithm that we're using requires normalization,
1748 * i.e. the msb of the denominator must be set, which is already true.
1749 */
1750 if (a.frac < b.frac) {
1751 exp -= 1;
1752 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1753 } else {
1754 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1755 }
1756 q = udiv_qrnnd(&r, n1, n0, b.frac);
1757
1758 /* Set lsb if there is a remainder, to set inexact. */
1759 a.frac = q | (r != 0);
1760 a.sign = sign;
1761 a.exp = exp;
1762 return a;
1763 }
1764 /* handle all the NaN cases */
1765 if (is_nan(a.cls) || is_nan(b.cls)) {
1766 return pick_nan(a, b, s);
1767 }
1768 /* 0/0 or Inf/Inf */
1769 if (a.cls == b.cls
1770 &&
1771 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1772 float_raise(float_flag_invalid, s);
1773 parts_default_nan(&a, s);
1774 return a;
1775 }
1776 /* Inf / x or 0 / x */
1777 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1778 a.sign = sign;
1779 return a;
1780 }
1781 /* Div 0 => Inf */
1782 if (b.cls == float_class_zero) {
1783 float_raise(float_flag_divbyzero, s);
1784 a.cls = float_class_inf;
1785 a.sign = sign;
1786 return a;
1787 }
1788 /* Div by Inf */
1789 if (b.cls == float_class_inf) {
1790 a.cls = float_class_zero;
1791 a.sign = sign;
1792 return a;
1793 }
1794 g_assert_not_reached();
1795 }
1796
1797 float16 float16_div(float16 a, float16 b, float_status *status)
1798 {
1799 FloatParts64 pa = float16_unpack_canonical(a, status);
1800 FloatParts64 pb = float16_unpack_canonical(b, status);
1801 FloatParts64 pr = div_floats(pa, pb, status);
1802
1803 return float16_round_pack_canonical(pr, status);
1804 }
1805
1806 static float32 QEMU_SOFTFLOAT_ATTR
1807 soft_f32_div(float32 a, float32 b, float_status *status)
1808 {
1809 FloatParts64 pa = float32_unpack_canonical(a, status);
1810 FloatParts64 pb = float32_unpack_canonical(b, status);
1811 FloatParts64 pr = div_floats(pa, pb, status);
1812
1813 return float32_round_pack_canonical(pr, status);
1814 }
1815
1816 static float64 QEMU_SOFTFLOAT_ATTR
1817 soft_f64_div(float64 a, float64 b, float_status *status)
1818 {
1819 FloatParts64 pa = float64_unpack_canonical(a, status);
1820 FloatParts64 pb = float64_unpack_canonical(b, status);
1821 FloatParts64 pr = div_floats(pa, pb, status);
1822
1823 return float64_round_pack_canonical(pr, status);
1824 }
1825
1826 static float hard_f32_div(float a, float b)
1827 {
1828 return a / b;
1829 }
1830
1831 static double hard_f64_div(double a, double b)
1832 {
1833 return a / b;
1834 }
1835
1836 static bool f32_div_pre(union_float32 a, union_float32 b)
1837 {
1838 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1839 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1840 fpclassify(b.h) == FP_NORMAL;
1841 }
1842 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1843 }
1844
1845 static bool f64_div_pre(union_float64 a, union_float64 b)
1846 {
1847 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1848 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1849 fpclassify(b.h) == FP_NORMAL;
1850 }
1851 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1852 }
1853
1854 static bool f32_div_post(union_float32 a, union_float32 b)
1855 {
1856 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1857 return fpclassify(a.h) != FP_ZERO;
1858 }
1859 return !float32_is_zero(a.s);
1860 }
1861
1862 static bool f64_div_post(union_float64 a, union_float64 b)
1863 {
1864 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1865 return fpclassify(a.h) != FP_ZERO;
1866 }
1867 return !float64_is_zero(a.s);
1868 }
1869
1870 float32 QEMU_FLATTEN
1871 float32_div(float32 a, float32 b, float_status *s)
1872 {
1873 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1874 f32_div_pre, f32_div_post);
1875 }
1876
1877 float64 QEMU_FLATTEN
1878 float64_div(float64 a, float64 b, float_status *s)
1879 {
1880 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1881 f64_div_pre, f64_div_post);
1882 }
1883
1884 /*
1885 * Returns the result of dividing the bfloat16
1886 * value `a' by the corresponding value `b'.
1887 */
1888
1889 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1890 {
1891 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1892 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1893 FloatParts64 pr = div_floats(pa, pb, status);
1894
1895 return bfloat16_round_pack_canonical(pr, status);
1896 }
1897
1898 /*
1899 * Float to Float conversions
1900 *
1901 * Returns the result of converting one float format to another. The
1902 * conversion is performed according to the IEC/IEEE Standard for
1903 * Binary Floating-Point Arithmetic.
1904 *
1905 * The float_to_float helper only needs to take care of raising
1906 * invalid exceptions and handling the conversion on NaNs.
1907 */
1908
1909 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1910 float_status *s)
1911 {
1912 if (dstf->arm_althp) {
1913 switch (a.cls) {
1914 case float_class_qnan:
1915 case float_class_snan:
1916 /* There is no NaN in the destination format. Raise Invalid
1917 * and return a zero with the sign of the input NaN.
1918 */
1919 float_raise(float_flag_invalid, s);
1920 a.cls = float_class_zero;
1921 a.frac = 0;
1922 a.exp = 0;
1923 break;
1924
1925 case float_class_inf:
1926 /* There is no Inf in the destination format. Raise Invalid
1927 * and return the maximum normal with the correct sign.
1928 */
1929 float_raise(float_flag_invalid, s);
1930 a.cls = float_class_normal;
1931 a.exp = dstf->exp_max;
1932 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1933 break;
1934
1935 default:
1936 break;
1937 }
1938 } else if (is_nan(a.cls)) {
1939 return return_nan(a, s);
1940 }
1941 return a;
1942 }
1943
1944 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1945 {
1946 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1947 FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1948 FloatParts64 pr = float_to_float(p, &float32_params, s);
1949 return float32_round_pack_canonical(pr, s);
1950 }
1951
1952 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1953 {
1954 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1955 FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1956 FloatParts64 pr = float_to_float(p, &float64_params, s);
1957 return float64_round_pack_canonical(pr, s);
1958 }
1959
1960 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1961 {
1962 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1963 FloatParts64 p = float32_unpack_canonical(a, s);
1964 FloatParts64 pr = float_to_float(p, fmt16, s);
1965 return float16a_round_pack_canonical(pr, s, fmt16);
1966 }
1967
1968 static float64 QEMU_SOFTFLOAT_ATTR
1969 soft_float32_to_float64(float32 a, float_status *s)
1970 {
1971 FloatParts64 p = float32_unpack_canonical(a, s);
1972 FloatParts64 pr = float_to_float(p, &float64_params, s);
1973 return float64_round_pack_canonical(pr, s);
1974 }
1975
1976 float64 float32_to_float64(float32 a, float_status *s)
1977 {
1978 if (likely(float32_is_normal(a))) {
1979 /* Widening conversion can never produce inexact results. */
1980 union_float32 uf;
1981 union_float64 ud;
1982 uf.s = a;
1983 ud.h = uf.h;
1984 return ud.s;
1985 } else if (float32_is_zero(a)) {
1986 return float64_set_sign(float64_zero, float32_is_neg(a));
1987 } else {
1988 return soft_float32_to_float64(a, s);
1989 }
1990 }
1991
1992 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1993 {
1994 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1995 FloatParts64 p = float64_unpack_canonical(a, s);
1996 FloatParts64 pr = float_to_float(p, fmt16, s);
1997 return float16a_round_pack_canonical(pr, s, fmt16);
1998 }
1999
2000 float32 float64_to_float32(float64 a, float_status *s)
2001 {
2002 FloatParts64 p = float64_unpack_canonical(a, s);
2003 FloatParts64 pr = float_to_float(p, &float32_params, s);
2004 return float32_round_pack_canonical(pr, s);
2005 }
2006
2007 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2008 {
2009 FloatParts64 p = bfloat16_unpack_canonical(a, s);
2010 FloatParts64 pr = float_to_float(p, &float32_params, s);
2011 return float32_round_pack_canonical(pr, s);
2012 }
2013
2014 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2015 {
2016 FloatParts64 p = bfloat16_unpack_canonical(a, s);
2017 FloatParts64 pr = float_to_float(p, &float64_params, s);
2018 return float64_round_pack_canonical(pr, s);
2019 }
2020
2021 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2022 {
2023 FloatParts64 p = float32_unpack_canonical(a, s);
2024 FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2025 return bfloat16_round_pack_canonical(pr, s);
2026 }
2027
2028 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2029 {
2030 FloatParts64 p = float64_unpack_canonical(a, s);
2031 FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2032 return bfloat16_round_pack_canonical(pr, s);
2033 }
2034
2035 /*
2036 * Rounds the floating-point value `a' to an integer, and returns the
2037 * result as a floating-point value. The operation is performed
2038 * according to the IEC/IEEE Standard for Binary Floating-Point
2039 * Arithmetic.
2040 */
2041
2042 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2043 int scale, float_status *s)
2044 {
2045 switch (a.cls) {
2046 case float_class_qnan:
2047 case float_class_snan:
2048 return return_nan(a, s);
2049
2050 case float_class_zero:
2051 case float_class_inf:
2052 /* already "integral" */
2053 break;
2054
2055 case float_class_normal:
2056 scale = MIN(MAX(scale, -0x10000), 0x10000);
2057 a.exp += scale;
2058
2059 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2060 /* already integral */
2061 break;
2062 }
2063 if (a.exp < 0) {
2064 bool one;
2065 /* all fractional */
2066 float_raise(float_flag_inexact, s);
2067 switch (rmode) {
2068 case float_round_nearest_even:
2069 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2070 break;
2071 case float_round_ties_away:
2072 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2073 break;
2074 case float_round_to_zero:
2075 one = false;
2076 break;
2077 case float_round_up:
2078 one = !a.sign;
2079 break;
2080 case float_round_down:
2081 one = a.sign;
2082 break;
2083 case float_round_to_odd:
2084 one = true;
2085 break;
2086 default:
2087 g_assert_not_reached();
2088 }
2089
2090 if (one) {
2091 a.frac = DECOMPOSED_IMPLICIT_BIT;
2092 a.exp = 0;
2093 } else {
2094 a.cls = float_class_zero;
2095 }
2096 } else {
2097 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2098 uint64_t frac_lsbm1 = frac_lsb >> 1;
2099 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2100 uint64_t rnd_mask = rnd_even_mask >> 1;
2101 uint64_t inc;
2102
2103 switch (rmode) {
2104 case float_round_nearest_even:
2105 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2106 break;
2107 case float_round_ties_away:
2108 inc = frac_lsbm1;
2109 break;
2110 case float_round_to_zero:
2111 inc = 0;
2112 break;
2113 case float_round_up:
2114 inc = a.sign ? 0 : rnd_mask;
2115 break;
2116 case float_round_down:
2117 inc = a.sign ? rnd_mask : 0;
2118 break;
2119 case float_round_to_odd:
2120 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2121 break;
2122 default:
2123 g_assert_not_reached();
2124 }
2125
2126 if (a.frac & rnd_mask) {
2127 float_raise(float_flag_inexact, s);
2128 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2129 a.frac >>= 1;
2130 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2131 a.exp++;
2132 }
2133 a.frac &= ~rnd_mask;
2134 }
2135 }
2136 break;
2137 default:
2138 g_assert_not_reached();
2139 }
2140 return a;
2141 }
2142
2143 float16 float16_round_to_int(float16 a, float_status *s)
2144 {
2145 FloatParts64 pa = float16_unpack_canonical(a, s);
2146 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2147 return float16_round_pack_canonical(pr, s);
2148 }
2149
2150 float32 float32_round_to_int(float32 a, float_status *s)
2151 {
2152 FloatParts64 pa = float32_unpack_canonical(a, s);
2153 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2154 return float32_round_pack_canonical(pr, s);
2155 }
2156
2157 float64 float64_round_to_int(float64 a, float_status *s)
2158 {
2159 FloatParts64 pa = float64_unpack_canonical(a, s);
2160 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2161 return float64_round_pack_canonical(pr, s);
2162 }
2163
2164 /*
2165 * Rounds the bfloat16 value `a' to an integer, and returns the
2166 * result as a bfloat16 value.
2167 */
2168
2169 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2170 {
2171 FloatParts64 pa = bfloat16_unpack_canonical(a, s);
2172 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2173 return bfloat16_round_pack_canonical(pr, s);
2174 }
2175
2176 /*
2177 * Returns the result of converting the floating-point value `a' to
2178 * the two's complement integer format. The conversion is performed
2179 * according to the IEC/IEEE Standard for Binary Floating-Point
2180 * Arithmetic---which means in particular that the conversion is
2181 * rounded according to the current rounding mode. If `a' is a NaN,
2182 * the largest positive integer is returned. Otherwise, if the
2183 * conversion overflows, the largest integer with the same sign as `a'
2184 * is returned.
2185 */
2186
2187 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2188 int scale, int64_t min, int64_t max,
2189 float_status *s)
2190 {
2191 uint64_t r;
2192 int orig_flags = get_float_exception_flags(s);
2193 FloatParts64 p = round_to_int(in, rmode, scale, s);
2194
2195 switch (p.cls) {
2196 case float_class_snan:
2197 case float_class_qnan:
2198 s->float_exception_flags = orig_flags | float_flag_invalid;
2199 return max;
2200 case float_class_inf:
2201 s->float_exception_flags = orig_flags | float_flag_invalid;
2202 return p.sign ? min : max;
2203 case float_class_zero:
2204 return 0;
2205 case float_class_normal:
2206 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2207 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2208 } else {
2209 r = UINT64_MAX;
2210 }
2211 if (p.sign) {
2212 if (r <= -(uint64_t) min) {
2213 return -r;
2214 } else {
2215 s->float_exception_flags = orig_flags | float_flag_invalid;
2216 return min;
2217 }
2218 } else {
2219 if (r <= max) {
2220 return r;
2221 } else {
2222 s->float_exception_flags = orig_flags | float_flag_invalid;
2223 return max;
2224 }
2225 }
2226 default:
2227 g_assert_not_reached();
2228 }
2229 }
2230
2231 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2232 float_status *s)
2233 {
2234 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2235 rmode, scale, INT8_MIN, INT8_MAX, s);
2236 }
2237
2238 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2239 float_status *s)
2240 {
2241 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2242 rmode, scale, INT16_MIN, INT16_MAX, s);
2243 }
2244
2245 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2246 float_status *s)
2247 {
2248 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2249 rmode, scale, INT32_MIN, INT32_MAX, s);
2250 }
2251
2252 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2253 float_status *s)
2254 {
2255 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2256 rmode, scale, INT64_MIN, INT64_MAX, s);
2257 }
2258
2259 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2260 float_status *s)
2261 {
2262 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2263 rmode, scale, INT16_MIN, INT16_MAX, s);
2264 }
2265
2266 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2267 float_status *s)
2268 {
2269 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2270 rmode, scale, INT32_MIN, INT32_MAX, s);
2271 }
2272
2273 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2274 float_status *s)
2275 {
2276 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2277 rmode, scale, INT64_MIN, INT64_MAX, s);
2278 }
2279
2280 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2281 float_status *s)
2282 {
2283 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2284 rmode, scale, INT16_MIN, INT16_MAX, s);
2285 }
2286
2287 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2288 float_status *s)
2289 {
2290 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2291 rmode, scale, INT32_MIN, INT32_MAX, s);
2292 }
2293
2294 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2295 float_status *s)
2296 {
2297 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2298 rmode, scale, INT64_MIN, INT64_MAX, s);
2299 }
2300
2301 int8_t float16_to_int8(float16 a, float_status *s)
2302 {
2303 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2304 }
2305
2306 int16_t float16_to_int16(float16 a, float_status *s)
2307 {
2308 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2309 }
2310
2311 int32_t float16_to_int32(float16 a, float_status *s)
2312 {
2313 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2314 }
2315
2316 int64_t float16_to_int64(float16 a, float_status *s)
2317 {
2318 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2319 }
2320
2321 int16_t float32_to_int16(float32 a, float_status *s)
2322 {
2323 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2324 }
2325
2326 int32_t float32_to_int32(float32 a, float_status *s)
2327 {
2328 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2329 }
2330
2331 int64_t float32_to_int64(float32 a, float_status *s)
2332 {
2333 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2334 }
2335
2336 int16_t float64_to_int16(float64 a, float_status *s)
2337 {
2338 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2339 }
2340
2341 int32_t float64_to_int32(float64 a, float_status *s)
2342 {
2343 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2344 }
2345
2346 int64_t float64_to_int64(float64 a, float_status *s)
2347 {
2348 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2349 }
2350
2351 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2352 {
2353 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2354 }
2355
2356 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2357 {
2358 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2359 }
2360
2361 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2362 {
2363 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2364 }
2365
2366 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2367 {
2368 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2369 }
2370
2371 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2372 {
2373 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2374 }
2375
2376 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2377 {
2378 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2379 }
2380
2381 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2382 {
2383 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2384 }
2385
2386 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2387 {
2388 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2389 }
2390
2391 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2392 {
2393 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2394 }
2395
2396 /*
2397 * Returns the result of converting the floating-point value `a' to
2398 * the two's complement integer format.
2399 */
2400
2401 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2402 float_status *s)
2403 {
2404 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2405 rmode, scale, INT16_MIN, INT16_MAX, s);
2406 }
2407
2408 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2409 float_status *s)
2410 {
2411 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2412 rmode, scale, INT32_MIN, INT32_MAX, s);
2413 }
2414
2415 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2416 float_status *s)
2417 {
2418 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2419 rmode, scale, INT64_MIN, INT64_MAX, s);
2420 }
2421
2422 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2423 {
2424 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2425 }
2426
2427 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2428 {
2429 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2430 }
2431
2432 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2433 {
2434 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2435 }
2436
2437 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2438 {
2439 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2440 }
2441
2442 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2443 {
2444 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2445 }
2446
2447 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2448 {
2449 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2450 }
2451
2452 /*
2453 * Returns the result of converting the floating-point value `a' to
2454 * the unsigned integer format. The conversion is performed according
2455 * to the IEC/IEEE Standard for Binary Floating-Point
2456 * Arithmetic---which means in particular that the conversion is
2457 * rounded according to the current rounding mode. If `a' is a NaN,
2458 * the largest unsigned integer is returned. Otherwise, if the
2459 * conversion overflows, the largest unsigned integer is returned. If
2460 * the 'a' is negative, the result is rounded and zero is returned;
2461 * values that do not round to zero will raise the inexact exception
2462 * flag.
2463 */
2464
2465 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2466 int scale, uint64_t max,
2467 float_status *s)
2468 {
2469 int orig_flags = get_float_exception_flags(s);
2470 FloatParts64 p = round_to_int(in, rmode, scale, s);
2471 uint64_t r;
2472
2473 switch (p.cls) {
2474 case float_class_snan:
2475 case float_class_qnan:
2476 s->float_exception_flags = orig_flags | float_flag_invalid;
2477 return max;
2478 case float_class_inf:
2479 s->float_exception_flags = orig_flags | float_flag_invalid;
2480 return p.sign ? 0 : max;
2481 case float_class_zero:
2482 return 0;
2483 case float_class_normal:
2484 if (p.sign) {
2485 s->float_exception_flags = orig_flags | float_flag_invalid;
2486 return 0;
2487 }
2488
2489 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2490 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2491 } else {
2492 s->float_exception_flags = orig_flags | float_flag_invalid;
2493 return max;
2494 }
2495
2496 /* For uint64 this will never trip, but if p.exp is too large
2497 * to shift a decomposed fraction we shall have exited via the
2498 * 3rd leg above.
2499 */
2500 if (r > max) {
2501 s->float_exception_flags = orig_flags | float_flag_invalid;
2502 return max;
2503 }
2504 return r;
2505 default:
2506 g_assert_not_reached();
2507 }
2508 }
2509
2510 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2511 float_status *s)
2512 {
2513 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2514 rmode, scale, UINT8_MAX, s);
2515 }
2516
2517 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2518 float_status *s)
2519 {
2520 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2521 rmode, scale, UINT16_MAX, s);
2522 }
2523
2524 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2525 float_status *s)
2526 {
2527 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2528 rmode, scale, UINT32_MAX, s);
2529 }
2530
2531 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2532 float_status *s)
2533 {
2534 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2535 rmode, scale, UINT64_MAX, s);
2536 }
2537
2538 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2539 float_status *s)
2540 {
2541 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2542 rmode, scale, UINT16_MAX, s);
2543 }
2544
2545 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2546 float_status *s)
2547 {
2548 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2549 rmode, scale, UINT32_MAX, s);
2550 }
2551
2552 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2553 float_status *s)
2554 {
2555 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2556 rmode, scale, UINT64_MAX, s);
2557 }
2558
2559 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2560 float_status *s)
2561 {
2562 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2563 rmode, scale, UINT16_MAX, s);
2564 }
2565
2566 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2567 float_status *s)
2568 {
2569 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2570 rmode, scale, UINT32_MAX, s);
2571 }
2572
2573 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2574 float_status *s)
2575 {
2576 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2577 rmode, scale, UINT64_MAX, s);
2578 }
2579
2580 uint8_t float16_to_uint8(float16 a, float_status *s)
2581 {
2582 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2583 }
2584
2585 uint16_t float16_to_uint16(float16 a, float_status *s)
2586 {
2587 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2588 }
2589
2590 uint32_t float16_to_uint32(float16 a, float_status *s)
2591 {
2592 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2593 }
2594
2595 uint64_t float16_to_uint64(float16 a, float_status *s)
2596 {
2597 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2598 }
2599
2600 uint16_t float32_to_uint16(float32 a, float_status *s)
2601 {
2602 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2603 }
2604
2605 uint32_t float32_to_uint32(float32 a, float_status *s)
2606 {
2607 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2608 }
2609
2610 uint64_t float32_to_uint64(float32 a, float_status *s)
2611 {
2612 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614
2615 uint16_t float64_to_uint16(float64 a, float_status *s)
2616 {
2617 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2618 }
2619
2620 uint32_t float64_to_uint32(float64 a, float_status *s)
2621 {
2622 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2623 }
2624
2625 uint64_t float64_to_uint64(float64 a, float_status *s)
2626 {
2627 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2628 }
2629
2630 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2631 {
2632 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634
2635 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2636 {
2637 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639
2640 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2641 {
2642 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2643 }
2644
2645 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2646 {
2647 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2648 }
2649
2650 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2651 {
2652 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2653 }
2654
2655 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2656 {
2657 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2658 }
2659
2660 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2661 {
2662 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2663 }
2664
2665 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2666 {
2667 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2668 }
2669
2670 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2671 {
2672 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2673 }
2674
2675 /*
2676 * Returns the result of converting the bfloat16 value `a' to
2677 * the unsigned integer format.
2678 */
2679
2680 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2681 int scale, float_status *s)
2682 {
2683 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2684 rmode, scale, UINT16_MAX, s);
2685 }
2686
2687 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2688 int scale, float_status *s)
2689 {
2690 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2691 rmode, scale, UINT32_MAX, s);
2692 }
2693
2694 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2695 int scale, float_status *s)
2696 {
2697 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2698 rmode, scale, UINT64_MAX, s);
2699 }
2700
2701 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2702 {
2703 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2704 }
2705
2706 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2707 {
2708 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2709 }
2710
2711 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2712 {
2713 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2714 }
2715
2716 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2717 {
2718 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2719 }
2720
2721 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2722 {
2723 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2724 }
2725
2726 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2727 {
2728 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2729 }
2730
2731 /*
2732 * Integer to float conversions
2733 *
2734 * Returns the result of converting the two's complement integer `a'
2735 * to the floating-point format. The conversion is performed according
2736 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2737 */
2738
2739 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2740 {
2741 FloatParts64 r = { .sign = false };
2742
2743 if (a == 0) {
2744 r.cls = float_class_zero;
2745 } else {
2746 uint64_t f = a;
2747 int shift;
2748
2749 r.cls = float_class_normal;
2750 if (a < 0) {
2751 f = -f;
2752 r.sign = true;
2753 }
2754 shift = clz64(f);
2755 scale = MIN(MAX(scale, -0x10000), 0x10000);
2756
2757 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2758 r.frac = f << shift;
2759 }
2760
2761 return r;
2762 }
2763
2764 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2765 {
2766 FloatParts64 pa = int_to_float(a, scale, status);
2767 return float16_round_pack_canonical(pa, status);
2768 }
2769
2770 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2771 {
2772 return int64_to_float16_scalbn(a, scale, status);
2773 }
2774
2775 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2776 {
2777 return int64_to_float16_scalbn(a, scale, status);
2778 }
2779
2780 float16 int64_to_float16(int64_t a, float_status *status)
2781 {
2782 return int64_to_float16_scalbn(a, 0, status);
2783 }
2784
2785 float16 int32_to_float16(int32_t a, float_status *status)
2786 {
2787 return int64_to_float16_scalbn(a, 0, status);
2788 }
2789
2790 float16 int16_to_float16(int16_t a, float_status *status)
2791 {
2792 return int64_to_float16_scalbn(a, 0, status);
2793 }
2794
2795 float16 int8_to_float16(int8_t a, float_status *status)
2796 {
2797 return int64_to_float16_scalbn(a, 0, status);
2798 }
2799
2800 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2801 {
2802 FloatParts64 pa = int_to_float(a, scale, status);
2803 return float32_round_pack_canonical(pa, status);
2804 }
2805
2806 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2807 {
2808 return int64_to_float32_scalbn(a, scale, status);
2809 }
2810
2811 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2812 {
2813 return int64_to_float32_scalbn(a, scale, status);
2814 }
2815
2816 float32 int64_to_float32(int64_t a, float_status *status)
2817 {
2818 return int64_to_float32_scalbn(a, 0, status);
2819 }
2820
2821 float32 int32_to_float32(int32_t a, float_status *status)
2822 {
2823 return int64_to_float32_scalbn(a, 0, status);
2824 }
2825
2826 float32 int16_to_float32(int16_t a, float_status *status)
2827 {
2828 return int64_to_float32_scalbn(a, 0, status);
2829 }
2830
2831 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2832 {
2833 FloatParts64 pa = int_to_float(a, scale, status);
2834 return float64_round_pack_canonical(pa, status);
2835 }
2836
2837 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2838 {
2839 return int64_to_float64_scalbn(a, scale, status);
2840 }
2841
2842 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2843 {
2844 return int64_to_float64_scalbn(a, scale, status);
2845 }
2846
2847 float64 int64_to_float64(int64_t a, float_status *status)
2848 {
2849 return int64_to_float64_scalbn(a, 0, status);
2850 }
2851
2852 float64 int32_to_float64(int32_t a, float_status *status)
2853 {
2854 return int64_to_float64_scalbn(a, 0, status);
2855 }
2856
2857 float64 int16_to_float64(int16_t a, float_status *status)
2858 {
2859 return int64_to_float64_scalbn(a, 0, status);
2860 }
2861
2862 /*
2863 * Returns the result of converting the two's complement integer `a'
2864 * to the bfloat16 format.
2865 */
2866
2867 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2868 {
2869 FloatParts64 pa = int_to_float(a, scale, status);
2870 return bfloat16_round_pack_canonical(pa, status);
2871 }
2872
2873 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2874 {
2875 return int64_to_bfloat16_scalbn(a, scale, status);
2876 }
2877
2878 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2879 {
2880 return int64_to_bfloat16_scalbn(a, scale, status);
2881 }
2882
2883 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2884 {
2885 return int64_to_bfloat16_scalbn(a, 0, status);
2886 }
2887
2888 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2889 {
2890 return int64_to_bfloat16_scalbn(a, 0, status);
2891 }
2892
2893 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2894 {
2895 return int64_to_bfloat16_scalbn(a, 0, status);
2896 }
2897
2898 /*
2899 * Unsigned Integer to float conversions
2900 *
2901 * Returns the result of converting the unsigned integer `a' to the
2902 * floating-point format. The conversion is performed according to the
2903 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2904 */
2905
2906 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
2907 {
2908 FloatParts64 r = { .sign = false };
2909 int shift;
2910
2911 if (a == 0) {
2912 r.cls = float_class_zero;
2913 } else {
2914 scale = MIN(MAX(scale, -0x10000), 0x10000);
2915 shift = clz64(a);
2916 r.cls = float_class_normal;
2917 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2918 r.frac = a << shift;
2919 }
2920
2921 return r;
2922 }
2923
2924 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2925 {
2926 FloatParts64 pa = uint_to_float(a, scale, status);
2927 return float16_round_pack_canonical(pa, status);
2928 }
2929
2930 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2931 {
2932 return uint64_to_float16_scalbn(a, scale, status);
2933 }
2934
2935 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2936 {
2937 return uint64_to_float16_scalbn(a, scale, status);
2938 }
2939
2940 float16 uint64_to_float16(uint64_t a, float_status *status)
2941 {
2942 return uint64_to_float16_scalbn(a, 0, status);
2943 }
2944
2945 float16 uint32_to_float16(uint32_t a, float_status *status)
2946 {
2947 return uint64_to_float16_scalbn(a, 0, status);
2948 }
2949
2950 float16 uint16_to_float16(uint16_t a, float_status *status)
2951 {
2952 return uint64_to_float16_scalbn(a, 0, status);
2953 }
2954
2955 float16 uint8_to_float16(uint8_t a, float_status *status)
2956 {
2957 return uint64_to_float16_scalbn(a, 0, status);
2958 }
2959
2960 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2961 {
2962 FloatParts64 pa = uint_to_float(a, scale, status);
2963 return float32_round_pack_canonical(pa, status);
2964 }
2965
2966 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2967 {
2968 return uint64_to_float32_scalbn(a, scale, status);
2969 }
2970
2971 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2972 {
2973 return uint64_to_float32_scalbn(a, scale, status);
2974 }
2975
2976 float32 uint64_to_float32(uint64_t a, float_status *status)
2977 {
2978 return uint64_to_float32_scalbn(a, 0, status);
2979 }
2980
2981 float32 uint32_to_float32(uint32_t a, float_status *status)
2982 {
2983 return uint64_to_float32_scalbn(a, 0, status);
2984 }
2985
2986 float32 uint16_to_float32(uint16_t a, float_status *status)
2987 {
2988 return uint64_to_float32_scalbn(a, 0, status);
2989 }
2990
2991 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2992 {
2993 FloatParts64 pa = uint_to_float(a, scale, status);
2994 return float64_round_pack_canonical(pa, status);
2995 }
2996
2997 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2998 {
2999 return uint64_to_float64_scalbn(a, scale, status);
3000 }
3001
3002 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3003 {
3004 return uint64_to_float64_scalbn(a, scale, status);
3005 }
3006
3007 float64 uint64_to_float64(uint64_t a, float_status *status)
3008 {
3009 return uint64_to_float64_scalbn(a, 0, status);
3010 }
3011
3012 float64 uint32_to_float64(uint32_t a, float_status *status)
3013 {
3014 return uint64_to_float64_scalbn(a, 0, status);
3015 }
3016
3017 float64 uint16_to_float64(uint16_t a, float_status *status)
3018 {
3019 return uint64_to_float64_scalbn(a, 0, status);
3020 }
3021
3022 /*
3023 * Returns the result of converting the unsigned integer `a' to the
3024 * bfloat16 format.
3025 */
3026
3027 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3028 {
3029 FloatParts64 pa = uint_to_float(a, scale, status);
3030 return bfloat16_round_pack_canonical(pa, status);
3031 }
3032
3033 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3034 {
3035 return uint64_to_bfloat16_scalbn(a, scale, status);
3036 }
3037
3038 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3039 {
3040 return uint64_to_bfloat16_scalbn(a, scale, status);
3041 }
3042
3043 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3044 {
3045 return uint64_to_bfloat16_scalbn(a, 0, status);
3046 }
3047
3048 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3049 {
3050 return uint64_to_bfloat16_scalbn(a, 0, status);
3051 }
3052
3053 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3054 {
3055 return uint64_to_bfloat16_scalbn(a, 0, status);
3056 }
3057
3058 /* Float Min/Max */
3059 /* min() and max() functions. These can't be implemented as
3060 * 'compare and pick one input' because that would mishandle
3061 * NaNs and +0 vs -0.
3062 *
3063 * minnum() and maxnum() functions. These are similar to the min()
3064 * and max() functions but if one of the arguments is a QNaN and
3065 * the other is numerical then the numerical argument is returned.
3066 * SNaNs will get quietened before being returned.
3067 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3068 * and maxNum() operations. min() and max() are the typical min/max
3069 * semantics provided by many CPUs which predate that specification.
3070 *
3071 * minnummag() and maxnummag() functions correspond to minNumMag()
3072 * and minNumMag() from the IEEE-754 2008.
3073 */
3074 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3075 bool ieee, bool ismag, float_status *s)
3076 {
3077 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3078 if (ieee) {
3079 /* Takes two floating-point values `a' and `b', one of
3080 * which is a NaN, and returns the appropriate NaN
3081 * result. If either `a' or `b' is a signaling NaN,
3082 * the invalid exception is raised.
3083 */
3084 if (is_snan(a.cls) || is_snan(b.cls)) {
3085 return pick_nan(a, b, s);
3086 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3087 return b;
3088 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3089 return a;
3090 }
3091 }
3092 return pick_nan(a, b, s);
3093 } else {
3094 int a_exp, b_exp;
3095
3096 switch (a.cls) {
3097 case float_class_normal:
3098 a_exp = a.exp;
3099 break;
3100 case float_class_inf:
3101 a_exp = INT_MAX;
3102 break;
3103 case float_class_zero:
3104 a_exp = INT_MIN;
3105 break;
3106 default:
3107 g_assert_not_reached();
3108 break;
3109 }
3110 switch (b.cls) {
3111 case float_class_normal:
3112 b_exp = b.exp;
3113 break;
3114 case float_class_inf:
3115 b_exp = INT_MAX;
3116 break;
3117 case float_class_zero:
3118 b_exp = INT_MIN;
3119 break;
3120 default:
3121 g_assert_not_reached();
3122 break;
3123 }
3124
3125 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3126 bool a_less = a_exp < b_exp;
3127 if (a_exp == b_exp) {
3128 a_less = a.frac < b.frac;
3129 }
3130 return a_less ^ ismin ? b : a;
3131 }
3132
3133 if (a.sign == b.sign) {
3134 bool a_less = a_exp < b_exp;
3135 if (a_exp == b_exp) {
3136 a_less = a.frac < b.frac;
3137 }
3138 return a.sign ^ a_less ^ ismin ? b : a;
3139 } else {
3140 return a.sign ^ ismin ? b : a;
3141 }
3142 }
3143 }
3144
3145 #define MINMAX(sz, name, ismin, isiee, ismag) \
3146 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3147 float_status *s) \
3148 { \
3149 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3150 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3151 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3152 \
3153 return float ## sz ## _round_pack_canonical(pr, s); \
3154 }
3155
3156 MINMAX(16, min, true, false, false)
3157 MINMAX(16, minnum, true, true, false)
3158 MINMAX(16, minnummag, true, true, true)
3159 MINMAX(16, max, false, false, false)
3160 MINMAX(16, maxnum, false, true, false)
3161 MINMAX(16, maxnummag, false, true, true)
3162
3163 MINMAX(32, min, true, false, false)
3164 MINMAX(32, minnum, true, true, false)
3165 MINMAX(32, minnummag, true, true, true)
3166 MINMAX(32, max, false, false, false)
3167 MINMAX(32, maxnum, false, true, false)
3168 MINMAX(32, maxnummag, false, true, true)
3169
3170 MINMAX(64, min, true, false, false)
3171 MINMAX(64, minnum, true, true, false)
3172 MINMAX(64, minnummag, true, true, true)
3173 MINMAX(64, max, false, false, false)
3174 MINMAX(64, maxnum, false, true, false)
3175 MINMAX(64, maxnummag, false, true, true)
3176
3177 #undef MINMAX
3178
3179 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3180 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3181 { \
3182 FloatParts64 pa = bfloat16_unpack_canonical(a, s); \
3183 FloatParts64 pb = bfloat16_unpack_canonical(b, s); \
3184 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3185 \
3186 return bfloat16_round_pack_canonical(pr, s); \
3187 }
3188
3189 BF16_MINMAX(min, true, false, false)
3190 BF16_MINMAX(minnum, true, true, false)
3191 BF16_MINMAX(minnummag, true, true, true)
3192 BF16_MINMAX(max, false, false, false)
3193 BF16_MINMAX(maxnum, false, true, false)
3194 BF16_MINMAX(maxnummag, false, true, true)
3195
3196 #undef BF16_MINMAX
3197
3198 /* Floating point compare */
3199 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3200 float_status *s)
3201 {
3202 if (is_nan(a.cls) || is_nan(b.cls)) {
3203 if (!is_quiet ||
3204 a.cls == float_class_snan ||
3205 b.cls == float_class_snan) {
3206 float_raise(float_flag_invalid, s);
3207 }
3208 return float_relation_unordered;
3209 }
3210
3211 if (a.cls == float_class_zero) {
3212 if (b.cls == float_class_zero) {
3213 return float_relation_equal;
3214 }
3215 return b.sign ? float_relation_greater : float_relation_less;
3216 } else if (b.cls == float_class_zero) {
3217 return a.sign ? float_relation_less : float_relation_greater;
3218 }
3219
3220 /* The only really important thing about infinity is its sign. If
3221 * both are infinities the sign marks the smallest of the two.
3222 */
3223 if (a.cls == float_class_inf) {
3224 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3225 return float_relation_equal;
3226 }
3227 return a.sign ? float_relation_less : float_relation_greater;
3228 } else if (b.cls == float_class_inf) {
3229 return b.sign ? float_relation_greater : float_relation_less;
3230 }
3231
3232 if (a.sign != b.sign) {
3233 return a.sign ? float_relation_less : float_relation_greater;
3234 }
3235
3236 if (a.exp == b.exp) {
3237 if (a.frac == b.frac) {
3238 return float_relation_equal;
3239 }
3240 if (a.sign) {
3241 return a.frac > b.frac ?
3242 float_relation_less : float_relation_greater;
3243 } else {
3244 return a.frac > b.frac ?
3245 float_relation_greater : float_relation_less;
3246 }
3247 } else {
3248 if (a.sign) {
3249 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3250 } else {
3251 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3252 }
3253 }
3254 }
3255
3256 #define COMPARE(name, attr, sz) \
3257 static int attr \
3258 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3259 { \
3260 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3261 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3262 return compare_floats(pa, pb, is_quiet, s); \
3263 }
3264
3265 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3266 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3267 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3268
3269 #undef COMPARE
3270
3271 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3272 {
3273 return soft_f16_compare(a, b, false, s);
3274 }
3275
3276 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3277 {
3278 return soft_f16_compare(a, b, true, s);
3279 }
3280
3281 static FloatRelation QEMU_FLATTEN
3282 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3283 {
3284 union_float32 ua, ub;
3285
3286 ua.s = xa;
3287 ub.s = xb;
3288
3289 if (QEMU_NO_HARDFLOAT) {
3290 goto soft;
3291 }
3292
3293 float32_input_flush2(&ua.s, &ub.s, s);
3294 if (isgreaterequal(ua.h, ub.h)) {
3295 if (isgreater(ua.h, ub.h)) {
3296 return float_relation_greater;
3297 }
3298 return float_relation_equal;
3299 }
3300 if (likely(isless(ua.h, ub.h))) {
3301 return float_relation_less;
3302 }
3303 /* The only condition remaining is unordered.
3304 * Fall through to set flags.
3305 */
3306 soft:
3307 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3308 }
3309
3310 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3311 {
3312 return f32_compare(a, b, false, s);
3313 }
3314
3315 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3316 {
3317 return f32_compare(a, b, true, s);
3318 }
3319
3320 static FloatRelation QEMU_FLATTEN
3321 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3322 {
3323 union_float64 ua, ub;
3324
3325 ua.s = xa;
3326 ub.s = xb;
3327
3328 if (QEMU_NO_HARDFLOAT) {
3329 goto soft;
3330 }
3331
3332 float64_input_flush2(&ua.s, &ub.s, s);
3333 if (isgreaterequal(ua.h, ub.h)) {
3334 if (isgreater(ua.h, ub.h)) {
3335 return float_relation_greater;
3336 }
3337 return float_relation_equal;
3338 }
3339 if (likely(isless(ua.h, ub.h))) {
3340 return float_relation_less;
3341 }
3342 /* The only condition remaining is unordered.
3343 * Fall through to set flags.
3344 */
3345 soft:
3346 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3347 }
3348
3349 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3350 {
3351 return f64_compare(a, b, false, s);
3352 }
3353
3354 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3355 {
3356 return f64_compare(a, b, true, s);
3357 }
3358
3359 static FloatRelation QEMU_FLATTEN
3360 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3361 {
3362 FloatParts64 pa = bfloat16_unpack_canonical(a, s);
3363 FloatParts64 pb = bfloat16_unpack_canonical(b, s);
3364 return compare_floats(pa, pb, is_quiet, s);
3365 }
3366
3367 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3368 {
3369 return soft_bf16_compare(a, b, false, s);
3370 }
3371
3372 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3373 {
3374 return soft_bf16_compare(a, b, true, s);
3375 }
3376
3377 /* Multiply A by 2 raised to the power N. */
3378 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3379 {
3380 if (unlikely(is_nan(a.cls))) {
3381 return return_nan(a, s);
3382 }
3383 if (a.cls == float_class_normal) {
3384 /* The largest float type (even though not supported by FloatParts64)
3385 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3386 * still allows rounding to infinity, without allowing overflow
3387 * within the int32_t that backs FloatParts64.exp.
3388 */
3389 n = MIN(MAX(n, -0x10000), 0x10000);
3390 a.exp += n;
3391 }
3392 return a;
3393 }
3394
3395 float16 float16_scalbn(float16 a, int n, float_status *status)
3396 {
3397 FloatParts64 pa = float16_unpack_canonical(a, status);
3398 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3399 return float16_round_pack_canonical(pr, status);
3400 }
3401
3402 float32 float32_scalbn(float32 a, int n, float_status *status)
3403 {
3404 FloatParts64 pa = float32_unpack_canonical(a, status);
3405 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3406 return float32_round_pack_canonical(pr, status);
3407 }
3408
3409 float64 float64_scalbn(float64 a, int n, float_status *status)
3410 {
3411 FloatParts64 pa = float64_unpack_canonical(a, status);
3412 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3413 return float64_round_pack_canonical(pr, status);
3414 }
3415
3416 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3417 {
3418 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3419 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3420 return bfloat16_round_pack_canonical(pr, status);
3421 }
3422
3423 /*
3424 * Square Root
3425 *
3426 * The old softfloat code did an approximation step before zeroing in
3427 * on the final result. However for simpleness we just compute the
3428 * square root by iterating down from the implicit bit to enough extra
3429 * bits to ensure we get a correctly rounded result.
3430 *
3431 * This does mean however the calculation is slower than before,
3432 * especially for 64 bit floats.
3433 */
3434
3435 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3436 {
3437 uint64_t a_frac, r_frac, s_frac;
3438 int bit, last_bit;
3439
3440 if (is_nan(a.cls)) {
3441 return return_nan(a, s);
3442 }
3443 if (a.cls == float_class_zero) {
3444 return a; /* sqrt(+-0) = +-0 */
3445 }
3446 if (a.sign) {
3447 float_raise(float_flag_invalid, s);
3448 parts_default_nan(&a, s);
3449 return a;
3450 }
3451 if (a.cls == float_class_inf) {
3452 return a; /* sqrt(+inf) = +inf */
3453 }
3454
3455 assert(a.cls == float_class_normal);
3456
3457 /* We need two overflow bits at the top. Adding room for that is a
3458 * right shift. If the exponent is odd, we can discard the low bit
3459 * by multiplying the fraction by 2; that's a left shift. Combine
3460 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3461 */
3462 a_frac = a.frac >> (2 - (a.exp & 1));
3463 a.exp >>= 1;
3464
3465 /* Bit-by-bit computation of sqrt. */
3466 r_frac = 0;
3467 s_frac = 0;
3468
3469 /* Iterate from implicit bit down to the 3 extra bits to compute a
3470 * properly rounded result. Remember we've inserted two more bits
3471 * at the top, so these positions are two less.
3472 */
3473 bit = DECOMPOSED_BINARY_POINT - 2;
3474 last_bit = MAX(p->frac_shift - 4, 0);
3475 do {
3476 uint64_t q = 1ULL << bit;
3477 uint64_t t_frac = s_frac + q;
3478 if (t_frac <= a_frac) {
3479 s_frac = t_frac + q;
3480 a_frac -= t_frac;
3481 r_frac += q;
3482 }
3483 a_frac <<= 1;
3484 } while (--bit >= last_bit);
3485
3486 /* Undo the right shift done above. If there is any remaining
3487 * fraction, the result is inexact. Set the sticky bit.
3488 */
3489 a.frac = (r_frac << 2) + (a_frac != 0);
3490
3491 return a;
3492 }
3493
3494 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3495 {
3496 FloatParts64 pa = float16_unpack_canonical(a, status);
3497 FloatParts64 pr = sqrt_float(pa, status, &float16_params);
3498 return float16_round_pack_canonical(pr, status);
3499 }
3500
3501 static float32 QEMU_SOFTFLOAT_ATTR
3502 soft_f32_sqrt(float32 a, float_status *status)
3503 {
3504 FloatParts64 pa = float32_unpack_canonical(a, status);
3505 FloatParts64 pr = sqrt_float(pa, status, &float32_params);
3506 return float32_round_pack_canonical(pr, status);
3507 }
3508
3509 static float64 QEMU_SOFTFLOAT_ATTR
3510 soft_f64_sqrt(float64 a, float_status *status)
3511 {
3512 FloatParts64 pa = float64_unpack_canonical(a, status);
3513 FloatParts64 pr = sqrt_float(pa, status, &float64_params);
3514 return float64_round_pack_canonical(pr, status);
3515 }
3516
3517 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3518 {
3519 union_float32 ua, ur;
3520
3521 ua.s = xa;
3522 if (unlikely(!can_use_fpu(s))) {
3523 goto soft;
3524 }
3525
3526 float32_input_flush1(&ua.s, s);
3527 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3528 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3529 fpclassify(ua.h) == FP_ZERO) ||
3530 signbit(ua.h))) {
3531 goto soft;
3532 }
3533 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3534 float32_is_neg(ua.s))) {
3535 goto soft;
3536 }
3537 ur.h = sqrtf(ua.h);
3538 return ur.s;
3539
3540 soft:
3541 return soft_f32_sqrt(ua.s, s);
3542 }
3543
3544 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3545 {
3546 union_float64 ua, ur;
3547
3548 ua.s = xa;
3549 if (unlikely(!can_use_fpu(s))) {
3550 goto soft;
3551 }
3552
3553 float64_input_flush1(&ua.s, s);
3554 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3555 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3556 fpclassify(ua.h) == FP_ZERO) ||
3557 signbit(ua.h))) {
3558 goto soft;
3559 }
3560 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3561 float64_is_neg(ua.s))) {
3562 goto soft;
3563 }
3564 ur.h = sqrt(ua.h);
3565 return ur.s;
3566
3567 soft:
3568 return soft_f64_sqrt(ua.s, s);
3569 }
3570
3571 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3572 {
3573 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3574 FloatParts64 pr = sqrt_float(pa, status, &bfloat16_params);
3575 return bfloat16_round_pack_canonical(pr, status);
3576 }
3577
3578 /*----------------------------------------------------------------------------
3579 | The pattern for a default generated NaN.
3580 *----------------------------------------------------------------------------*/
3581
3582 float16 float16_default_nan(float_status *status)
3583 {
3584 FloatParts64 p;
3585
3586 parts_default_nan(&p, status);
3587 p.frac >>= float16_params.frac_shift;
3588 return float16_pack_raw(p);
3589 }
3590
3591 float32 float32_default_nan(float_status *status)
3592 {
3593 FloatParts64 p;
3594
3595 parts_default_nan(&p, status);
3596 p.frac >>= float32_params.frac_shift;
3597 return float32_pack_raw(p);
3598 }
3599
3600 float64 float64_default_nan(float_status *status)
3601 {
3602 FloatParts64 p;
3603
3604 parts_default_nan(&p, status);
3605 p.frac >>= float64_params.frac_shift;
3606 return float64_pack_raw(p);
3607 }
3608
3609 float128 float128_default_nan(float_status *status)
3610 {
3611 FloatParts64 p;
3612 float128 r;
3613
3614 parts_default_nan(&p, status);
3615 /* Extrapolate from the choices made by parts_default_nan to fill
3616 * in the quad-floating format. If the low bit is set, assume we
3617 * want to set all non-snan bits.
3618 */
3619 r.low = -(p.frac & 1);
3620 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3621 r.high |= UINT64_C(0x7FFF000000000000);
3622 r.high |= (uint64_t)p.sign << 63;
3623
3624 return r;
3625 }
3626
3627 bfloat16 bfloat16_default_nan(float_status *status)
3628 {
3629 FloatParts64 p;
3630
3631 parts_default_nan(&p, status);
3632 p.frac >>= bfloat16_params.frac_shift;
3633 return bfloat16_pack_raw(p);
3634 }
3635
3636 /*----------------------------------------------------------------------------
3637 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3638 *----------------------------------------------------------------------------*/
3639
3640 float16 float16_silence_nan(float16 a, float_status *status)
3641 {
3642 FloatParts64 p = float16_unpack_raw(a);
3643 p.frac <<= float16_params.frac_shift;
3644 p = parts_silence_nan(p, status);
3645 p.frac >>= float16_params.frac_shift;
3646 return float16_pack_raw(p);
3647 }
3648
3649 float32 float32_silence_nan(float32 a, float_status *status)
3650 {
3651 FloatParts64 p = float32_unpack_raw(a);
3652 p.frac <<= float32_params.frac_shift;
3653 p = parts_silence_nan(p, status);
3654 p.frac >>= float32_params.frac_shift;
3655 return float32_pack_raw(p);
3656 }
3657
3658 float64 float64_silence_nan(float64 a, float_status *status)
3659 {
3660 FloatParts64 p = float64_unpack_raw(a);
3661 p.frac <<= float64_params.frac_shift;
3662 p = parts_silence_nan(p, status);
3663 p.frac >>= float64_params.frac_shift;
3664 return float64_pack_raw(p);
3665 }
3666
3667 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3668 {
3669 FloatParts64 p = bfloat16_unpack_raw(a);
3670 p.frac <<= bfloat16_params.frac_shift;
3671 p = parts_silence_nan(p, status);
3672 p.frac >>= bfloat16_params.frac_shift;
3673 return bfloat16_pack_raw(p);
3674 }
3675
3676 /*----------------------------------------------------------------------------
3677 | If `a' is denormal and we are in flush-to-zero mode then set the
3678 | input-denormal exception and return zero. Otherwise just return the value.
3679 *----------------------------------------------------------------------------*/
3680
3681 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3682 {
3683 if (p.exp == 0 && p.frac != 0) {
3684 float_raise(float_flag_input_denormal, status);
3685 return true;
3686 }
3687
3688 return false;
3689 }
3690
3691 float16 float16_squash_input_denormal(float16 a, float_status *status)
3692 {
3693 if (status->flush_inputs_to_zero) {
3694 FloatParts64 p = float16_unpack_raw(a);
3695 if (parts_squash_denormal(p, status)) {
3696 return float16_set_sign(float16_zero, p.sign);
3697 }
3698 }
3699 return a;
3700 }
3701
3702 float32 float32_squash_input_denormal(float32 a, float_status *status)
3703 {
3704 if (status->flush_inputs_to_zero) {
3705 FloatParts64 p = float32_unpack_raw(a);
3706 if (parts_squash_denormal(p, status)) {
3707 return float32_set_sign(float32_zero, p.sign);
3708 }
3709 }
3710 return a;
3711 }
3712
3713 float64 float64_squash_input_denormal(float64 a, float_status *status)
3714 {
3715 if (status->flush_inputs_to_zero) {
3716 FloatParts64 p = float64_unpack_raw(a);
3717 if (parts_squash_denormal(p, status)) {
3718 return float64_set_sign(float64_zero, p.sign);
3719 }
3720 }
3721 return a;
3722 }
3723
3724 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3725 {
3726 if (status->flush_inputs_to_zero) {
3727 FloatParts64 p = bfloat16_unpack_raw(a);
3728 if (parts_squash_denormal(p, status)) {
3729 return bfloat16_set_sign(bfloat16_zero, p.sign);
3730 }
3731 }
3732 return a;
3733 }
3734
3735 /*----------------------------------------------------------------------------
3736 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3737 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3738 | input. If `zSign' is 1, the input is negated before being converted to an
3739 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3740 | is simply rounded to an integer, with the inexact exception raised if the
3741 | input cannot be represented exactly as an integer. However, if the fixed-
3742 | point input is too large, the invalid exception is raised and the largest
3743 | positive or negative integer is returned.
3744 *----------------------------------------------------------------------------*/
3745
3746 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3747 float_status *status)
3748 {
3749 int8_t roundingMode;
3750 bool roundNearestEven;
3751 int8_t roundIncrement, roundBits;
3752 int32_t z;
3753
3754 roundingMode = status->float_rounding_mode;
3755 roundNearestEven = ( roundingMode == float_round_nearest_even );
3756 switch (roundingMode) {
3757 case float_round_nearest_even:
3758 case float_round_ties_away:
3759 roundIncrement = 0x40;
3760 break;
3761 case float_round_to_zero:
3762 roundIncrement = 0;
3763 break;
3764 case float_round_up:
3765 roundIncrement = zSign ? 0 : 0x7f;
3766 break;
3767 case float_round_down:
3768 roundIncrement = zSign ? 0x7f : 0;
3769 break;
3770 case float_round_to_odd:
3771 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3772 break;
3773 default:
3774 abort();
3775 }
3776 roundBits = absZ & 0x7F;
3777 absZ = ( absZ + roundIncrement )>>7;
3778 if (!(roundBits ^ 0x40) && roundNearestEven) {
3779 absZ &= ~1;
3780 }
3781 z = absZ;
3782 if ( zSign ) z = - z;
3783 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3784 float_raise(float_flag_invalid, status);
3785 return zSign ? INT32_MIN : INT32_MAX;
3786 }
3787 if (roundBits) {
3788 float_raise(float_flag_inexact, status);
3789 }
3790 return z;
3791
3792 }
3793
3794 /*----------------------------------------------------------------------------
3795 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3796 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3797 | and returns the properly rounded 64-bit integer corresponding to the input.
3798 | If `zSign' is 1, the input is negated before being converted to an integer.
3799 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3800 | the inexact exception raised if the input cannot be represented exactly as
3801 | an integer. However, if the fixed-point input is too large, the invalid
3802 | exception is raised and the largest positive or negative integer is
3803 | returned.
3804 *----------------------------------------------------------------------------*/
3805
3806 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3807 float_status *status)
3808 {
3809 int8_t roundingMode;
3810 bool roundNearestEven, increment;
3811 int64_t z;
3812
3813 roundingMode = status->float_rounding_mode;
3814 roundNearestEven = ( roundingMode == float_round_nearest_even );
3815 switch (roundingMode) {
3816 case float_round_nearest_even:
3817 case float_round_ties_away:
3818 increment = ((int64_t) absZ1 < 0);
3819 break;
3820 case float_round_to_zero:
3821 increment = 0;
3822 break;
3823 case float_round_up:
3824 increment = !zSign && absZ1;
3825 break;
3826 case float_round_down:
3827 increment = zSign && absZ1;
3828 break;
3829 case float_round_to_odd:
3830 increment = !(absZ0 & 1) && absZ1;
3831 break;
3832 default:
3833 abort();
3834 }
3835 if ( increment ) {
3836 ++absZ0;
3837 if ( absZ0 == 0 ) goto overflow;
3838 if (!(absZ1 << 1) && roundNearestEven) {
3839 absZ0 &= ~1;
3840 }
3841 }
3842 z = absZ0;
3843 if ( zSign ) z = - z;
3844 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3845 overflow:
3846 float_raise(float_flag_invalid, status);
3847 return zSign ? INT64_MIN : INT64_MAX;
3848 }
3849 if (absZ1) {
3850 float_raise(float_flag_inexact, status);
3851 }
3852 return z;
3853
3854 }
3855
3856 /*----------------------------------------------------------------------------
3857 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3858 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3859 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3860 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3861 | with the inexact exception raised if the input cannot be represented exactly
3862 | as an integer. However, if the fixed-point input is too large, the invalid
3863 | exception is raised and the largest unsigned integer is returned.
3864 *----------------------------------------------------------------------------*/
3865
3866 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3867 uint64_t absZ1, float_status *status)
3868 {
3869 int8_t roundingMode;
3870 bool roundNearestEven, increment;
3871
3872 roundingMode = status->float_rounding_mode;
3873 roundNearestEven = (roundingMode == float_round_nearest_even);
3874 switch (roundingMode) {
3875 case float_round_nearest_even:
3876 case float_round_ties_away:
3877 increment = ((int64_t)absZ1 < 0);
3878 break;
3879 case float_round_to_zero:
3880 increment = 0;
3881 break;
3882 case float_round_up:
3883 increment = !zSign && absZ1;
3884 break;
3885 case float_round_down:
3886 increment = zSign && absZ1;
3887 break;
3888 case float_round_to_odd:
3889 increment = !(absZ0 & 1) && absZ1;
3890 break;
3891 default:
3892 abort();
3893 }
3894 if (increment) {
3895 ++absZ0;
3896 if (absZ0 == 0) {
3897 float_raise(float_flag_invalid, status);
3898 return UINT64_MAX;
3899 }
3900 if (!(absZ1 << 1) && roundNearestEven) {
3901 absZ0 &= ~1;
3902 }
3903 }
3904
3905 if (zSign && absZ0) {
3906 float_raise(float_flag_invalid, status);
3907 return 0;
3908 }
3909
3910 if (absZ1) {
3911 float_raise(float_flag_inexact, status);
3912 }
3913 return absZ0;
3914 }
3915
3916 /*----------------------------------------------------------------------------
3917 | Normalizes the subnormal single-precision floating-point value represented
3918 | by the denormalized significand `aSig'. The normalized exponent and
3919 | significand are stored at the locations pointed to by `zExpPtr' and
3920 | `zSigPtr', respectively.
3921 *----------------------------------------------------------------------------*/
3922
3923 static void
3924 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3925 {
3926 int8_t shiftCount;
3927
3928 shiftCount = clz32(aSig) - 8;
3929 *zSigPtr = aSig<<shiftCount;
3930 *zExpPtr = 1 - shiftCount;
3931
3932 }
3933
3934 /*----------------------------------------------------------------------------
3935 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3936 | and significand `zSig', and returns the proper single-precision floating-
3937 | point value corresponding to the abstract input. Ordinarily, the abstract
3938 | value is simply rounded and packed into the single-precision format, with
3939 | the inexact exception raised if the abstract input cannot be represented
3940 | exactly. However, if the abstract value is too large, the overflow and
3941 | inexact exceptions are raised and an infinity or maximal finite value is
3942 | returned. If the abstract value is too small, the input value is rounded to
3943 | a subnormal number, and the underflow and inexact exceptions are raised if
3944 | the abstract input cannot be represented exactly as a subnormal single-
3945 | precision floating-point number.
3946 | The input significand `zSig' has its binary point between bits 30
3947 | and 29, which is 7 bits to the left of the usual location. This shifted
3948 | significand must be normalized or smaller. If `zSig' is not normalized,
3949 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3950 | and it must not require rounding. In the usual case that `zSig' is
3951 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3952 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3953 | Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955
3956 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3957 float_status *status)
3958 {
3959 int8_t roundingMode;
3960 bool roundNearestEven;
3961 int8_t roundIncrement, roundBits;
3962 bool isTiny;
3963
3964 roundingMode = status->float_rounding_mode;
3965 roundNearestEven = ( roundingMode == float_round_nearest_even );
3966 switch (roundingMode) {
3967 case float_round_nearest_even:
3968 case float_round_ties_away:
3969 roundIncrement = 0x40;
3970 break;
3971 case float_round_to_zero:
3972 roundIncrement = 0;
3973 break;
3974 case float_round_up:
3975 roundIncrement = zSign ? 0 : 0x7f;
3976 break;
3977 case float_round_down:
3978 roundIncrement = zSign ? 0x7f : 0;
3979 break;
3980 case float_round_to_odd:
3981 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3982 break;
3983 default:
3984 abort();
3985 break;
3986 }
3987 roundBits = zSig & 0x7F;
3988 if ( 0xFD <= (uint16_t) zExp ) {
3989 if ( ( 0xFD < zExp )
3990 || ( ( zExp == 0xFD )
3991 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3992 ) {
3993 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3994 roundIncrement != 0;
3995 float_raise(float_flag_overflow | float_flag_inexact, status);
3996 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3997 }
3998 if ( zExp < 0 ) {
3999 if (status->flush_to_zero) {
4000 float_raise(float_flag_output_denormal, status);
4001 return packFloat32(zSign, 0, 0);
4002 }
4003 isTiny = status->tininess_before_rounding
4004 || (zExp < -1)
4005 || (zSig + roundIncrement < 0x80000000);
4006 shift32RightJamming( zSig, - zExp, &zSig );
4007 zExp = 0;
4008 roundBits = zSig & 0x7F;
4009 if (isTiny && roundBits) {
4010 float_raise(float_flag_underflow, status);
4011 }
4012 if (roundingMode == float_round_to_odd) {
4013 /*
4014 * For round-to-odd case, the roundIncrement depends on
4015 * zSig which just changed.
4016 */
4017 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4018 }
4019 }
4020 }
4021 if (roundBits) {
4022 float_raise(float_flag_inexact, status);
4023 }
4024 zSig = ( zSig + roundIncrement )>>7;
4025 if (!(roundBits ^ 0x40) && roundNearestEven) {
4026 zSig &= ~1;
4027 }
4028 if ( zSig == 0 ) zExp = 0;
4029 return packFloat32( zSign, zExp, zSig );
4030
4031 }
4032
4033 /*----------------------------------------------------------------------------
4034 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4035 | and significand `zSig', and returns the proper single-precision floating-
4036 | point value corresponding to the abstract input. This routine is just like
4037 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4038 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4039 | floating-point exponent.
4040 *----------------------------------------------------------------------------*/
4041
4042 static float32
4043 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4044 float_status *status)
4045 {
4046 int8_t shiftCount;
4047
4048 shiftCount = clz32(zSig) - 1;
4049 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4050 status);
4051
4052 }
4053
4054 /*----------------------------------------------------------------------------
4055 | Normalizes the subnormal double-precision floating-point value represented
4056 | by the denormalized significand `aSig'. The normalized exponent and
4057 | significand are stored at the locations pointed to by `zExpPtr' and
4058 | `zSigPtr', respectively.
4059 *----------------------------------------------------------------------------*/
4060
4061 static void
4062 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4063 {
4064 int8_t shiftCount;
4065
4066 shiftCount = clz64(aSig) - 11;
4067 *zSigPtr = aSig<<shiftCount;
4068 *zExpPtr = 1 - shiftCount;
4069
4070 }
4071
4072 /*----------------------------------------------------------------------------
4073 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4074 | double-precision floating-point value, returning the result. After being
4075 | shifted into the proper positions, the three fields are simply added
4076 | together to form the result. This means that any integer portion of `zSig'
4077 | will be added into the exponent. Since a properly normalized significand
4078 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4079 | than the desired result exponent whenever `zSig' is a complete, normalized
4080 | significand.
4081 *----------------------------------------------------------------------------*/
4082
4083 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4084 {
4085
4086 return make_float64(
4087 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4088
4089 }
4090
4091 /*----------------------------------------------------------------------------
4092 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4093 | and significand `zSig', and returns the proper double-precision floating-
4094 | point value corresponding to the abstract input. Ordinarily, the abstract
4095 | value is simply rounded and packed into the double-precision format, with
4096 | the inexact exception raised if the abstract input cannot be represented
4097 | exactly. However, if the abstract value is too large, the overflow and
4098 | inexact exceptions are raised and an infinity or maximal finite value is
4099 | returned. If the abstract value is too small, the input value is rounded to
4100 | a subnormal number, and the underflow and inexact exceptions are raised if
4101 | the abstract input cannot be represented exactly as a subnormal double-
4102 | precision floating-point number.
4103 | The input significand `zSig' has its binary point between bits 62
4104 | and 61, which is 10 bits to the left of the usual location. This shifted
4105 | significand must be normalized or smaller. If `zSig' is not normalized,
4106 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4107 | and it must not require rounding. In the usual case that `zSig' is
4108 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4109 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4110 | Binary Floating-Point Arithmetic.
4111 *----------------------------------------------------------------------------*/
4112
4113 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4114 float_status *status)
4115 {
4116 int8_t roundingMode;
4117 bool roundNearestEven;
4118 int roundIncrement, roundBits;
4119 bool isTiny;
4120
4121 roundingMode = status->float_rounding_mode;
4122 roundNearestEven = ( roundingMode == float_round_nearest_even );
4123 switch (roundingMode) {
4124 case float_round_nearest_even:
4125 case float_round_ties_away:
4126 roundIncrement = 0x200;
4127 break;
4128 case float_round_to_zero:
4129 roundIncrement = 0;
4130 break;
4131 case float_round_up:
4132 roundIncrement = zSign ? 0 : 0x3ff;
4133 break;
4134 case float_round_down:
4135 roundIncrement = zSign ? 0x3ff : 0;
4136 break;
4137 case float_round_to_odd:
4138 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4139 break;
4140 default:
4141 abort();
4142 }
4143 roundBits = zSig & 0x3FF;
4144 if ( 0x7FD <= (uint16_t) zExp ) {
4145 if ( ( 0x7FD < zExp )
4146 || ( ( zExp == 0x7FD )
4147 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4148 ) {
4149 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4150 roundIncrement != 0;
4151 float_raise(float_flag_overflow | float_flag_inexact, status);
4152 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4153 }
4154 if ( zExp < 0 ) {
4155 if (status->flush_to_zero) {
4156 float_raise(float_flag_output_denormal, status);
4157 return packFloat64(zSign, 0, 0);
4158 }
4159 isTiny = status->tininess_before_rounding
4160 || (zExp < -1)
4161 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4162 shift64RightJamming( zSig, - zExp, &zSig );
4163 zExp = 0;
4164 roundBits = zSig & 0x3FF;
4165 if (isTiny && roundBits) {
4166 float_raise(float_flag_underflow, status);
4167 }
4168 if (roundingMode == float_round_to_odd) {
4169 /*
4170 * For round-to-odd case, the roundIncrement depends on
4171 * zSig which just changed.
4172 */
4173 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4174 }
4175 }
4176 }
4177 if (roundBits) {
4178 float_raise(float_flag_inexact, status);
4179 }
4180 zSig = ( zSig + roundIncrement )>>10;
4181 if (!(roundBits ^ 0x200) && roundNearestEven) {
4182 zSig &= ~1;
4183 }
4184 if ( zSig == 0 ) zExp = 0;
4185 return packFloat64( zSign, zExp, zSig );
4186
4187 }
4188
4189 /*----------------------------------------------------------------------------
4190 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4191 | and significand `zSig', and returns the proper double-precision floating-
4192 | point value corresponding to the abstract input. This routine is just like
4193 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4194 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4195 | floating-point exponent.
4196 *----------------------------------------------------------------------------*/
4197
4198 static float64
4199 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4200 float_status *status)
4201 {
4202 int8_t shiftCount;
4203
4204 shiftCount = clz64(zSig) - 1;
4205 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4206 status);
4207
4208 }
4209
4210 /*----------------------------------------------------------------------------
4211 | Normalizes the subnormal extended double-precision floating-point value
4212 | represented by the denormalized significand `aSig'. The normalized exponent
4213 | and significand are stored at the locations pointed to by `zExpPtr' and
4214 | `zSigPtr', respectively.
4215 *----------------------------------------------------------------------------*/
4216
4217 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4218 uint64_t *zSigPtr)
4219 {
4220 int8_t shiftCount;
4221
4222 shiftCount = clz64(aSig);
4223 *zSigPtr = aSig<<shiftCount;
4224 *zExpPtr = 1 - shiftCount;
4225 }
4226
4227 /*----------------------------------------------------------------------------
4228 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4229 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4230 | and returns the proper extended double-precision floating-point value
4231 | corresponding to the abstract input. Ordinarily, the abstract value is
4232 | rounded and packed into the extended double-precision format, with the
4233 | inexact exception raised if the abstract input cannot be represented
4234 | exactly. However, if the abstract value is too large, the overflow and
4235 | inexact exceptions are raised and an infinity or maximal finite value is
4236 | returned. If the abstract value is too small, the input value is rounded to
4237 | a subnormal number, and the underflow and inexact exceptions are raised if
4238 | the abstract input cannot be represented exactly as a subnormal extended
4239 | double-precision floating-point number.
4240 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4241 | number of bits as single or double precision, respectively. Otherwise, the
4242 | result is rounded to the full precision of the extended double-precision
4243 | format.
4244 | The input significand must be normalized or smaller. If the input
4245 | significand is not normalized, `zExp' must be 0; in that case, the result
4246 | returned is a subnormal number, and it must not require rounding. The
4247 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4248 | Floating-Point Arithmetic.
4249 *----------------------------------------------------------------------------*/
4250
4251 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4252 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4253 float_status *status)
4254 {
4255 int8_t roundingMode;
4256 bool roundNearestEven, increment, isTiny;
4257 int64_t roundIncrement, roundMask, roundBits;
4258
4259 roundingMode = status->float_rounding_mode;
4260 roundNearestEven = ( roundingMode == float_round_nearest_even );
4261 if ( roundingPrecision == 80 ) goto precision80;
4262 if ( roundingPrecision == 64 ) {
4263 roundIncrement = UINT64_C(0x0000000000000400);
4264 roundMask = UINT64_C(0x00000000000007FF);
4265 }
4266 else if ( roundingPrecision == 32 ) {
4267 roundIncrement = UINT64_C(0x0000008000000000);
4268 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4269 }
4270 else {
4271 goto precision80;
4272 }
4273 zSig0 |= ( zSig1 != 0 );
4274 switch (roundingMode) {
4275 case float_round_nearest_even:
4276 case float_round_ties_away:
4277 break;
4278 case float_round_to_zero:
4279 roundIncrement = 0;
4280 break;
4281 case float_round_up:
4282 roundIncrement = zSign ? 0 : roundMask;
4283 break;
4284 case float_round_down:
4285 roundIncrement = zSign ? roundMask : 0;
4286 break;
4287 default:
4288 abort();
4289 }
4290 roundBits = zSig0 & roundMask;
4291 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4292 if ( ( 0x7FFE < zExp )
4293 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4294 ) {
4295 goto overflow;
4296 }
4297 if ( zExp <= 0 ) {
4298 if (status->flush_to_zero) {
4299 float_raise(float_flag_output_denormal, status);
4300 return packFloatx80(zSign, 0, 0);
4301 }
4302 isTiny = status->tininess_before_rounding
4303 || (zExp < 0 )
4304 || (zSig0 <= zSig0 + roundIncrement);
4305 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4306 zExp = 0;
4307 roundBits = zSig0 & roundMask;
4308 if (isTiny && roundBits) {
4309 float_raise(float_flag_underflow, status);
4310 }
4311 if (roundBits) {
4312 float_raise(float_flag_inexact, status);
4313 }
4314 zSig0 += roundIncrement;
4315 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4316 roundIncrement = roundMask + 1;
4317 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4318 roundMask |= roundIncrement;
4319 }
4320 zSig0 &= ~ roundMask;
4321 return packFloatx80( zSign, zExp, zSig0 );
4322 }
4323 }
4324 if (roundBits) {
4325 float_raise(float_flag_inexact, status);
4326 }
4327 zSig0 += roundIncrement;
4328 if ( zSig0 < roundIncrement ) {
4329 ++zExp;
4330 zSig0 = UINT64_C(0x8000000000000000);
4331 }
4332 roundIncrement = roundMask + 1;
4333 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4334 roundMask |= roundIncrement;
4335 }
4336 zSig0 &= ~ roundMask;
4337 if ( zSig0 == 0 ) zExp = 0;
4338 return packFloatx80( zSign, zExp, zSig0 );
4339 precision80:
4340 switch (roundingMode) {
4341 case float_round_nearest_even:
4342 case float_round_ties_away:
4343 increment = ((int64_t)zSig1 < 0);
4344 break;
4345 case float_round_to_zero:
4346 increment = 0;
4347 break;
4348 case float_round_up:
4349 increment = !zSign && zSig1;
4350 break;
4351 case float_round_down:
4352 increment = zSign && zSig1;
4353 break;
4354 default:
4355 abort();
4356 }
4357 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4358 if ( ( 0x7FFE < zExp )
4359 || ( ( zExp == 0x7FFE )
4360 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4361 && increment
4362 )
4363 ) {
4364 roundMask = 0;
4365 overflow:
4366 float_raise(float_flag_overflow | float_flag_inexact, status);
4367 if ( ( roundingMode == float_round_to_zero )
4368 || ( zSign && ( roundingMode == float_round_up ) )
4369 || ( ! zSign && ( roundingMode == float_round_down ) )
4370 ) {
4371 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4372 }
4373 return packFloatx80(zSign,
4374 floatx80_infinity_high,
4375 floatx80_infinity_low);
4376 }
4377 if ( zExp <= 0 ) {
4378 isTiny = status->tininess_before_rounding
4379 || (zExp < 0)
4380 || !increment
4381 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4382 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4383 zExp = 0;
4384 if (isTiny && zSig1) {
4385 float_raise(float_flag_underflow, status);
4386 }
4387 if (zSig1) {
4388 float_raise(float_flag_inexact, status);
4389 }
4390 switch (roundingMode) {
4391 case float_round_nearest_even:
4392 case float_round_ties_away:
4393 increment = ((int64_t)zSig1 < 0);
4394 break;
4395 case float_round_to_zero:
4396 increment = 0;
4397 break;
4398 case float_round_up:
4399 increment = !zSign && zSig1;
4400 break;
4401 case float_round_down:
4402 increment = zSign && zSig1;
4403 break;
4404 default:
4405 abort();
4406 }
4407 if ( increment ) {
4408 ++zSig0;
4409 if (!(zSig1 << 1) && roundNearestEven) {
4410 zSig0 &= ~1;
4411 }
4412 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4413 }
4414 return packFloatx80( zSign, zExp, zSig0 );
4415 }
4416 }
4417 if (zSig1) {
4418 float_raise(float_flag_inexact, status);
4419 }
4420 if ( increment ) {
4421 ++zSig0;
4422 if ( zSig0 == 0 ) {
4423 ++zExp;
4424 zSig0 = UINT64_C(0x8000000000000000);
4425 }
4426 else {
4427 if (!(zSig1 << 1) && roundNearestEven) {
4428 zSig0 &= ~1;
4429 }
4430 }
4431 }
4432 else {
4433 if ( zSig0 == 0 ) zExp = 0;
4434 }
4435 return packFloatx80( zSign, zExp, zSig0 );
4436
4437 }
4438
4439 /*----------------------------------------------------------------------------
4440 | Takes an abstract floating-point value having sign `zSign', exponent
4441 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4442 | and returns the proper extended double-precision floating-point value
4443 | corresponding to the abstract input. This routine is just like
4444 | `roundAndPackFloatx80' except that the input significand does not have to be
4445 | normalized.
4446 *----------------------------------------------------------------------------*/
4447
4448 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4449 bool zSign, int32_t zExp,
4450 uint64_t zSig0, uint64_t zSig1,
4451 float_status *status)
4452 {
4453 int8_t shiftCount;
4454
4455 if ( zSig0 == 0 ) {
4456 zSig0 = zSig1;
4457 zSig1 = 0;
4458 zExp -= 64;
4459 }
4460 shiftCount = clz64(zSig0);
4461 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4462 zExp -= shiftCount;
4463 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4464 zSig0, zSig1, status);
4465
4466 }
4467
4468 /*----------------------------------------------------------------------------
4469 | Returns the least-significant 64 fraction bits of the quadruple-precision
4470 | floating-point value `a'.
4471 *----------------------------------------------------------------------------*/
4472
4473 static inline uint64_t extractFloat128Frac1( float128 a )
4474 {
4475
4476 return a.low;
4477
4478 }
4479
4480 /*----------------------------------------------------------------------------
4481 | Returns the most-significant 48 fraction bits of the quadruple-precision
4482 | floating-point value `a'.
4483 *----------------------------------------------------------------------------*/
4484
4485 static inline uint64_t extractFloat128Frac0( float128 a )
4486 {
4487
4488 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4489
4490 }
4491
4492 /*----------------------------------------------------------------------------
4493 | Returns the exponent bits of the quadruple-precision floating-point value
4494 | `a'.
4495 *----------------------------------------------------------------------------*/
4496
4497 static inline int32_t extractFloat128Exp( float128 a )
4498 {
4499
4500 return ( a.high>>48 ) & 0x7FFF;
4501
4502 }
4503
4504 /*----------------------------------------------------------------------------
4505 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4506 *----------------------------------------------------------------------------*/
4507
4508 static inline bool extractFloat128Sign(float128 a)
4509 {
4510 return a.high >> 63;
4511 }
4512
4513 /*----------------------------------------------------------------------------
4514 | Normalizes the subnormal quadruple-precision floating-point value
4515 | represented by the denormalized significand formed by the concatenation of
4516 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4517 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4518 | significand are stored at the location pointed to by `zSig0Ptr', and the
4519 | least significant 64 bits of the normalized significand are stored at the
4520 | location pointed to by `zSig1Ptr'.
4521 *----------------------------------------------------------------------------*/
4522
4523 static void
4524 normalizeFloat128Subnormal(
4525 uint64_t aSig0,
4526 uint64_t aSig1,
4527 int32_t *zExpPtr,
4528 uint64_t *zSig0Ptr,
4529 uint64_t *zSig1Ptr
4530 )
4531 {
4532 int8_t shiftCount;
4533
4534 if ( aSig0 == 0 ) {
4535 shiftCount = clz64(aSig1) - 15;
4536 if ( shiftCount < 0 ) {
4537 *zSig0Ptr = aSig1>>( - shiftCount );
4538 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4539 }
4540 else {
4541 *zSig0Ptr = aSig1<<shiftCount;
4542 *zSig1Ptr = 0;
4543 }
4544 *zExpPtr = - shiftCount - 63;
4545 }
4546 else {
4547 shiftCount = clz64(aSig0) - 15;
4548 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4549 *zExpPtr = 1 - shiftCount;
4550 }
4551
4552 }
4553
4554 /*----------------------------------------------------------------------------
4555 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4556 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4557 | floating-point value, returning the result. After being shifted into the
4558 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4559 | added together to form the most significant 32 bits of the result. This
4560 | means that any integer portion of `zSig0' will be added into the exponent.
4561 | Since a properly normalized significand will have an integer portion equal
4562 | to 1, the `zExp' input should be 1 less than the desired result exponent
4563 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4564 | significand.
4565 *----------------------------------------------------------------------------*/
4566
4567 static inline float128
4568 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4569 {
4570 float128 z;
4571
4572 z.low = zSig1;
4573 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4574 return z;
4575 }
4576
4577 /*----------------------------------------------------------------------------
4578 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4579 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4580 | and `zSig2', and returns the proper quadruple-precision floating-point value
4581 | corresponding to the abstract input. Ordinarily, the abstract value is
4582 | simply rounded and packed into the quadruple-precision format, with the
4583 | inexact exception raised if the abstract input cannot be represented
4584 | exactly. However, if the abstract value is too large, the overflow and
4585 | inexact exceptions are raised and an infinity or maximal finite value is
4586 | returned. If the abstract value is too small, the input value is rounded to
4587 | a subnormal number, and the underflow and inexact exceptions are raised if
4588 | the abstract input cannot be represented exactly as a subnormal quadruple-
4589 | precision floating-point number.
4590 | The input significand must be normalized or smaller. If the input
4591 | significand is not normalized, `zExp' must be 0; in that case, the result
4592 | returned is a subnormal number, and it must not require rounding. In the
4593 | usual case that the input significand is normalized, `zExp' must be 1 less
4594 | than the ``true'' floating-point exponent. The handling of underflow and
4595 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4596 *----------------------------------------------------------------------------*/
4597
4598 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4599 uint64_t zSig0, uint64_t zSig1,
4600 uint64_t zSig2, float_status *status)
4601 {
4602 int8_t roundingMode;
4603 bool roundNearestEven, increment, isTiny;
4604
4605 roundingMode = status->float_rounding_mode;
4606 roundNearestEven = ( roundingMode == float_round_nearest_even );
4607 switch (roundingMode) {
4608 case float_round_nearest_even:
4609 case float_round_ties_away:
4610 increment = ((int64_t)zSig2 < 0);
4611 break;
4612 case float_round_to_zero:
4613 increment = 0;
4614 break;
4615 case float_round_up:
4616 increment = !zSign && zSig2;
4617 break;
4618 case float_round_down:
4619 increment = zSign && zSig2;
4620 break;
4621 case float_round_to_odd:
4622 increment = !(zSig1 & 0x1) && zSig2;
4623 break;
4624 default:
4625 abort();
4626 }
4627 if ( 0x7FFD <= (uint32_t) zExp ) {
4628 if ( ( 0x7FFD < zExp )
4629 || ( ( zExp == 0x7FFD )
4630 && eq128(
4631 UINT64_C(0x0001FFFFFFFFFFFF),
4632 UINT64_C(0xFFFFFFFFFFFFFFFF),
4633 zSig0,
4634 zSig1
4635 )
4636 && increment
4637 )
4638 ) {
4639 float_raise(float_flag_overflow | float_flag_inexact, status);
4640 if ( ( roundingMode == float_round_to_zero )
4641 || ( zSign && ( roundingMode == float_round_up ) )
4642 || ( ! zSign && ( roundingMode == float_round_down ) )
4643 || (roundingMode == float_round_to_odd)
4644 ) {
4645 return
4646 packFloat128(
4647 zSign,
4648 0x7FFE,
4649 UINT64_C(0x0000FFFFFFFFFFFF),
4650 UINT64_C(0xFFFFFFFFFFFFFFFF)
4651 );
4652 }
4653 return packFloat128( zSign, 0x7FFF, 0, 0 );
4654 }
4655 if ( zExp < 0 ) {
4656 if (status->flush_to_zero) {
4657 float_raise(float_flag_output_denormal, status);
4658 return packFloat128(zSign, 0, 0, 0);
4659 }
4660 isTiny = status->tininess_before_rounding
4661 || (zExp < -1)
4662 || !increment
4663 || lt128(zSig0, zSig1,
4664 UINT64_C(0x0001FFFFFFFFFFFF),
4665 UINT64_C(0xFFFFFFFFFFFFFFFF));
4666 shift128ExtraRightJamming(
4667 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4668 zExp = 0;
4669 if (isTiny && zSig2) {
4670 float_raise(float_flag_underflow, status);
4671 }
4672 switch (roundingMode) {
4673 case float_round_nearest_even:
4674 case float_round_ties_away:
4675 increment = ((int64_t)zSig2 < 0);
4676 break;
4677 case float_round_to_zero:
4678 increment = 0;
4679 break;
4680 case float_round_up:
4681 increment = !zSign && zSig2;
4682 break;
4683 case float_round_down:
4684 increment = zSign && zSig2;
4685 break;
4686 case float_round_to_odd:
4687 increment = !(zSig1 & 0x1) && zSig2;
4688 break;
4689 default:
4690 abort();
4691 }
4692 }
4693 }
4694 if (zSig2) {
4695 float_raise(float_flag_inexact, status);
4696 }
4697 if ( increment ) {
4698 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4699 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4700 zSig1 &= ~1;
4701 }
4702 }
4703 else {
4704 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4705 }
4706 return packFloat128( zSign, zExp, zSig0, zSig1 );
4707
4708 }
4709
4710 /*----------------------------------------------------------------------------
4711 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4712 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4713 | returns the proper quadruple-precision floating-point value corresponding
4714 | to the abstract input. This routine is just like `roundAndPackFloat128'
4715 | except that the input significand has fewer bits and does not have to be
4716 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4717 | point exponent.
4718 *----------------------------------------------------------------------------*/
4719
4720 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4721 uint64_t zSig0, uint64_t zSig1,
4722 float_status *status)
4723 {
4724 int8_t shiftCount;
4725 uint64_t zSig2;
4726
4727 if ( zSig0 == 0 ) {
4728 zSig0 = zSig1;
4729 zSig1 = 0;
4730 zExp -= 64;
4731 }
4732 shiftCount = clz64(zSig0) - 15;
4733 if ( 0 <= shiftCount ) {
4734 zSig2 = 0;
4735 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4736 }
4737 else {
4738 shift128ExtraRightJamming(
4739 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4740 }
4741 zExp -= shiftCount;
4742 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4743
4744 }
4745
4746
4747 /*----------------------------------------------------------------------------
4748 | Returns the result of converting the 32-bit two's complement integer `a'
4749 | to the extended double-precision floating-point format. The conversion
4750 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4751 | Arithmetic.
4752 *----------------------------------------------------------------------------*/
4753
4754 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4755 {
4756 bool zSign;
4757 uint32_t absA;
4758 int8_t shiftCount;
4759 uint64_t zSig;
4760
4761 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4762 zSign = ( a < 0 );
4763 absA = zSign ? - a : a;
4764 shiftCount = clz32(absA) + 32;
4765 zSig = absA;
4766 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4767
4768 }
4769
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the 32-bit two's complement integer `a' to
4772 | the quadruple-precision floating-point format. The conversion is performed
4773 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4774 *----------------------------------------------------------------------------*/
4775
4776 float128 int32_to_float128(int32_t a, float_status *status)
4777 {
4778 bool zSign;
4779 uint32_t absA;
4780 int8_t shiftCount;
4781 uint64_t zSig0;
4782
4783 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4784 zSign = ( a < 0 );
4785 absA = zSign ? - a : a;
4786 shiftCount = clz32(absA) + 17;
4787 zSig0 = absA;
4788 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4789
4790 }
4791
4792 /*----------------------------------------------------------------------------
4793 | Returns the result of converting the 64-bit two's complement integer `a'
4794 | to the extended double-precision floating-point format. The conversion
4795 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4796 | Arithmetic.
4797 *----------------------------------------------------------------------------*/
4798
4799 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4800 {
4801 bool zSign;
4802 uint64_t absA;
4803 int8_t shiftCount;
4804
4805 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4806 zSign = ( a < 0 );
4807 absA = zSign ? - a : a;
4808 shiftCount = clz64(absA);
4809 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4810
4811 }
4812
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of converting the 64-bit two's complement integer `a' to
4815 | the quadruple-precision floating-point format. The conversion is performed
4816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4818
4819 float128 int64_to_float128(int64_t a, float_status *status)
4820 {
4821 bool zSign;
4822 uint64_t absA;
4823 int8_t shiftCount;
4824 int32_t zExp;
4825 uint64_t zSig0, zSig1;
4826
4827 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4828 zSign = ( a < 0 );
4829 absA = zSign ? - a : a;
4830 shiftCount = clz64(absA) + 49;
4831 zExp = 0x406E - shiftCount;
4832 if ( 64 <= shiftCount ) {
4833 zSig1 = 0;
4834 zSig0 = absA;
4835 shiftCount -= 64;
4836 }
4837 else {
4838 zSig1 = absA;
4839 zSig0 = 0;
4840 }
4841 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4842 return packFloat128( zSign, zExp, zSig0, zSig1 );
4843
4844 }
4845
4846 /*----------------------------------------------------------------------------
4847 | Returns the result of converting the 64-bit unsigned integer `a'
4848 | to the quadruple-precision floating-point format. The conversion is performed
4849 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4850 *----------------------------------------------------------------------------*/
4851
4852 float128 uint64_to_float128(uint64_t a, float_status *status)
4853 {
4854 if (a == 0) {
4855 return float128_zero;
4856 }
4857 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4858 }
4859
4860 /*----------------------------------------------------------------------------
4861 | Returns the result of converting the single-precision floating-point value
4862 | `a' to the extended double-precision floating-point format. The conversion
4863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4864 | Arithmetic.
4865 *----------------------------------------------------------------------------*/
4866
4867 floatx80 float32_to_floatx80(float32 a, float_status *status)
4868 {
4869 bool aSign;
4870 int aExp;
4871 uint32_t aSig;
4872
4873 a = float32_squash_input_denormal(a, status);
4874 aSig = extractFloat32Frac( a );
4875 aExp = extractFloat32Exp( a );
4876 aSign = extractFloat32Sign( a );
4877 if ( aExp == 0xFF ) {
4878 if (aSig) {
4879 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4880 status);
4881 return floatx80_silence_nan(res, status);
4882 }
4883 return packFloatx80(aSign,
4884 floatx80_infinity_high,
4885 floatx80_infinity_low);
4886 }
4887 if ( aExp == 0 ) {
4888 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4889 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4890 }
4891 aSig |= 0x00800000;
4892 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4893
4894 }
4895
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of converting the single-precision floating-point value
4898 | `a' to the double-precision floating-point format. The conversion is
4899 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4900 | Arithmetic.
4901 *----------------------------------------------------------------------------*/
4902
4903 float128 float32_to_float128(float32 a, float_status *status)
4904 {
4905 bool aSign;
4906 int aExp;
4907 uint32_t aSig;
4908
4909 a = float32_squash_input_denormal(a, status);
4910 aSig = extractFloat32Frac( a );
4911 aExp = extractFloat32Exp( a );
4912 aSign = extractFloat32Sign( a );
4913 if ( aExp == 0xFF ) {
4914 if (aSig) {
4915 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4916 }
4917 return packFloat128( aSign, 0x7FFF, 0, 0 );
4918 }
4919 if ( aExp == 0 ) {
4920 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4921 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4922 --aExp;
4923 }
4924 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4925
4926 }
4927
4928 /*----------------------------------------------------------------------------
4929 | Returns the remainder of the single-precision floating-point value `a'
4930 | with respect to the corresponding value `b'. The operation is performed
4931 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4933
4934 float32 float32_rem(float32 a, float32 b, float_status *status)
4935 {
4936 bool aSign, zSign;
4937 int aExp, bExp, expDiff;
4938 uint32_t aSig, bSig;
4939 uint32_t q;
4940 uint64_t aSig64, bSig64, q64;
4941 uint32_t alternateASig;
4942 int32_t sigMean;
4943 a = float32_squash_input_denormal(a, status);
4944 b = float32_squash_input_denormal(b, status);
4945
4946 aSig = extractFloat32Frac( a );
4947 aExp = extractFloat32Exp( a );
4948 aSign = extractFloat32Sign( a );
4949 bSig = extractFloat32Frac( b );
4950 bExp = extractFloat32Exp( b );
4951 if ( aExp == 0xFF ) {
4952 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4953 return propagateFloat32NaN(a, b, status);
4954 }
4955 float_raise(float_flag_invalid, status);
4956 return float32_default_nan(status);
4957 }
4958 if ( bExp == 0xFF ) {
4959 if (bSig) {
4960 return propagateFloat32NaN(a, b, status);
4961 }
4962 return a;
4963 }
4964 if ( bExp == 0 ) {
4965 if ( bSig == 0 ) {
4966 float_raise(float_flag_invalid, status);
4967 return float32_default_nan(status);
4968 }
4969 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4970 }
4971 if ( aExp == 0 ) {
4972 if ( aSig == 0 ) return a;
4973 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4974 }
4975 expDiff = aExp - bExp;
4976 aSig |= 0x00800000;
4977 bSig |= 0x00800000;
4978 if ( expDiff < 32 ) {
4979 aSig <<= 8;
4980 bSig <<= 8;
4981 if ( expDiff < 0 ) {
4982 if ( expDiff < -1 ) return a;
4983 aSig >>= 1;
4984 }
4985 q = ( bSig <= aSig );
4986 if ( q ) aSig -= bSig;
4987 if ( 0 < expDiff ) {
4988 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4989 q >>= 32 - expDiff;
4990 bSig >>= 2;
4991 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4992 }
4993 else {
4994 aSig >>= 2;
4995 bSig >>= 2;
4996 }
4997 }
4998 else {
4999 if ( bSig <= aSig ) aSig -= bSig;
5000 aSig64 = ( (uint64_t) aSig )<<40;
5001 bSig64 = ( (uint64_t) bSig )<<40;
5002 expDiff -= 64;
5003 while ( 0 < expDiff ) {
5004 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5005 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5006 aSig64 = - ( ( bSig * q64 )<<38 );
5007 expDiff -= 62;
5008 }
5009 expDiff += 64;
5010 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5011 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5012 q = q64>>( 64 - expDiff );
5013 bSig <<= 6;
5014 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5015 }
5016 do {
5017 alternateASig = aSig;
5018 ++q;
5019 aSig -= bSig;
5020 } while ( 0 <= (int32_t) aSig );
5021 sigMean = aSig + alternateASig;
5022 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5023 aSig = alternateASig;
5024 }
5025 zSign = ( (int32_t) aSig < 0 );
5026 if ( zSign ) aSig = - aSig;
5027 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5028 }
5029
5030
5031
5032 /*----------------------------------------------------------------------------
5033 | Returns the binary exponential of the single-precision floating-point value
5034 | `a'. The operation is performed according to the IEC/IEEE Standard for
5035 | Binary Floating-Point Arithmetic.
5036 |
5037 | Uses the following identities:
5038 |
5039 | 1. -------------------------------------------------------------------------
5040 | x x*ln(2)
5041 | 2 = e
5042 |
5043 | 2. -------------------------------------------------------------------------
5044 | 2 3 4 5 n
5045 | x x x x x x x
5046 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5047 | 1! 2! 3! 4! 5! n!
5048 *----------------------------------------------------------------------------*/
5049
5050 static const float64 float32_exp2_coefficients[15] =
5051 {
5052 const_float64( 0x3ff0000000000000ll ), /* 1 */
5053 const_float64( 0x3fe0000000000000ll ), /* 2 */
5054 const_float64( 0x3fc5555555555555ll ), /* 3 */
5055 const_float64( 0x3fa5555555555555ll ), /* 4 */
5056 const_float64( 0x3f81111111111111ll ), /* 5 */
5057 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5058 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5059 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5060 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5061 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5062 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5063 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5064 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5065 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5066 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5067 };
5068
5069 float32 float32_exp2(float32 a, float_status *status)
5070 {
5071 bool aSign;
5072 int aExp;
5073 uint32_t aSig;
5074 float64 r, x, xn;
5075 int i;
5076 a = float32_squash_input_denormal(a, status);
5077
5078 aSig = extractFloat32Frac( a );
5079 aExp = extractFloat32Exp( a );
5080 aSign = extractFloat32Sign( a );
5081
5082 if ( aExp == 0xFF) {
5083 if (aSig) {
5084 return propagateFloat32NaN(a, float32_zero, status);
5085 }
5086 return (aSign) ? float32_zero : a;
5087 }
5088 if (aExp == 0) {
5089 if (aSig == 0) return float32_one;
5090 }
5091
5092 float_raise(float_flag_inexact, status);
5093
5094 /* ******************************* */
5095 /* using float64 for approximation */
5096 /* ******************************* */
5097 x = float32_to_float64(a, status);
5098 x = float64_mul(x, float64_ln2, status);
5099
5100 xn = x;
5101 r = float64_one;
5102 for (i = 0 ; i < 15 ; i++) {
5103 float64 f;
5104
5105 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5106 r = float64_add(r, f, status);
5107
5108 xn = float64_mul(xn, x, status);
5109 }
5110
5111 return float64_to_float32(r, status);
5112 }
5113
5114 /*----------------------------------------------------------------------------
5115 | Returns the binary log of the single-precision floating-point value `a'.
5116 | The operation is performed according to the IEC/IEEE Standard for Binary
5117 | Floating-Point Arithmetic.
5118 *----------------------------------------------------------------------------*/
5119 float32 float32_log2(float32 a, float_status *status)
5120 {
5121 bool aSign, zSign;
5122 int aExp;
5123 uint32_t aSig, zSig, i;
5124
5125 a = float32_squash_input_denormal(a, status);
5126 aSig = extractFloat32Frac( a );
5127 aExp = extractFloat32Exp( a );
5128 aSign = extractFloat32Sign( a );
5129
5130 if ( aExp == 0 ) {
5131 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5132 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5133 }
5134 if ( aSign ) {
5135 float_raise(float_flag_invalid, status);
5136 return float32_default_nan(status);
5137 }
5138 if ( aExp == 0xFF ) {
5139 if (aSig) {
5140 return propagateFloat32NaN(a, float32_zero, status);
5141 }
5142 return a;
5143 }
5144
5145 aExp -= 0x7F;
5146 aSig |= 0x00800000;
5147 zSign = aExp < 0;
5148 zSig = aExp << 23;
5149
5150 for (i = 1 << 22; i > 0; i >>= 1) {
5151 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5152 if ( aSig & 0x01000000 ) {
5153 aSig >>= 1;
5154 zSig |= i;
5155 }
5156 }
5157
5158 if ( zSign )
5159 zSig = -zSig;
5160
5161 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5162 }
5163
5164 /*----------------------------------------------------------------------------
5165 | Returns the result of converting the double-precision floating-point value
5166 | `a' to the extended double-precision floating-point format. The conversion
5167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5168 | Arithmetic.
5169 *----------------------------------------------------------------------------*/
5170
5171 floatx80 float64_to_floatx80(float64 a, float_status *status)
5172 {
5173 bool aSign;
5174 int aExp;
5175 uint64_t aSig;
5176
5177 a = float64_squash_input_denormal(a, status);
5178 aSig = extractFloat64Frac( a );
5179 aExp = extractFloat64Exp( a );
5180 aSign = extractFloat64Sign( a );
5181 if ( aExp == 0x7FF ) {
5182 if (aSig) {
5183 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5184 status);
5185 return floatx80_silence_nan(res, status);
5186 }
5187 return packFloatx80(aSign,
5188 floatx80_infinity_high,
5189 floatx80_infinity_low);
5190 }
5191 if ( aExp == 0 ) {
5192 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5193 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5194 }
5195 return
5196 packFloatx80(
5197 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5198
5199 }
5200
5201 /*----------------------------------------------------------------------------
5202 | Returns the result of converting the double-precision floating-point value
5203 | `a' to the quadruple-precision floating-point format. The conversion is
5204 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5205 | Arithmetic.
5206 *----------------------------------------------------------------------------*/
5207
5208 float128 float64_to_float128(float64 a, float_status *status)
5209 {
5210 bool aSign;
5211 int aExp;
5212 uint64_t aSig, zSig0, zSig1;
5213
5214 a = float64_squash_input_denormal(a, status);
5215 aSig = extractFloat64Frac( a );
5216 aExp = extractFloat64Exp( a );
5217 aSign = extractFloat64Sign( a );
5218 if ( aExp == 0x7FF ) {
5219 if (aSig) {
5220 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5221 }
5222 return packFloat128( aSign, 0x7FFF, 0, 0 );
5223 }
5224 if ( aExp == 0 ) {
5225 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5226 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5227 --aExp;
5228 }
5229 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5230 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5231
5232 }
5233
5234
5235 /*----------------------------------------------------------------------------
5236 | Returns the remainder of the double-precision floating-point value `a'
5237 | with respect to the corresponding value `b'. The operation is performed
5238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5239 *----------------------------------------------------------------------------*/
5240
5241 float64 float64_rem(float64 a, float64 b, float_status *status)
5242 {
5243 bool aSign, zSign;
5244 int aExp, bExp, expDiff;
5245 uint64_t aSig, bSig;
5246 uint64_t q, alternateASig;
5247 int64_t sigMean;
5248
5249 a = float64_squash_input_denormal(a, status);
5250 b = float64_squash_input_denormal(b, status);
5251 aSig = extractFloat64Frac( a );
5252 aExp = extractFloat64Exp( a );
5253 aSign = extractFloat64Sign( a );
5254 bSig = extractFloat64Frac( b );
5255 bExp = extractFloat64Exp( b );
5256 if ( aExp == 0x7FF ) {
5257 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5258 return propagateFloat64NaN(a, b, status);
5259 }
5260 float_raise(float_flag_invalid, status);
5261 return float64_default_nan(status);
5262 }
5263 if ( bExp == 0x7FF ) {
5264 if (bSig) {
5265 return propagateFloat64NaN(a, b, status);
5266 }
5267 return a;
5268 }
5269 if ( bExp == 0 ) {
5270 if ( bSig == 0 ) {
5271 float_raise(float_flag_invalid, status);
5272 return float64_default_nan(status);
5273 }
5274 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5275 }
5276 if ( aExp == 0 ) {
5277 if ( aSig == 0 ) return a;
5278 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5279 }
5280 expDiff = aExp - bExp;
5281 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5282 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5283 if ( expDiff < 0 ) {
5284 if ( expDiff < -1 ) return a;
5285 aSig >>= 1;
5286 }
5287 q = ( bSig <= aSig );
5288 if ( q ) aSig -= bSig;
5289 expDiff -= 64;
5290 while ( 0 < expDiff ) {
5291 q = estimateDiv128To64( aSig, 0, bSig );
5292 q = ( 2 < q ) ? q - 2 : 0;
5293 aSig = - ( ( bSig>>2 ) * q );
5294 expDiff -= 62;
5295 }
5296 expDiff += 64;
5297 if ( 0 < expDiff ) {
5298 q = estimateDiv128To64( aSig, 0, bSig );
5299 q = ( 2 < q ) ? q - 2 : 0;
5300 q >>= 64 - expDiff;
5301 bSig >>= 2;
5302 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5303 }
5304 else {
5305 aSig >>= 2;
5306 bSig >>= 2;
5307 }
5308 do {
5309 alternateASig = aSig;
5310 ++q;
5311 aSig -= bSig;
5312 } while ( 0 <= (int64_t) aSig );
5313 sigMean = aSig + alternateASig;
5314 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5315 aSig = alternateASig;
5316 }
5317 zSign = ( (int64_t) aSig < 0 );
5318 if ( zSign ) aSig = - aSig;
5319 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5320
5321 }
5322
5323 /*----------------------------------------------------------------------------
5324 | Returns the binary log of the double-precision floating-point value `a'.
5325 | The operation is performed according to the IEC/IEEE Standard for Binary
5326 | Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328 float64 float64_log2(float64 a, float_status *status)
5329 {
5330 bool aSign, zSign;
5331 int aExp;
5332 uint64_t aSig, aSig0, aSig1, zSig, i;
5333 a = float64_squash_input_denormal(a, status);
5334
5335 aSig = extractFloat64Frac( a );
5336 aExp = extractFloat64Exp( a );
5337 aSign = extractFloat64Sign( a );
5338
5339 if ( aExp == 0 ) {
5340 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5341 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5342 }
5343 if ( aSign ) {
5344 float_raise(float_flag_invalid, status);
5345 return float64_default_nan(status);
5346 }
5347 if ( aExp == 0x7FF ) {
5348 if (aSig) {
5349 return propagateFloat64NaN(a, float64_zero, status);
5350 }
5351 return a;
5352 }
5353
5354 aExp -= 0x3FF;
5355 aSig |= UINT64_C(0x0010000000000000);
5356 zSign = aExp < 0;
5357 zSig = (uint64_t)aExp << 52;
5358 for (i = 1LL << 51; i > 0; i >>= 1) {
5359 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5360 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5361 if ( aSig & UINT64_C(0x0020000000000000) ) {
5362 aSig >>= 1;
5363 zSig |= i;
5364 }
5365 }
5366
5367 if ( zSign )
5368 zSig = -zSig;
5369 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5370 }
5371
5372 /*----------------------------------------------------------------------------
5373 | Returns the result of converting the extended double-precision floating-
5374 | point value `a' to the 32-bit two's complement integer format. The
5375 | conversion is performed according to the IEC/IEEE Standard for Binary
5376 | Floating-Point Arithmetic---which means in particular that the conversion
5377 | is rounded according to the current rounding mode. If `a' is a NaN, the
5378 | largest positive integer is returned. Otherwise, if the conversion
5379 | overflows, the largest integer with the same sign as `a' is returned.
5380 *----------------------------------------------------------------------------*/
5381
5382 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5383 {
5384 bool aSign;
5385 int32_t aExp, shiftCount;
5386 uint64_t aSig;
5387
5388 if (floatx80_invalid_encoding(a)) {
5389 float_raise(float_flag_invalid, status);
5390 return 1 << 31;
5391 }
5392 aSig = extractFloatx80Frac( a );
5393 aExp = extractFloatx80Exp( a );
5394 aSign = extractFloatx80Sign( a );
5395 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5396 shiftCount = 0x4037 - aExp;
5397 if ( shiftCount <= 0 ) shiftCount = 1;
5398 shift64RightJamming( aSig, shiftCount, &aSig );
5399 return roundAndPackInt32(aSign, aSig, status);
5400
5401 }
5402
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of converting the extended double-precision floating-
5405 | point value `a' to the 32-bit two's complement integer format. The
5406 | conversion is performed according to the IEC/IEEE Standard for Binary
5407 | Floating-Point Arithmetic, except that the conversion is always rounded
5408 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5409 | Otherwise, if the conversion overflows, the largest integer with the same
5410 | sign as `a' is returned.
5411 *----------------------------------------------------------------------------*/
5412
5413 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5414 {
5415 bool aSign;
5416 int32_t aExp, shiftCount;
5417 uint64_t aSig, savedASig;
5418 int32_t z;
5419
5420 if (floatx80_invalid_encoding(a)) {
5421 float_raise(float_flag_invalid, status);
5422 return 1 << 31;
5423 }
5424 aSig = extractFloatx80Frac( a );
5425 aExp = extractFloatx80Exp( a );
5426 aSign = extractFloatx80Sign( a );
5427 if ( 0x401E < aExp ) {
5428 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5429 goto invalid;
5430 }
5431 else if ( aExp < 0x3FFF ) {
5432 if (aExp || aSig) {
5433 float_raise(float_flag_inexact, status);
5434 }
5435 return 0;
5436 }
5437 shiftCount = 0x403E - aExp;
5438 savedASig = aSig;
5439 aSig >>= shiftCount;
5440 z = aSig;
5441 if ( aSign ) z = - z;
5442 if ( ( z < 0 ) ^ aSign ) {
5443 invalid:
5444 float_raise(float_flag_invalid, status);
5445 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5446 }
5447 if ( ( aSig<<shiftCount ) != savedASig ) {
5448 float_raise(float_flag_inexact, status);
5449 }
5450 return z;
5451
5452 }
5453
5454 /*----------------------------------------------------------------------------
5455 | Returns the result of converting the extended double-precision floating-
5456 | point value `a' to the 64-bit two's complement integer format. The
5457 | conversion is performed according to the IEC/IEEE Standard for Binary
5458 | Floating-Point Arithmetic---which means in particular that the conversion
5459 | is rounded according to the current rounding mode. If `a' is a NaN,
5460 | the largest positive integer is returned. Otherwise, if the conversion
5461 | overflows, the largest integer with the same sign as `a' is returned.
5462 *----------------------------------------------------------------------------*/
5463
5464 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5465 {
5466 bool aSign;
5467 int32_t aExp, shiftCount;
5468 uint64_t aSig, aSigExtra;
5469
5470 if (floatx80_invalid_encoding(a)) {
5471 float_raise(float_flag_invalid, status);
5472 return 1ULL << 63;
5473 }
5474 aSig = extractFloatx80Frac( a );
5475 aExp = extractFloatx80Exp( a );
5476 aSign = extractFloatx80Sign( a );
5477 shiftCount = 0x403E - aExp;
5478 if ( shiftCount <= 0 ) {
5479 if ( shiftCount ) {
5480 float_raise(float_flag_invalid, status);
5481 if (!aSign || floatx80_is_any_nan(a)) {
5482 return INT64_MAX;
5483 }
5484 return INT64_MIN;
5485 }
5486 aSigExtra = 0;
5487 }
5488 else {
5489 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5490 }
5491 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5492
5493 }
5494
5495 /*----------------------------------------------------------------------------
5496 | Returns the result of converting the extended double-precision floating-
5497 | point value `a' to the 64-bit two's complement integer format. The
5498 | conversion is performed according to the IEC/IEEE Standard for Binary
5499 | Floating-Point Arithmetic, except that the conversion is always rounded
5500 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5501 | Otherwise, if the conversion overflows, the largest integer with the same
5502 | sign as `a' is returned.
5503 *----------------------------------------------------------------------------*/
5504
5505 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5506 {
5507 bool aSign;
5508 int32_t aExp, shiftCount;
5509 uint64_t aSig;
5510 int64_t z;
5511
5512 if (floatx80_invalid_encoding(a)) {
5513 float_raise(float_flag_invalid, status);
5514 return 1ULL << 63;
5515 }
5516 aSig = extractFloatx80Frac( a );
5517 aExp = extractFloatx80Exp( a );
5518 aSign = extractFloatx80Sign( a );
5519 shiftCount = aExp - 0x403E;
5520 if ( 0 <= shiftCount ) {
5521 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5522 if ( ( a.high != 0xC03E ) || aSig ) {
5523 float_raise(float_flag_invalid, status);
5524 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5525 return INT64_MAX;
5526 }
5527 }
5528 return INT64_MIN;
5529 }
5530 else if ( aExp < 0x3FFF ) {
5531 if (aExp | aSig) {
5532 float_raise(float_flag_inexact, status);
5533 }
5534 return 0;
5535 }
5536 z = aSig>>( - shiftCount );
5537 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5538 float_raise(float_flag_inexact, status);
5539 }
5540 if ( aSign ) z = - z;
5541 return z;
5542
5543 }
5544
5545 /*----------------------------------------------------------------------------
5546 | Returns the result of converting the extended double-precision floating-
5547 | point value `a' to the single-precision floating-point format. The
5548 | conversion is performed according to the IEC/IEEE Standard for Binary
5549 | Floating-Point Arithmetic.
5550 *----------------------------------------------------------------------------*/
5551
5552 float32 floatx80_to_float32(floatx80 a, float_status *status)
5553 {
5554 bool aSign;
5555 int32_t aExp;
5556 uint64_t aSig;
5557
5558 if (floatx80_invalid_encoding(a)) {
5559 float_raise(float_flag_invalid, status);
5560 return float32_default_nan(status);
5561 }
5562 aSig = extractFloatx80Frac( a );
5563 aExp = extractFloatx80Exp( a );
5564 aSign = extractFloatx80Sign( a );
5565 if ( aExp == 0x7FFF ) {
5566 if ( (uint64_t) ( aSig<<1 ) ) {
5567 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5568 status);
5569 return float32_silence_nan(res, status);
5570 }
5571 return packFloat32( aSign, 0xFF, 0 );
5572 }
5573 shift64RightJamming( aSig, 33, &aSig );
5574 if ( aExp || aSig ) aExp -= 0x3F81;
5575 return roundAndPackFloat32(aSign, aExp, aSig, status);
5576
5577 }
5578
5579 /*----------------------------------------------------------------------------
5580 | Returns the result of converting the extended double-precision floating-
5581 | point value `a' to the double-precision floating-point format. The
5582 | conversion is performed according to the IEC/IEEE Standard for Binary
5583 | Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585
5586 float64 floatx80_to_float64(floatx80 a, float_status *status)
5587 {
5588 bool aSign;
5589 int32_t aExp;
5590 uint64_t aSig, zSig;
5591
5592 if (floatx80_invalid_encoding(a)) {
5593 float_raise(float_flag_invalid, status);
5594 return float64_default_nan(status);
5595 }
5596 aSig = extractFloatx80Frac( a );
5597 aExp = extractFloatx80Exp( a );
5598 aSign = extractFloatx80Sign( a );
5599 if ( aExp == 0x7FFF ) {
5600 if ( (uint64_t) ( aSig<<1 ) ) {
5601 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5602 status);
5603 return float64_silence_nan(res, status);
5604 }
5605 return packFloat64( aSign, 0x7FF, 0 );
5606 }
5607 shift64RightJamming( aSig, 1, &zSig );
5608 if ( aExp || aSig ) aExp -= 0x3C01;
5609 return roundAndPackFloat64(aSign, aExp, zSig, status);
5610
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the extended double-precision floating-
5615 | point value `a' to the quadruple-precision floating-point format. The
5616 | conversion is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619
5620 float128 floatx80_to_float128(floatx80 a, float_status *status)
5621 {
5622 bool aSign;
5623 int aExp;
5624 uint64_t aSig, zSig0, zSig1;
5625
5626 if (floatx80_invalid_encoding(a)) {
5627 float_raise(float_flag_invalid, status);
5628 return float128_default_nan(status);
5629 }
5630 aSig = extractFloatx80Frac( a );
5631 aExp = extractFloatx80Exp( a );
5632 aSign = extractFloatx80Sign( a );
5633 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5634 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5635 status);
5636 return float128_silence_nan(res, status);
5637 }
5638 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5639 return packFloat128( aSign, aExp, zSig0, zSig1 );
5640
5641 }
5642
5643 /*----------------------------------------------------------------------------
5644 | Rounds the extended double-precision floating-point value `a'
5645 | to the precision provided by floatx80_rounding_precision and returns the
5646 | result as an extended double-precision floating-point value.
5647 | The operation is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5650
5651 floatx80 floatx80_round(floatx80 a, float_status *status)
5652 {
5653 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5654 extractFloatx80Sign(a),
5655 extractFloatx80Exp(a),
5656 extractFloatx80Frac(a), 0, status);
5657 }
5658
5659 /*----------------------------------------------------------------------------
5660 | Rounds the extended double-precision floating-point value `a' to an integer,
5661 | and returns the result as an extended quadruple-precision floating-point
5662 | value. The operation is performed according to the IEC/IEEE Standard for
5663 | Binary Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5665
5666 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5667 {
5668 bool aSign;
5669 int32_t aExp;
5670 uint64_t lastBitMask, roundBitsMask;
5671 floatx80 z;
5672
5673 if (floatx80_invalid_encoding(a)) {
5674 float_raise(float_flag_invalid, status);
5675 return floatx80_default_nan(status);
5676 }
5677 aExp = extractFloatx80Exp( a );
5678 if ( 0x403E <= aExp ) {
5679 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5680 return propagateFloatx80NaN(a, a, status);
5681 }
5682 return a;
5683 }
5684 if ( aExp < 0x3FFF ) {
5685 if ( ( aExp == 0 )
5686 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5687 return a;
5688 }
5689 float_raise(float_flag_inexact, status);
5690 aSign = extractFloatx80Sign( a );
5691 switch (status->float_rounding_mode) {
5692 case float_round_nearest_even:
5693 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5694 ) {
5695 return
5696 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5697 }
5698 break;
5699 case float_round_ties_away:
5700 if (aExp == 0x3FFE) {
5701 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5702 }
5703 break;
5704 case float_round_down:
5705 return
5706 aSign ?
5707 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5708 : packFloatx80( 0, 0, 0 );
5709 case float_round_up:
5710 return
5711 aSign ? packFloatx80( 1, 0, 0 )
5712 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5713
5714 case float_round_to_zero:
5715 break;
5716 default:
5717 g_assert_not_reached();
5718 }
5719 return packFloatx80( aSign, 0, 0 );
5720 }
5721 lastBitMask = 1;
5722 lastBitMask <<= 0x403E - aExp;
5723 roundBitsMask = lastBitMask - 1;
5724 z = a;
5725 switch (status->float_rounding_mode) {
5726 case float_round_nearest_even:
5727 z.low += lastBitMask>>1;
5728 if ((z.low & roundBitsMask) == 0) {
5729 z.low &= ~lastBitMask;
5730 }
5731 break;
5732 case float_round_ties_away:
5733 z.low += lastBitMask >> 1;
5734 break;
5735 case float_round_to_zero:
5736 break;
5737 case float_round_up:
5738 if (!extractFloatx80Sign(z)) {
5739 z.low += roundBitsMask;
5740 }
5741 break;
5742 case float_round_down:
5743 if (extractFloatx80Sign(z)) {
5744 z.low += roundBitsMask;
5745 }
5746 break;
5747 default:
5748 abort();
5749 }
5750 z.low &= ~ roundBitsMask;
5751 if ( z.low == 0 ) {
5752 ++z.high;
5753 z.low = UINT64_C(0x8000000000000000);
5754 }
5755 if (z.low != a.low) {
5756 float_raise(float_flag_inexact, status);
5757 }
5758 return z;
5759
5760 }
5761
5762 /*----------------------------------------------------------------------------
5763 | Returns the result of adding the absolute values of the extended double-
5764 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5765 | negated before being returned. `zSign' is ignored if the result is a NaN.
5766 | The addition is performed according to the IEC/IEEE Standard for Binary
5767 | Floating-Point Arithmetic.
5768 *----------------------------------------------------------------------------*/
5769
5770 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5771 float_status *status)
5772 {
5773 int32_t aExp, bExp, zExp;
5774 uint64_t aSig, bSig, zSig0, zSig1;
5775 int32_t expDiff;
5776
5777 aSig = extractFloatx80Frac( a );
5778 aExp = extractFloatx80Exp( a );
5779 bSig = extractFloatx80Frac( b );
5780 bExp = extractFloatx80Exp( b );
5781 expDiff = aExp - bExp;
5782 if ( 0 < expDiff ) {
5783 if ( aExp == 0x7FFF ) {
5784 if ((uint64_t)(aSig << 1)) {
5785 return propagateFloatx80NaN(a, b, status);
5786 }
5787 return a;
5788 }
5789 if ( bExp == 0 ) --expDiff;
5790 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5791 zExp = aExp;
5792 }
5793 else if ( expDiff < 0 ) {
5794 if ( bExp == 0x7FFF ) {
5795 if ((uint64_t)(bSig << 1)) {
5796 return propagateFloatx80NaN(a, b, status);
5797 }
5798 return packFloatx80(zSign,
5799 floatx80_infinity_high,
5800 floatx80_infinity_low);
5801 }
5802 if ( aExp == 0 ) ++expDiff;
5803 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5804 zExp = bExp;
5805 }
5806 else {
5807 if ( aExp == 0x7FFF ) {
5808 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5809 return propagateFloatx80NaN(a, b, status);
5810 }
5811 return a;
5812 }
5813 zSig1 = 0;
5814 zSig0 = aSig + bSig;
5815 if ( aExp == 0 ) {
5816 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5817 /* At least one of the values is a pseudo-denormal,
5818 * and there is a carry out of the result. */
5819 zExp = 1;
5820 goto shiftRight1;
5821 }
5822 if (zSig0 == 0) {
5823 return packFloatx80(zSign, 0, 0);
5824 }
5825 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5826 goto roundAndPack;
5827 }
5828 zExp = aExp;
5829 goto shiftRight1;
5830 }
5831 zSig0 = aSig + bSig;
5832 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5833 shiftRight1:
5834 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5835 zSig0 |= UINT64_C(0x8000000000000000);
5836 ++zExp;
5837 roundAndPack:
5838 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5839 zSign, zExp, zSig0, zSig1, status);
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of subtracting the absolute values of the extended
5844 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5845 | difference is negated before being returned. `zSign' is ignored if the
5846 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5847 | Standard for Binary Floating-Point Arithmetic.
5848 *----------------------------------------------------------------------------*/
5849
5850 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5851 float_status *status)
5852 {
5853 int32_t aExp, bExp, zExp;
5854 uint64_t aSig, bSig, zSig0, zSig1;
5855 int32_t expDiff;
5856
5857 aSig = extractFloatx80Frac( a );
5858 aExp = extractFloatx80Exp( a );
5859 bSig = extractFloatx80Frac( b );
5860 bExp = extractFloatx80Exp( b );
5861 expDiff = aExp - bExp;
5862 if ( 0 < expDiff ) goto aExpBigger;
5863 if ( expDiff < 0 ) goto bExpBigger;
5864 if ( aExp == 0x7FFF ) {
5865 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5866 return propagateFloatx80NaN(a, b, status);
5867 }
5868 float_raise(float_flag_invalid, status);
5869 return floatx80_default_nan(status);
5870 }
5871 if ( aExp == 0 ) {
5872 aExp = 1;
5873 bExp = 1;
5874 }
5875 zSig1 = 0;
5876 if ( bSig < aSig ) goto aBigger;
5877 if ( aSig < bSig ) goto bBigger;
5878 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5879 bExpBigger:
5880 if ( bExp == 0x7FFF ) {
5881 if ((uint64_t)(bSig << 1)) {
5882 return propagateFloatx80NaN(a, b, status);
5883 }
5884 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5885 floatx80_infinity_low);
5886 }
5887 if ( aExp == 0 ) ++expDiff;
5888 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5889 bBigger:
5890 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5891 zExp = bExp;
5892 zSign ^= 1;
5893 goto normalizeRoundAndPack;
5894 aExpBigger:
5895 if ( aExp == 0x7FFF ) {
5896 if ((uint64_t)(aSig << 1)) {
5897 return propagateFloatx80NaN(a, b, status);
5898 }
5899 return a;
5900 }
5901 if ( bExp == 0 ) --expDiff;
5902 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5903 aBigger:
5904 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5905 zExp = aExp;
5906 normalizeRoundAndPack:
5907 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5908 zSign, zExp, zSig0, zSig1, status);
5909 }
5910
5911 /*----------------------------------------------------------------------------
5912 | Returns the result of adding the extended double-precision floating-point
5913 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5914 | Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5916
5917 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5918 {
5919 bool aSign, bSign;
5920
5921 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5922 float_raise(float_flag_invalid, status);
5923 return floatx80_default_nan(status);
5924 }
5925 aSign = extractFloatx80Sign( a );
5926 bSign = extractFloatx80Sign( b );
5927 if ( aSign == bSign ) {
5928 return addFloatx80Sigs(a, b, aSign, status);
5929 }
5930 else {
5931 return subFloatx80Sigs(a, b, aSign, status);
5932 }
5933
5934 }
5935
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of subtracting the extended double-precision floating-
5938 | point values `a' and `b'. The operation is performed according to the
5939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5941
5942 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5943 {
5944 bool aSign, bSign;
5945
5946 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5947 float_raise(float_flag_invalid, status);
5948 return floatx80_default_nan(status);
5949 }
5950 aSign = extractFloatx80Sign( a );
5951 bSign = extractFloatx80Sign( b );
5952 if ( aSign == bSign ) {
5953 return subFloatx80Sigs(a, b, aSign, status);
5954 }
5955 else {
5956 return addFloatx80Sigs(a, b, aSign, status);
5957 }
5958
5959 }
5960
5961 /*----------------------------------------------------------------------------
5962 | Returns the result of multiplying the extended double-precision floating-
5963 | point values `a' and `b'. The operation is performed according to the
5964 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5965 *----------------------------------------------------------------------------*/
5966
5967 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5968 {
5969 bool aSign, bSign, zSign;
5970 int32_t aExp, bExp, zExp;
5971 uint64_t aSig, bSig, zSig0, zSig1;
5972
5973 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5974 float_raise(float_flag_invalid, status);
5975 return floatx80_default_nan(status);
5976 }
5977 aSig = extractFloatx80Frac( a );
5978 aExp = extractFloatx80Exp( a );
5979 aSign = extractFloatx80Sign( a );
5980 bSig = extractFloatx80Frac( b );
5981 bExp = extractFloatx80Exp( b );
5982 bSign = extractFloatx80Sign( b );
5983 zSign = aSign ^ bSign;
5984 if ( aExp == 0x7FFF ) {
5985 if ( (uint64_t) ( aSig<<1 )
5986 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5987 return propagateFloatx80NaN(a, b, status);
5988 }
5989 if ( ( bExp | bSig ) == 0 ) goto invalid;
5990 return packFloatx80(zSign, floatx80_infinity_high,
5991 floatx80_infinity_low);
5992 }
5993 if ( bExp == 0x7FFF ) {
5994 if ((uint64_t)(bSig << 1)) {
5995 return propagateFloatx80NaN(a, b, status);
5996 }
5997 if ( ( aExp | aSig ) == 0 ) {
5998 invalid:
5999 float_raise(float_flag_invalid, status);
6000 return floatx80_default_nan(status);
6001 }
6002 return packFloatx80(zSign, floatx80_infinity_high,
6003 floatx80_infinity_low);
6004 }
6005 if ( aExp == 0 ) {
6006 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6007 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6008 }
6009 if ( bExp == 0 ) {
6010 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6011 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6012 }
6013 zExp = aExp + bExp - 0x3FFE;
6014 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6015 if ( 0 < (int64_t) zSig0 ) {
6016 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6017 --zExp;
6018 }
6019 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6020 zSign, zExp, zSig0, zSig1, status);
6021 }
6022
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of dividing the extended double-precision floating-point
6025 | value `a' by the corresponding value `b'. The operation is performed
6026 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6027 *----------------------------------------------------------------------------*/
6028
6029 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6030 {
6031 bool aSign, bSign, zSign;
6032 int32_t aExp, bExp, zExp;
6033 uint64_t aSig, bSig, zSig0, zSig1;
6034 uint64_t rem0, rem1, rem2, term0, term1, term2;
6035
6036 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6037 float_raise(float_flag_invalid, status);
6038 return floatx80_default_nan(status);
6039 }
6040 aSig = extractFloatx80Frac( a );
6041 aExp = extractFloatx80Exp( a );
6042 aSign = extractFloatx80Sign( a );
6043 bSig = extractFloatx80Frac( b );
6044 bExp = extractFloatx80Exp( b );
6045 bSign = extractFloatx80Sign( b );
6046 zSign = aSign ^ bSign;
6047 if ( aExp == 0x7FFF ) {
6048 if ((uint64_t)(aSig << 1)) {
6049 return propagateFloatx80NaN(a, b, status);
6050 }
6051 if ( bExp == 0x7FFF ) {
6052 if ((uint64_t)(bSig << 1)) {
6053 return propagateFloatx80NaN(a, b, status);
6054 }
6055 goto invalid;
6056 }
6057 return packFloatx80(zSign, floatx80_infinity_high,
6058 floatx80_infinity_low);
6059 }
6060 if ( bExp == 0x7FFF ) {
6061 if ((uint64_t)(bSig << 1)) {
6062 return propagateFloatx80NaN(a, b, status);
6063 }
6064 return packFloatx80( zSign, 0, 0 );
6065 }
6066 if ( bExp == 0 ) {
6067 if ( bSig == 0 ) {
6068 if ( ( aExp | aSig ) == 0 ) {
6069 invalid:
6070 float_raise(float_flag_invalid, status);
6071 return floatx80_default_nan(status);
6072 }
6073 float_raise(float_flag_divbyzero, status);
6074 return packFloatx80(zSign, floatx80_infinity_high,
6075 floatx80_infinity_low);
6076 }
6077 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6078 }
6079 if ( aExp == 0 ) {
6080 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6081 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6082 }
6083 zExp = aExp - bExp + 0x3FFE;
6084 rem1 = 0;
6085 if ( bSig <= aSig ) {
6086 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6087 ++zExp;
6088 }
6089 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6090 mul64To128( bSig, zSig0, &term0, &term1 );
6091 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6092 while ( (int64_t) rem0 < 0 ) {
6093 --zSig0;
6094 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6095 }
6096 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6097 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6098 mul64To128( bSig, zSig1, &term1, &term2 );
6099 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6100 while ( (int64_t) rem1 < 0 ) {
6101 --zSig1;
6102 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6103 }
6104 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6105 }
6106 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6107 zSign, zExp, zSig0, zSig1, status);
6108 }
6109
6110 /*----------------------------------------------------------------------------
6111 | Returns the remainder of the extended double-precision floating-point value
6112 | `a' with respect to the corresponding value `b'. The operation is performed
6113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6114 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6115 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6116 | the absolute value of the integer quotient.
6117 *----------------------------------------------------------------------------*/
6118
6119 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6120 float_status *status)
6121 {
6122 bool aSign, zSign;
6123 int32_t aExp, bExp, expDiff, aExpOrig;
6124 uint64_t aSig0, aSig1, bSig;
6125 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6126
6127 *quotient = 0;
6128 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6129 float_raise(float_flag_invalid, status);
6130 return floatx80_default_nan(status);
6131 }
6132 aSig0 = extractFloatx80Frac( a );
6133 aExpOrig = aExp = extractFloatx80Exp( a );
6134 aSign = extractFloatx80Sign( a );
6135 bSig = extractFloatx80Frac( b );
6136 bExp = extractFloatx80Exp( b );
6137 if ( aExp == 0x7FFF ) {
6138 if ( (uint64_t) ( aSig0<<1 )
6139 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6140 return propagateFloatx80NaN(a, b, status);
6141 }
6142 goto invalid;
6143 }
6144 if ( bExp == 0x7FFF ) {
6145 if ((uint64_t)(bSig << 1)) {
6146 return propagateFloatx80NaN(a, b, status);
6147 }
6148 if (aExp == 0 && aSig0 >> 63) {
6149 /*
6150 * Pseudo-denormal argument must be returned in normalized
6151 * form.
6152 */
6153 return packFloatx80(aSign, 1, aSig0);
6154 }
6155 return a;
6156 }
6157 if ( bExp == 0 ) {
6158 if ( bSig == 0 ) {
6159 invalid:
6160 float_raise(float_flag_invalid, status);
6161 return floatx80_default_nan(status);
6162 }
6163 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6164 }
6165 if ( aExp == 0 ) {
6166 if ( aSig0 == 0 ) return a;
6167 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6168 }
6169 zSign = aSign;
6170 expDiff = aExp - bExp;
6171 aSig1 = 0;
6172 if ( expDiff < 0 ) {
6173 if ( mod || expDiff < -1 ) {
6174 if (aExp == 1 && aExpOrig == 0) {
6175 /*
6176 * Pseudo-denormal argument must be returned in
6177 * normalized form.
6178 */
6179 return packFloatx80(aSign, aExp, aSig0);
6180 }
6181 return a;
6182 }
6183 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6184 expDiff = 0;
6185 }
6186 *quotient = q = ( bSig <= aSig0 );
6187 if ( q ) aSig0 -= bSig;
6188 expDiff -= 64;
6189 while ( 0 < expDiff ) {
6190 q = estimateDiv128To64( aSig0, aSig1, bSig );
6191 q = ( 2 < q ) ? q - 2 : 0;
6192 mul64To128( bSig, q, &term0, &term1 );
6193 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6194 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6195 expDiff -= 62;
6196 *quotient <<= 62;
6197 *quotient += q;
6198 }
6199 expDiff += 64;
6200 if ( 0 < expDiff ) {
6201 q = estimateDiv128To64( aSig0, aSig1, bSig );
6202 q = ( 2 < q ) ? q - 2 : 0;
6203 q >>= 64 - expDiff;
6204 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6205 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6206 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6207 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6208 ++q;
6209 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6210 }
6211 if (expDiff < 64) {
6212 *quotient <<= expDiff;
6213 } else {
6214 *quotient = 0;
6215 }
6216 *quotient += q;
6217 }
6218 else {
6219 term1 = 0;
6220 term0 = bSig;
6221 }
6222 if (!mod) {
6223 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6224 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6225 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6226 && ( q & 1 ) )
6227 ) {
6228 aSig0 = alternateASig0;
6229 aSig1 = alternateASig1;
6230 zSign = ! zSign;
6231 ++*quotient;
6232 }
6233 }
6234 return
6235 normalizeRoundAndPackFloatx80(
6236 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6237
6238 }
6239
6240 /*----------------------------------------------------------------------------
6241 | Returns the remainder of the extended double-precision floating-point value
6242 | `a' with respect to the corresponding value `b'. The operation is performed
6243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6244 *----------------------------------------------------------------------------*/
6245
6246 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6247 {
6248 uint64_t quotient;
6249 return floatx80_modrem(a, b, false, &quotient, status);
6250 }
6251
6252 /*----------------------------------------------------------------------------
6253 | Returns the remainder of the extended double-precision floating-point value
6254 | `a' with respect to the corresponding value `b', with the quotient truncated
6255 | toward zero.
6256 *----------------------------------------------------------------------------*/
6257
6258 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6259 {
6260 uint64_t quotient;
6261 return floatx80_modrem(a, b, true, &quotient, status);
6262 }
6263
6264 /*----------------------------------------------------------------------------
6265 | Returns the square root of the extended double-precision floating-point
6266 | value `a'. The operation is performed according to the IEC/IEEE Standard
6267 | for Binary Floating-Point Arithmetic.
6268 *----------------------------------------------------------------------------*/
6269
6270 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6271 {
6272 bool aSign;
6273 int32_t aExp, zExp;
6274 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6275 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6276
6277 if (floatx80_invalid_encoding(a)) {
6278 float_raise(float_flag_invalid, status);
6279 return floatx80_default_nan(status);
6280 }
6281 aSig0 = extractFloatx80Frac( a );
6282 aExp = extractFloatx80Exp( a );
6283 aSign = extractFloatx80Sign( a );
6284 if ( aExp == 0x7FFF ) {
6285 if ((uint64_t)(aSig0 << 1)) {
6286 return propagateFloatx80NaN(a, a, status);
6287 }
6288 if ( ! aSign ) return a;
6289 goto invalid;
6290 }
6291 if ( aSign ) {
6292 if ( ( aExp | aSig0 ) == 0 ) return a;
6293 invalid:
6294 float_raise(float_flag_invalid, status);
6295 return floatx80_default_nan(status);
6296 }
6297 if ( aExp == 0 ) {
6298 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6299 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6300 }
6301 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6302 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6303 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6304 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6305 doubleZSig0 = zSig0<<1;
6306 mul64To128( zSig0, zSig0, &term0, &term1 );
6307 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6308 while ( (int64_t) rem0 < 0 ) {
6309 --zSig0;
6310 doubleZSig0 -= 2;
6311 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6312 }
6313 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6314 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6315 if ( zSig1 == 0 ) zSig1 = 1;
6316 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6317 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6318 mul64To128( zSig1, zSig1, &term2, &term3 );
6319 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6320 while ( (int64_t) rem1 < 0 ) {
6321 --zSig1;
6322 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6323 term3 |= 1;
6324 term2 |= doubleZSig0;
6325 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6326 }
6327 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6328 }
6329 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6330 zSig0 |= doubleZSig0;
6331 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6332 0, zExp, zSig0, zSig1, status);
6333 }
6334
6335 /*----------------------------------------------------------------------------
6336 | Returns the result of converting the quadruple-precision floating-point
6337 | value `a' to the 32-bit two's complement integer format. The conversion
6338 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6339 | Arithmetic---which means in particular that the conversion is rounded
6340 | according to the current rounding mode. If `a' is a NaN, the largest
6341 | positive integer is returned. Otherwise, if the conversion overflows, the
6342 | largest integer with the same sign as `a' is returned.
6343 *----------------------------------------------------------------------------*/
6344
6345 int32_t float128_to_int32(float128 a, float_status *status)
6346 {
6347 bool aSign;
6348 int32_t aExp, shiftCount;
6349 uint64_t aSig0, aSig1;
6350
6351 aSig1 = extractFloat128Frac1( a );
6352 aSig0 = extractFloat128Frac0( a );
6353 aExp = extractFloat128Exp( a );
6354 aSign = extractFloat128Sign( a );
6355 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6356 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6357 aSig0 |= ( aSig1 != 0 );
6358 shiftCount = 0x4028 - aExp;
6359 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6360 return roundAndPackInt32(aSign, aSig0, status);
6361
6362 }
6363
6364 /*----------------------------------------------------------------------------
6365 | Returns the result of converting the quadruple-precision floating-point
6366 | value `a' to the 32-bit two's complement integer format. The conversion
6367 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6368 | Arithmetic, except that the conversion is always rounded toward zero. If
6369 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6370 | conversion overflows, the largest integer with the same sign as `a' is
6371 | returned.
6372 *----------------------------------------------------------------------------*/
6373
6374 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6375 {
6376 bool aSign;
6377 int32_t aExp, shiftCount;
6378 uint64_t aSig0, aSig1, savedASig;
6379 int32_t z;
6380
6381 aSig1 = extractFloat128Frac1( a );
6382 aSig0 = extractFloat128Frac0( a );
6383 aExp = extractFloat128Exp( a );
6384 aSign = extractFloat128Sign( a );
6385 aSig0 |= ( aSig1 != 0 );
6386 if ( 0x401E < aExp ) {
6387 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6388 goto invalid;
6389 }
6390 else if ( aExp < 0x3FFF ) {
6391 if (aExp || aSig0) {
6392 float_raise(float_flag_inexact, status);
6393 }
6394 return 0;
6395 }
6396 aSig0 |= UINT64_C(0x0001000000000000);
6397 shiftCount = 0x402F - aExp;
6398 savedASig = aSig0;
6399 aSig0 >>= shiftCount;
6400 z = aSig0;
6401 if ( aSign ) z = - z;
6402 if ( ( z < 0 ) ^ aSign ) {
6403 invalid:
6404 float_raise(float_flag_invalid, status);
6405 return aSign ? INT32_MIN : INT32_MAX;
6406 }
6407 if ( ( aSig0<<shiftCount ) != savedASig ) {
6408 float_raise(float_flag_inexact, status);
6409 }
6410 return z;
6411
6412 }
6413
6414 /*----------------------------------------------------------------------------
6415 | Returns the result of converting the quadruple-precision floating-point
6416 | value `a' to the 64-bit two's complement integer format. The conversion
6417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6418 | Arithmetic---which means in particular that the conversion is rounded
6419 | according to the current rounding mode. If `a' is a NaN, the largest
6420 | positive integer is returned. Otherwise, if the conversion overflows, the
6421 | largest integer with the same sign as `a' is returned.
6422 *----------------------------------------------------------------------------*/
6423
6424 int64_t float128_to_int64(float128 a, float_status *status)
6425 {
6426 bool aSign;
6427 int32_t aExp, shiftCount;
6428 uint64_t aSig0, aSig1;
6429
6430 aSig1 = extractFloat128Frac1( a );
6431 aSig0 = extractFloat128Frac0( a );
6432 aExp = extractFloat128Exp( a );
6433 aSign = extractFloat128Sign( a );
6434 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6435 shiftCount = 0x402F - aExp;
6436 if ( shiftCount <= 0 ) {
6437 if ( 0x403E < aExp ) {
6438 float_raise(float_flag_invalid, status);
6439 if ( ! aSign
6440 || ( ( aExp == 0x7FFF )
6441 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6442 )
6443 ) {
6444 return INT64_MAX;
6445 }
6446 return INT64_MIN;
6447 }
6448 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6449 }
6450 else {
6451 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6452 }
6453 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6454
6455 }
6456
6457 /*----------------------------------------------------------------------------
6458 | Returns the result of converting the quadruple-precision floating-point
6459 | value `a' to the 64-bit two's complement integer format. The conversion
6460 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6461 | Arithmetic, except that the conversion is always rounded toward zero.
6462 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6463 | the conversion overflows, the largest integer with the same sign as `a' is
6464 | returned.
6465 *----------------------------------------------------------------------------*/
6466
6467 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6468 {
6469 bool aSign;
6470 int32_t aExp, shiftCount;
6471 uint64_t aSig0, aSig1;
6472 int64_t z;
6473
6474 aSig1 = extractFloat128Frac1( a );
6475 aSig0 = extractFloat128Frac0( a );
6476 aExp = extractFloat128Exp( a );
6477 aSign = extractFloat128Sign( a );
6478 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6479 shiftCount = aExp - 0x402F;
6480 if ( 0 < shiftCount ) {
6481 if ( 0x403E <= aExp ) {
6482 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6483 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6484 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6485 if (aSig1) {
6486 float_raise(float_flag_inexact, status);
6487 }
6488 }
6489 else {
6490 float_raise(float_flag_invalid, status);
6491 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6492 return INT64_MAX;
6493 }
6494 }
6495 return INT64_MIN;
6496 }
6497 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6498 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6499 float_raise(float_flag_inexact, status);
6500 }
6501 }
6502 else {
6503 if ( aExp < 0x3FFF ) {
6504 if ( aExp | aSig0 | aSig1 ) {
6505 float_raise(float_flag_inexact, status);
6506 }
6507 return 0;
6508 }
6509 z = aSig0>>( - shiftCount );
6510 if ( aSig1
6511 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6512 float_raise(float_flag_inexact, status);
6513 }
6514 }
6515 if ( aSign ) z = - z;
6516 return z;
6517
6518 }
6519
6520 /*----------------------------------------------------------------------------
6521 | Returns the result of converting the quadruple-precision floating-point value
6522 | `a' to the 64-bit unsigned integer format. The conversion is
6523 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6524 | Arithmetic---which means in particular that the conversion is rounded
6525 | according to the current rounding mode. If `a' is a NaN, the largest
6526 | positive integer is returned. If the conversion overflows, the
6527 | largest unsigned integer is returned. If 'a' is negative, the value is
6528 | rounded and zero is returned; negative values that do not round to zero
6529 | will raise the inexact exception.
6530 *----------------------------------------------------------------------------*/
6531
6532 uint64_t float128_to_uint64(float128 a, float_status *status)
6533 {
6534 bool aSign;
6535 int aExp;
6536 int shiftCount;
6537 uint64_t aSig0, aSig1;
6538
6539 aSig0 = extractFloat128Frac0(a);
6540 aSig1 = extractFloat128Frac1(a);
6541 aExp = extractFloat128Exp(a);
6542 aSign = extractFloat128Sign(a);
6543 if (aSign && (aExp > 0x3FFE)) {
6544 float_raise(float_flag_invalid, status);
6545 if (float128_is_any_nan(a)) {
6546 return UINT64_MAX;
6547 } else {
6548 return 0;
6549 }
6550 }
6551 if (aExp) {
6552 aSig0 |= UINT64_C(0x0001000000000000);
6553 }
6554 shiftCount = 0x402F - aExp;
6555 if (shiftCount <= 0) {
6556 if (0x403E < aExp) {
6557 float_raise(float_flag_invalid, status);
6558 return UINT64_MAX;
6559 }
6560 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6561 } else {
6562 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6563 }
6564 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6565 }
6566
6567 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6568 {
6569 uint64_t v;
6570 signed char current_rounding_mode = status->float_rounding_mode;
6571
6572 set_float_rounding_mode(float_round_to_zero, status);
6573 v = float128_to_uint64(a, status);
6574 set_float_rounding_mode(current_rounding_mode, status);
6575
6576 return v;
6577 }
6578
6579 /*----------------------------------------------------------------------------
6580 | Returns the result of converting the quadruple-precision floating-point
6581 | value `a' to the 32-bit unsigned integer format. The conversion
6582 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6583 | Arithmetic except that the conversion is always rounded toward zero.
6584 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6585 | if the conversion overflows, the largest unsigned integer is returned.
6586 | If 'a' is negative, the value is rounded and zero is returned; negative
6587 | values that do not round to zero will raise the inexact exception.
6588 *----------------------------------------------------------------------------*/
6589
6590 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6591 {
6592 uint64_t v;
6593 uint32_t res;
6594 int old_exc_flags = get_float_exception_flags(status);
6595
6596 v = float128_to_uint64_round_to_zero(a, status);
6597 if (v > 0xffffffff) {
6598 res = 0xffffffff;
6599 } else {
6600 return v;
6601 }
6602 set_float_exception_flags(old_exc_flags, status);
6603 float_raise(float_flag_invalid, status);
6604 return res;
6605 }
6606
6607 /*----------------------------------------------------------------------------
6608 | Returns the result of converting the quadruple-precision floating-point value
6609 | `a' to the 32-bit unsigned integer format. The conversion is
6610 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6611 | Arithmetic---which means in particular that the conversion is rounded
6612 | according to the current rounding mode. If `a' is a NaN, the largest
6613 | positive integer is returned. If the conversion overflows, the
6614 | largest unsigned integer is returned. If 'a' is negative, the value is
6615 | rounded and zero is returned; negative values that do not round to zero
6616 | will raise the inexact exception.
6617 *----------------------------------------------------------------------------*/
6618
6619 uint32_t float128_to_uint32(float128 a, float_status *status)
6620 {
6621 uint64_t v;
6622 uint32_t res;
6623 int old_exc_flags = get_float_exception_flags(status);
6624
6625 v = float128_to_uint64(a, status);
6626 if (v > 0xffffffff) {
6627 res = 0xffffffff;
6628 } else {
6629 return v;
6630 }
6631 set_float_exception_flags(old_exc_flags, status);
6632 float_raise(float_flag_invalid, status);
6633 return res;
6634 }
6635
6636 /*----------------------------------------------------------------------------
6637 | Returns the result of converting the quadruple-precision floating-point
6638 | value `a' to the single-precision floating-point format. The conversion
6639 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6640 | Arithmetic.
6641 *----------------------------------------------------------------------------*/
6642
6643 float32 float128_to_float32(float128 a, float_status *status)
6644 {
6645 bool aSign;
6646 int32_t aExp;
6647 uint64_t aSig0, aSig1;
6648 uint32_t zSig;
6649
6650 aSig1 = extractFloat128Frac1( a );
6651 aSig0 = extractFloat128Frac0( a );
6652 aExp = extractFloat128Exp( a );
6653 aSign = extractFloat128Sign( a );
6654 if ( aExp == 0x7FFF ) {
6655 if ( aSig0 | aSig1 ) {
6656 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6657 }
6658 return packFloat32( aSign, 0xFF, 0 );
6659 }
6660 aSig0 |= ( aSig1 != 0 );
6661 shift64RightJamming( aSig0, 18, &aSig0 );
6662 zSig = aSig0;
6663 if ( aExp || zSig ) {
6664 zSig |= 0x40000000;
6665 aExp -= 0x3F81;
6666 }
6667 return roundAndPackFloat32(aSign, aExp, zSig, status);
6668
6669 }
6670
6671 /*----------------------------------------------------------------------------
6672 | Returns the result of converting the quadruple-precision floating-point
6673 | value `a' to the double-precision floating-point format. The conversion
6674 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6675 | Arithmetic.
6676 *----------------------------------------------------------------------------*/
6677
6678 float64 float128_to_float64(float128 a, float_status *status)
6679 {
6680 bool aSign;
6681 int32_t aExp;
6682 uint64_t aSig0, aSig1;
6683
6684 aSig1 = extractFloat128Frac1( a );
6685 aSig0 = extractFloat128Frac0( a );
6686 aExp = extractFloat128Exp( a );
6687 aSign = extractFloat128Sign( a );
6688 if ( aExp == 0x7FFF ) {
6689 if ( aSig0 | aSig1 ) {
6690 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6691 }
6692 return packFloat64( aSign, 0x7FF, 0 );
6693 }
6694 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6695 aSig0 |= ( aSig1 != 0 );
6696 if ( aExp || aSig0 ) {
6697 aSig0 |= UINT64_C(0x4000000000000000);
6698 aExp -= 0x3C01;
6699 }
6700 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6701
6702 }
6703
6704 /*----------------------------------------------------------------------------
6705 | Returns the result of converting the quadruple-precision floating-point
6706 | value `a' to the extended double-precision floating-point format. The
6707 | conversion is performed according to the IEC/IEEE Standard for Binary
6708 | Floating-Point Arithmetic.
6709 *----------------------------------------------------------------------------*/
6710
6711 floatx80 float128_to_floatx80(float128 a, float_status *status)
6712 {
6713 bool aSign;
6714 int32_t aExp;
6715 uint64_t aSig0, aSig1;
6716
6717 aSig1 = extractFloat128Frac1( a );
6718 aSig0 = extractFloat128Frac0( a );
6719 aExp = extractFloat128Exp( a );
6720 aSign = extractFloat128Sign( a );
6721 if ( aExp == 0x7FFF ) {
6722 if ( aSig0 | aSig1 ) {
6723 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6724 status);
6725 return floatx80_silence_nan(res, status);
6726 }
6727 return packFloatx80(aSign, floatx80_infinity_high,
6728 floatx80_infinity_low);
6729 }
6730 if ( aExp == 0 ) {
6731 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6732 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6733 }
6734 else {
6735 aSig0 |= UINT64_C(0x0001000000000000);
6736 }
6737 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6738 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6739
6740 }
6741
6742 /*----------------------------------------------------------------------------
6743 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6744 | returns the result as a quadruple-precision floating-point value. The
6745 | operation is performed according to the IEC/IEEE Standard for Binary
6746 | Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6748
6749 float128 float128_round_to_int(float128 a, float_status *status)
6750 {
6751 bool aSign;
6752 int32_t aExp;
6753 uint64_t lastBitMask, roundBitsMask;
6754 float128 z;
6755
6756 aExp = extractFloat128Exp( a );
6757 if ( 0x402F <= aExp ) {
6758 if ( 0x406F <= aExp ) {
6759 if ( ( aExp == 0x7FFF )
6760 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6761 ) {
6762 return propagateFloat128NaN(a, a, status);
6763 }
6764 return a;
6765 }
6766 lastBitMask = 1;
6767 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6768 roundBitsMask = lastBitMask - 1;
6769 z = a;
6770 switch (status->float_rounding_mode) {
6771 case float_round_nearest_even:
6772 if ( lastBitMask ) {
6773 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6774 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6775 }
6776 else {
6777 if ( (int64_t) z.low < 0 ) {
6778 ++z.high;
6779 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6780 }
6781 }
6782 break;
6783 case float_round_ties_away:
6784 if (lastBitMask) {
6785 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6786 } else {
6787 if ((int64_t) z.low < 0) {
6788 ++z.high;
6789 }
6790 }
6791 break;
6792 case float_round_to_zero:
6793 break;
6794 case float_round_up:
6795 if (!extractFloat128Sign(z)) {
6796 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6797 }
6798 break;
6799 case float_round_down:
6800 if (extractFloat128Sign(z)) {
6801 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6802 }
6803 break;
6804 case float_round_to_odd:
6805 /*
6806 * Note that if lastBitMask == 0, the last bit is the lsb
6807 * of high, and roundBitsMask == -1.
6808 */
6809 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6810 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6811 }
6812 break;
6813 default:
6814 abort();
6815 }
6816 z.low &= ~ roundBitsMask;
6817 }
6818 else {
6819 if ( aExp < 0x3FFF ) {
6820 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6821 float_raise(float_flag_inexact, status);
6822 aSign = extractFloat128Sign( a );
6823 switch (status->float_rounding_mode) {
6824 case float_round_nearest_even:
6825 if ( ( aExp == 0x3FFE )
6826 && ( extractFloat128Frac0( a )
6827 | extractFloat128Frac1( a ) )
6828 ) {
6829 return packFloat128( aSign, 0x3FFF, 0, 0 );
6830 }
6831 break;
6832 case float_round_ties_away:
6833 if (aExp == 0x3FFE) {
6834 return packFloat128(aSign, 0x3FFF, 0, 0);
6835 }
6836 break;
6837 case float_round_down:
6838 return
6839 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6840 : packFloat128( 0, 0, 0, 0 );
6841 case float_round_up:
6842 return
6843 aSign ? packFloat128( 1, 0, 0, 0 )
6844 : packFloat128( 0, 0x3FFF, 0, 0 );
6845
6846 case float_round_to_odd:
6847 return packFloat128(aSign, 0x3FFF, 0, 0);
6848
6849 case float_round_to_zero:
6850 break;
6851 }
6852 return packFloat128( aSign, 0, 0, 0 );
6853 }
6854 lastBitMask = 1;
6855 lastBitMask <<= 0x402F - aExp;
6856 roundBitsMask = lastBitMask - 1;
6857 z.low = 0;
6858 z.high = a.high;
6859 switch (status->float_rounding_mode) {
6860 case float_round_nearest_even:
6861 z.high += lastBitMask>>1;
6862 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6863 z.high &= ~ lastBitMask;
6864 }
6865 break;
6866 case float_round_ties_away:
6867 z.high += lastBitMask>>1;
6868 break;
6869 case float_round_to_zero:
6870 break;
6871 case float_round_up:
6872 if (!extractFloat128Sign(z)) {
6873 z.high |= ( a.low != 0 );
6874 z.high += roundBitsMask;
6875 }
6876 break;
6877 case float_round_down:
6878 if (extractFloat128Sign(z)) {
6879 z.high |= (a.low != 0);
6880 z.high += roundBitsMask;
6881 }
6882 break;
6883 case float_round_to_odd:
6884 if ((z.high & lastBitMask) == 0) {
6885 z.high |= (a.low != 0);
6886 z.high += roundBitsMask;
6887 }
6888 break;
6889 default:
6890 abort();
6891 }
6892 z.high &= ~ roundBitsMask;
6893 }
6894 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6895 float_raise(float_flag_inexact, status);
6896 }
6897 return z;
6898
6899 }
6900
6901 /*----------------------------------------------------------------------------
6902 | Returns the result of adding the absolute values of the quadruple-precision
6903 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6904 | before being returned. `zSign' is ignored if the result is a NaN.
6905 | The addition is performed according to the IEC/IEEE Standard for Binary
6906 | Floating-Point Arithmetic.
6907 *----------------------------------------------------------------------------*/
6908
6909 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6910 float_status *status)
6911 {
6912 int32_t aExp, bExp, zExp;
6913 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6914 int32_t expDiff;
6915
6916 aSig1 = extractFloat128Frac1( a );
6917 aSig0 = extractFloat128Frac0( a );
6918 aExp = extractFloat128Exp( a );
6919 bSig1 = extractFloat128Frac1( b );
6920 bSig0 = extractFloat128Frac0( b );
6921 bExp = extractFloat128Exp( b );
6922 expDiff = aExp - bExp;
6923 if ( 0 < expDiff ) {
6924 if ( aExp == 0x7FFF ) {
6925 if (aSig0 | aSig1) {
6926 return propagateFloat128NaN(a, b, status);
6927 }
6928 return a;
6929 }
6930 if ( bExp == 0 ) {
6931 --expDiff;
6932 }
6933 else {
6934 bSig0 |= UINT64_C(0x0001000000000000);
6935 }
6936 shift128ExtraRightJamming(
6937 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6938 zExp = aExp;
6939 }
6940 else if ( expDiff < 0 ) {
6941 if ( bExp == 0x7FFF ) {
6942 if (bSig0 | bSig1) {
6943 return propagateFloat128NaN(a, b, status);
6944 }
6945 return packFloat128( zSign, 0x7FFF, 0, 0 );
6946 }
6947 if ( aExp == 0 ) {
6948 ++expDiff;
6949 }
6950 else {
6951 aSig0 |= UINT64_C(0x0001000000000000);
6952 }
6953 shift128ExtraRightJamming(
6954 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6955 zExp = bExp;
6956 }
6957 else {
6958 if ( aExp == 0x7FFF ) {
6959 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6960 return propagateFloat128NaN(a, b, status);
6961 }
6962 return a;
6963 }
6964 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6965 if ( aExp == 0 ) {
6966 if (status->flush_to_zero) {
6967 if (zSig0 | zSig1) {
6968 float_raise(float_flag_output_denormal, status);
6969 }
6970 return packFloat128(zSign, 0, 0, 0);
6971 }
6972 return packFloat128( zSign, 0, zSig0, zSig1 );
6973 }
6974 zSig2 = 0;
6975 zSig0 |= UINT64_C(0x0002000000000000);
6976 zExp = aExp;
6977 goto shiftRight1;
6978 }
6979 aSig0 |= UINT64_C(0x0001000000000000);
6980 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6981 --zExp;
6982 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6983 ++zExp;
6984 shiftRight1:
6985 shift128ExtraRightJamming(
6986 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6987 roundAndPack:
6988 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6989
6990 }
6991
6992 /*----------------------------------------------------------------------------
6993 | Returns the result of subtracting the absolute values of the quadruple-
6994 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6995 | difference is negated before being returned. `zSign' is ignored if the
6996 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6997 | Standard for Binary Floating-Point Arithmetic.
6998 *----------------------------------------------------------------------------*/
6999
7000 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7001 float_status *status)
7002 {
7003 int32_t aExp, bExp, zExp;
7004 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7005 int32_t expDiff;
7006
7007 aSig1 = extractFloat128Frac1( a );
7008 aSig0 = extractFloat128Frac0( a );
7009 aExp = extractFloat128Exp( a );
7010 bSig1 = extractFloat128Frac1( b );
7011 bSig0 = extractFloat128Frac0( b );
7012 bExp = extractFloat128Exp( b );
7013 expDiff = aExp - bExp;
7014 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7015 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7016 if ( 0 < expDiff ) goto aExpBigger;
7017 if ( expDiff < 0 ) goto bExpBigger;
7018 if ( aExp == 0x7FFF ) {
7019 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7020 return propagateFloat128NaN(a, b, status);
7021 }
7022 float_raise(float_flag_invalid, status);
7023 return float128_default_nan(status);
7024 }
7025 if ( aExp == 0 ) {
7026 aExp = 1;
7027 bExp = 1;
7028 }
7029 if ( bSig0 < aSig0 ) goto aBigger;
7030 if ( aSig0 < bSig0 ) goto bBigger;
7031 if ( bSig1 < aSig1 ) goto aBigger;
7032 if ( aSig1 < bSig1 ) goto bBigger;
7033 return packFloat128(status->float_rounding_mode == float_round_down,
7034 0, 0, 0);
7035 bExpBigger:
7036 if ( bExp == 0x7FFF ) {
7037 if (bSig0 | bSig1) {
7038 return propagateFloat128NaN(a, b, status);
7039 }
7040 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7041 }
7042 if ( aExp == 0 ) {
7043 ++expDiff;
7044 }
7045 else {
7046 aSig0 |= UINT64_C(0x4000000000000000);
7047 }
7048 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7049 bSig0 |= UINT64_C(0x4000000000000000);
7050 bBigger:
7051 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7052 zExp = bExp;
7053 zSign ^= 1;
7054 goto normalizeRoundAndPack;
7055 aExpBigger:
7056 if ( aExp == 0x7FFF ) {
7057 if (aSig0 | aSig1) {
7058 return propagateFloat128NaN(a, b, status);
7059 }
7060 return a;
7061 }
7062 if ( bExp == 0 ) {
7063 --expDiff;
7064 }
7065 else {
7066 bSig0 |= UINT64_C(0x4000000000000000);
7067 }
7068 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7069 aSig0 |= UINT64_C(0x4000000000000000);
7070 aBigger:
7071 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7072 zExp = aExp;
7073 normalizeRoundAndPack:
7074 --zExp;
7075 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7076 status);
7077
7078 }
7079
7080 /*----------------------------------------------------------------------------
7081 | Returns the result of adding the quadruple-precision floating-point values
7082 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7083 | for Binary Floating-Point Arithmetic.
7084 *----------------------------------------------------------------------------*/
7085
7086 float128 float128_add(float128 a, float128 b, float_status *status)
7087 {
7088 bool aSign, bSign;
7089
7090 aSign = extractFloat128Sign( a );
7091 bSign = extractFloat128Sign( b );
7092 if ( aSign == bSign ) {
7093 return addFloat128Sigs(a, b, aSign, status);
7094 }
7095 else {
7096 return subFloat128Sigs(a, b, aSign, status);
7097 }
7098
7099 }
7100
7101 /*----------------------------------------------------------------------------
7102 | Returns the result of subtracting the quadruple-precision floating-point
7103 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7104 | Standard for Binary Floating-Point Arithmetic.
7105 *----------------------------------------------------------------------------*/
7106
7107 float128 float128_sub(float128 a, float128 b, float_status *status)
7108 {
7109 bool aSign, bSign;
7110
7111 aSign = extractFloat128Sign( a );
7112 bSign = extractFloat128Sign( b );
7113 if ( aSign == bSign ) {
7114 return subFloat128Sigs(a, b, aSign, status);
7115 }
7116 else {
7117 return addFloat128Sigs(a, b, aSign, status);
7118 }
7119
7120 }
7121
7122 /*----------------------------------------------------------------------------
7123 | Returns the result of multiplying the quadruple-precision floating-point
7124 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7125 | Standard for Binary Floating-Point Arithmetic.
7126 *----------------------------------------------------------------------------*/
7127
7128 float128 float128_mul(float128 a, float128 b, float_status *status)
7129 {
7130 bool aSign, bSign, zSign;
7131 int32_t aExp, bExp, zExp;
7132 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7133
7134 aSig1 = extractFloat128Frac1( a );
7135 aSig0 = extractFloat128Frac0( a );
7136 aExp = extractFloat128Exp( a );
7137 aSign = extractFloat128Sign( a );
7138 bSig1 = extractFloat128Frac1( b );
7139 bSig0 = extractFloat128Frac0( b );
7140 bExp = extractFloat128Exp( b );
7141 bSign = extractFloat128Sign( b );
7142 zSign = aSign ^ bSign;
7143 if ( aExp == 0x7FFF ) {
7144 if ( ( aSig0 | aSig1 )
7145 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7146 return propagateFloat128NaN(a, b, status);
7147 }
7148 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7149 return packFloat128( zSign, 0x7FFF, 0, 0 );
7150 }
7151 if ( bExp == 0x7FFF ) {
7152 if (bSig0 | bSig1) {
7153 return propagateFloat128NaN(a, b, status);
7154 }
7155 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7156 invalid:
7157 float_raise(float_flag_invalid, status);
7158 return float128_default_nan(status);
7159 }
7160 return packFloat128( zSign, 0x7FFF, 0, 0 );
7161 }
7162 if ( aExp == 0 ) {
7163 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7164 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7165 }
7166 if ( bExp == 0 ) {
7167 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7168 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7169 }
7170 zExp = aExp + bExp - 0x4000;
7171 aSig0 |= UINT64_C(0x0001000000000000);
7172 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7173 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7174 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7175 zSig2 |= ( zSig3 != 0 );
7176 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7177 shift128ExtraRightJamming(
7178 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7179 ++zExp;
7180 }
7181 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7182
7183 }
7184
7185 /*----------------------------------------------------------------------------
7186 | Returns the result of dividing the quadruple-precision floating-point value
7187 | `a' by the corresponding value `b'. The operation is performed according to
7188 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7189 *----------------------------------------------------------------------------*/
7190
7191 float128 float128_div(float128 a, float128 b, float_status *status)
7192 {
7193 bool aSign, bSign, zSign;
7194 int32_t aExp, bExp, zExp;
7195 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7196 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7197
7198 aSig1 = extractFloat128Frac1( a );
7199 aSig0 = extractFloat128Frac0( a );
7200 aExp = extractFloat128Exp( a );
7201 aSign = extractFloat128Sign( a );
7202 bSig1 = extractFloat128Frac1( b );
7203 bSig0 = extractFloat128Frac0( b );
7204 bExp = extractFloat128Exp( b );
7205 bSign = extractFloat128Sign( b );
7206 zSign = aSign ^ bSign;
7207 if ( aExp == 0x7FFF ) {
7208 if (aSig0 | aSig1) {
7209 return propagateFloat128NaN(a, b, status);
7210 }
7211 if ( bExp == 0x7FFF ) {
7212 if (bSig0 | bSig1) {
7213 return propagateFloat128NaN(a, b, status);
7214 }
7215 goto invalid;
7216 }
7217 return packFloat128( zSign, 0x7FFF, 0, 0 );
7218 }
7219 if ( bExp == 0x7FFF ) {
7220 if (bSig0 | bSig1) {
7221 return propagateFloat128NaN(a, b, status);
7222 }
7223 return packFloat128( zSign, 0, 0, 0 );
7224 }
7225 if ( bExp == 0 ) {
7226 if ( ( bSig0 | bSig1 ) == 0 ) {
7227 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7228 invalid:
7229 float_raise(float_flag_invalid, status);
7230 return float128_default_nan(status);
7231 }
7232 float_raise(float_flag_divbyzero, status);
7233 return packFloat128( zSign, 0x7FFF, 0, 0 );
7234 }
7235 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7236 }
7237 if ( aExp == 0 ) {
7238 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7239 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7240 }
7241 zExp = aExp - bExp + 0x3FFD;
7242 shortShift128Left(
7243 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7244 shortShift128Left(
7245 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7246 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7247 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7248 ++zExp;
7249 }
7250 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7251 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7252 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7253 while ( (int64_t) rem0 < 0 ) {
7254 --zSig0;
7255 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7256 }
7257 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7258 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7259 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7260 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7261 while ( (int64_t) rem1 < 0 ) {
7262 --zSig1;
7263 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7264 }
7265 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7266 }
7267 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7268 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7269
7270 }
7271
7272 /*----------------------------------------------------------------------------
7273 | Returns the remainder of the quadruple-precision floating-point value `a'
7274 | with respect to the corresponding value `b'. The operation is performed
7275 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7276 *----------------------------------------------------------------------------*/
7277
7278 float128 float128_rem(float128 a, float128 b, float_status *status)
7279 {
7280 bool aSign, zSign;
7281 int32_t aExp, bExp, expDiff;
7282 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7283 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7284 int64_t sigMean0;
7285
7286 aSig1 = extractFloat128Frac1( a );
7287 aSig0 = extractFloat128Frac0( a );
7288 aExp = extractFloat128Exp( a );
7289 aSign = extractFloat128Sign( a );
7290 bSig1 = extractFloat128Frac1( b );
7291 bSig0 = extractFloat128Frac0( b );
7292 bExp = extractFloat128Exp( b );
7293 if ( aExp == 0x7FFF ) {
7294 if ( ( aSig0 | aSig1 )
7295 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7296 return propagateFloat128NaN(a, b, status);
7297 }
7298 goto invalid;
7299 }
7300 if ( bExp == 0x7FFF ) {
7301 if (bSig0 | bSig1) {
7302 return propagateFloat128NaN(a, b, status);
7303 }
7304 return a;
7305 }
7306 if ( bExp == 0 ) {
7307 if ( ( bSig0 | bSig1 ) == 0 ) {
7308 invalid:
7309 float_raise(float_flag_invalid, status);
7310 return float128_default_nan(status);
7311 }
7312 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7313 }
7314 if ( aExp == 0 ) {
7315 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7316 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7317 }
7318 expDiff = aExp - bExp;
7319 if ( expDiff < -1 ) return a;
7320 shortShift128Left(
7321 aSig0 | UINT64_C(0x0001000000000000),
7322 aSig1,
7323 15 - ( expDiff < 0 ),
7324 &aSig0,
7325 &aSig1
7326 );
7327 shortShift128Left(
7328 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7329 q = le128( bSig0, bSig1, aSig0, aSig1 );
7330 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7331 expDiff -= 64;
7332 while ( 0 < expDiff ) {
7333 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7334 q = ( 4 < q ) ? q - 4 : 0;
7335 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7336 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7337 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7338 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7339 expDiff -= 61;
7340 }
7341 if ( -64 < expDiff ) {
7342 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7343 q = ( 4 < q ) ? q - 4 : 0;
7344 q >>= - expDiff;
7345 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7346 expDiff += 52;
7347 if ( expDiff < 0 ) {
7348 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7349 }
7350 else {
7351 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7352 }
7353 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7354 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7355 }
7356 else {
7357 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7358 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7359 }
7360 do {
7361 alternateASig0 = aSig0;
7362 alternateASig1 = aSig1;
7363 ++q;
7364 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7365 } while ( 0 <= (int64_t) aSig0 );
7366 add128(
7367 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7368 if ( ( sigMean0 < 0 )
7369 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7370 aSig0 = alternateASig0;
7371 aSig1 = alternateASig1;
7372 }
7373 zSign = ( (int64_t) aSig0 < 0 );
7374 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7375 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7376 status);
7377 }
7378
7379 /*----------------------------------------------------------------------------
7380 | Returns the square root of the quadruple-precision floating-point value `a'.
7381 | The operation is performed according to the IEC/IEEE Standard for Binary
7382 | Floating-Point Arithmetic.
7383 *----------------------------------------------------------------------------*/
7384
7385 float128 float128_sqrt(float128 a, float_status *status)
7386 {
7387 bool aSign;
7388 int32_t aExp, zExp;
7389 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7390 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7391
7392 aSig1 = extractFloat128Frac1( a );
7393 aSig0 = extractFloat128Frac0( a );
7394 aExp = extractFloat128Exp( a );
7395 aSign = extractFloat128Sign( a );
7396 if ( aExp == 0x7FFF ) {
7397 if (aSig0 | aSig1) {
7398 return propagateFloat128NaN(a, a, status);
7399 }
7400 if ( ! aSign ) return a;
7401 goto invalid;
7402 }
7403 if ( aSign ) {
7404 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7405 invalid:
7406 float_raise(float_flag_invalid, status);
7407 return float128_default_nan(status);
7408 }
7409 if ( aExp == 0 ) {
7410 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7411 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7412 }
7413 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7414 aSig0 |= UINT64_C(0x0001000000000000);
7415 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7416 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7417 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7418 doubleZSig0 = zSig0<<1;
7419 mul64To128( zSig0, zSig0, &term0, &term1 );
7420 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7421 while ( (int64_t) rem0 < 0 ) {
7422 --zSig0;
7423 doubleZSig0 -= 2;
7424 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7425 }
7426 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7427 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7428 if ( zSig1 == 0 ) zSig1 = 1;
7429 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7430 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7431 mul64To128( zSig1, zSig1, &term2, &term3 );
7432 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7433 while ( (int64_t) rem1 < 0 ) {
7434 --zSig1;
7435 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7436 term3 |= 1;
7437 term2 |= doubleZSig0;
7438 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7439 }
7440 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7441 }
7442 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7443 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7444
7445 }
7446
7447 static inline FloatRelation
7448 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7449 float_status *status)
7450 {
7451 bool aSign, bSign;
7452
7453 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7454 float_raise(float_flag_invalid, status);
7455 return float_relation_unordered;
7456 }
7457 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7458 ( extractFloatx80Frac( a )<<1 ) ) ||
7459 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7460 ( extractFloatx80Frac( b )<<1 ) )) {
7461 if (!is_quiet ||
7462 floatx80_is_signaling_nan(a, status) ||
7463 floatx80_is_signaling_nan(b, status)) {
7464 float_raise(float_flag_invalid, status);
7465 }
7466 return float_relation_unordered;
7467 }
7468 aSign = extractFloatx80Sign( a );
7469 bSign = extractFloatx80Sign( b );
7470 if ( aSign != bSign ) {
7471
7472 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7473 ( ( a.low | b.low ) == 0 ) ) {
7474 /* zero case */
7475 return float_relation_equal;
7476 } else {
7477 return 1 - (2 * aSign);
7478 }
7479 } else {
7480 /* Normalize pseudo-denormals before comparison. */
7481 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7482 ++a.high;
7483 }
7484 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7485 ++b.high;
7486 }
7487 if (a.low == b.low && a.high == b.high) {
7488 return float_relation_equal;
7489 } else {
7490 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7491 }
7492 }
7493 }
7494
7495 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7496 {
7497 return floatx80_compare_internal(a, b, 0, status);
7498 }
7499
7500 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7501 float_status *status)
7502 {
7503 return floatx80_compare_internal(a, b, 1, status);
7504 }
7505
7506 static inline FloatRelation
7507 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7508 float_status *status)
7509 {
7510 bool aSign, bSign;
7511
7512 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7513 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7514 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7515 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7516 if (!is_quiet ||
7517 float128_is_signaling_nan(a, status) ||
7518 float128_is_signaling_nan(b, status)) {
7519 float_raise(float_flag_invalid, status);
7520 }
7521 return float_relation_unordered;
7522 }
7523 aSign = extractFloat128Sign( a );
7524 bSign = extractFloat128Sign( b );
7525 if ( aSign != bSign ) {
7526 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7527 /* zero case */
7528 return float_relation_equal;
7529 } else {
7530 return 1 - (2 * aSign);
7531 }
7532 } else {
7533 if (a.low == b.low && a.high == b.high) {
7534 return float_relation_equal;
7535 } else {
7536 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7537 }
7538 }
7539 }
7540
7541 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7542 {
7543 return float128_compare_internal(a, b, 0, status);
7544 }
7545
7546 FloatRelation float128_compare_quiet(float128 a, float128 b,
7547 float_status *status)
7548 {
7549 return float128_compare_internal(a, b, 1, status);
7550 }
7551
7552 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7553 {
7554 bool aSign;
7555 int32_t aExp;
7556 uint64_t aSig;
7557
7558 if (floatx80_invalid_encoding(a)) {
7559 float_raise(float_flag_invalid, status);
7560 return floatx80_default_nan(status);
7561 }
7562 aSig = extractFloatx80Frac( a );
7563 aExp = extractFloatx80Exp( a );
7564 aSign = extractFloatx80Sign( a );
7565
7566 if ( aExp == 0x7FFF ) {
7567 if ( aSig<<1 ) {
7568 return propagateFloatx80NaN(a, a, status);
7569 }
7570 return a;
7571 }
7572
7573 if (aExp == 0) {
7574 if (aSig == 0) {
7575 return a;
7576 }
7577 aExp++;
7578 }
7579
7580 if (n > 0x10000) {
7581 n = 0x10000;
7582 } else if (n < -0x10000) {
7583 n = -0x10000;
7584 }
7585
7586 aExp += n;
7587 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7588 aSign, aExp, aSig, 0, status);
7589 }
7590
7591 float128 float128_scalbn(float128 a, int n, float_status *status)
7592 {
7593 bool aSign;
7594 int32_t aExp;
7595 uint64_t aSig0, aSig1;
7596
7597 aSig1 = extractFloat128Frac1( a );
7598 aSig0 = extractFloat128Frac0( a );
7599 aExp = extractFloat128Exp( a );
7600 aSign = extractFloat128Sign( a );
7601 if ( aExp == 0x7FFF ) {
7602 if ( aSig0 | aSig1 ) {
7603 return propagateFloat128NaN(a, a, status);
7604 }
7605 return a;
7606 }
7607 if (aExp != 0) {
7608 aSig0 |= UINT64_C(0x0001000000000000);
7609 } else if (aSig0 == 0 && aSig1 == 0) {
7610 return a;
7611 } else {
7612 aExp++;
7613 }
7614
7615 if (n > 0x10000) {
7616 n = 0x10000;
7617 } else if (n < -0x10000) {
7618 n = -0x10000;
7619 }
7620
7621 aExp += n - 1;
7622 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7623 , status);
7624
7625 }
7626
7627 static void __attribute__((constructor)) softfloat_init(void)
7628 {
7629 union_float64 ua, ub, uc, ur;
7630
7631 if (QEMU_NO_HARDFLOAT) {
7632 return;
7633 }
7634 /*
7635 * Test that the host's FMA is not obviously broken. For example,
7636 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7637 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7638 */
7639 ua.s = 0x0020000000000001ULL;
7640 ub.s = 0x3ca0000000000000ULL;
7641 uc.s = 0x0020000000000000ULL;
7642 ur.h = fma(ua.h, ub.h, uc.h);
7643 if (ur.s != 0x0020000000000001ULL) {
7644 force_soft_fma = true;
7645 }
7646 }