]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Rename FloatParts to FloatParts64
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float. The
504 * exponent is unbiased and the fraction is normalized. All
505 * calculations are done with a 64 bit fraction and then rounded as
506 * appropriate for the final format.
507 *
508 * Thanks to the packed FloatClass a decent compiler should be able to
509 * fit the whole structure into registers and avoid using the stack
510 * for parameter passing.
511 */
512
513 typedef struct {
514 uint64_t frac;
515 int32_t exp;
516 FloatClass cls;
517 bool sign;
518 } FloatParts64;
519
520 #define DECOMPOSED_BINARY_POINT 63
521 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
522
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
535 */
536 typedef struct {
537 int exp_size;
538 int exp_bias;
539 int exp_max;
540 int frac_size;
541 int frac_shift;
542 uint64_t frac_lsb;
543 uint64_t frac_lsbm1;
544 uint64_t round_mask;
545 uint64_t roundeven_mask;
546 bool arm_althp;
547 } FloatFmt;
548
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
551 .exp_size = E, \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
554 .frac_size = F, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
560
561 static const FloatFmt float16_params = {
562 FLOAT_PARAMS(5, 10)
563 };
564
565 static const FloatFmt float16_params_ahp = {
566 FLOAT_PARAMS(5, 10),
567 .arm_althp = true
568 };
569
570 static const FloatFmt bfloat16_params = {
571 FLOAT_PARAMS(8, 7)
572 };
573
574 static const FloatFmt float32_params = {
575 FLOAT_PARAMS(8, 23)
576 };
577
578 static const FloatFmt float64_params = {
579 FLOAT_PARAMS(11, 52)
580 };
581
582 /* Unpack a float to parts, but do not canonicalize. */
583 static inline FloatParts64 unpack_raw(FloatFmt fmt, uint64_t raw)
584 {
585 const int sign_pos = fmt.frac_size + fmt.exp_size;
586
587 return (FloatParts64) {
588 .cls = float_class_unclassified,
589 .sign = extract64(raw, sign_pos, 1),
590 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
591 .frac = extract64(raw, 0, fmt.frac_size),
592 };
593 }
594
595 static inline FloatParts64 float16_unpack_raw(float16 f)
596 {
597 return unpack_raw(float16_params, f);
598 }
599
600 static inline FloatParts64 bfloat16_unpack_raw(bfloat16 f)
601 {
602 return unpack_raw(bfloat16_params, f);
603 }
604
605 static inline FloatParts64 float32_unpack_raw(float32 f)
606 {
607 return unpack_raw(float32_params, f);
608 }
609
610 static inline FloatParts64 float64_unpack_raw(float64 f)
611 {
612 return unpack_raw(float64_params, f);
613 }
614
615 /* Pack a float from parts, but do not canonicalize. */
616 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts64 p)
617 {
618 const int sign_pos = fmt.frac_size + fmt.exp_size;
619 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
620 return deposit64(ret, sign_pos, 1, p.sign);
621 }
622
623 static inline float16 float16_pack_raw(FloatParts64 p)
624 {
625 return make_float16(pack_raw(float16_params, p));
626 }
627
628 static inline bfloat16 bfloat16_pack_raw(FloatParts64 p)
629 {
630 return pack_raw(bfloat16_params, p);
631 }
632
633 static inline float32 float32_pack_raw(FloatParts64 p)
634 {
635 return make_float32(pack_raw(float32_params, p));
636 }
637
638 static inline float64 float64_pack_raw(FloatParts64 p)
639 {
640 return make_float64(pack_raw(float64_params, p));
641 }
642
643 /*----------------------------------------------------------------------------
644 | Functions and definitions to determine: (1) whether tininess for underflow
645 | is detected before or after rounding by default, (2) what (if anything)
646 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
647 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
648 | are propagated from function inputs to output. These details are target-
649 | specific.
650 *----------------------------------------------------------------------------*/
651 #include "softfloat-specialize.c.inc"
652
653 /* Canonicalize EXP and FRAC, setting CLS. */
654 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
655 float_status *status)
656 {
657 if (part.exp == parm->exp_max && !parm->arm_althp) {
658 if (part.frac == 0) {
659 part.cls = float_class_inf;
660 } else {
661 part.frac <<= parm->frac_shift;
662 part.cls = (parts_is_snan_frac(part.frac, status)
663 ? float_class_snan : float_class_qnan);
664 }
665 } else if (part.exp == 0) {
666 if (likely(part.frac == 0)) {
667 part.cls = float_class_zero;
668 } else if (status->flush_inputs_to_zero) {
669 float_raise(float_flag_input_denormal, status);
670 part.cls = float_class_zero;
671 part.frac = 0;
672 } else {
673 int shift = clz64(part.frac);
674 part.cls = float_class_normal;
675 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
676 part.frac <<= shift;
677 }
678 } else {
679 part.cls = float_class_normal;
680 part.exp -= parm->exp_bias;
681 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
682 }
683 return part;
684 }
685
686 /* Round and uncanonicalize a floating-point number by parts. There
687 * are FRAC_SHIFT bits that may require rounding at the bottom of the
688 * fraction; these bits will be removed. The exponent will be biased
689 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
690 */
691
692 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
693 const FloatFmt *parm)
694 {
695 const uint64_t frac_lsb = parm->frac_lsb;
696 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
697 const uint64_t round_mask = parm->round_mask;
698 const uint64_t roundeven_mask = parm->roundeven_mask;
699 const int exp_max = parm->exp_max;
700 const int frac_shift = parm->frac_shift;
701 uint64_t frac, inc;
702 int exp, flags = 0;
703 bool overflow_norm;
704
705 frac = p.frac;
706 exp = p.exp;
707
708 switch (p.cls) {
709 case float_class_normal:
710 switch (s->float_rounding_mode) {
711 case float_round_nearest_even:
712 overflow_norm = false;
713 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
714 break;
715 case float_round_ties_away:
716 overflow_norm = false;
717 inc = frac_lsbm1;
718 break;
719 case float_round_to_zero:
720 overflow_norm = true;
721 inc = 0;
722 break;
723 case float_round_up:
724 inc = p.sign ? 0 : round_mask;
725 overflow_norm = p.sign;
726 break;
727 case float_round_down:
728 inc = p.sign ? round_mask : 0;
729 overflow_norm = !p.sign;
730 break;
731 case float_round_to_odd:
732 overflow_norm = true;
733 inc = frac & frac_lsb ? 0 : round_mask;
734 break;
735 default:
736 g_assert_not_reached();
737 }
738
739 exp += parm->exp_bias;
740 if (likely(exp > 0)) {
741 if (frac & round_mask) {
742 flags |= float_flag_inexact;
743 if (uadd64_overflow(frac, inc, &frac)) {
744 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
745 exp++;
746 }
747 }
748 frac >>= frac_shift;
749
750 if (parm->arm_althp) {
751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
752 if (unlikely(exp > exp_max)) {
753 /* Overflow. Return the maximum normal. */
754 flags = float_flag_invalid;
755 exp = exp_max;
756 frac = -1;
757 }
758 } else if (unlikely(exp >= exp_max)) {
759 flags |= float_flag_overflow | float_flag_inexact;
760 if (overflow_norm) {
761 exp = exp_max - 1;
762 frac = -1;
763 } else {
764 p.cls = float_class_inf;
765 goto do_inf;
766 }
767 }
768 } else if (s->flush_to_zero) {
769 flags |= float_flag_output_denormal;
770 p.cls = float_class_zero;
771 goto do_zero;
772 } else {
773 bool is_tiny = s->tininess_before_rounding || (exp < 0);
774
775 if (!is_tiny) {
776 uint64_t discard;
777 is_tiny = !uadd64_overflow(frac, inc, &discard);
778 }
779
780 shift64RightJamming(frac, 1 - exp, &frac);
781 if (frac & round_mask) {
782 /* Need to recompute round-to-even. */
783 switch (s->float_rounding_mode) {
784 case float_round_nearest_even:
785 inc = ((frac & roundeven_mask) != frac_lsbm1
786 ? frac_lsbm1 : 0);
787 break;
788 case float_round_to_odd:
789 inc = frac & frac_lsb ? 0 : round_mask;
790 break;
791 default:
792 break;
793 }
794 flags |= float_flag_inexact;
795 frac += inc;
796 }
797
798 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
799 frac >>= frac_shift;
800
801 if (is_tiny && (flags & float_flag_inexact)) {
802 flags |= float_flag_underflow;
803 }
804 if (exp == 0 && frac == 0) {
805 p.cls = float_class_zero;
806 }
807 }
808 break;
809
810 case float_class_zero:
811 do_zero:
812 exp = 0;
813 frac = 0;
814 break;
815
816 case float_class_inf:
817 do_inf:
818 assert(!parm->arm_althp);
819 exp = exp_max;
820 frac = 0;
821 break;
822
823 case float_class_qnan:
824 case float_class_snan:
825 assert(!parm->arm_althp);
826 exp = exp_max;
827 frac >>= parm->frac_shift;
828 break;
829
830 default:
831 g_assert_not_reached();
832 }
833
834 float_raise(flags, s);
835 p.exp = exp;
836 p.frac = frac;
837 return p;
838 }
839
840 /* Explicit FloatFmt version */
841 static FloatParts64 float16a_unpack_canonical(float16 f, float_status *s,
842 const FloatFmt *params)
843 {
844 return sf_canonicalize(float16_unpack_raw(f), params, s);
845 }
846
847 static FloatParts64 float16_unpack_canonical(float16 f, float_status *s)
848 {
849 return float16a_unpack_canonical(f, s, &float16_params);
850 }
851
852 static FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s)
853 {
854 return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
855 }
856
857 static float16 float16a_round_pack_canonical(FloatParts64 p, float_status *s,
858 const FloatFmt *params)
859 {
860 return float16_pack_raw(round_canonical(p, s, params));
861 }
862
863 static float16 float16_round_pack_canonical(FloatParts64 p, float_status *s)
864 {
865 return float16a_round_pack_canonical(p, s, &float16_params);
866 }
867
868 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 p, float_status *s)
869 {
870 return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
871 }
872
873 static FloatParts64 float32_unpack_canonical(float32 f, float_status *s)
874 {
875 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
876 }
877
878 static float32 float32_round_pack_canonical(FloatParts64 p, float_status *s)
879 {
880 return float32_pack_raw(round_canonical(p, s, &float32_params));
881 }
882
883 static FloatParts64 float64_unpack_canonical(float64 f, float_status *s)
884 {
885 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
886 }
887
888 static float64 float64_round_pack_canonical(FloatParts64 p, float_status *s)
889 {
890 return float64_pack_raw(round_canonical(p, s, &float64_params));
891 }
892
893 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
894 {
895 g_assert(is_nan(a.cls));
896 if (is_snan(a.cls)) {
897 float_raise(float_flag_invalid, s);
898 if (!s->default_nan_mode) {
899 return parts_silence_nan(a, s);
900 }
901 } else if (!s->default_nan_mode) {
902 return a;
903 }
904 return parts_default_nan(s);
905 }
906
907 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
908 {
909 if (is_snan(a.cls) || is_snan(b.cls)) {
910 float_raise(float_flag_invalid, s);
911 }
912
913 if (s->default_nan_mode) {
914 return parts_default_nan(s);
915 } else {
916 if (pickNaN(a.cls, b.cls,
917 a.frac > b.frac ||
918 (a.frac == b.frac && a.sign < b.sign), s)) {
919 a = b;
920 }
921 if (is_snan(a.cls)) {
922 return parts_silence_nan(a, s);
923 }
924 }
925 return a;
926 }
927
928 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
929 bool inf_zero, float_status *s)
930 {
931 int which;
932
933 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
934 float_raise(float_flag_invalid, s);
935 }
936
937 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
938
939 if (s->default_nan_mode) {
940 /* Note that this check is after pickNaNMulAdd so that function
941 * has an opportunity to set the Invalid flag.
942 */
943 which = 3;
944 }
945
946 switch (which) {
947 case 0:
948 break;
949 case 1:
950 a = b;
951 break;
952 case 2:
953 a = c;
954 break;
955 case 3:
956 return parts_default_nan(s);
957 default:
958 g_assert_not_reached();
959 }
960
961 if (is_snan(a.cls)) {
962 return parts_silence_nan(a, s);
963 }
964 return a;
965 }
966
967 /*
968 * Returns the result of adding or subtracting the values of the
969 * floating-point values `a' and `b'. The operation is performed
970 * according to the IEC/IEEE Standard for Binary Floating-Point
971 * Arithmetic.
972 */
973
974 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
975 float_status *s)
976 {
977 bool a_sign = a.sign;
978 bool b_sign = b.sign ^ subtract;
979
980 if (a_sign != b_sign) {
981 /* Subtraction */
982
983 if (a.cls == float_class_normal && b.cls == float_class_normal) {
984 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
985 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
986 a.frac = a.frac - b.frac;
987 } else {
988 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
989 a.frac = b.frac - a.frac;
990 a.exp = b.exp;
991 a_sign ^= 1;
992 }
993
994 if (a.frac == 0) {
995 a.cls = float_class_zero;
996 a.sign = s->float_rounding_mode == float_round_down;
997 } else {
998 int shift = clz64(a.frac);
999 a.frac = a.frac << shift;
1000 a.exp = a.exp - shift;
1001 a.sign = a_sign;
1002 }
1003 return a;
1004 }
1005 if (is_nan(a.cls) || is_nan(b.cls)) {
1006 return pick_nan(a, b, s);
1007 }
1008 if (a.cls == float_class_inf) {
1009 if (b.cls == float_class_inf) {
1010 float_raise(float_flag_invalid, s);
1011 return parts_default_nan(s);
1012 }
1013 return a;
1014 }
1015 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1016 a.sign = s->float_rounding_mode == float_round_down;
1017 return a;
1018 }
1019 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1020 b.sign = a_sign ^ 1;
1021 return b;
1022 }
1023 if (b.cls == float_class_zero) {
1024 return a;
1025 }
1026 } else {
1027 /* Addition */
1028 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1029 if (a.exp > b.exp) {
1030 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1031 } else if (a.exp < b.exp) {
1032 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1033 a.exp = b.exp;
1034 }
1035
1036 if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1037 shift64RightJamming(a.frac, 1, &a.frac);
1038 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1039 a.exp += 1;
1040 }
1041 return a;
1042 }
1043 if (is_nan(a.cls) || is_nan(b.cls)) {
1044 return pick_nan(a, b, s);
1045 }
1046 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1047 return a;
1048 }
1049 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1050 b.sign = b_sign;
1051 return b;
1052 }
1053 }
1054 g_assert_not_reached();
1055 }
1056
1057 /*
1058 * Returns the result of adding or subtracting the floating-point
1059 * values `a' and `b'. The operation is performed according to the
1060 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1061 */
1062
1063 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1064 {
1065 FloatParts64 pa = float16_unpack_canonical(a, status);
1066 FloatParts64 pb = float16_unpack_canonical(b, status);
1067 FloatParts64 pr = addsub_floats(pa, pb, false, status);
1068
1069 return float16_round_pack_canonical(pr, status);
1070 }
1071
1072 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1073 {
1074 FloatParts64 pa = float16_unpack_canonical(a, status);
1075 FloatParts64 pb = float16_unpack_canonical(b, status);
1076 FloatParts64 pr = addsub_floats(pa, pb, true, status);
1077
1078 return float16_round_pack_canonical(pr, status);
1079 }
1080
1081 static float32 QEMU_SOFTFLOAT_ATTR
1082 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1083 {
1084 FloatParts64 pa = float32_unpack_canonical(a, status);
1085 FloatParts64 pb = float32_unpack_canonical(b, status);
1086 FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1087
1088 return float32_round_pack_canonical(pr, status);
1089 }
1090
1091 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1092 {
1093 return soft_f32_addsub(a, b, false, status);
1094 }
1095
1096 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1097 {
1098 return soft_f32_addsub(a, b, true, status);
1099 }
1100
1101 static float64 QEMU_SOFTFLOAT_ATTR
1102 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1103 {
1104 FloatParts64 pa = float64_unpack_canonical(a, status);
1105 FloatParts64 pb = float64_unpack_canonical(b, status);
1106 FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1107
1108 return float64_round_pack_canonical(pr, status);
1109 }
1110
1111 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1112 {
1113 return soft_f64_addsub(a, b, false, status);
1114 }
1115
1116 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1117 {
1118 return soft_f64_addsub(a, b, true, status);
1119 }
1120
1121 static float hard_f32_add(float a, float b)
1122 {
1123 return a + b;
1124 }
1125
1126 static float hard_f32_sub(float a, float b)
1127 {
1128 return a - b;
1129 }
1130
1131 static double hard_f64_add(double a, double b)
1132 {
1133 return a + b;
1134 }
1135
1136 static double hard_f64_sub(double a, double b)
1137 {
1138 return a - b;
1139 }
1140
1141 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1142 {
1143 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1144 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1145 }
1146 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1147 }
1148
1149 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1150 {
1151 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1152 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1153 } else {
1154 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1155 }
1156 }
1157
1158 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1159 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1160 {
1161 return float32_gen2(a, b, s, hard, soft,
1162 f32_is_zon2, f32_addsubmul_post);
1163 }
1164
1165 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1166 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1167 {
1168 return float64_gen2(a, b, s, hard, soft,
1169 f64_is_zon2, f64_addsubmul_post);
1170 }
1171
1172 float32 QEMU_FLATTEN
1173 float32_add(float32 a, float32 b, float_status *s)
1174 {
1175 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1176 }
1177
1178 float32 QEMU_FLATTEN
1179 float32_sub(float32 a, float32 b, float_status *s)
1180 {
1181 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1182 }
1183
1184 float64 QEMU_FLATTEN
1185 float64_add(float64 a, float64 b, float_status *s)
1186 {
1187 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1188 }
1189
1190 float64 QEMU_FLATTEN
1191 float64_sub(float64 a, float64 b, float_status *s)
1192 {
1193 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1194 }
1195
1196 /*
1197 * Returns the result of adding or subtracting the bfloat16
1198 * values `a' and `b'.
1199 */
1200 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1201 {
1202 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1203 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1204 FloatParts64 pr = addsub_floats(pa, pb, false, status);
1205
1206 return bfloat16_round_pack_canonical(pr, status);
1207 }
1208
1209 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1210 {
1211 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1212 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1213 FloatParts64 pr = addsub_floats(pa, pb, true, status);
1214
1215 return bfloat16_round_pack_canonical(pr, status);
1216 }
1217
1218 /*
1219 * Returns the result of multiplying the floating-point values `a' and
1220 * `b'. The operation is performed according to the IEC/IEEE Standard
1221 * for Binary Floating-Point Arithmetic.
1222 */
1223
1224 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1225 {
1226 bool sign = a.sign ^ b.sign;
1227
1228 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1229 uint64_t hi, lo;
1230 int exp = a.exp + b.exp;
1231
1232 mul64To128(a.frac, b.frac, &hi, &lo);
1233 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1234 exp += 1;
1235 } else {
1236 hi <<= 1;
1237 }
1238 hi |= (lo != 0);
1239
1240 /* Re-use a */
1241 a.exp = exp;
1242 a.sign = sign;
1243 a.frac = hi;
1244 return a;
1245 }
1246 /* handle all the NaN cases */
1247 if (is_nan(a.cls) || is_nan(b.cls)) {
1248 return pick_nan(a, b, s);
1249 }
1250 /* Inf * Zero == NaN */
1251 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1252 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1253 float_raise(float_flag_invalid, s);
1254 return parts_default_nan(s);
1255 }
1256 /* Multiply by 0 or Inf */
1257 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1258 a.sign = sign;
1259 return a;
1260 }
1261 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1262 b.sign = sign;
1263 return b;
1264 }
1265 g_assert_not_reached();
1266 }
1267
1268 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1269 {
1270 FloatParts64 pa = float16_unpack_canonical(a, status);
1271 FloatParts64 pb = float16_unpack_canonical(b, status);
1272 FloatParts64 pr = mul_floats(pa, pb, status);
1273
1274 return float16_round_pack_canonical(pr, status);
1275 }
1276
1277 static float32 QEMU_SOFTFLOAT_ATTR
1278 soft_f32_mul(float32 a, float32 b, float_status *status)
1279 {
1280 FloatParts64 pa = float32_unpack_canonical(a, status);
1281 FloatParts64 pb = float32_unpack_canonical(b, status);
1282 FloatParts64 pr = mul_floats(pa, pb, status);
1283
1284 return float32_round_pack_canonical(pr, status);
1285 }
1286
1287 static float64 QEMU_SOFTFLOAT_ATTR
1288 soft_f64_mul(float64 a, float64 b, float_status *status)
1289 {
1290 FloatParts64 pa = float64_unpack_canonical(a, status);
1291 FloatParts64 pb = float64_unpack_canonical(b, status);
1292 FloatParts64 pr = mul_floats(pa, pb, status);
1293
1294 return float64_round_pack_canonical(pr, status);
1295 }
1296
1297 static float hard_f32_mul(float a, float b)
1298 {
1299 return a * b;
1300 }
1301
1302 static double hard_f64_mul(double a, double b)
1303 {
1304 return a * b;
1305 }
1306
1307 float32 QEMU_FLATTEN
1308 float32_mul(float32 a, float32 b, float_status *s)
1309 {
1310 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1311 f32_is_zon2, f32_addsubmul_post);
1312 }
1313
1314 float64 QEMU_FLATTEN
1315 float64_mul(float64 a, float64 b, float_status *s)
1316 {
1317 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1318 f64_is_zon2, f64_addsubmul_post);
1319 }
1320
1321 /*
1322 * Returns the result of multiplying the bfloat16
1323 * values `a' and `b'.
1324 */
1325
1326 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1327 {
1328 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1329 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1330 FloatParts64 pr = mul_floats(pa, pb, status);
1331
1332 return bfloat16_round_pack_canonical(pr, status);
1333 }
1334
1335 /*
1336 * Returns the result of multiplying the floating-point values `a' and
1337 * `b' then adding 'c', with no intermediate rounding step after the
1338 * multiplication. The operation is performed according to the
1339 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1340 * The flags argument allows the caller to select negation of the
1341 * addend, the intermediate product, or the final result. (The
1342 * difference between this and having the caller do a separate
1343 * negation is that negating externally will flip the sign bit on
1344 * NaNs.)
1345 */
1346
1347 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1348 int flags, float_status *s)
1349 {
1350 bool inf_zero, p_sign;
1351 bool sign_flip = flags & float_muladd_negate_result;
1352 FloatClass p_class;
1353 uint64_t hi, lo;
1354 int p_exp;
1355 int ab_mask, abc_mask;
1356
1357 ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1358 abc_mask = float_cmask(c.cls) | ab_mask;
1359 inf_zero = ab_mask == float_cmask_infzero;
1360
1361 /* It is implementation-defined whether the cases of (0,inf,qnan)
1362 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1363 * they return if they do), so we have to hand this information
1364 * off to the target-specific pick-a-NaN routine.
1365 */
1366 if (unlikely(abc_mask & float_cmask_anynan)) {
1367 return pick_nan_muladd(a, b, c, inf_zero, s);
1368 }
1369
1370 if (inf_zero) {
1371 float_raise(float_flag_invalid, s);
1372 return parts_default_nan(s);
1373 }
1374
1375 if (flags & float_muladd_negate_c) {
1376 c.sign ^= 1;
1377 }
1378
1379 p_sign = a.sign ^ b.sign;
1380
1381 if (flags & float_muladd_negate_product) {
1382 p_sign ^= 1;
1383 }
1384
1385 if (ab_mask & float_cmask_inf) {
1386 p_class = float_class_inf;
1387 } else if (ab_mask & float_cmask_zero) {
1388 p_class = float_class_zero;
1389 } else {
1390 p_class = float_class_normal;
1391 }
1392
1393 if (c.cls == float_class_inf) {
1394 if (p_class == float_class_inf && p_sign != c.sign) {
1395 float_raise(float_flag_invalid, s);
1396 return parts_default_nan(s);
1397 } else {
1398 c.sign ^= sign_flip;
1399 return c;
1400 }
1401 }
1402
1403 if (p_class == float_class_inf) {
1404 a.cls = float_class_inf;
1405 a.sign = p_sign ^ sign_flip;
1406 return a;
1407 }
1408
1409 if (p_class == float_class_zero) {
1410 if (c.cls == float_class_zero) {
1411 if (p_sign != c.sign) {
1412 p_sign = s->float_rounding_mode == float_round_down;
1413 }
1414 c.sign = p_sign;
1415 } else if (flags & float_muladd_halve_result) {
1416 c.exp -= 1;
1417 }
1418 c.sign ^= sign_flip;
1419 return c;
1420 }
1421
1422 /* a & b should be normals now... */
1423 assert(a.cls == float_class_normal &&
1424 b.cls == float_class_normal);
1425
1426 p_exp = a.exp + b.exp;
1427
1428 mul64To128(a.frac, b.frac, &hi, &lo);
1429
1430 /* Renormalize to the msb. */
1431 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1432 p_exp += 1;
1433 } else {
1434 shortShift128Left(hi, lo, 1, &hi, &lo);
1435 }
1436
1437 /* + add/sub */
1438 if (c.cls != float_class_zero) {
1439 int exp_diff = p_exp - c.exp;
1440 if (p_sign == c.sign) {
1441 /* Addition */
1442 if (exp_diff <= 0) {
1443 shift64RightJamming(hi, -exp_diff, &hi);
1444 p_exp = c.exp;
1445 if (uadd64_overflow(hi, c.frac, &hi)) {
1446 shift64RightJamming(hi, 1, &hi);
1447 hi |= DECOMPOSED_IMPLICIT_BIT;
1448 p_exp += 1;
1449 }
1450 } else {
1451 uint64_t c_hi, c_lo, over;
1452 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1453 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1454 if (over) {
1455 shift64RightJamming(hi, 1, &hi);
1456 hi |= DECOMPOSED_IMPLICIT_BIT;
1457 p_exp += 1;
1458 }
1459 }
1460 } else {
1461 /* Subtraction */
1462 uint64_t c_hi = c.frac, c_lo = 0;
1463
1464 if (exp_diff <= 0) {
1465 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1466 if (exp_diff == 0
1467 &&
1468 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1469 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1470 } else {
1471 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1472 p_sign ^= 1;
1473 p_exp = c.exp;
1474 }
1475 } else {
1476 shift128RightJamming(c_hi, c_lo,
1477 exp_diff,
1478 &c_hi, &c_lo);
1479 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1480 }
1481
1482 if (hi == 0 && lo == 0) {
1483 a.cls = float_class_zero;
1484 a.sign = s->float_rounding_mode == float_round_down;
1485 a.sign ^= sign_flip;
1486 return a;
1487 } else {
1488 int shift;
1489 if (hi != 0) {
1490 shift = clz64(hi);
1491 } else {
1492 shift = clz64(lo) + 64;
1493 }
1494 /* Normalizing to a binary point of 124 is the
1495 correct adjust for the exponent. However since we're
1496 shifting, we might as well put the binary point back
1497 at 63 where we really want it. Therefore shift as
1498 if we're leaving 1 bit at the top of the word, but
1499 adjust the exponent as if we're leaving 3 bits. */
1500 shift128Left(hi, lo, shift, &hi, &lo);
1501 p_exp -= shift;
1502 }
1503 }
1504 }
1505 hi |= (lo != 0);
1506
1507 if (flags & float_muladd_halve_result) {
1508 p_exp -= 1;
1509 }
1510
1511 /* finally prepare our result */
1512 a.cls = float_class_normal;
1513 a.sign = p_sign ^ sign_flip;
1514 a.exp = p_exp;
1515 a.frac = hi;
1516
1517 return a;
1518 }
1519
1520 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1521 int flags, float_status *status)
1522 {
1523 FloatParts64 pa = float16_unpack_canonical(a, status);
1524 FloatParts64 pb = float16_unpack_canonical(b, status);
1525 FloatParts64 pc = float16_unpack_canonical(c, status);
1526 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1527
1528 return float16_round_pack_canonical(pr, status);
1529 }
1530
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1533 float_status *status)
1534 {
1535 FloatParts64 pa = float32_unpack_canonical(a, status);
1536 FloatParts64 pb = float32_unpack_canonical(b, status);
1537 FloatParts64 pc = float32_unpack_canonical(c, status);
1538 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1539
1540 return float32_round_pack_canonical(pr, status);
1541 }
1542
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1545 float_status *status)
1546 {
1547 FloatParts64 pa = float64_unpack_canonical(a, status);
1548 FloatParts64 pb = float64_unpack_canonical(b, status);
1549 FloatParts64 pc = float64_unpack_canonical(c, status);
1550 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552 return float64_round_pack_canonical(pr, status);
1553 }
1554
1555 static bool force_soft_fma;
1556
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1559 {
1560 union_float32 ua, ub, uc, ur;
1561
1562 ua.s = xa;
1563 ub.s = xb;
1564 uc.s = xc;
1565
1566 if (unlikely(!can_use_fpu(s))) {
1567 goto soft;
1568 }
1569 if (unlikely(flags & float_muladd_halve_result)) {
1570 goto soft;
1571 }
1572
1573 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1574 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1575 goto soft;
1576 }
1577
1578 if (unlikely(force_soft_fma)) {
1579 goto soft;
1580 }
1581
1582 /*
1583 * When (a || b) == 0, there's no need to check for under/over flow,
1584 * since we know the addend is (normal || 0) and the product is 0.
1585 */
1586 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1587 union_float32 up;
1588 bool prod_sign;
1589
1590 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1591 prod_sign ^= !!(flags & float_muladd_negate_product);
1592 up.s = float32_set_sign(float32_zero, prod_sign);
1593
1594 if (flags & float_muladd_negate_c) {
1595 uc.h = -uc.h;
1596 }
1597 ur.h = up.h + uc.h;
1598 } else {
1599 union_float32 ua_orig = ua;
1600 union_float32 uc_orig = uc;
1601
1602 if (flags & float_muladd_negate_product) {
1603 ua.h = -ua.h;
1604 }
1605 if (flags & float_muladd_negate_c) {
1606 uc.h = -uc.h;
1607 }
1608
1609 ur.h = fmaf(ua.h, ub.h, uc.h);
1610
1611 if (unlikely(f32_is_inf(ur))) {
1612 float_raise(float_flag_overflow, s);
1613 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1614 ua = ua_orig;
1615 uc = uc_orig;
1616 goto soft;
1617 }
1618 }
1619 if (flags & float_muladd_negate_result) {
1620 return float32_chs(ur.s);
1621 }
1622 return ur.s;
1623
1624 soft:
1625 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1626 }
1627
1628 float64 QEMU_FLATTEN
1629 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1630 {
1631 union_float64 ua, ub, uc, ur;
1632
1633 ua.s = xa;
1634 ub.s = xb;
1635 uc.s = xc;
1636
1637 if (unlikely(!can_use_fpu(s))) {
1638 goto soft;
1639 }
1640 if (unlikely(flags & float_muladd_halve_result)) {
1641 goto soft;
1642 }
1643
1644 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1645 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1646 goto soft;
1647 }
1648
1649 if (unlikely(force_soft_fma)) {
1650 goto soft;
1651 }
1652
1653 /*
1654 * When (a || b) == 0, there's no need to check for under/over flow,
1655 * since we know the addend is (normal || 0) and the product is 0.
1656 */
1657 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1658 union_float64 up;
1659 bool prod_sign;
1660
1661 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1662 prod_sign ^= !!(flags & float_muladd_negate_product);
1663 up.s = float64_set_sign(float64_zero, prod_sign);
1664
1665 if (flags & float_muladd_negate_c) {
1666 uc.h = -uc.h;
1667 }
1668 ur.h = up.h + uc.h;
1669 } else {
1670 union_float64 ua_orig = ua;
1671 union_float64 uc_orig = uc;
1672
1673 if (flags & float_muladd_negate_product) {
1674 ua.h = -ua.h;
1675 }
1676 if (flags & float_muladd_negate_c) {
1677 uc.h = -uc.h;
1678 }
1679
1680 ur.h = fma(ua.h, ub.h, uc.h);
1681
1682 if (unlikely(f64_is_inf(ur))) {
1683 float_raise(float_flag_overflow, s);
1684 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1685 ua = ua_orig;
1686 uc = uc_orig;
1687 goto soft;
1688 }
1689 }
1690 if (flags & float_muladd_negate_result) {
1691 return float64_chs(ur.s);
1692 }
1693 return ur.s;
1694
1695 soft:
1696 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1697 }
1698
1699 /*
1700 * Returns the result of multiplying the bfloat16 values `a'
1701 * and `b' then adding 'c', with no intermediate rounding step after the
1702 * multiplication.
1703 */
1704
1705 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1706 int flags, float_status *status)
1707 {
1708 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1709 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1710 FloatParts64 pc = bfloat16_unpack_canonical(c, status);
1711 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1712
1713 return bfloat16_round_pack_canonical(pr, status);
1714 }
1715
1716 /*
1717 * Returns the result of dividing the floating-point value `a' by the
1718 * corresponding value `b'. The operation is performed according to
1719 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1720 */
1721
1722 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1723 {
1724 bool sign = a.sign ^ b.sign;
1725
1726 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1727 uint64_t n0, n1, q, r;
1728 int exp = a.exp - b.exp;
1729
1730 /*
1731 * We want a 2*N / N-bit division to produce exactly an N-bit
1732 * result, so that we do not lose any precision and so that we
1733 * do not have to renormalize afterward. If A.frac < B.frac,
1734 * then division would produce an (N-1)-bit result; shift A left
1735 * by one to produce the an N-bit result, and decrement the
1736 * exponent to match.
1737 *
1738 * The udiv_qrnnd algorithm that we're using requires normalization,
1739 * i.e. the msb of the denominator must be set, which is already true.
1740 */
1741 if (a.frac < b.frac) {
1742 exp -= 1;
1743 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1744 } else {
1745 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1746 }
1747 q = udiv_qrnnd(&r, n1, n0, b.frac);
1748
1749 /* Set lsb if there is a remainder, to set inexact. */
1750 a.frac = q | (r != 0);
1751 a.sign = sign;
1752 a.exp = exp;
1753 return a;
1754 }
1755 /* handle all the NaN cases */
1756 if (is_nan(a.cls) || is_nan(b.cls)) {
1757 return pick_nan(a, b, s);
1758 }
1759 /* 0/0 or Inf/Inf */
1760 if (a.cls == b.cls
1761 &&
1762 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1763 float_raise(float_flag_invalid, s);
1764 return parts_default_nan(s);
1765 }
1766 /* Inf / x or 0 / x */
1767 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1768 a.sign = sign;
1769 return a;
1770 }
1771 /* Div 0 => Inf */
1772 if (b.cls == float_class_zero) {
1773 float_raise(float_flag_divbyzero, s);
1774 a.cls = float_class_inf;
1775 a.sign = sign;
1776 return a;
1777 }
1778 /* Div by Inf */
1779 if (b.cls == float_class_inf) {
1780 a.cls = float_class_zero;
1781 a.sign = sign;
1782 return a;
1783 }
1784 g_assert_not_reached();
1785 }
1786
1787 float16 float16_div(float16 a, float16 b, float_status *status)
1788 {
1789 FloatParts64 pa = float16_unpack_canonical(a, status);
1790 FloatParts64 pb = float16_unpack_canonical(b, status);
1791 FloatParts64 pr = div_floats(pa, pb, status);
1792
1793 return float16_round_pack_canonical(pr, status);
1794 }
1795
1796 static float32 QEMU_SOFTFLOAT_ATTR
1797 soft_f32_div(float32 a, float32 b, float_status *status)
1798 {
1799 FloatParts64 pa = float32_unpack_canonical(a, status);
1800 FloatParts64 pb = float32_unpack_canonical(b, status);
1801 FloatParts64 pr = div_floats(pa, pb, status);
1802
1803 return float32_round_pack_canonical(pr, status);
1804 }
1805
1806 static float64 QEMU_SOFTFLOAT_ATTR
1807 soft_f64_div(float64 a, float64 b, float_status *status)
1808 {
1809 FloatParts64 pa = float64_unpack_canonical(a, status);
1810 FloatParts64 pb = float64_unpack_canonical(b, status);
1811 FloatParts64 pr = div_floats(pa, pb, status);
1812
1813 return float64_round_pack_canonical(pr, status);
1814 }
1815
1816 static float hard_f32_div(float a, float b)
1817 {
1818 return a / b;
1819 }
1820
1821 static double hard_f64_div(double a, double b)
1822 {
1823 return a / b;
1824 }
1825
1826 static bool f32_div_pre(union_float32 a, union_float32 b)
1827 {
1828 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1829 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1830 fpclassify(b.h) == FP_NORMAL;
1831 }
1832 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1833 }
1834
1835 static bool f64_div_pre(union_float64 a, union_float64 b)
1836 {
1837 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1838 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1839 fpclassify(b.h) == FP_NORMAL;
1840 }
1841 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1842 }
1843
1844 static bool f32_div_post(union_float32 a, union_float32 b)
1845 {
1846 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1847 return fpclassify(a.h) != FP_ZERO;
1848 }
1849 return !float32_is_zero(a.s);
1850 }
1851
1852 static bool f64_div_post(union_float64 a, union_float64 b)
1853 {
1854 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1855 return fpclassify(a.h) != FP_ZERO;
1856 }
1857 return !float64_is_zero(a.s);
1858 }
1859
1860 float32 QEMU_FLATTEN
1861 float32_div(float32 a, float32 b, float_status *s)
1862 {
1863 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1864 f32_div_pre, f32_div_post);
1865 }
1866
1867 float64 QEMU_FLATTEN
1868 float64_div(float64 a, float64 b, float_status *s)
1869 {
1870 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1871 f64_div_pre, f64_div_post);
1872 }
1873
1874 /*
1875 * Returns the result of dividing the bfloat16
1876 * value `a' by the corresponding value `b'.
1877 */
1878
1879 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1880 {
1881 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1882 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1883 FloatParts64 pr = div_floats(pa, pb, status);
1884
1885 return bfloat16_round_pack_canonical(pr, status);
1886 }
1887
1888 /*
1889 * Float to Float conversions
1890 *
1891 * Returns the result of converting one float format to another. The
1892 * conversion is performed according to the IEC/IEEE Standard for
1893 * Binary Floating-Point Arithmetic.
1894 *
1895 * The float_to_float helper only needs to take care of raising
1896 * invalid exceptions and handling the conversion on NaNs.
1897 */
1898
1899 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1900 float_status *s)
1901 {
1902 if (dstf->arm_althp) {
1903 switch (a.cls) {
1904 case float_class_qnan:
1905 case float_class_snan:
1906 /* There is no NaN in the destination format. Raise Invalid
1907 * and return a zero with the sign of the input NaN.
1908 */
1909 float_raise(float_flag_invalid, s);
1910 a.cls = float_class_zero;
1911 a.frac = 0;
1912 a.exp = 0;
1913 break;
1914
1915 case float_class_inf:
1916 /* There is no Inf in the destination format. Raise Invalid
1917 * and return the maximum normal with the correct sign.
1918 */
1919 float_raise(float_flag_invalid, s);
1920 a.cls = float_class_normal;
1921 a.exp = dstf->exp_max;
1922 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1923 break;
1924
1925 default:
1926 break;
1927 }
1928 } else if (is_nan(a.cls)) {
1929 return return_nan(a, s);
1930 }
1931 return a;
1932 }
1933
1934 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1935 {
1936 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1937 FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1938 FloatParts64 pr = float_to_float(p, &float32_params, s);
1939 return float32_round_pack_canonical(pr, s);
1940 }
1941
1942 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1943 {
1944 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1945 FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1946 FloatParts64 pr = float_to_float(p, &float64_params, s);
1947 return float64_round_pack_canonical(pr, s);
1948 }
1949
1950 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1951 {
1952 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1953 FloatParts64 p = float32_unpack_canonical(a, s);
1954 FloatParts64 pr = float_to_float(p, fmt16, s);
1955 return float16a_round_pack_canonical(pr, s, fmt16);
1956 }
1957
1958 static float64 QEMU_SOFTFLOAT_ATTR
1959 soft_float32_to_float64(float32 a, float_status *s)
1960 {
1961 FloatParts64 p = float32_unpack_canonical(a, s);
1962 FloatParts64 pr = float_to_float(p, &float64_params, s);
1963 return float64_round_pack_canonical(pr, s);
1964 }
1965
1966 float64 float32_to_float64(float32 a, float_status *s)
1967 {
1968 if (likely(float32_is_normal(a))) {
1969 /* Widening conversion can never produce inexact results. */
1970 union_float32 uf;
1971 union_float64 ud;
1972 uf.s = a;
1973 ud.h = uf.h;
1974 return ud.s;
1975 } else if (float32_is_zero(a)) {
1976 return float64_set_sign(float64_zero, float32_is_neg(a));
1977 } else {
1978 return soft_float32_to_float64(a, s);
1979 }
1980 }
1981
1982 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1983 {
1984 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1985 FloatParts64 p = float64_unpack_canonical(a, s);
1986 FloatParts64 pr = float_to_float(p, fmt16, s);
1987 return float16a_round_pack_canonical(pr, s, fmt16);
1988 }
1989
1990 float32 float64_to_float32(float64 a, float_status *s)
1991 {
1992 FloatParts64 p = float64_unpack_canonical(a, s);
1993 FloatParts64 pr = float_to_float(p, &float32_params, s);
1994 return float32_round_pack_canonical(pr, s);
1995 }
1996
1997 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
1998 {
1999 FloatParts64 p = bfloat16_unpack_canonical(a, s);
2000 FloatParts64 pr = float_to_float(p, &float32_params, s);
2001 return float32_round_pack_canonical(pr, s);
2002 }
2003
2004 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2005 {
2006 FloatParts64 p = bfloat16_unpack_canonical(a, s);
2007 FloatParts64 pr = float_to_float(p, &float64_params, s);
2008 return float64_round_pack_canonical(pr, s);
2009 }
2010
2011 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2012 {
2013 FloatParts64 p = float32_unpack_canonical(a, s);
2014 FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2015 return bfloat16_round_pack_canonical(pr, s);
2016 }
2017
2018 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2019 {
2020 FloatParts64 p = float64_unpack_canonical(a, s);
2021 FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2022 return bfloat16_round_pack_canonical(pr, s);
2023 }
2024
2025 /*
2026 * Rounds the floating-point value `a' to an integer, and returns the
2027 * result as a floating-point value. The operation is performed
2028 * according to the IEC/IEEE Standard for Binary Floating-Point
2029 * Arithmetic.
2030 */
2031
2032 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2033 int scale, float_status *s)
2034 {
2035 switch (a.cls) {
2036 case float_class_qnan:
2037 case float_class_snan:
2038 return return_nan(a, s);
2039
2040 case float_class_zero:
2041 case float_class_inf:
2042 /* already "integral" */
2043 break;
2044
2045 case float_class_normal:
2046 scale = MIN(MAX(scale, -0x10000), 0x10000);
2047 a.exp += scale;
2048
2049 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2050 /* already integral */
2051 break;
2052 }
2053 if (a.exp < 0) {
2054 bool one;
2055 /* all fractional */
2056 float_raise(float_flag_inexact, s);
2057 switch (rmode) {
2058 case float_round_nearest_even:
2059 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2060 break;
2061 case float_round_ties_away:
2062 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2063 break;
2064 case float_round_to_zero:
2065 one = false;
2066 break;
2067 case float_round_up:
2068 one = !a.sign;
2069 break;
2070 case float_round_down:
2071 one = a.sign;
2072 break;
2073 case float_round_to_odd:
2074 one = true;
2075 break;
2076 default:
2077 g_assert_not_reached();
2078 }
2079
2080 if (one) {
2081 a.frac = DECOMPOSED_IMPLICIT_BIT;
2082 a.exp = 0;
2083 } else {
2084 a.cls = float_class_zero;
2085 }
2086 } else {
2087 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2088 uint64_t frac_lsbm1 = frac_lsb >> 1;
2089 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2090 uint64_t rnd_mask = rnd_even_mask >> 1;
2091 uint64_t inc;
2092
2093 switch (rmode) {
2094 case float_round_nearest_even:
2095 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2096 break;
2097 case float_round_ties_away:
2098 inc = frac_lsbm1;
2099 break;
2100 case float_round_to_zero:
2101 inc = 0;
2102 break;
2103 case float_round_up:
2104 inc = a.sign ? 0 : rnd_mask;
2105 break;
2106 case float_round_down:
2107 inc = a.sign ? rnd_mask : 0;
2108 break;
2109 case float_round_to_odd:
2110 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2111 break;
2112 default:
2113 g_assert_not_reached();
2114 }
2115
2116 if (a.frac & rnd_mask) {
2117 float_raise(float_flag_inexact, s);
2118 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2119 a.frac >>= 1;
2120 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2121 a.exp++;
2122 }
2123 a.frac &= ~rnd_mask;
2124 }
2125 }
2126 break;
2127 default:
2128 g_assert_not_reached();
2129 }
2130 return a;
2131 }
2132
2133 float16 float16_round_to_int(float16 a, float_status *s)
2134 {
2135 FloatParts64 pa = float16_unpack_canonical(a, s);
2136 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2137 return float16_round_pack_canonical(pr, s);
2138 }
2139
2140 float32 float32_round_to_int(float32 a, float_status *s)
2141 {
2142 FloatParts64 pa = float32_unpack_canonical(a, s);
2143 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2144 return float32_round_pack_canonical(pr, s);
2145 }
2146
2147 float64 float64_round_to_int(float64 a, float_status *s)
2148 {
2149 FloatParts64 pa = float64_unpack_canonical(a, s);
2150 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2151 return float64_round_pack_canonical(pr, s);
2152 }
2153
2154 /*
2155 * Rounds the bfloat16 value `a' to an integer, and returns the
2156 * result as a bfloat16 value.
2157 */
2158
2159 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2160 {
2161 FloatParts64 pa = bfloat16_unpack_canonical(a, s);
2162 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2163 return bfloat16_round_pack_canonical(pr, s);
2164 }
2165
2166 /*
2167 * Returns the result of converting the floating-point value `a' to
2168 * the two's complement integer format. The conversion is performed
2169 * according to the IEC/IEEE Standard for Binary Floating-Point
2170 * Arithmetic---which means in particular that the conversion is
2171 * rounded according to the current rounding mode. If `a' is a NaN,
2172 * the largest positive integer is returned. Otherwise, if the
2173 * conversion overflows, the largest integer with the same sign as `a'
2174 * is returned.
2175 */
2176
2177 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2178 int scale, int64_t min, int64_t max,
2179 float_status *s)
2180 {
2181 uint64_t r;
2182 int orig_flags = get_float_exception_flags(s);
2183 FloatParts64 p = round_to_int(in, rmode, scale, s);
2184
2185 switch (p.cls) {
2186 case float_class_snan:
2187 case float_class_qnan:
2188 s->float_exception_flags = orig_flags | float_flag_invalid;
2189 return max;
2190 case float_class_inf:
2191 s->float_exception_flags = orig_flags | float_flag_invalid;
2192 return p.sign ? min : max;
2193 case float_class_zero:
2194 return 0;
2195 case float_class_normal:
2196 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2197 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2198 } else {
2199 r = UINT64_MAX;
2200 }
2201 if (p.sign) {
2202 if (r <= -(uint64_t) min) {
2203 return -r;
2204 } else {
2205 s->float_exception_flags = orig_flags | float_flag_invalid;
2206 return min;
2207 }
2208 } else {
2209 if (r <= max) {
2210 return r;
2211 } else {
2212 s->float_exception_flags = orig_flags | float_flag_invalid;
2213 return max;
2214 }
2215 }
2216 default:
2217 g_assert_not_reached();
2218 }
2219 }
2220
2221 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2222 float_status *s)
2223 {
2224 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2225 rmode, scale, INT8_MIN, INT8_MAX, s);
2226 }
2227
2228 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2229 float_status *s)
2230 {
2231 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2232 rmode, scale, INT16_MIN, INT16_MAX, s);
2233 }
2234
2235 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2236 float_status *s)
2237 {
2238 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2239 rmode, scale, INT32_MIN, INT32_MAX, s);
2240 }
2241
2242 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2243 float_status *s)
2244 {
2245 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2246 rmode, scale, INT64_MIN, INT64_MAX, s);
2247 }
2248
2249 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2250 float_status *s)
2251 {
2252 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2253 rmode, scale, INT16_MIN, INT16_MAX, s);
2254 }
2255
2256 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2257 float_status *s)
2258 {
2259 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2260 rmode, scale, INT32_MIN, INT32_MAX, s);
2261 }
2262
2263 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2264 float_status *s)
2265 {
2266 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2267 rmode, scale, INT64_MIN, INT64_MAX, s);
2268 }
2269
2270 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2271 float_status *s)
2272 {
2273 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2274 rmode, scale, INT16_MIN, INT16_MAX, s);
2275 }
2276
2277 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2278 float_status *s)
2279 {
2280 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2281 rmode, scale, INT32_MIN, INT32_MAX, s);
2282 }
2283
2284 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2285 float_status *s)
2286 {
2287 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2288 rmode, scale, INT64_MIN, INT64_MAX, s);
2289 }
2290
2291 int8_t float16_to_int8(float16 a, float_status *s)
2292 {
2293 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2294 }
2295
2296 int16_t float16_to_int16(float16 a, float_status *s)
2297 {
2298 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2299 }
2300
2301 int32_t float16_to_int32(float16 a, float_status *s)
2302 {
2303 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2304 }
2305
2306 int64_t float16_to_int64(float16 a, float_status *s)
2307 {
2308 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2309 }
2310
2311 int16_t float32_to_int16(float32 a, float_status *s)
2312 {
2313 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2314 }
2315
2316 int32_t float32_to_int32(float32 a, float_status *s)
2317 {
2318 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2319 }
2320
2321 int64_t float32_to_int64(float32 a, float_status *s)
2322 {
2323 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2324 }
2325
2326 int16_t float64_to_int16(float64 a, float_status *s)
2327 {
2328 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2329 }
2330
2331 int32_t float64_to_int32(float64 a, float_status *s)
2332 {
2333 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2334 }
2335
2336 int64_t float64_to_int64(float64 a, float_status *s)
2337 {
2338 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2339 }
2340
2341 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2342 {
2343 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2344 }
2345
2346 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2347 {
2348 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2349 }
2350
2351 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2352 {
2353 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2354 }
2355
2356 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2357 {
2358 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2359 }
2360
2361 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2362 {
2363 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2364 }
2365
2366 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2367 {
2368 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2369 }
2370
2371 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2372 {
2373 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2374 }
2375
2376 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2377 {
2378 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2379 }
2380
2381 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2382 {
2383 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2384 }
2385
2386 /*
2387 * Returns the result of converting the floating-point value `a' to
2388 * the two's complement integer format.
2389 */
2390
2391 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2392 float_status *s)
2393 {
2394 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2395 rmode, scale, INT16_MIN, INT16_MAX, s);
2396 }
2397
2398 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2399 float_status *s)
2400 {
2401 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2402 rmode, scale, INT32_MIN, INT32_MAX, s);
2403 }
2404
2405 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2406 float_status *s)
2407 {
2408 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2409 rmode, scale, INT64_MIN, INT64_MAX, s);
2410 }
2411
2412 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2413 {
2414 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2415 }
2416
2417 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2418 {
2419 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2420 }
2421
2422 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2423 {
2424 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2425 }
2426
2427 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2428 {
2429 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2430 }
2431
2432 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2433 {
2434 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2435 }
2436
2437 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2438 {
2439 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2440 }
2441
2442 /*
2443 * Returns the result of converting the floating-point value `a' to
2444 * the unsigned integer format. The conversion is performed according
2445 * to the IEC/IEEE Standard for Binary Floating-Point
2446 * Arithmetic---which means in particular that the conversion is
2447 * rounded according to the current rounding mode. If `a' is a NaN,
2448 * the largest unsigned integer is returned. Otherwise, if the
2449 * conversion overflows, the largest unsigned integer is returned. If
2450 * the 'a' is negative, the result is rounded and zero is returned;
2451 * values that do not round to zero will raise the inexact exception
2452 * flag.
2453 */
2454
2455 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2456 int scale, uint64_t max,
2457 float_status *s)
2458 {
2459 int orig_flags = get_float_exception_flags(s);
2460 FloatParts64 p = round_to_int(in, rmode, scale, s);
2461 uint64_t r;
2462
2463 switch (p.cls) {
2464 case float_class_snan:
2465 case float_class_qnan:
2466 s->float_exception_flags = orig_flags | float_flag_invalid;
2467 return max;
2468 case float_class_inf:
2469 s->float_exception_flags = orig_flags | float_flag_invalid;
2470 return p.sign ? 0 : max;
2471 case float_class_zero:
2472 return 0;
2473 case float_class_normal:
2474 if (p.sign) {
2475 s->float_exception_flags = orig_flags | float_flag_invalid;
2476 return 0;
2477 }
2478
2479 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2480 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2481 } else {
2482 s->float_exception_flags = orig_flags | float_flag_invalid;
2483 return max;
2484 }
2485
2486 /* For uint64 this will never trip, but if p.exp is too large
2487 * to shift a decomposed fraction we shall have exited via the
2488 * 3rd leg above.
2489 */
2490 if (r > max) {
2491 s->float_exception_flags = orig_flags | float_flag_invalid;
2492 return max;
2493 }
2494 return r;
2495 default:
2496 g_assert_not_reached();
2497 }
2498 }
2499
2500 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2501 float_status *s)
2502 {
2503 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2504 rmode, scale, UINT8_MAX, s);
2505 }
2506
2507 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2508 float_status *s)
2509 {
2510 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2511 rmode, scale, UINT16_MAX, s);
2512 }
2513
2514 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2515 float_status *s)
2516 {
2517 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2518 rmode, scale, UINT32_MAX, s);
2519 }
2520
2521 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2522 float_status *s)
2523 {
2524 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2525 rmode, scale, UINT64_MAX, s);
2526 }
2527
2528 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2529 float_status *s)
2530 {
2531 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2532 rmode, scale, UINT16_MAX, s);
2533 }
2534
2535 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2536 float_status *s)
2537 {
2538 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2539 rmode, scale, UINT32_MAX, s);
2540 }
2541
2542 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2543 float_status *s)
2544 {
2545 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2546 rmode, scale, UINT64_MAX, s);
2547 }
2548
2549 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2550 float_status *s)
2551 {
2552 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2553 rmode, scale, UINT16_MAX, s);
2554 }
2555
2556 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2557 float_status *s)
2558 {
2559 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2560 rmode, scale, UINT32_MAX, s);
2561 }
2562
2563 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2564 float_status *s)
2565 {
2566 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2567 rmode, scale, UINT64_MAX, s);
2568 }
2569
2570 uint8_t float16_to_uint8(float16 a, float_status *s)
2571 {
2572 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2573 }
2574
2575 uint16_t float16_to_uint16(float16 a, float_status *s)
2576 {
2577 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2578 }
2579
2580 uint32_t float16_to_uint32(float16 a, float_status *s)
2581 {
2582 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2583 }
2584
2585 uint64_t float16_to_uint64(float16 a, float_status *s)
2586 {
2587 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2588 }
2589
2590 uint16_t float32_to_uint16(float32 a, float_status *s)
2591 {
2592 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2593 }
2594
2595 uint32_t float32_to_uint32(float32 a, float_status *s)
2596 {
2597 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2598 }
2599
2600 uint64_t float32_to_uint64(float32 a, float_status *s)
2601 {
2602 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2603 }
2604
2605 uint16_t float64_to_uint16(float64 a, float_status *s)
2606 {
2607 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2608 }
2609
2610 uint32_t float64_to_uint32(float64 a, float_status *s)
2611 {
2612 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614
2615 uint64_t float64_to_uint64(float64 a, float_status *s)
2616 {
2617 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2618 }
2619
2620 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2621 {
2622 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2623 }
2624
2625 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2626 {
2627 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2628 }
2629
2630 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2631 {
2632 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634
2635 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2636 {
2637 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639
2640 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2641 {
2642 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2643 }
2644
2645 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2646 {
2647 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2648 }
2649
2650 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2651 {
2652 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2653 }
2654
2655 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2656 {
2657 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2658 }
2659
2660 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2661 {
2662 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2663 }
2664
2665 /*
2666 * Returns the result of converting the bfloat16 value `a' to
2667 * the unsigned integer format.
2668 */
2669
2670 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2671 int scale, float_status *s)
2672 {
2673 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2674 rmode, scale, UINT16_MAX, s);
2675 }
2676
2677 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2678 int scale, float_status *s)
2679 {
2680 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2681 rmode, scale, UINT32_MAX, s);
2682 }
2683
2684 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2685 int scale, float_status *s)
2686 {
2687 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2688 rmode, scale, UINT64_MAX, s);
2689 }
2690
2691 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2692 {
2693 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2694 }
2695
2696 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2697 {
2698 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2699 }
2700
2701 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2702 {
2703 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2704 }
2705
2706 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2707 {
2708 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2709 }
2710
2711 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2712 {
2713 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2714 }
2715
2716 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2717 {
2718 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2719 }
2720
2721 /*
2722 * Integer to float conversions
2723 *
2724 * Returns the result of converting the two's complement integer `a'
2725 * to the floating-point format. The conversion is performed according
2726 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2727 */
2728
2729 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2730 {
2731 FloatParts64 r = { .sign = false };
2732
2733 if (a == 0) {
2734 r.cls = float_class_zero;
2735 } else {
2736 uint64_t f = a;
2737 int shift;
2738
2739 r.cls = float_class_normal;
2740 if (a < 0) {
2741 f = -f;
2742 r.sign = true;
2743 }
2744 shift = clz64(f);
2745 scale = MIN(MAX(scale, -0x10000), 0x10000);
2746
2747 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2748 r.frac = f << shift;
2749 }
2750
2751 return r;
2752 }
2753
2754 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2755 {
2756 FloatParts64 pa = int_to_float(a, scale, status);
2757 return float16_round_pack_canonical(pa, status);
2758 }
2759
2760 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2761 {
2762 return int64_to_float16_scalbn(a, scale, status);
2763 }
2764
2765 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2766 {
2767 return int64_to_float16_scalbn(a, scale, status);
2768 }
2769
2770 float16 int64_to_float16(int64_t a, float_status *status)
2771 {
2772 return int64_to_float16_scalbn(a, 0, status);
2773 }
2774
2775 float16 int32_to_float16(int32_t a, float_status *status)
2776 {
2777 return int64_to_float16_scalbn(a, 0, status);
2778 }
2779
2780 float16 int16_to_float16(int16_t a, float_status *status)
2781 {
2782 return int64_to_float16_scalbn(a, 0, status);
2783 }
2784
2785 float16 int8_to_float16(int8_t a, float_status *status)
2786 {
2787 return int64_to_float16_scalbn(a, 0, status);
2788 }
2789
2790 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2791 {
2792 FloatParts64 pa = int_to_float(a, scale, status);
2793 return float32_round_pack_canonical(pa, status);
2794 }
2795
2796 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2797 {
2798 return int64_to_float32_scalbn(a, scale, status);
2799 }
2800
2801 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2802 {
2803 return int64_to_float32_scalbn(a, scale, status);
2804 }
2805
2806 float32 int64_to_float32(int64_t a, float_status *status)
2807 {
2808 return int64_to_float32_scalbn(a, 0, status);
2809 }
2810
2811 float32 int32_to_float32(int32_t a, float_status *status)
2812 {
2813 return int64_to_float32_scalbn(a, 0, status);
2814 }
2815
2816 float32 int16_to_float32(int16_t a, float_status *status)
2817 {
2818 return int64_to_float32_scalbn(a, 0, status);
2819 }
2820
2821 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2822 {
2823 FloatParts64 pa = int_to_float(a, scale, status);
2824 return float64_round_pack_canonical(pa, status);
2825 }
2826
2827 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2828 {
2829 return int64_to_float64_scalbn(a, scale, status);
2830 }
2831
2832 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2833 {
2834 return int64_to_float64_scalbn(a, scale, status);
2835 }
2836
2837 float64 int64_to_float64(int64_t a, float_status *status)
2838 {
2839 return int64_to_float64_scalbn(a, 0, status);
2840 }
2841
2842 float64 int32_to_float64(int32_t a, float_status *status)
2843 {
2844 return int64_to_float64_scalbn(a, 0, status);
2845 }
2846
2847 float64 int16_to_float64(int16_t a, float_status *status)
2848 {
2849 return int64_to_float64_scalbn(a, 0, status);
2850 }
2851
2852 /*
2853 * Returns the result of converting the two's complement integer `a'
2854 * to the bfloat16 format.
2855 */
2856
2857 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2858 {
2859 FloatParts64 pa = int_to_float(a, scale, status);
2860 return bfloat16_round_pack_canonical(pa, status);
2861 }
2862
2863 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2864 {
2865 return int64_to_bfloat16_scalbn(a, scale, status);
2866 }
2867
2868 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2869 {
2870 return int64_to_bfloat16_scalbn(a, scale, status);
2871 }
2872
2873 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2874 {
2875 return int64_to_bfloat16_scalbn(a, 0, status);
2876 }
2877
2878 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2879 {
2880 return int64_to_bfloat16_scalbn(a, 0, status);
2881 }
2882
2883 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2884 {
2885 return int64_to_bfloat16_scalbn(a, 0, status);
2886 }
2887
2888 /*
2889 * Unsigned Integer to float conversions
2890 *
2891 * Returns the result of converting the unsigned integer `a' to the
2892 * floating-point format. The conversion is performed according to the
2893 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2894 */
2895
2896 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
2897 {
2898 FloatParts64 r = { .sign = false };
2899 int shift;
2900
2901 if (a == 0) {
2902 r.cls = float_class_zero;
2903 } else {
2904 scale = MIN(MAX(scale, -0x10000), 0x10000);
2905 shift = clz64(a);
2906 r.cls = float_class_normal;
2907 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2908 r.frac = a << shift;
2909 }
2910
2911 return r;
2912 }
2913
2914 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2915 {
2916 FloatParts64 pa = uint_to_float(a, scale, status);
2917 return float16_round_pack_canonical(pa, status);
2918 }
2919
2920 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2921 {
2922 return uint64_to_float16_scalbn(a, scale, status);
2923 }
2924
2925 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2926 {
2927 return uint64_to_float16_scalbn(a, scale, status);
2928 }
2929
2930 float16 uint64_to_float16(uint64_t a, float_status *status)
2931 {
2932 return uint64_to_float16_scalbn(a, 0, status);
2933 }
2934
2935 float16 uint32_to_float16(uint32_t a, float_status *status)
2936 {
2937 return uint64_to_float16_scalbn(a, 0, status);
2938 }
2939
2940 float16 uint16_to_float16(uint16_t a, float_status *status)
2941 {
2942 return uint64_to_float16_scalbn(a, 0, status);
2943 }
2944
2945 float16 uint8_to_float16(uint8_t a, float_status *status)
2946 {
2947 return uint64_to_float16_scalbn(a, 0, status);
2948 }
2949
2950 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2951 {
2952 FloatParts64 pa = uint_to_float(a, scale, status);
2953 return float32_round_pack_canonical(pa, status);
2954 }
2955
2956 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2957 {
2958 return uint64_to_float32_scalbn(a, scale, status);
2959 }
2960
2961 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2962 {
2963 return uint64_to_float32_scalbn(a, scale, status);
2964 }
2965
2966 float32 uint64_to_float32(uint64_t a, float_status *status)
2967 {
2968 return uint64_to_float32_scalbn(a, 0, status);
2969 }
2970
2971 float32 uint32_to_float32(uint32_t a, float_status *status)
2972 {
2973 return uint64_to_float32_scalbn(a, 0, status);
2974 }
2975
2976 float32 uint16_to_float32(uint16_t a, float_status *status)
2977 {
2978 return uint64_to_float32_scalbn(a, 0, status);
2979 }
2980
2981 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2982 {
2983 FloatParts64 pa = uint_to_float(a, scale, status);
2984 return float64_round_pack_canonical(pa, status);
2985 }
2986
2987 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2988 {
2989 return uint64_to_float64_scalbn(a, scale, status);
2990 }
2991
2992 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2993 {
2994 return uint64_to_float64_scalbn(a, scale, status);
2995 }
2996
2997 float64 uint64_to_float64(uint64_t a, float_status *status)
2998 {
2999 return uint64_to_float64_scalbn(a, 0, status);
3000 }
3001
3002 float64 uint32_to_float64(uint32_t a, float_status *status)
3003 {
3004 return uint64_to_float64_scalbn(a, 0, status);
3005 }
3006
3007 float64 uint16_to_float64(uint16_t a, float_status *status)
3008 {
3009 return uint64_to_float64_scalbn(a, 0, status);
3010 }
3011
3012 /*
3013 * Returns the result of converting the unsigned integer `a' to the
3014 * bfloat16 format.
3015 */
3016
3017 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3018 {
3019 FloatParts64 pa = uint_to_float(a, scale, status);
3020 return bfloat16_round_pack_canonical(pa, status);
3021 }
3022
3023 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3024 {
3025 return uint64_to_bfloat16_scalbn(a, scale, status);
3026 }
3027
3028 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3029 {
3030 return uint64_to_bfloat16_scalbn(a, scale, status);
3031 }
3032
3033 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3034 {
3035 return uint64_to_bfloat16_scalbn(a, 0, status);
3036 }
3037
3038 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3039 {
3040 return uint64_to_bfloat16_scalbn(a, 0, status);
3041 }
3042
3043 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3044 {
3045 return uint64_to_bfloat16_scalbn(a, 0, status);
3046 }
3047
3048 /* Float Min/Max */
3049 /* min() and max() functions. These can't be implemented as
3050 * 'compare and pick one input' because that would mishandle
3051 * NaNs and +0 vs -0.
3052 *
3053 * minnum() and maxnum() functions. These are similar to the min()
3054 * and max() functions but if one of the arguments is a QNaN and
3055 * the other is numerical then the numerical argument is returned.
3056 * SNaNs will get quietened before being returned.
3057 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3058 * and maxNum() operations. min() and max() are the typical min/max
3059 * semantics provided by many CPUs which predate that specification.
3060 *
3061 * minnummag() and maxnummag() functions correspond to minNumMag()
3062 * and minNumMag() from the IEEE-754 2008.
3063 */
3064 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3065 bool ieee, bool ismag, float_status *s)
3066 {
3067 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3068 if (ieee) {
3069 /* Takes two floating-point values `a' and `b', one of
3070 * which is a NaN, and returns the appropriate NaN
3071 * result. If either `a' or `b' is a signaling NaN,
3072 * the invalid exception is raised.
3073 */
3074 if (is_snan(a.cls) || is_snan(b.cls)) {
3075 return pick_nan(a, b, s);
3076 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3077 return b;
3078 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3079 return a;
3080 }
3081 }
3082 return pick_nan(a, b, s);
3083 } else {
3084 int a_exp, b_exp;
3085
3086 switch (a.cls) {
3087 case float_class_normal:
3088 a_exp = a.exp;
3089 break;
3090 case float_class_inf:
3091 a_exp = INT_MAX;
3092 break;
3093 case float_class_zero:
3094 a_exp = INT_MIN;
3095 break;
3096 default:
3097 g_assert_not_reached();
3098 break;
3099 }
3100 switch (b.cls) {
3101 case float_class_normal:
3102 b_exp = b.exp;
3103 break;
3104 case float_class_inf:
3105 b_exp = INT_MAX;
3106 break;
3107 case float_class_zero:
3108 b_exp = INT_MIN;
3109 break;
3110 default:
3111 g_assert_not_reached();
3112 break;
3113 }
3114
3115 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3116 bool a_less = a_exp < b_exp;
3117 if (a_exp == b_exp) {
3118 a_less = a.frac < b.frac;
3119 }
3120 return a_less ^ ismin ? b : a;
3121 }
3122
3123 if (a.sign == b.sign) {
3124 bool a_less = a_exp < b_exp;
3125 if (a_exp == b_exp) {
3126 a_less = a.frac < b.frac;
3127 }
3128 return a.sign ^ a_less ^ ismin ? b : a;
3129 } else {
3130 return a.sign ^ ismin ? b : a;
3131 }
3132 }
3133 }
3134
3135 #define MINMAX(sz, name, ismin, isiee, ismag) \
3136 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3137 float_status *s) \
3138 { \
3139 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3140 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3141 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3142 \
3143 return float ## sz ## _round_pack_canonical(pr, s); \
3144 }
3145
3146 MINMAX(16, min, true, false, false)
3147 MINMAX(16, minnum, true, true, false)
3148 MINMAX(16, minnummag, true, true, true)
3149 MINMAX(16, max, false, false, false)
3150 MINMAX(16, maxnum, false, true, false)
3151 MINMAX(16, maxnummag, false, true, true)
3152
3153 MINMAX(32, min, true, false, false)
3154 MINMAX(32, minnum, true, true, false)
3155 MINMAX(32, minnummag, true, true, true)
3156 MINMAX(32, max, false, false, false)
3157 MINMAX(32, maxnum, false, true, false)
3158 MINMAX(32, maxnummag, false, true, true)
3159
3160 MINMAX(64, min, true, false, false)
3161 MINMAX(64, minnum, true, true, false)
3162 MINMAX(64, minnummag, true, true, true)
3163 MINMAX(64, max, false, false, false)
3164 MINMAX(64, maxnum, false, true, false)
3165 MINMAX(64, maxnummag, false, true, true)
3166
3167 #undef MINMAX
3168
3169 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3170 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3171 { \
3172 FloatParts64 pa = bfloat16_unpack_canonical(a, s); \
3173 FloatParts64 pb = bfloat16_unpack_canonical(b, s); \
3174 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3175 \
3176 return bfloat16_round_pack_canonical(pr, s); \
3177 }
3178
3179 BF16_MINMAX(min, true, false, false)
3180 BF16_MINMAX(minnum, true, true, false)
3181 BF16_MINMAX(minnummag, true, true, true)
3182 BF16_MINMAX(max, false, false, false)
3183 BF16_MINMAX(maxnum, false, true, false)
3184 BF16_MINMAX(maxnummag, false, true, true)
3185
3186 #undef BF16_MINMAX
3187
3188 /* Floating point compare */
3189 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3190 float_status *s)
3191 {
3192 if (is_nan(a.cls) || is_nan(b.cls)) {
3193 if (!is_quiet ||
3194 a.cls == float_class_snan ||
3195 b.cls == float_class_snan) {
3196 float_raise(float_flag_invalid, s);
3197 }
3198 return float_relation_unordered;
3199 }
3200
3201 if (a.cls == float_class_zero) {
3202 if (b.cls == float_class_zero) {
3203 return float_relation_equal;
3204 }
3205 return b.sign ? float_relation_greater : float_relation_less;
3206 } else if (b.cls == float_class_zero) {
3207 return a.sign ? float_relation_less : float_relation_greater;
3208 }
3209
3210 /* The only really important thing about infinity is its sign. If
3211 * both are infinities the sign marks the smallest of the two.
3212 */
3213 if (a.cls == float_class_inf) {
3214 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3215 return float_relation_equal;
3216 }
3217 return a.sign ? float_relation_less : float_relation_greater;
3218 } else if (b.cls == float_class_inf) {
3219 return b.sign ? float_relation_greater : float_relation_less;
3220 }
3221
3222 if (a.sign != b.sign) {
3223 return a.sign ? float_relation_less : float_relation_greater;
3224 }
3225
3226 if (a.exp == b.exp) {
3227 if (a.frac == b.frac) {
3228 return float_relation_equal;
3229 }
3230 if (a.sign) {
3231 return a.frac > b.frac ?
3232 float_relation_less : float_relation_greater;
3233 } else {
3234 return a.frac > b.frac ?
3235 float_relation_greater : float_relation_less;
3236 }
3237 } else {
3238 if (a.sign) {
3239 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3240 } else {
3241 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3242 }
3243 }
3244 }
3245
3246 #define COMPARE(name, attr, sz) \
3247 static int attr \
3248 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3249 { \
3250 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3251 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3252 return compare_floats(pa, pb, is_quiet, s); \
3253 }
3254
3255 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3256 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3257 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3258
3259 #undef COMPARE
3260
3261 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3262 {
3263 return soft_f16_compare(a, b, false, s);
3264 }
3265
3266 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3267 {
3268 return soft_f16_compare(a, b, true, s);
3269 }
3270
3271 static FloatRelation QEMU_FLATTEN
3272 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3273 {
3274 union_float32 ua, ub;
3275
3276 ua.s = xa;
3277 ub.s = xb;
3278
3279 if (QEMU_NO_HARDFLOAT) {
3280 goto soft;
3281 }
3282
3283 float32_input_flush2(&ua.s, &ub.s, s);
3284 if (isgreaterequal(ua.h, ub.h)) {
3285 if (isgreater(ua.h, ub.h)) {
3286 return float_relation_greater;
3287 }
3288 return float_relation_equal;
3289 }
3290 if (likely(isless(ua.h, ub.h))) {
3291 return float_relation_less;
3292 }
3293 /* The only condition remaining is unordered.
3294 * Fall through to set flags.
3295 */
3296 soft:
3297 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3298 }
3299
3300 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3301 {
3302 return f32_compare(a, b, false, s);
3303 }
3304
3305 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3306 {
3307 return f32_compare(a, b, true, s);
3308 }
3309
3310 static FloatRelation QEMU_FLATTEN
3311 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3312 {
3313 union_float64 ua, ub;
3314
3315 ua.s = xa;
3316 ub.s = xb;
3317
3318 if (QEMU_NO_HARDFLOAT) {
3319 goto soft;
3320 }
3321
3322 float64_input_flush2(&ua.s, &ub.s, s);
3323 if (isgreaterequal(ua.h, ub.h)) {
3324 if (isgreater(ua.h, ub.h)) {
3325 return float_relation_greater;
3326 }
3327 return float_relation_equal;
3328 }
3329 if (likely(isless(ua.h, ub.h))) {
3330 return float_relation_less;
3331 }
3332 /* The only condition remaining is unordered.
3333 * Fall through to set flags.
3334 */
3335 soft:
3336 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3337 }
3338
3339 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3340 {
3341 return f64_compare(a, b, false, s);
3342 }
3343
3344 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3345 {
3346 return f64_compare(a, b, true, s);
3347 }
3348
3349 static FloatRelation QEMU_FLATTEN
3350 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3351 {
3352 FloatParts64 pa = bfloat16_unpack_canonical(a, s);
3353 FloatParts64 pb = bfloat16_unpack_canonical(b, s);
3354 return compare_floats(pa, pb, is_quiet, s);
3355 }
3356
3357 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3358 {
3359 return soft_bf16_compare(a, b, false, s);
3360 }
3361
3362 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3363 {
3364 return soft_bf16_compare(a, b, true, s);
3365 }
3366
3367 /* Multiply A by 2 raised to the power N. */
3368 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3369 {
3370 if (unlikely(is_nan(a.cls))) {
3371 return return_nan(a, s);
3372 }
3373 if (a.cls == float_class_normal) {
3374 /* The largest float type (even though not supported by FloatParts64)
3375 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3376 * still allows rounding to infinity, without allowing overflow
3377 * within the int32_t that backs FloatParts64.exp.
3378 */
3379 n = MIN(MAX(n, -0x10000), 0x10000);
3380 a.exp += n;
3381 }
3382 return a;
3383 }
3384
3385 float16 float16_scalbn(float16 a, int n, float_status *status)
3386 {
3387 FloatParts64 pa = float16_unpack_canonical(a, status);
3388 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3389 return float16_round_pack_canonical(pr, status);
3390 }
3391
3392 float32 float32_scalbn(float32 a, int n, float_status *status)
3393 {
3394 FloatParts64 pa = float32_unpack_canonical(a, status);
3395 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3396 return float32_round_pack_canonical(pr, status);
3397 }
3398
3399 float64 float64_scalbn(float64 a, int n, float_status *status)
3400 {
3401 FloatParts64 pa = float64_unpack_canonical(a, status);
3402 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3403 return float64_round_pack_canonical(pr, status);
3404 }
3405
3406 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3407 {
3408 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3409 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3410 return bfloat16_round_pack_canonical(pr, status);
3411 }
3412
3413 /*
3414 * Square Root
3415 *
3416 * The old softfloat code did an approximation step before zeroing in
3417 * on the final result. However for simpleness we just compute the
3418 * square root by iterating down from the implicit bit to enough extra
3419 * bits to ensure we get a correctly rounded result.
3420 *
3421 * This does mean however the calculation is slower than before,
3422 * especially for 64 bit floats.
3423 */
3424
3425 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3426 {
3427 uint64_t a_frac, r_frac, s_frac;
3428 int bit, last_bit;
3429
3430 if (is_nan(a.cls)) {
3431 return return_nan(a, s);
3432 }
3433 if (a.cls == float_class_zero) {
3434 return a; /* sqrt(+-0) = +-0 */
3435 }
3436 if (a.sign) {
3437 float_raise(float_flag_invalid, s);
3438 return parts_default_nan(s);
3439 }
3440 if (a.cls == float_class_inf) {
3441 return a; /* sqrt(+inf) = +inf */
3442 }
3443
3444 assert(a.cls == float_class_normal);
3445
3446 /* We need two overflow bits at the top. Adding room for that is a
3447 * right shift. If the exponent is odd, we can discard the low bit
3448 * by multiplying the fraction by 2; that's a left shift. Combine
3449 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3450 */
3451 a_frac = a.frac >> (2 - (a.exp & 1));
3452 a.exp >>= 1;
3453
3454 /* Bit-by-bit computation of sqrt. */
3455 r_frac = 0;
3456 s_frac = 0;
3457
3458 /* Iterate from implicit bit down to the 3 extra bits to compute a
3459 * properly rounded result. Remember we've inserted two more bits
3460 * at the top, so these positions are two less.
3461 */
3462 bit = DECOMPOSED_BINARY_POINT - 2;
3463 last_bit = MAX(p->frac_shift - 4, 0);
3464 do {
3465 uint64_t q = 1ULL << bit;
3466 uint64_t t_frac = s_frac + q;
3467 if (t_frac <= a_frac) {
3468 s_frac = t_frac + q;
3469 a_frac -= t_frac;
3470 r_frac += q;
3471 }
3472 a_frac <<= 1;
3473 } while (--bit >= last_bit);
3474
3475 /* Undo the right shift done above. If there is any remaining
3476 * fraction, the result is inexact. Set the sticky bit.
3477 */
3478 a.frac = (r_frac << 2) + (a_frac != 0);
3479
3480 return a;
3481 }
3482
3483 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3484 {
3485 FloatParts64 pa = float16_unpack_canonical(a, status);
3486 FloatParts64 pr = sqrt_float(pa, status, &float16_params);
3487 return float16_round_pack_canonical(pr, status);
3488 }
3489
3490 static float32 QEMU_SOFTFLOAT_ATTR
3491 soft_f32_sqrt(float32 a, float_status *status)
3492 {
3493 FloatParts64 pa = float32_unpack_canonical(a, status);
3494 FloatParts64 pr = sqrt_float(pa, status, &float32_params);
3495 return float32_round_pack_canonical(pr, status);
3496 }
3497
3498 static float64 QEMU_SOFTFLOAT_ATTR
3499 soft_f64_sqrt(float64 a, float_status *status)
3500 {
3501 FloatParts64 pa = float64_unpack_canonical(a, status);
3502 FloatParts64 pr = sqrt_float(pa, status, &float64_params);
3503 return float64_round_pack_canonical(pr, status);
3504 }
3505
3506 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3507 {
3508 union_float32 ua, ur;
3509
3510 ua.s = xa;
3511 if (unlikely(!can_use_fpu(s))) {
3512 goto soft;
3513 }
3514
3515 float32_input_flush1(&ua.s, s);
3516 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3517 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3518 fpclassify(ua.h) == FP_ZERO) ||
3519 signbit(ua.h))) {
3520 goto soft;
3521 }
3522 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3523 float32_is_neg(ua.s))) {
3524 goto soft;
3525 }
3526 ur.h = sqrtf(ua.h);
3527 return ur.s;
3528
3529 soft:
3530 return soft_f32_sqrt(ua.s, s);
3531 }
3532
3533 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3534 {
3535 union_float64 ua, ur;
3536
3537 ua.s = xa;
3538 if (unlikely(!can_use_fpu(s))) {
3539 goto soft;
3540 }
3541
3542 float64_input_flush1(&ua.s, s);
3543 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3544 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3545 fpclassify(ua.h) == FP_ZERO) ||
3546 signbit(ua.h))) {
3547 goto soft;
3548 }
3549 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3550 float64_is_neg(ua.s))) {
3551 goto soft;
3552 }
3553 ur.h = sqrt(ua.h);
3554 return ur.s;
3555
3556 soft:
3557 return soft_f64_sqrt(ua.s, s);
3558 }
3559
3560 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3561 {
3562 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3563 FloatParts64 pr = sqrt_float(pa, status, &bfloat16_params);
3564 return bfloat16_round_pack_canonical(pr, status);
3565 }
3566
3567 /*----------------------------------------------------------------------------
3568 | The pattern for a default generated NaN.
3569 *----------------------------------------------------------------------------*/
3570
3571 float16 float16_default_nan(float_status *status)
3572 {
3573 FloatParts64 p = parts_default_nan(status);
3574 p.frac >>= float16_params.frac_shift;
3575 return float16_pack_raw(p);
3576 }
3577
3578 float32 float32_default_nan(float_status *status)
3579 {
3580 FloatParts64 p = parts_default_nan(status);
3581 p.frac >>= float32_params.frac_shift;
3582 return float32_pack_raw(p);
3583 }
3584
3585 float64 float64_default_nan(float_status *status)
3586 {
3587 FloatParts64 p = parts_default_nan(status);
3588 p.frac >>= float64_params.frac_shift;
3589 return float64_pack_raw(p);
3590 }
3591
3592 float128 float128_default_nan(float_status *status)
3593 {
3594 FloatParts64 p = parts_default_nan(status);
3595 float128 r;
3596
3597 /* Extrapolate from the choices made by parts_default_nan to fill
3598 * in the quad-floating format. If the low bit is set, assume we
3599 * want to set all non-snan bits.
3600 */
3601 r.low = -(p.frac & 1);
3602 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3603 r.high |= UINT64_C(0x7FFF000000000000);
3604 r.high |= (uint64_t)p.sign << 63;
3605
3606 return r;
3607 }
3608
3609 bfloat16 bfloat16_default_nan(float_status *status)
3610 {
3611 FloatParts64 p = parts_default_nan(status);
3612 p.frac >>= bfloat16_params.frac_shift;
3613 return bfloat16_pack_raw(p);
3614 }
3615
3616 /*----------------------------------------------------------------------------
3617 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3618 *----------------------------------------------------------------------------*/
3619
3620 float16 float16_silence_nan(float16 a, float_status *status)
3621 {
3622 FloatParts64 p = float16_unpack_raw(a);
3623 p.frac <<= float16_params.frac_shift;
3624 p = parts_silence_nan(p, status);
3625 p.frac >>= float16_params.frac_shift;
3626 return float16_pack_raw(p);
3627 }
3628
3629 float32 float32_silence_nan(float32 a, float_status *status)
3630 {
3631 FloatParts64 p = float32_unpack_raw(a);
3632 p.frac <<= float32_params.frac_shift;
3633 p = parts_silence_nan(p, status);
3634 p.frac >>= float32_params.frac_shift;
3635 return float32_pack_raw(p);
3636 }
3637
3638 float64 float64_silence_nan(float64 a, float_status *status)
3639 {
3640 FloatParts64 p = float64_unpack_raw(a);
3641 p.frac <<= float64_params.frac_shift;
3642 p = parts_silence_nan(p, status);
3643 p.frac >>= float64_params.frac_shift;
3644 return float64_pack_raw(p);
3645 }
3646
3647 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3648 {
3649 FloatParts64 p = bfloat16_unpack_raw(a);
3650 p.frac <<= bfloat16_params.frac_shift;
3651 p = parts_silence_nan(p, status);
3652 p.frac >>= bfloat16_params.frac_shift;
3653 return bfloat16_pack_raw(p);
3654 }
3655
3656 /*----------------------------------------------------------------------------
3657 | If `a' is denormal and we are in flush-to-zero mode then set the
3658 | input-denormal exception and return zero. Otherwise just return the value.
3659 *----------------------------------------------------------------------------*/
3660
3661 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3662 {
3663 if (p.exp == 0 && p.frac != 0) {
3664 float_raise(float_flag_input_denormal, status);
3665 return true;
3666 }
3667
3668 return false;
3669 }
3670
3671 float16 float16_squash_input_denormal(float16 a, float_status *status)
3672 {
3673 if (status->flush_inputs_to_zero) {
3674 FloatParts64 p = float16_unpack_raw(a);
3675 if (parts_squash_denormal(p, status)) {
3676 return float16_set_sign(float16_zero, p.sign);
3677 }
3678 }
3679 return a;
3680 }
3681
3682 float32 float32_squash_input_denormal(float32 a, float_status *status)
3683 {
3684 if (status->flush_inputs_to_zero) {
3685 FloatParts64 p = float32_unpack_raw(a);
3686 if (parts_squash_denormal(p, status)) {
3687 return float32_set_sign(float32_zero, p.sign);
3688 }
3689 }
3690 return a;
3691 }
3692
3693 float64 float64_squash_input_denormal(float64 a, float_status *status)
3694 {
3695 if (status->flush_inputs_to_zero) {
3696 FloatParts64 p = float64_unpack_raw(a);
3697 if (parts_squash_denormal(p, status)) {
3698 return float64_set_sign(float64_zero, p.sign);
3699 }
3700 }
3701 return a;
3702 }
3703
3704 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3705 {
3706 if (status->flush_inputs_to_zero) {
3707 FloatParts64 p = bfloat16_unpack_raw(a);
3708 if (parts_squash_denormal(p, status)) {
3709 return bfloat16_set_sign(bfloat16_zero, p.sign);
3710 }
3711 }
3712 return a;
3713 }
3714
3715 /*----------------------------------------------------------------------------
3716 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3717 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3718 | input. If `zSign' is 1, the input is negated before being converted to an
3719 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3720 | is simply rounded to an integer, with the inexact exception raised if the
3721 | input cannot be represented exactly as an integer. However, if the fixed-
3722 | point input is too large, the invalid exception is raised and the largest
3723 | positive or negative integer is returned.
3724 *----------------------------------------------------------------------------*/
3725
3726 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3727 float_status *status)
3728 {
3729 int8_t roundingMode;
3730 bool roundNearestEven;
3731 int8_t roundIncrement, roundBits;
3732 int32_t z;
3733
3734 roundingMode = status->float_rounding_mode;
3735 roundNearestEven = ( roundingMode == float_round_nearest_even );
3736 switch (roundingMode) {
3737 case float_round_nearest_even:
3738 case float_round_ties_away:
3739 roundIncrement = 0x40;
3740 break;
3741 case float_round_to_zero:
3742 roundIncrement = 0;
3743 break;
3744 case float_round_up:
3745 roundIncrement = zSign ? 0 : 0x7f;
3746 break;
3747 case float_round_down:
3748 roundIncrement = zSign ? 0x7f : 0;
3749 break;
3750 case float_round_to_odd:
3751 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3752 break;
3753 default:
3754 abort();
3755 }
3756 roundBits = absZ & 0x7F;
3757 absZ = ( absZ + roundIncrement )>>7;
3758 if (!(roundBits ^ 0x40) && roundNearestEven) {
3759 absZ &= ~1;
3760 }
3761 z = absZ;
3762 if ( zSign ) z = - z;
3763 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3764 float_raise(float_flag_invalid, status);
3765 return zSign ? INT32_MIN : INT32_MAX;
3766 }
3767 if (roundBits) {
3768 float_raise(float_flag_inexact, status);
3769 }
3770 return z;
3771
3772 }
3773
3774 /*----------------------------------------------------------------------------
3775 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3776 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3777 | and returns the properly rounded 64-bit integer corresponding to the input.
3778 | If `zSign' is 1, the input is negated before being converted to an integer.
3779 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3780 | the inexact exception raised if the input cannot be represented exactly as
3781 | an integer. However, if the fixed-point input is too large, the invalid
3782 | exception is raised and the largest positive or negative integer is
3783 | returned.
3784 *----------------------------------------------------------------------------*/
3785
3786 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3787 float_status *status)
3788 {
3789 int8_t roundingMode;
3790 bool roundNearestEven, increment;
3791 int64_t z;
3792
3793 roundingMode = status->float_rounding_mode;
3794 roundNearestEven = ( roundingMode == float_round_nearest_even );
3795 switch (roundingMode) {
3796 case float_round_nearest_even:
3797 case float_round_ties_away:
3798 increment = ((int64_t) absZ1 < 0);
3799 break;
3800 case float_round_to_zero:
3801 increment = 0;
3802 break;
3803 case float_round_up:
3804 increment = !zSign && absZ1;
3805 break;
3806 case float_round_down:
3807 increment = zSign && absZ1;
3808 break;
3809 case float_round_to_odd:
3810 increment = !(absZ0 & 1) && absZ1;
3811 break;
3812 default:
3813 abort();
3814 }
3815 if ( increment ) {
3816 ++absZ0;
3817 if ( absZ0 == 0 ) goto overflow;
3818 if (!(absZ1 << 1) && roundNearestEven) {
3819 absZ0 &= ~1;
3820 }
3821 }
3822 z = absZ0;
3823 if ( zSign ) z = - z;
3824 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3825 overflow:
3826 float_raise(float_flag_invalid, status);
3827 return zSign ? INT64_MIN : INT64_MAX;
3828 }
3829 if (absZ1) {
3830 float_raise(float_flag_inexact, status);
3831 }
3832 return z;
3833
3834 }
3835
3836 /*----------------------------------------------------------------------------
3837 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3838 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3839 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3840 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3841 | with the inexact exception raised if the input cannot be represented exactly
3842 | as an integer. However, if the fixed-point input is too large, the invalid
3843 | exception is raised and the largest unsigned integer is returned.
3844 *----------------------------------------------------------------------------*/
3845
3846 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3847 uint64_t absZ1, float_status *status)
3848 {
3849 int8_t roundingMode;
3850 bool roundNearestEven, increment;
3851
3852 roundingMode = status->float_rounding_mode;
3853 roundNearestEven = (roundingMode == float_round_nearest_even);
3854 switch (roundingMode) {
3855 case float_round_nearest_even:
3856 case float_round_ties_away:
3857 increment = ((int64_t)absZ1 < 0);
3858 break;
3859 case float_round_to_zero:
3860 increment = 0;
3861 break;
3862 case float_round_up:
3863 increment = !zSign && absZ1;
3864 break;
3865 case float_round_down:
3866 increment = zSign && absZ1;
3867 break;
3868 case float_round_to_odd:
3869 increment = !(absZ0 & 1) && absZ1;
3870 break;
3871 default:
3872 abort();
3873 }
3874 if (increment) {
3875 ++absZ0;
3876 if (absZ0 == 0) {
3877 float_raise(float_flag_invalid, status);
3878 return UINT64_MAX;
3879 }
3880 if (!(absZ1 << 1) && roundNearestEven) {
3881 absZ0 &= ~1;
3882 }
3883 }
3884
3885 if (zSign && absZ0) {
3886 float_raise(float_flag_invalid, status);
3887 return 0;
3888 }
3889
3890 if (absZ1) {
3891 float_raise(float_flag_inexact, status);
3892 }
3893 return absZ0;
3894 }
3895
3896 /*----------------------------------------------------------------------------
3897 | Normalizes the subnormal single-precision floating-point value represented
3898 | by the denormalized significand `aSig'. The normalized exponent and
3899 | significand are stored at the locations pointed to by `zExpPtr' and
3900 | `zSigPtr', respectively.
3901 *----------------------------------------------------------------------------*/
3902
3903 static void
3904 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3905 {
3906 int8_t shiftCount;
3907
3908 shiftCount = clz32(aSig) - 8;
3909 *zSigPtr = aSig<<shiftCount;
3910 *zExpPtr = 1 - shiftCount;
3911
3912 }
3913
3914 /*----------------------------------------------------------------------------
3915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3916 | and significand `zSig', and returns the proper single-precision floating-
3917 | point value corresponding to the abstract input. Ordinarily, the abstract
3918 | value is simply rounded and packed into the single-precision format, with
3919 | the inexact exception raised if the abstract input cannot be represented
3920 | exactly. However, if the abstract value is too large, the overflow and
3921 | inexact exceptions are raised and an infinity or maximal finite value is
3922 | returned. If the abstract value is too small, the input value is rounded to
3923 | a subnormal number, and the underflow and inexact exceptions are raised if
3924 | the abstract input cannot be represented exactly as a subnormal single-
3925 | precision floating-point number.
3926 | The input significand `zSig' has its binary point between bits 30
3927 | and 29, which is 7 bits to the left of the usual location. This shifted
3928 | significand must be normalized or smaller. If `zSig' is not normalized,
3929 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3930 | and it must not require rounding. In the usual case that `zSig' is
3931 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3932 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3933 | Binary Floating-Point Arithmetic.
3934 *----------------------------------------------------------------------------*/
3935
3936 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3937 float_status *status)
3938 {
3939 int8_t roundingMode;
3940 bool roundNearestEven;
3941 int8_t roundIncrement, roundBits;
3942 bool isTiny;
3943
3944 roundingMode = status->float_rounding_mode;
3945 roundNearestEven = ( roundingMode == float_round_nearest_even );
3946 switch (roundingMode) {
3947 case float_round_nearest_even:
3948 case float_round_ties_away:
3949 roundIncrement = 0x40;
3950 break;
3951 case float_round_to_zero:
3952 roundIncrement = 0;
3953 break;
3954 case float_round_up:
3955 roundIncrement = zSign ? 0 : 0x7f;
3956 break;
3957 case float_round_down:
3958 roundIncrement = zSign ? 0x7f : 0;
3959 break;
3960 case float_round_to_odd:
3961 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3962 break;
3963 default:
3964 abort();
3965 break;
3966 }
3967 roundBits = zSig & 0x7F;
3968 if ( 0xFD <= (uint16_t) zExp ) {
3969 if ( ( 0xFD < zExp )
3970 || ( ( zExp == 0xFD )
3971 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3972 ) {
3973 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3974 roundIncrement != 0;
3975 float_raise(float_flag_overflow | float_flag_inexact, status);
3976 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3977 }
3978 if ( zExp < 0 ) {
3979 if (status->flush_to_zero) {
3980 float_raise(float_flag_output_denormal, status);
3981 return packFloat32(zSign, 0, 0);
3982 }
3983 isTiny = status->tininess_before_rounding
3984 || (zExp < -1)
3985 || (zSig + roundIncrement < 0x80000000);
3986 shift32RightJamming( zSig, - zExp, &zSig );
3987 zExp = 0;
3988 roundBits = zSig & 0x7F;
3989 if (isTiny && roundBits) {
3990 float_raise(float_flag_underflow, status);
3991 }
3992 if (roundingMode == float_round_to_odd) {
3993 /*
3994 * For round-to-odd case, the roundIncrement depends on
3995 * zSig which just changed.
3996 */
3997 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3998 }
3999 }
4000 }
4001 if (roundBits) {
4002 float_raise(float_flag_inexact, status);
4003 }
4004 zSig = ( zSig + roundIncrement )>>7;
4005 if (!(roundBits ^ 0x40) && roundNearestEven) {
4006 zSig &= ~1;
4007 }
4008 if ( zSig == 0 ) zExp = 0;
4009 return packFloat32( zSign, zExp, zSig );
4010
4011 }
4012
4013 /*----------------------------------------------------------------------------
4014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4015 | and significand `zSig', and returns the proper single-precision floating-
4016 | point value corresponding to the abstract input. This routine is just like
4017 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4018 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4019 | floating-point exponent.
4020 *----------------------------------------------------------------------------*/
4021
4022 static float32
4023 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4024 float_status *status)
4025 {
4026 int8_t shiftCount;
4027
4028 shiftCount = clz32(zSig) - 1;
4029 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4030 status);
4031
4032 }
4033
4034 /*----------------------------------------------------------------------------
4035 | Normalizes the subnormal double-precision floating-point value represented
4036 | by the denormalized significand `aSig'. The normalized exponent and
4037 | significand are stored at the locations pointed to by `zExpPtr' and
4038 | `zSigPtr', respectively.
4039 *----------------------------------------------------------------------------*/
4040
4041 static void
4042 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4043 {
4044 int8_t shiftCount;
4045
4046 shiftCount = clz64(aSig) - 11;
4047 *zSigPtr = aSig<<shiftCount;
4048 *zExpPtr = 1 - shiftCount;
4049
4050 }
4051
4052 /*----------------------------------------------------------------------------
4053 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4054 | double-precision floating-point value, returning the result. After being
4055 | shifted into the proper positions, the three fields are simply added
4056 | together to form the result. This means that any integer portion of `zSig'
4057 | will be added into the exponent. Since a properly normalized significand
4058 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4059 | than the desired result exponent whenever `zSig' is a complete, normalized
4060 | significand.
4061 *----------------------------------------------------------------------------*/
4062
4063 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4064 {
4065
4066 return make_float64(
4067 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4068
4069 }
4070
4071 /*----------------------------------------------------------------------------
4072 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4073 | and significand `zSig', and returns the proper double-precision floating-
4074 | point value corresponding to the abstract input. Ordinarily, the abstract
4075 | value is simply rounded and packed into the double-precision format, with
4076 | the inexact exception raised if the abstract input cannot be represented
4077 | exactly. However, if the abstract value is too large, the overflow and
4078 | inexact exceptions are raised and an infinity or maximal finite value is
4079 | returned. If the abstract value is too small, the input value is rounded to
4080 | a subnormal number, and the underflow and inexact exceptions are raised if
4081 | the abstract input cannot be represented exactly as a subnormal double-
4082 | precision floating-point number.
4083 | The input significand `zSig' has its binary point between bits 62
4084 | and 61, which is 10 bits to the left of the usual location. This shifted
4085 | significand must be normalized or smaller. If `zSig' is not normalized,
4086 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4087 | and it must not require rounding. In the usual case that `zSig' is
4088 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4089 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4090 | Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092
4093 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4094 float_status *status)
4095 {
4096 int8_t roundingMode;
4097 bool roundNearestEven;
4098 int roundIncrement, roundBits;
4099 bool isTiny;
4100
4101 roundingMode = status->float_rounding_mode;
4102 roundNearestEven = ( roundingMode == float_round_nearest_even );
4103 switch (roundingMode) {
4104 case float_round_nearest_even:
4105 case float_round_ties_away:
4106 roundIncrement = 0x200;
4107 break;
4108 case float_round_to_zero:
4109 roundIncrement = 0;
4110 break;
4111 case float_round_up:
4112 roundIncrement = zSign ? 0 : 0x3ff;
4113 break;
4114 case float_round_down:
4115 roundIncrement = zSign ? 0x3ff : 0;
4116 break;
4117 case float_round_to_odd:
4118 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4119 break;
4120 default:
4121 abort();
4122 }
4123 roundBits = zSig & 0x3FF;
4124 if ( 0x7FD <= (uint16_t) zExp ) {
4125 if ( ( 0x7FD < zExp )
4126 || ( ( zExp == 0x7FD )
4127 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4128 ) {
4129 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4130 roundIncrement != 0;
4131 float_raise(float_flag_overflow | float_flag_inexact, status);
4132 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4133 }
4134 if ( zExp < 0 ) {
4135 if (status->flush_to_zero) {
4136 float_raise(float_flag_output_denormal, status);
4137 return packFloat64(zSign, 0, 0);
4138 }
4139 isTiny = status->tininess_before_rounding
4140 || (zExp < -1)
4141 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4142 shift64RightJamming( zSig, - zExp, &zSig );
4143 zExp = 0;
4144 roundBits = zSig & 0x3FF;
4145 if (isTiny && roundBits) {
4146 float_raise(float_flag_underflow, status);
4147 }
4148 if (roundingMode == float_round_to_odd) {
4149 /*
4150 * For round-to-odd case, the roundIncrement depends on
4151 * zSig which just changed.
4152 */
4153 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4154 }
4155 }
4156 }
4157 if (roundBits) {
4158 float_raise(float_flag_inexact, status);
4159 }
4160 zSig = ( zSig + roundIncrement )>>10;
4161 if (!(roundBits ^ 0x200) && roundNearestEven) {
4162 zSig &= ~1;
4163 }
4164 if ( zSig == 0 ) zExp = 0;
4165 return packFloat64( zSign, zExp, zSig );
4166
4167 }
4168
4169 /*----------------------------------------------------------------------------
4170 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4171 | and significand `zSig', and returns the proper double-precision floating-
4172 | point value corresponding to the abstract input. This routine is just like
4173 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4174 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4175 | floating-point exponent.
4176 *----------------------------------------------------------------------------*/
4177
4178 static float64
4179 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4180 float_status *status)
4181 {
4182 int8_t shiftCount;
4183
4184 shiftCount = clz64(zSig) - 1;
4185 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4186 status);
4187
4188 }
4189
4190 /*----------------------------------------------------------------------------
4191 | Normalizes the subnormal extended double-precision floating-point value
4192 | represented by the denormalized significand `aSig'. The normalized exponent
4193 | and significand are stored at the locations pointed to by `zExpPtr' and
4194 | `zSigPtr', respectively.
4195 *----------------------------------------------------------------------------*/
4196
4197 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4198 uint64_t *zSigPtr)
4199 {
4200 int8_t shiftCount;
4201
4202 shiftCount = clz64(aSig);
4203 *zSigPtr = aSig<<shiftCount;
4204 *zExpPtr = 1 - shiftCount;
4205 }
4206
4207 /*----------------------------------------------------------------------------
4208 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4209 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4210 | and returns the proper extended double-precision floating-point value
4211 | corresponding to the abstract input. Ordinarily, the abstract value is
4212 | rounded and packed into the extended double-precision format, with the
4213 | inexact exception raised if the abstract input cannot be represented
4214 | exactly. However, if the abstract value is too large, the overflow and
4215 | inexact exceptions are raised and an infinity or maximal finite value is
4216 | returned. If the abstract value is too small, the input value is rounded to
4217 | a subnormal number, and the underflow and inexact exceptions are raised if
4218 | the abstract input cannot be represented exactly as a subnormal extended
4219 | double-precision floating-point number.
4220 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4221 | number of bits as single or double precision, respectively. Otherwise, the
4222 | result is rounded to the full precision of the extended double-precision
4223 | format.
4224 | The input significand must be normalized or smaller. If the input
4225 | significand is not normalized, `zExp' must be 0; in that case, the result
4226 | returned is a subnormal number, and it must not require rounding. The
4227 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4230
4231 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4232 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4233 float_status *status)
4234 {
4235 int8_t roundingMode;
4236 bool roundNearestEven, increment, isTiny;
4237 int64_t roundIncrement, roundMask, roundBits;
4238
4239 roundingMode = status->float_rounding_mode;
4240 roundNearestEven = ( roundingMode == float_round_nearest_even );
4241 if ( roundingPrecision == 80 ) goto precision80;
4242 if ( roundingPrecision == 64 ) {
4243 roundIncrement = UINT64_C(0x0000000000000400);
4244 roundMask = UINT64_C(0x00000000000007FF);
4245 }
4246 else if ( roundingPrecision == 32 ) {
4247 roundIncrement = UINT64_C(0x0000008000000000);
4248 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4249 }
4250 else {
4251 goto precision80;
4252 }
4253 zSig0 |= ( zSig1 != 0 );
4254 switch (roundingMode) {
4255 case float_round_nearest_even:
4256 case float_round_ties_away:
4257 break;
4258 case float_round_to_zero:
4259 roundIncrement = 0;
4260 break;
4261 case float_round_up:
4262 roundIncrement = zSign ? 0 : roundMask;
4263 break;
4264 case float_round_down:
4265 roundIncrement = zSign ? roundMask : 0;
4266 break;
4267 default:
4268 abort();
4269 }
4270 roundBits = zSig0 & roundMask;
4271 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4272 if ( ( 0x7FFE < zExp )
4273 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4274 ) {
4275 goto overflow;
4276 }
4277 if ( zExp <= 0 ) {
4278 if (status->flush_to_zero) {
4279 float_raise(float_flag_output_denormal, status);
4280 return packFloatx80(zSign, 0, 0);
4281 }
4282 isTiny = status->tininess_before_rounding
4283 || (zExp < 0 )
4284 || (zSig0 <= zSig0 + roundIncrement);
4285 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4286 zExp = 0;
4287 roundBits = zSig0 & roundMask;
4288 if (isTiny && roundBits) {
4289 float_raise(float_flag_underflow, status);
4290 }
4291 if (roundBits) {
4292 float_raise(float_flag_inexact, status);
4293 }
4294 zSig0 += roundIncrement;
4295 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4296 roundIncrement = roundMask + 1;
4297 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4298 roundMask |= roundIncrement;
4299 }
4300 zSig0 &= ~ roundMask;
4301 return packFloatx80( zSign, zExp, zSig0 );
4302 }
4303 }
4304 if (roundBits) {
4305 float_raise(float_flag_inexact, status);
4306 }
4307 zSig0 += roundIncrement;
4308 if ( zSig0 < roundIncrement ) {
4309 ++zExp;
4310 zSig0 = UINT64_C(0x8000000000000000);
4311 }
4312 roundIncrement = roundMask + 1;
4313 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4314 roundMask |= roundIncrement;
4315 }
4316 zSig0 &= ~ roundMask;
4317 if ( zSig0 == 0 ) zExp = 0;
4318 return packFloatx80( zSign, zExp, zSig0 );
4319 precision80:
4320 switch (roundingMode) {
4321 case float_round_nearest_even:
4322 case float_round_ties_away:
4323 increment = ((int64_t)zSig1 < 0);
4324 break;
4325 case float_round_to_zero:
4326 increment = 0;
4327 break;
4328 case float_round_up:
4329 increment = !zSign && zSig1;
4330 break;
4331 case float_round_down:
4332 increment = zSign && zSig1;
4333 break;
4334 default:
4335 abort();
4336 }
4337 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4338 if ( ( 0x7FFE < zExp )
4339 || ( ( zExp == 0x7FFE )
4340 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4341 && increment
4342 )
4343 ) {
4344 roundMask = 0;
4345 overflow:
4346 float_raise(float_flag_overflow | float_flag_inexact, status);
4347 if ( ( roundingMode == float_round_to_zero )
4348 || ( zSign && ( roundingMode == float_round_up ) )
4349 || ( ! zSign && ( roundingMode == float_round_down ) )
4350 ) {
4351 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4352 }
4353 return packFloatx80(zSign,
4354 floatx80_infinity_high,
4355 floatx80_infinity_low);
4356 }
4357 if ( zExp <= 0 ) {
4358 isTiny = status->tininess_before_rounding
4359 || (zExp < 0)
4360 || !increment
4361 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4362 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4363 zExp = 0;
4364 if (isTiny && zSig1) {
4365 float_raise(float_flag_underflow, status);
4366 }
4367 if (zSig1) {
4368 float_raise(float_flag_inexact, status);
4369 }
4370 switch (roundingMode) {
4371 case float_round_nearest_even:
4372 case float_round_ties_away:
4373 increment = ((int64_t)zSig1 < 0);
4374 break;
4375 case float_round_to_zero:
4376 increment = 0;
4377 break;
4378 case float_round_up:
4379 increment = !zSign && zSig1;
4380 break;
4381 case float_round_down:
4382 increment = zSign && zSig1;
4383 break;
4384 default:
4385 abort();
4386 }
4387 if ( increment ) {
4388 ++zSig0;
4389 if (!(zSig1 << 1) && roundNearestEven) {
4390 zSig0 &= ~1;
4391 }
4392 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4393 }
4394 return packFloatx80( zSign, zExp, zSig0 );
4395 }
4396 }
4397 if (zSig1) {
4398 float_raise(float_flag_inexact, status);
4399 }
4400 if ( increment ) {
4401 ++zSig0;
4402 if ( zSig0 == 0 ) {
4403 ++zExp;
4404 zSig0 = UINT64_C(0x8000000000000000);
4405 }
4406 else {
4407 if (!(zSig1 << 1) && roundNearestEven) {
4408 zSig0 &= ~1;
4409 }
4410 }
4411 }
4412 else {
4413 if ( zSig0 == 0 ) zExp = 0;
4414 }
4415 return packFloatx80( zSign, zExp, zSig0 );
4416
4417 }
4418
4419 /*----------------------------------------------------------------------------
4420 | Takes an abstract floating-point value having sign `zSign', exponent
4421 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4422 | and returns the proper extended double-precision floating-point value
4423 | corresponding to the abstract input. This routine is just like
4424 | `roundAndPackFloatx80' except that the input significand does not have to be
4425 | normalized.
4426 *----------------------------------------------------------------------------*/
4427
4428 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4429 bool zSign, int32_t zExp,
4430 uint64_t zSig0, uint64_t zSig1,
4431 float_status *status)
4432 {
4433 int8_t shiftCount;
4434
4435 if ( zSig0 == 0 ) {
4436 zSig0 = zSig1;
4437 zSig1 = 0;
4438 zExp -= 64;
4439 }
4440 shiftCount = clz64(zSig0);
4441 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4442 zExp -= shiftCount;
4443 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4444 zSig0, zSig1, status);
4445
4446 }
4447
4448 /*----------------------------------------------------------------------------
4449 | Returns the least-significant 64 fraction bits of the quadruple-precision
4450 | floating-point value `a'.
4451 *----------------------------------------------------------------------------*/
4452
4453 static inline uint64_t extractFloat128Frac1( float128 a )
4454 {
4455
4456 return a.low;
4457
4458 }
4459
4460 /*----------------------------------------------------------------------------
4461 | Returns the most-significant 48 fraction bits of the quadruple-precision
4462 | floating-point value `a'.
4463 *----------------------------------------------------------------------------*/
4464
4465 static inline uint64_t extractFloat128Frac0( float128 a )
4466 {
4467
4468 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4469
4470 }
4471
4472 /*----------------------------------------------------------------------------
4473 | Returns the exponent bits of the quadruple-precision floating-point value
4474 | `a'.
4475 *----------------------------------------------------------------------------*/
4476
4477 static inline int32_t extractFloat128Exp( float128 a )
4478 {
4479
4480 return ( a.high>>48 ) & 0x7FFF;
4481
4482 }
4483
4484 /*----------------------------------------------------------------------------
4485 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4486 *----------------------------------------------------------------------------*/
4487
4488 static inline bool extractFloat128Sign(float128 a)
4489 {
4490 return a.high >> 63;
4491 }
4492
4493 /*----------------------------------------------------------------------------
4494 | Normalizes the subnormal quadruple-precision floating-point value
4495 | represented by the denormalized significand formed by the concatenation of
4496 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4497 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4498 | significand are stored at the location pointed to by `zSig0Ptr', and the
4499 | least significant 64 bits of the normalized significand are stored at the
4500 | location pointed to by `zSig1Ptr'.
4501 *----------------------------------------------------------------------------*/
4502
4503 static void
4504 normalizeFloat128Subnormal(
4505 uint64_t aSig0,
4506 uint64_t aSig1,
4507 int32_t *zExpPtr,
4508 uint64_t *zSig0Ptr,
4509 uint64_t *zSig1Ptr
4510 )
4511 {
4512 int8_t shiftCount;
4513
4514 if ( aSig0 == 0 ) {
4515 shiftCount = clz64(aSig1) - 15;
4516 if ( shiftCount < 0 ) {
4517 *zSig0Ptr = aSig1>>( - shiftCount );
4518 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4519 }
4520 else {
4521 *zSig0Ptr = aSig1<<shiftCount;
4522 *zSig1Ptr = 0;
4523 }
4524 *zExpPtr = - shiftCount - 63;
4525 }
4526 else {
4527 shiftCount = clz64(aSig0) - 15;
4528 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4529 *zExpPtr = 1 - shiftCount;
4530 }
4531
4532 }
4533
4534 /*----------------------------------------------------------------------------
4535 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4536 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4537 | floating-point value, returning the result. After being shifted into the
4538 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4539 | added together to form the most significant 32 bits of the result. This
4540 | means that any integer portion of `zSig0' will be added into the exponent.
4541 | Since a properly normalized significand will have an integer portion equal
4542 | to 1, the `zExp' input should be 1 less than the desired result exponent
4543 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4544 | significand.
4545 *----------------------------------------------------------------------------*/
4546
4547 static inline float128
4548 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4549 {
4550 float128 z;
4551
4552 z.low = zSig1;
4553 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4554 return z;
4555 }
4556
4557 /*----------------------------------------------------------------------------
4558 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4559 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4560 | and `zSig2', and returns the proper quadruple-precision floating-point value
4561 | corresponding to the abstract input. Ordinarily, the abstract value is
4562 | simply rounded and packed into the quadruple-precision format, with the
4563 | inexact exception raised if the abstract input cannot be represented
4564 | exactly. However, if the abstract value is too large, the overflow and
4565 | inexact exceptions are raised and an infinity or maximal finite value is
4566 | returned. If the abstract value is too small, the input value is rounded to
4567 | a subnormal number, and the underflow and inexact exceptions are raised if
4568 | the abstract input cannot be represented exactly as a subnormal quadruple-
4569 | precision floating-point number.
4570 | The input significand must be normalized or smaller. If the input
4571 | significand is not normalized, `zExp' must be 0; in that case, the result
4572 | returned is a subnormal number, and it must not require rounding. In the
4573 | usual case that the input significand is normalized, `zExp' must be 1 less
4574 | than the ``true'' floating-point exponent. The handling of underflow and
4575 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4576 *----------------------------------------------------------------------------*/
4577
4578 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4579 uint64_t zSig0, uint64_t zSig1,
4580 uint64_t zSig2, float_status *status)
4581 {
4582 int8_t roundingMode;
4583 bool roundNearestEven, increment, isTiny;
4584
4585 roundingMode = status->float_rounding_mode;
4586 roundNearestEven = ( roundingMode == float_round_nearest_even );
4587 switch (roundingMode) {
4588 case float_round_nearest_even:
4589 case float_round_ties_away:
4590 increment = ((int64_t)zSig2 < 0);
4591 break;
4592 case float_round_to_zero:
4593 increment = 0;
4594 break;
4595 case float_round_up:
4596 increment = !zSign && zSig2;
4597 break;
4598 case float_round_down:
4599 increment = zSign && zSig2;
4600 break;
4601 case float_round_to_odd:
4602 increment = !(zSig1 & 0x1) && zSig2;
4603 break;
4604 default:
4605 abort();
4606 }
4607 if ( 0x7FFD <= (uint32_t) zExp ) {
4608 if ( ( 0x7FFD < zExp )
4609 || ( ( zExp == 0x7FFD )
4610 && eq128(
4611 UINT64_C(0x0001FFFFFFFFFFFF),
4612 UINT64_C(0xFFFFFFFFFFFFFFFF),
4613 zSig0,
4614 zSig1
4615 )
4616 && increment
4617 )
4618 ) {
4619 float_raise(float_flag_overflow | float_flag_inexact, status);
4620 if ( ( roundingMode == float_round_to_zero )
4621 || ( zSign && ( roundingMode == float_round_up ) )
4622 || ( ! zSign && ( roundingMode == float_round_down ) )
4623 || (roundingMode == float_round_to_odd)
4624 ) {
4625 return
4626 packFloat128(
4627 zSign,
4628 0x7FFE,
4629 UINT64_C(0x0000FFFFFFFFFFFF),
4630 UINT64_C(0xFFFFFFFFFFFFFFFF)
4631 );
4632 }
4633 return packFloat128( zSign, 0x7FFF, 0, 0 );
4634 }
4635 if ( zExp < 0 ) {
4636 if (status->flush_to_zero) {
4637 float_raise(float_flag_output_denormal, status);
4638 return packFloat128(zSign, 0, 0, 0);
4639 }
4640 isTiny = status->tininess_before_rounding
4641 || (zExp < -1)
4642 || !increment
4643 || lt128(zSig0, zSig1,
4644 UINT64_C(0x0001FFFFFFFFFFFF),
4645 UINT64_C(0xFFFFFFFFFFFFFFFF));
4646 shift128ExtraRightJamming(
4647 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4648 zExp = 0;
4649 if (isTiny && zSig2) {
4650 float_raise(float_flag_underflow, status);
4651 }
4652 switch (roundingMode) {
4653 case float_round_nearest_even:
4654 case float_round_ties_away:
4655 increment = ((int64_t)zSig2 < 0);
4656 break;
4657 case float_round_to_zero:
4658 increment = 0;
4659 break;
4660 case float_round_up:
4661 increment = !zSign && zSig2;
4662 break;
4663 case float_round_down:
4664 increment = zSign && zSig2;
4665 break;
4666 case float_round_to_odd:
4667 increment = !(zSig1 & 0x1) && zSig2;
4668 break;
4669 default:
4670 abort();
4671 }
4672 }
4673 }
4674 if (zSig2) {
4675 float_raise(float_flag_inexact, status);
4676 }
4677 if ( increment ) {
4678 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4679 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4680 zSig1 &= ~1;
4681 }
4682 }
4683 else {
4684 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4685 }
4686 return packFloat128( zSign, zExp, zSig0, zSig1 );
4687
4688 }
4689
4690 /*----------------------------------------------------------------------------
4691 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4692 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4693 | returns the proper quadruple-precision floating-point value corresponding
4694 | to the abstract input. This routine is just like `roundAndPackFloat128'
4695 | except that the input significand has fewer bits and does not have to be
4696 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4697 | point exponent.
4698 *----------------------------------------------------------------------------*/
4699
4700 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4701 uint64_t zSig0, uint64_t zSig1,
4702 float_status *status)
4703 {
4704 int8_t shiftCount;
4705 uint64_t zSig2;
4706
4707 if ( zSig0 == 0 ) {
4708 zSig0 = zSig1;
4709 zSig1 = 0;
4710 zExp -= 64;
4711 }
4712 shiftCount = clz64(zSig0) - 15;
4713 if ( 0 <= shiftCount ) {
4714 zSig2 = 0;
4715 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4716 }
4717 else {
4718 shift128ExtraRightJamming(
4719 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4720 }
4721 zExp -= shiftCount;
4722 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4723
4724 }
4725
4726
4727 /*----------------------------------------------------------------------------
4728 | Returns the result of converting the 32-bit two's complement integer `a'
4729 | to the extended double-precision floating-point format. The conversion
4730 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4731 | Arithmetic.
4732 *----------------------------------------------------------------------------*/
4733
4734 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4735 {
4736 bool zSign;
4737 uint32_t absA;
4738 int8_t shiftCount;
4739 uint64_t zSig;
4740
4741 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4742 zSign = ( a < 0 );
4743 absA = zSign ? - a : a;
4744 shiftCount = clz32(absA) + 32;
4745 zSig = absA;
4746 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4747
4748 }
4749
4750 /*----------------------------------------------------------------------------
4751 | Returns the result of converting the 32-bit two's complement integer `a' to
4752 | the quadruple-precision floating-point format. The conversion is performed
4753 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4755
4756 float128 int32_to_float128(int32_t a, float_status *status)
4757 {
4758 bool zSign;
4759 uint32_t absA;
4760 int8_t shiftCount;
4761 uint64_t zSig0;
4762
4763 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4764 zSign = ( a < 0 );
4765 absA = zSign ? - a : a;
4766 shiftCount = clz32(absA) + 17;
4767 zSig0 = absA;
4768 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4769
4770 }
4771
4772 /*----------------------------------------------------------------------------
4773 | Returns the result of converting the 64-bit two's complement integer `a'
4774 | to the extended double-precision floating-point format. The conversion
4775 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4776 | Arithmetic.
4777 *----------------------------------------------------------------------------*/
4778
4779 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4780 {
4781 bool zSign;
4782 uint64_t absA;
4783 int8_t shiftCount;
4784
4785 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4786 zSign = ( a < 0 );
4787 absA = zSign ? - a : a;
4788 shiftCount = clz64(absA);
4789 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4790
4791 }
4792
4793 /*----------------------------------------------------------------------------
4794 | Returns the result of converting the 64-bit two's complement integer `a' to
4795 | the quadruple-precision floating-point format. The conversion is performed
4796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4797 *----------------------------------------------------------------------------*/
4798
4799 float128 int64_to_float128(int64_t a, float_status *status)
4800 {
4801 bool zSign;
4802 uint64_t absA;
4803 int8_t shiftCount;
4804 int32_t zExp;
4805 uint64_t zSig0, zSig1;
4806
4807 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4808 zSign = ( a < 0 );
4809 absA = zSign ? - a : a;
4810 shiftCount = clz64(absA) + 49;
4811 zExp = 0x406E - shiftCount;
4812 if ( 64 <= shiftCount ) {
4813 zSig1 = 0;
4814 zSig0 = absA;
4815 shiftCount -= 64;
4816 }
4817 else {
4818 zSig1 = absA;
4819 zSig0 = 0;
4820 }
4821 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4822 return packFloat128( zSign, zExp, zSig0, zSig1 );
4823
4824 }
4825
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the 64-bit unsigned integer `a'
4828 | to the quadruple-precision floating-point format. The conversion is performed
4829 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4830 *----------------------------------------------------------------------------*/
4831
4832 float128 uint64_to_float128(uint64_t a, float_status *status)
4833 {
4834 if (a == 0) {
4835 return float128_zero;
4836 }
4837 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4838 }
4839
4840 /*----------------------------------------------------------------------------
4841 | Returns the result of converting the single-precision floating-point value
4842 | `a' to the extended double-precision floating-point format. The conversion
4843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4844 | Arithmetic.
4845 *----------------------------------------------------------------------------*/
4846
4847 floatx80 float32_to_floatx80(float32 a, float_status *status)
4848 {
4849 bool aSign;
4850 int aExp;
4851 uint32_t aSig;
4852
4853 a = float32_squash_input_denormal(a, status);
4854 aSig = extractFloat32Frac( a );
4855 aExp = extractFloat32Exp( a );
4856 aSign = extractFloat32Sign( a );
4857 if ( aExp == 0xFF ) {
4858 if (aSig) {
4859 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4860 status);
4861 return floatx80_silence_nan(res, status);
4862 }
4863 return packFloatx80(aSign,
4864 floatx80_infinity_high,
4865 floatx80_infinity_low);
4866 }
4867 if ( aExp == 0 ) {
4868 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4869 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4870 }
4871 aSig |= 0x00800000;
4872 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4873
4874 }
4875
4876 /*----------------------------------------------------------------------------
4877 | Returns the result of converting the single-precision floating-point value
4878 | `a' to the double-precision floating-point format. The conversion is
4879 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4880 | Arithmetic.
4881 *----------------------------------------------------------------------------*/
4882
4883 float128 float32_to_float128(float32 a, float_status *status)
4884 {
4885 bool aSign;
4886 int aExp;
4887 uint32_t aSig;
4888
4889 a = float32_squash_input_denormal(a, status);
4890 aSig = extractFloat32Frac( a );
4891 aExp = extractFloat32Exp( a );
4892 aSign = extractFloat32Sign( a );
4893 if ( aExp == 0xFF ) {
4894 if (aSig) {
4895 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4896 }
4897 return packFloat128( aSign, 0x7FFF, 0, 0 );
4898 }
4899 if ( aExp == 0 ) {
4900 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4901 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4902 --aExp;
4903 }
4904 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4905
4906 }
4907
4908 /*----------------------------------------------------------------------------
4909 | Returns the remainder of the single-precision floating-point value `a'
4910 | with respect to the corresponding value `b'. The operation is performed
4911 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912 *----------------------------------------------------------------------------*/
4913
4914 float32 float32_rem(float32 a, float32 b, float_status *status)
4915 {
4916 bool aSign, zSign;
4917 int aExp, bExp, expDiff;
4918 uint32_t aSig, bSig;
4919 uint32_t q;
4920 uint64_t aSig64, bSig64, q64;
4921 uint32_t alternateASig;
4922 int32_t sigMean;
4923 a = float32_squash_input_denormal(a, status);
4924 b = float32_squash_input_denormal(b, status);
4925
4926 aSig = extractFloat32Frac( a );
4927 aExp = extractFloat32Exp( a );
4928 aSign = extractFloat32Sign( a );
4929 bSig = extractFloat32Frac( b );
4930 bExp = extractFloat32Exp( b );
4931 if ( aExp == 0xFF ) {
4932 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4933 return propagateFloat32NaN(a, b, status);
4934 }
4935 float_raise(float_flag_invalid, status);
4936 return float32_default_nan(status);
4937 }
4938 if ( bExp == 0xFF ) {
4939 if (bSig) {
4940 return propagateFloat32NaN(a, b, status);
4941 }
4942 return a;
4943 }
4944 if ( bExp == 0 ) {
4945 if ( bSig == 0 ) {
4946 float_raise(float_flag_invalid, status);
4947 return float32_default_nan(status);
4948 }
4949 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4950 }
4951 if ( aExp == 0 ) {
4952 if ( aSig == 0 ) return a;
4953 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4954 }
4955 expDiff = aExp - bExp;
4956 aSig |= 0x00800000;
4957 bSig |= 0x00800000;
4958 if ( expDiff < 32 ) {
4959 aSig <<= 8;
4960 bSig <<= 8;
4961 if ( expDiff < 0 ) {
4962 if ( expDiff < -1 ) return a;
4963 aSig >>= 1;
4964 }
4965 q = ( bSig <= aSig );
4966 if ( q ) aSig -= bSig;
4967 if ( 0 < expDiff ) {
4968 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4969 q >>= 32 - expDiff;
4970 bSig >>= 2;
4971 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4972 }
4973 else {
4974 aSig >>= 2;
4975 bSig >>= 2;
4976 }
4977 }
4978 else {
4979 if ( bSig <= aSig ) aSig -= bSig;
4980 aSig64 = ( (uint64_t) aSig )<<40;
4981 bSig64 = ( (uint64_t) bSig )<<40;
4982 expDiff -= 64;
4983 while ( 0 < expDiff ) {
4984 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4985 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4986 aSig64 = - ( ( bSig * q64 )<<38 );
4987 expDiff -= 62;
4988 }
4989 expDiff += 64;
4990 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4991 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4992 q = q64>>( 64 - expDiff );
4993 bSig <<= 6;
4994 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4995 }
4996 do {
4997 alternateASig = aSig;
4998 ++q;
4999 aSig -= bSig;
5000 } while ( 0 <= (int32_t) aSig );
5001 sigMean = aSig + alternateASig;
5002 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5003 aSig = alternateASig;
5004 }
5005 zSign = ( (int32_t) aSig < 0 );
5006 if ( zSign ) aSig = - aSig;
5007 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5008 }
5009
5010
5011
5012 /*----------------------------------------------------------------------------
5013 | Returns the binary exponential of the single-precision floating-point value
5014 | `a'. The operation is performed according to the IEC/IEEE Standard for
5015 | Binary Floating-Point Arithmetic.
5016 |
5017 | Uses the following identities:
5018 |
5019 | 1. -------------------------------------------------------------------------
5020 | x x*ln(2)
5021 | 2 = e
5022 |
5023 | 2. -------------------------------------------------------------------------
5024 | 2 3 4 5 n
5025 | x x x x x x x
5026 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5027 | 1! 2! 3! 4! 5! n!
5028 *----------------------------------------------------------------------------*/
5029
5030 static const float64 float32_exp2_coefficients[15] =
5031 {
5032 const_float64( 0x3ff0000000000000ll ), /* 1 */
5033 const_float64( 0x3fe0000000000000ll ), /* 2 */
5034 const_float64( 0x3fc5555555555555ll ), /* 3 */
5035 const_float64( 0x3fa5555555555555ll ), /* 4 */
5036 const_float64( 0x3f81111111111111ll ), /* 5 */
5037 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5038 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5039 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5040 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5041 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5042 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5043 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5044 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5045 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5046 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5047 };
5048
5049 float32 float32_exp2(float32 a, float_status *status)
5050 {
5051 bool aSign;
5052 int aExp;
5053 uint32_t aSig;
5054 float64 r, x, xn;
5055 int i;
5056 a = float32_squash_input_denormal(a, status);
5057
5058 aSig = extractFloat32Frac( a );
5059 aExp = extractFloat32Exp( a );
5060 aSign = extractFloat32Sign( a );
5061
5062 if ( aExp == 0xFF) {
5063 if (aSig) {
5064 return propagateFloat32NaN(a, float32_zero, status);
5065 }
5066 return (aSign) ? float32_zero : a;
5067 }
5068 if (aExp == 0) {
5069 if (aSig == 0) return float32_one;
5070 }
5071
5072 float_raise(float_flag_inexact, status);
5073
5074 /* ******************************* */
5075 /* using float64 for approximation */
5076 /* ******************************* */
5077 x = float32_to_float64(a, status);
5078 x = float64_mul(x, float64_ln2, status);
5079
5080 xn = x;
5081 r = float64_one;
5082 for (i = 0 ; i < 15 ; i++) {
5083 float64 f;
5084
5085 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5086 r = float64_add(r, f, status);
5087
5088 xn = float64_mul(xn, x, status);
5089 }
5090
5091 return float64_to_float32(r, status);
5092 }
5093
5094 /*----------------------------------------------------------------------------
5095 | Returns the binary log of the single-precision floating-point value `a'.
5096 | The operation is performed according to the IEC/IEEE Standard for Binary
5097 | Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099 float32 float32_log2(float32 a, float_status *status)
5100 {
5101 bool aSign, zSign;
5102 int aExp;
5103 uint32_t aSig, zSig, i;
5104
5105 a = float32_squash_input_denormal(a, status);
5106 aSig = extractFloat32Frac( a );
5107 aExp = extractFloat32Exp( a );
5108 aSign = extractFloat32Sign( a );
5109
5110 if ( aExp == 0 ) {
5111 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5112 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5113 }
5114 if ( aSign ) {
5115 float_raise(float_flag_invalid, status);
5116 return float32_default_nan(status);
5117 }
5118 if ( aExp == 0xFF ) {
5119 if (aSig) {
5120 return propagateFloat32NaN(a, float32_zero, status);
5121 }
5122 return a;
5123 }
5124
5125 aExp -= 0x7F;
5126 aSig |= 0x00800000;
5127 zSign = aExp < 0;
5128 zSig = aExp << 23;
5129
5130 for (i = 1 << 22; i > 0; i >>= 1) {
5131 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5132 if ( aSig & 0x01000000 ) {
5133 aSig >>= 1;
5134 zSig |= i;
5135 }
5136 }
5137
5138 if ( zSign )
5139 zSig = -zSig;
5140
5141 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5142 }
5143
5144 /*----------------------------------------------------------------------------
5145 | Returns the result of converting the double-precision floating-point value
5146 | `a' to the extended double-precision floating-point format. The conversion
5147 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5148 | Arithmetic.
5149 *----------------------------------------------------------------------------*/
5150
5151 floatx80 float64_to_floatx80(float64 a, float_status *status)
5152 {
5153 bool aSign;
5154 int aExp;
5155 uint64_t aSig;
5156
5157 a = float64_squash_input_denormal(a, status);
5158 aSig = extractFloat64Frac( a );
5159 aExp = extractFloat64Exp( a );
5160 aSign = extractFloat64Sign( a );
5161 if ( aExp == 0x7FF ) {
5162 if (aSig) {
5163 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5164 status);
5165 return floatx80_silence_nan(res, status);
5166 }
5167 return packFloatx80(aSign,
5168 floatx80_infinity_high,
5169 floatx80_infinity_low);
5170 }
5171 if ( aExp == 0 ) {
5172 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5173 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5174 }
5175 return
5176 packFloatx80(
5177 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5178
5179 }
5180
5181 /*----------------------------------------------------------------------------
5182 | Returns the result of converting the double-precision floating-point value
5183 | `a' to the quadruple-precision floating-point format. The conversion is
5184 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5185 | Arithmetic.
5186 *----------------------------------------------------------------------------*/
5187
5188 float128 float64_to_float128(float64 a, float_status *status)
5189 {
5190 bool aSign;
5191 int aExp;
5192 uint64_t aSig, zSig0, zSig1;
5193
5194 a = float64_squash_input_denormal(a, status);
5195 aSig = extractFloat64Frac( a );
5196 aExp = extractFloat64Exp( a );
5197 aSign = extractFloat64Sign( a );
5198 if ( aExp == 0x7FF ) {
5199 if (aSig) {
5200 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5201 }
5202 return packFloat128( aSign, 0x7FFF, 0, 0 );
5203 }
5204 if ( aExp == 0 ) {
5205 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5206 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5207 --aExp;
5208 }
5209 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5210 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5211
5212 }
5213
5214
5215 /*----------------------------------------------------------------------------
5216 | Returns the remainder of the double-precision floating-point value `a'
5217 | with respect to the corresponding value `b'. The operation is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220
5221 float64 float64_rem(float64 a, float64 b, float_status *status)
5222 {
5223 bool aSign, zSign;
5224 int aExp, bExp, expDiff;
5225 uint64_t aSig, bSig;
5226 uint64_t q, alternateASig;
5227 int64_t sigMean;
5228
5229 a = float64_squash_input_denormal(a, status);
5230 b = float64_squash_input_denormal(b, status);
5231 aSig = extractFloat64Frac( a );
5232 aExp = extractFloat64Exp( a );
5233 aSign = extractFloat64Sign( a );
5234 bSig = extractFloat64Frac( b );
5235 bExp = extractFloat64Exp( b );
5236 if ( aExp == 0x7FF ) {
5237 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5238 return propagateFloat64NaN(a, b, status);
5239 }
5240 float_raise(float_flag_invalid, status);
5241 return float64_default_nan(status);
5242 }
5243 if ( bExp == 0x7FF ) {
5244 if (bSig) {
5245 return propagateFloat64NaN(a, b, status);
5246 }
5247 return a;
5248 }
5249 if ( bExp == 0 ) {
5250 if ( bSig == 0 ) {
5251 float_raise(float_flag_invalid, status);
5252 return float64_default_nan(status);
5253 }
5254 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5255 }
5256 if ( aExp == 0 ) {
5257 if ( aSig == 0 ) return a;
5258 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5259 }
5260 expDiff = aExp - bExp;
5261 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5262 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5263 if ( expDiff < 0 ) {
5264 if ( expDiff < -1 ) return a;
5265 aSig >>= 1;
5266 }
5267 q = ( bSig <= aSig );
5268 if ( q ) aSig -= bSig;
5269 expDiff -= 64;
5270 while ( 0 < expDiff ) {
5271 q = estimateDiv128To64( aSig, 0, bSig );
5272 q = ( 2 < q ) ? q - 2 : 0;
5273 aSig = - ( ( bSig>>2 ) * q );
5274 expDiff -= 62;
5275 }
5276 expDiff += 64;
5277 if ( 0 < expDiff ) {
5278 q = estimateDiv128To64( aSig, 0, bSig );
5279 q = ( 2 < q ) ? q - 2 : 0;
5280 q >>= 64 - expDiff;
5281 bSig >>= 2;
5282 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5283 }
5284 else {
5285 aSig >>= 2;
5286 bSig >>= 2;
5287 }
5288 do {
5289 alternateASig = aSig;
5290 ++q;
5291 aSig -= bSig;
5292 } while ( 0 <= (int64_t) aSig );
5293 sigMean = aSig + alternateASig;
5294 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5295 aSig = alternateASig;
5296 }
5297 zSign = ( (int64_t) aSig < 0 );
5298 if ( zSign ) aSig = - aSig;
5299 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5300
5301 }
5302
5303 /*----------------------------------------------------------------------------
5304 | Returns the binary log of the double-precision floating-point value `a'.
5305 | The operation is performed according to the IEC/IEEE Standard for Binary
5306 | Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5308 float64 float64_log2(float64 a, float_status *status)
5309 {
5310 bool aSign, zSign;
5311 int aExp;
5312 uint64_t aSig, aSig0, aSig1, zSig, i;
5313 a = float64_squash_input_denormal(a, status);
5314
5315 aSig = extractFloat64Frac( a );
5316 aExp = extractFloat64Exp( a );
5317 aSign = extractFloat64Sign( a );
5318
5319 if ( aExp == 0 ) {
5320 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5321 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5322 }
5323 if ( aSign ) {
5324 float_raise(float_flag_invalid, status);
5325 return float64_default_nan(status);
5326 }
5327 if ( aExp == 0x7FF ) {
5328 if (aSig) {
5329 return propagateFloat64NaN(a, float64_zero, status);
5330 }
5331 return a;
5332 }
5333
5334 aExp -= 0x3FF;
5335 aSig |= UINT64_C(0x0010000000000000);
5336 zSign = aExp < 0;
5337 zSig = (uint64_t)aExp << 52;
5338 for (i = 1LL << 51; i > 0; i >>= 1) {
5339 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5340 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5341 if ( aSig & UINT64_C(0x0020000000000000) ) {
5342 aSig >>= 1;
5343 zSig |= i;
5344 }
5345 }
5346
5347 if ( zSign )
5348 zSig = -zSig;
5349 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5350 }
5351
5352 /*----------------------------------------------------------------------------
5353 | Returns the result of converting the extended double-precision floating-
5354 | point value `a' to the 32-bit two's complement integer format. The
5355 | conversion is performed according to the IEC/IEEE Standard for Binary
5356 | Floating-Point Arithmetic---which means in particular that the conversion
5357 | is rounded according to the current rounding mode. If `a' is a NaN, the
5358 | largest positive integer is returned. Otherwise, if the conversion
5359 | overflows, the largest integer with the same sign as `a' is returned.
5360 *----------------------------------------------------------------------------*/
5361
5362 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5363 {
5364 bool aSign;
5365 int32_t aExp, shiftCount;
5366 uint64_t aSig;
5367
5368 if (floatx80_invalid_encoding(a)) {
5369 float_raise(float_flag_invalid, status);
5370 return 1 << 31;
5371 }
5372 aSig = extractFloatx80Frac( a );
5373 aExp = extractFloatx80Exp( a );
5374 aSign = extractFloatx80Sign( a );
5375 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5376 shiftCount = 0x4037 - aExp;
5377 if ( shiftCount <= 0 ) shiftCount = 1;
5378 shift64RightJamming( aSig, shiftCount, &aSig );
5379 return roundAndPackInt32(aSign, aSig, status);
5380
5381 }
5382
5383 /*----------------------------------------------------------------------------
5384 | Returns the result of converting the extended double-precision floating-
5385 | point value `a' to the 32-bit two's complement integer format. The
5386 | conversion is performed according to the IEC/IEEE Standard for Binary
5387 | Floating-Point Arithmetic, except that the conversion is always rounded
5388 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5389 | Otherwise, if the conversion overflows, the largest integer with the same
5390 | sign as `a' is returned.
5391 *----------------------------------------------------------------------------*/
5392
5393 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5394 {
5395 bool aSign;
5396 int32_t aExp, shiftCount;
5397 uint64_t aSig, savedASig;
5398 int32_t z;
5399
5400 if (floatx80_invalid_encoding(a)) {
5401 float_raise(float_flag_invalid, status);
5402 return 1 << 31;
5403 }
5404 aSig = extractFloatx80Frac( a );
5405 aExp = extractFloatx80Exp( a );
5406 aSign = extractFloatx80Sign( a );
5407 if ( 0x401E < aExp ) {
5408 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5409 goto invalid;
5410 }
5411 else if ( aExp < 0x3FFF ) {
5412 if (aExp || aSig) {
5413 float_raise(float_flag_inexact, status);
5414 }
5415 return 0;
5416 }
5417 shiftCount = 0x403E - aExp;
5418 savedASig = aSig;
5419 aSig >>= shiftCount;
5420 z = aSig;
5421 if ( aSign ) z = - z;
5422 if ( ( z < 0 ) ^ aSign ) {
5423 invalid:
5424 float_raise(float_flag_invalid, status);
5425 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5426 }
5427 if ( ( aSig<<shiftCount ) != savedASig ) {
5428 float_raise(float_flag_inexact, status);
5429 }
5430 return z;
5431
5432 }
5433
5434 /*----------------------------------------------------------------------------
5435 | Returns the result of converting the extended double-precision floating-
5436 | point value `a' to the 64-bit two's complement integer format. The
5437 | conversion is performed according to the IEC/IEEE Standard for Binary
5438 | Floating-Point Arithmetic---which means in particular that the conversion
5439 | is rounded according to the current rounding mode. If `a' is a NaN,
5440 | the largest positive integer is returned. Otherwise, if the conversion
5441 | overflows, the largest integer with the same sign as `a' is returned.
5442 *----------------------------------------------------------------------------*/
5443
5444 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5445 {
5446 bool aSign;
5447 int32_t aExp, shiftCount;
5448 uint64_t aSig, aSigExtra;
5449
5450 if (floatx80_invalid_encoding(a)) {
5451 float_raise(float_flag_invalid, status);
5452 return 1ULL << 63;
5453 }
5454 aSig = extractFloatx80Frac( a );
5455 aExp = extractFloatx80Exp( a );
5456 aSign = extractFloatx80Sign( a );
5457 shiftCount = 0x403E - aExp;
5458 if ( shiftCount <= 0 ) {
5459 if ( shiftCount ) {
5460 float_raise(float_flag_invalid, status);
5461 if (!aSign || floatx80_is_any_nan(a)) {
5462 return INT64_MAX;
5463 }
5464 return INT64_MIN;
5465 }
5466 aSigExtra = 0;
5467 }
5468 else {
5469 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5470 }
5471 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5472
5473 }
5474
5475 /*----------------------------------------------------------------------------
5476 | Returns the result of converting the extended double-precision floating-
5477 | point value `a' to the 64-bit two's complement integer format. The
5478 | conversion is performed according to the IEC/IEEE Standard for Binary
5479 | Floating-Point Arithmetic, except that the conversion is always rounded
5480 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5481 | Otherwise, if the conversion overflows, the largest integer with the same
5482 | sign as `a' is returned.
5483 *----------------------------------------------------------------------------*/
5484
5485 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5486 {
5487 bool aSign;
5488 int32_t aExp, shiftCount;
5489 uint64_t aSig;
5490 int64_t z;
5491
5492 if (floatx80_invalid_encoding(a)) {
5493 float_raise(float_flag_invalid, status);
5494 return 1ULL << 63;
5495 }
5496 aSig = extractFloatx80Frac( a );
5497 aExp = extractFloatx80Exp( a );
5498 aSign = extractFloatx80Sign( a );
5499 shiftCount = aExp - 0x403E;
5500 if ( 0 <= shiftCount ) {
5501 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5502 if ( ( a.high != 0xC03E ) || aSig ) {
5503 float_raise(float_flag_invalid, status);
5504 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5505 return INT64_MAX;
5506 }
5507 }
5508 return INT64_MIN;
5509 }
5510 else if ( aExp < 0x3FFF ) {
5511 if (aExp | aSig) {
5512 float_raise(float_flag_inexact, status);
5513 }
5514 return 0;
5515 }
5516 z = aSig>>( - shiftCount );
5517 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5518 float_raise(float_flag_inexact, status);
5519 }
5520 if ( aSign ) z = - z;
5521 return z;
5522
5523 }
5524
5525 /*----------------------------------------------------------------------------
5526 | Returns the result of converting the extended double-precision floating-
5527 | point value `a' to the single-precision floating-point format. The
5528 | conversion is performed according to the IEC/IEEE Standard for Binary
5529 | Floating-Point Arithmetic.
5530 *----------------------------------------------------------------------------*/
5531
5532 float32 floatx80_to_float32(floatx80 a, float_status *status)
5533 {
5534 bool aSign;
5535 int32_t aExp;
5536 uint64_t aSig;
5537
5538 if (floatx80_invalid_encoding(a)) {
5539 float_raise(float_flag_invalid, status);
5540 return float32_default_nan(status);
5541 }
5542 aSig = extractFloatx80Frac( a );
5543 aExp = extractFloatx80Exp( a );
5544 aSign = extractFloatx80Sign( a );
5545 if ( aExp == 0x7FFF ) {
5546 if ( (uint64_t) ( aSig<<1 ) ) {
5547 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5548 status);
5549 return float32_silence_nan(res, status);
5550 }
5551 return packFloat32( aSign, 0xFF, 0 );
5552 }
5553 shift64RightJamming( aSig, 33, &aSig );
5554 if ( aExp || aSig ) aExp -= 0x3F81;
5555 return roundAndPackFloat32(aSign, aExp, aSig, status);
5556
5557 }
5558
5559 /*----------------------------------------------------------------------------
5560 | Returns the result of converting the extended double-precision floating-
5561 | point value `a' to the double-precision floating-point format. The
5562 | conversion is performed according to the IEC/IEEE Standard for Binary
5563 | Floating-Point Arithmetic.
5564 *----------------------------------------------------------------------------*/
5565
5566 float64 floatx80_to_float64(floatx80 a, float_status *status)
5567 {
5568 bool aSign;
5569 int32_t aExp;
5570 uint64_t aSig, zSig;
5571
5572 if (floatx80_invalid_encoding(a)) {
5573 float_raise(float_flag_invalid, status);
5574 return float64_default_nan(status);
5575 }
5576 aSig = extractFloatx80Frac( a );
5577 aExp = extractFloatx80Exp( a );
5578 aSign = extractFloatx80Sign( a );
5579 if ( aExp == 0x7FFF ) {
5580 if ( (uint64_t) ( aSig<<1 ) ) {
5581 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5582 status);
5583 return float64_silence_nan(res, status);
5584 }
5585 return packFloat64( aSign, 0x7FF, 0 );
5586 }
5587 shift64RightJamming( aSig, 1, &zSig );
5588 if ( aExp || aSig ) aExp -= 0x3C01;
5589 return roundAndPackFloat64(aSign, aExp, zSig, status);
5590
5591 }
5592
5593 /*----------------------------------------------------------------------------
5594 | Returns the result of converting the extended double-precision floating-
5595 | point value `a' to the quadruple-precision floating-point format. The
5596 | conversion is performed according to the IEC/IEEE Standard for Binary
5597 | Floating-Point Arithmetic.
5598 *----------------------------------------------------------------------------*/
5599
5600 float128 floatx80_to_float128(floatx80 a, float_status *status)
5601 {
5602 bool aSign;
5603 int aExp;
5604 uint64_t aSig, zSig0, zSig1;
5605
5606 if (floatx80_invalid_encoding(a)) {
5607 float_raise(float_flag_invalid, status);
5608 return float128_default_nan(status);
5609 }
5610 aSig = extractFloatx80Frac( a );
5611 aExp = extractFloatx80Exp( a );
5612 aSign = extractFloatx80Sign( a );
5613 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5614 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5615 status);
5616 return float128_silence_nan(res, status);
5617 }
5618 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5619 return packFloat128( aSign, aExp, zSig0, zSig1 );
5620
5621 }
5622
5623 /*----------------------------------------------------------------------------
5624 | Rounds the extended double-precision floating-point value `a'
5625 | to the precision provided by floatx80_rounding_precision and returns the
5626 | result as an extended double-precision floating-point value.
5627 | The operation is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic.
5629 *----------------------------------------------------------------------------*/
5630
5631 floatx80 floatx80_round(floatx80 a, float_status *status)
5632 {
5633 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5634 extractFloatx80Sign(a),
5635 extractFloatx80Exp(a),
5636 extractFloatx80Frac(a), 0, status);
5637 }
5638
5639 /*----------------------------------------------------------------------------
5640 | Rounds the extended double-precision floating-point value `a' to an integer,
5641 | and returns the result as an extended quadruple-precision floating-point
5642 | value. The operation is performed according to the IEC/IEEE Standard for
5643 | Binary Floating-Point Arithmetic.
5644 *----------------------------------------------------------------------------*/
5645
5646 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5647 {
5648 bool aSign;
5649 int32_t aExp;
5650 uint64_t lastBitMask, roundBitsMask;
5651 floatx80 z;
5652
5653 if (floatx80_invalid_encoding(a)) {
5654 float_raise(float_flag_invalid, status);
5655 return floatx80_default_nan(status);
5656 }
5657 aExp = extractFloatx80Exp( a );
5658 if ( 0x403E <= aExp ) {
5659 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5660 return propagateFloatx80NaN(a, a, status);
5661 }
5662 return a;
5663 }
5664 if ( aExp < 0x3FFF ) {
5665 if ( ( aExp == 0 )
5666 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5667 return a;
5668 }
5669 float_raise(float_flag_inexact, status);
5670 aSign = extractFloatx80Sign( a );
5671 switch (status->float_rounding_mode) {
5672 case float_round_nearest_even:
5673 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5674 ) {
5675 return
5676 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5677 }
5678 break;
5679 case float_round_ties_away:
5680 if (aExp == 0x3FFE) {
5681 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5682 }
5683 break;
5684 case float_round_down:
5685 return
5686 aSign ?
5687 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5688 : packFloatx80( 0, 0, 0 );
5689 case float_round_up:
5690 return
5691 aSign ? packFloatx80( 1, 0, 0 )
5692 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5693
5694 case float_round_to_zero:
5695 break;
5696 default:
5697 g_assert_not_reached();
5698 }
5699 return packFloatx80( aSign, 0, 0 );
5700 }
5701 lastBitMask = 1;
5702 lastBitMask <<= 0x403E - aExp;
5703 roundBitsMask = lastBitMask - 1;
5704 z = a;
5705 switch (status->float_rounding_mode) {
5706 case float_round_nearest_even:
5707 z.low += lastBitMask>>1;
5708 if ((z.low & roundBitsMask) == 0) {
5709 z.low &= ~lastBitMask;
5710 }
5711 break;
5712 case float_round_ties_away:
5713 z.low += lastBitMask >> 1;
5714 break;
5715 case float_round_to_zero:
5716 break;
5717 case float_round_up:
5718 if (!extractFloatx80Sign(z)) {
5719 z.low += roundBitsMask;
5720 }
5721 break;
5722 case float_round_down:
5723 if (extractFloatx80Sign(z)) {
5724 z.low += roundBitsMask;
5725 }
5726 break;
5727 default:
5728 abort();
5729 }
5730 z.low &= ~ roundBitsMask;
5731 if ( z.low == 0 ) {
5732 ++z.high;
5733 z.low = UINT64_C(0x8000000000000000);
5734 }
5735 if (z.low != a.low) {
5736 float_raise(float_flag_inexact, status);
5737 }
5738 return z;
5739
5740 }
5741
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of adding the absolute values of the extended double-
5744 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5745 | negated before being returned. `zSign' is ignored if the result is a NaN.
5746 | The addition is performed according to the IEC/IEEE Standard for Binary
5747 | Floating-Point Arithmetic.
5748 *----------------------------------------------------------------------------*/
5749
5750 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5751 float_status *status)
5752 {
5753 int32_t aExp, bExp, zExp;
5754 uint64_t aSig, bSig, zSig0, zSig1;
5755 int32_t expDiff;
5756
5757 aSig = extractFloatx80Frac( a );
5758 aExp = extractFloatx80Exp( a );
5759 bSig = extractFloatx80Frac( b );
5760 bExp = extractFloatx80Exp( b );
5761 expDiff = aExp - bExp;
5762 if ( 0 < expDiff ) {
5763 if ( aExp == 0x7FFF ) {
5764 if ((uint64_t)(aSig << 1)) {
5765 return propagateFloatx80NaN(a, b, status);
5766 }
5767 return a;
5768 }
5769 if ( bExp == 0 ) --expDiff;
5770 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5771 zExp = aExp;
5772 }
5773 else if ( expDiff < 0 ) {
5774 if ( bExp == 0x7FFF ) {
5775 if ((uint64_t)(bSig << 1)) {
5776 return propagateFloatx80NaN(a, b, status);
5777 }
5778 return packFloatx80(zSign,
5779 floatx80_infinity_high,
5780 floatx80_infinity_low);
5781 }
5782 if ( aExp == 0 ) ++expDiff;
5783 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5784 zExp = bExp;
5785 }
5786 else {
5787 if ( aExp == 0x7FFF ) {
5788 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5789 return propagateFloatx80NaN(a, b, status);
5790 }
5791 return a;
5792 }
5793 zSig1 = 0;
5794 zSig0 = aSig + bSig;
5795 if ( aExp == 0 ) {
5796 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5797 /* At least one of the values is a pseudo-denormal,
5798 * and there is a carry out of the result. */
5799 zExp = 1;
5800 goto shiftRight1;
5801 }
5802 if (zSig0 == 0) {
5803 return packFloatx80(zSign, 0, 0);
5804 }
5805 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5806 goto roundAndPack;
5807 }
5808 zExp = aExp;
5809 goto shiftRight1;
5810 }
5811 zSig0 = aSig + bSig;
5812 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5813 shiftRight1:
5814 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5815 zSig0 |= UINT64_C(0x8000000000000000);
5816 ++zExp;
5817 roundAndPack:
5818 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5819 zSign, zExp, zSig0, zSig1, status);
5820 }
5821
5822 /*----------------------------------------------------------------------------
5823 | Returns the result of subtracting the absolute values of the extended
5824 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5825 | difference is negated before being returned. `zSign' is ignored if the
5826 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5827 | Standard for Binary Floating-Point Arithmetic.
5828 *----------------------------------------------------------------------------*/
5829
5830 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5831 float_status *status)
5832 {
5833 int32_t aExp, bExp, zExp;
5834 uint64_t aSig, bSig, zSig0, zSig1;
5835 int32_t expDiff;
5836
5837 aSig = extractFloatx80Frac( a );
5838 aExp = extractFloatx80Exp( a );
5839 bSig = extractFloatx80Frac( b );
5840 bExp = extractFloatx80Exp( b );
5841 expDiff = aExp - bExp;
5842 if ( 0 < expDiff ) goto aExpBigger;
5843 if ( expDiff < 0 ) goto bExpBigger;
5844 if ( aExp == 0x7FFF ) {
5845 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5846 return propagateFloatx80NaN(a, b, status);
5847 }
5848 float_raise(float_flag_invalid, status);
5849 return floatx80_default_nan(status);
5850 }
5851 if ( aExp == 0 ) {
5852 aExp = 1;
5853 bExp = 1;
5854 }
5855 zSig1 = 0;
5856 if ( bSig < aSig ) goto aBigger;
5857 if ( aSig < bSig ) goto bBigger;
5858 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5859 bExpBigger:
5860 if ( bExp == 0x7FFF ) {
5861 if ((uint64_t)(bSig << 1)) {
5862 return propagateFloatx80NaN(a, b, status);
5863 }
5864 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5865 floatx80_infinity_low);
5866 }
5867 if ( aExp == 0 ) ++expDiff;
5868 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5869 bBigger:
5870 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5871 zExp = bExp;
5872 zSign ^= 1;
5873 goto normalizeRoundAndPack;
5874 aExpBigger:
5875 if ( aExp == 0x7FFF ) {
5876 if ((uint64_t)(aSig << 1)) {
5877 return propagateFloatx80NaN(a, b, status);
5878 }
5879 return a;
5880 }
5881 if ( bExp == 0 ) --expDiff;
5882 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5883 aBigger:
5884 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5885 zExp = aExp;
5886 normalizeRoundAndPack:
5887 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5888 zSign, zExp, zSig0, zSig1, status);
5889 }
5890
5891 /*----------------------------------------------------------------------------
5892 | Returns the result of adding the extended double-precision floating-point
5893 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5894 | Standard for Binary Floating-Point Arithmetic.
5895 *----------------------------------------------------------------------------*/
5896
5897 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5898 {
5899 bool aSign, bSign;
5900
5901 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5902 float_raise(float_flag_invalid, status);
5903 return floatx80_default_nan(status);
5904 }
5905 aSign = extractFloatx80Sign( a );
5906 bSign = extractFloatx80Sign( b );
5907 if ( aSign == bSign ) {
5908 return addFloatx80Sigs(a, b, aSign, status);
5909 }
5910 else {
5911 return subFloatx80Sigs(a, b, aSign, status);
5912 }
5913
5914 }
5915
5916 /*----------------------------------------------------------------------------
5917 | Returns the result of subtracting the extended double-precision floating-
5918 | point values `a' and `b'. The operation is performed according to the
5919 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5920 *----------------------------------------------------------------------------*/
5921
5922 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5923 {
5924 bool aSign, bSign;
5925
5926 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5927 float_raise(float_flag_invalid, status);
5928 return floatx80_default_nan(status);
5929 }
5930 aSign = extractFloatx80Sign( a );
5931 bSign = extractFloatx80Sign( b );
5932 if ( aSign == bSign ) {
5933 return subFloatx80Sigs(a, b, aSign, status);
5934 }
5935 else {
5936 return addFloatx80Sigs(a, b, aSign, status);
5937 }
5938
5939 }
5940
5941 /*----------------------------------------------------------------------------
5942 | Returns the result of multiplying the extended double-precision floating-
5943 | point values `a' and `b'. The operation is performed according to the
5944 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5945 *----------------------------------------------------------------------------*/
5946
5947 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5948 {
5949 bool aSign, bSign, zSign;
5950 int32_t aExp, bExp, zExp;
5951 uint64_t aSig, bSig, zSig0, zSig1;
5952
5953 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5954 float_raise(float_flag_invalid, status);
5955 return floatx80_default_nan(status);
5956 }
5957 aSig = extractFloatx80Frac( a );
5958 aExp = extractFloatx80Exp( a );
5959 aSign = extractFloatx80Sign( a );
5960 bSig = extractFloatx80Frac( b );
5961 bExp = extractFloatx80Exp( b );
5962 bSign = extractFloatx80Sign( b );
5963 zSign = aSign ^ bSign;
5964 if ( aExp == 0x7FFF ) {
5965 if ( (uint64_t) ( aSig<<1 )
5966 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5967 return propagateFloatx80NaN(a, b, status);
5968 }
5969 if ( ( bExp | bSig ) == 0 ) goto invalid;
5970 return packFloatx80(zSign, floatx80_infinity_high,
5971 floatx80_infinity_low);
5972 }
5973 if ( bExp == 0x7FFF ) {
5974 if ((uint64_t)(bSig << 1)) {
5975 return propagateFloatx80NaN(a, b, status);
5976 }
5977 if ( ( aExp | aSig ) == 0 ) {
5978 invalid:
5979 float_raise(float_flag_invalid, status);
5980 return floatx80_default_nan(status);
5981 }
5982 return packFloatx80(zSign, floatx80_infinity_high,
5983 floatx80_infinity_low);
5984 }
5985 if ( aExp == 0 ) {
5986 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5987 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5988 }
5989 if ( bExp == 0 ) {
5990 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5991 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5992 }
5993 zExp = aExp + bExp - 0x3FFE;
5994 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5995 if ( 0 < (int64_t) zSig0 ) {
5996 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5997 --zExp;
5998 }
5999 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6000 zSign, zExp, zSig0, zSig1, status);
6001 }
6002
6003 /*----------------------------------------------------------------------------
6004 | Returns the result of dividing the extended double-precision floating-point
6005 | value `a' by the corresponding value `b'. The operation is performed
6006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6007 *----------------------------------------------------------------------------*/
6008
6009 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6010 {
6011 bool aSign, bSign, zSign;
6012 int32_t aExp, bExp, zExp;
6013 uint64_t aSig, bSig, zSig0, zSig1;
6014 uint64_t rem0, rem1, rem2, term0, term1, term2;
6015
6016 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6017 float_raise(float_flag_invalid, status);
6018 return floatx80_default_nan(status);
6019 }
6020 aSig = extractFloatx80Frac( a );
6021 aExp = extractFloatx80Exp( a );
6022 aSign = extractFloatx80Sign( a );
6023 bSig = extractFloatx80Frac( b );
6024 bExp = extractFloatx80Exp( b );
6025 bSign = extractFloatx80Sign( b );
6026 zSign = aSign ^ bSign;
6027 if ( aExp == 0x7FFF ) {
6028 if ((uint64_t)(aSig << 1)) {
6029 return propagateFloatx80NaN(a, b, status);
6030 }
6031 if ( bExp == 0x7FFF ) {
6032 if ((uint64_t)(bSig << 1)) {
6033 return propagateFloatx80NaN(a, b, status);
6034 }
6035 goto invalid;
6036 }
6037 return packFloatx80(zSign, floatx80_infinity_high,
6038 floatx80_infinity_low);
6039 }
6040 if ( bExp == 0x7FFF ) {
6041 if ((uint64_t)(bSig << 1)) {
6042 return propagateFloatx80NaN(a, b, status);
6043 }
6044 return packFloatx80( zSign, 0, 0 );
6045 }
6046 if ( bExp == 0 ) {
6047 if ( bSig == 0 ) {
6048 if ( ( aExp | aSig ) == 0 ) {
6049 invalid:
6050 float_raise(float_flag_invalid, status);
6051 return floatx80_default_nan(status);
6052 }
6053 float_raise(float_flag_divbyzero, status);
6054 return packFloatx80(zSign, floatx80_infinity_high,
6055 floatx80_infinity_low);
6056 }
6057 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6058 }
6059 if ( aExp == 0 ) {
6060 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6061 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6062 }
6063 zExp = aExp - bExp + 0x3FFE;
6064 rem1 = 0;
6065 if ( bSig <= aSig ) {
6066 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6067 ++zExp;
6068 }
6069 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6070 mul64To128( bSig, zSig0, &term0, &term1 );
6071 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6072 while ( (int64_t) rem0 < 0 ) {
6073 --zSig0;
6074 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6075 }
6076 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6077 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6078 mul64To128( bSig, zSig1, &term1, &term2 );
6079 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6080 while ( (int64_t) rem1 < 0 ) {
6081 --zSig1;
6082 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6083 }
6084 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6085 }
6086 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6087 zSign, zExp, zSig0, zSig1, status);
6088 }
6089
6090 /*----------------------------------------------------------------------------
6091 | Returns the remainder of the extended double-precision floating-point value
6092 | `a' with respect to the corresponding value `b'. The operation is performed
6093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6094 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6095 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6096 | the absolute value of the integer quotient.
6097 *----------------------------------------------------------------------------*/
6098
6099 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6100 float_status *status)
6101 {
6102 bool aSign, zSign;
6103 int32_t aExp, bExp, expDiff, aExpOrig;
6104 uint64_t aSig0, aSig1, bSig;
6105 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6106
6107 *quotient = 0;
6108 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6109 float_raise(float_flag_invalid, status);
6110 return floatx80_default_nan(status);
6111 }
6112 aSig0 = extractFloatx80Frac( a );
6113 aExpOrig = aExp = extractFloatx80Exp( a );
6114 aSign = extractFloatx80Sign( a );
6115 bSig = extractFloatx80Frac( b );
6116 bExp = extractFloatx80Exp( b );
6117 if ( aExp == 0x7FFF ) {
6118 if ( (uint64_t) ( aSig0<<1 )
6119 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6120 return propagateFloatx80NaN(a, b, status);
6121 }
6122 goto invalid;
6123 }
6124 if ( bExp == 0x7FFF ) {
6125 if ((uint64_t)(bSig << 1)) {
6126 return propagateFloatx80NaN(a, b, status);
6127 }
6128 if (aExp == 0 && aSig0 >> 63) {
6129 /*
6130 * Pseudo-denormal argument must be returned in normalized
6131 * form.
6132 */
6133 return packFloatx80(aSign, 1, aSig0);
6134 }
6135 return a;
6136 }
6137 if ( bExp == 0 ) {
6138 if ( bSig == 0 ) {
6139 invalid:
6140 float_raise(float_flag_invalid, status);
6141 return floatx80_default_nan(status);
6142 }
6143 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6144 }
6145 if ( aExp == 0 ) {
6146 if ( aSig0 == 0 ) return a;
6147 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6148 }
6149 zSign = aSign;
6150 expDiff = aExp - bExp;
6151 aSig1 = 0;
6152 if ( expDiff < 0 ) {
6153 if ( mod || expDiff < -1 ) {
6154 if (aExp == 1 && aExpOrig == 0) {
6155 /*
6156 * Pseudo-denormal argument must be returned in
6157 * normalized form.
6158 */
6159 return packFloatx80(aSign, aExp, aSig0);
6160 }
6161 return a;
6162 }
6163 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6164 expDiff = 0;
6165 }
6166 *quotient = q = ( bSig <= aSig0 );
6167 if ( q ) aSig0 -= bSig;
6168 expDiff -= 64;
6169 while ( 0 < expDiff ) {
6170 q = estimateDiv128To64( aSig0, aSig1, bSig );
6171 q = ( 2 < q ) ? q - 2 : 0;
6172 mul64To128( bSig, q, &term0, &term1 );
6173 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6174 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6175 expDiff -= 62;
6176 *quotient <<= 62;
6177 *quotient += q;
6178 }
6179 expDiff += 64;
6180 if ( 0 < expDiff ) {
6181 q = estimateDiv128To64( aSig0, aSig1, bSig );
6182 q = ( 2 < q ) ? q - 2 : 0;
6183 q >>= 64 - expDiff;
6184 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6185 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6186 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6187 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6188 ++q;
6189 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6190 }
6191 if (expDiff < 64) {
6192 *quotient <<= expDiff;
6193 } else {
6194 *quotient = 0;
6195 }
6196 *quotient += q;
6197 }
6198 else {
6199 term1 = 0;
6200 term0 = bSig;
6201 }
6202 if (!mod) {
6203 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6204 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6205 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6206 && ( q & 1 ) )
6207 ) {
6208 aSig0 = alternateASig0;
6209 aSig1 = alternateASig1;
6210 zSign = ! zSign;
6211 ++*quotient;
6212 }
6213 }
6214 return
6215 normalizeRoundAndPackFloatx80(
6216 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6217
6218 }
6219
6220 /*----------------------------------------------------------------------------
6221 | Returns the remainder of the extended double-precision floating-point value
6222 | `a' with respect to the corresponding value `b'. The operation is performed
6223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6224 *----------------------------------------------------------------------------*/
6225
6226 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6227 {
6228 uint64_t quotient;
6229 return floatx80_modrem(a, b, false, &quotient, status);
6230 }
6231
6232 /*----------------------------------------------------------------------------
6233 | Returns the remainder of the extended double-precision floating-point value
6234 | `a' with respect to the corresponding value `b', with the quotient truncated
6235 | toward zero.
6236 *----------------------------------------------------------------------------*/
6237
6238 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6239 {
6240 uint64_t quotient;
6241 return floatx80_modrem(a, b, true, &quotient, status);
6242 }
6243
6244 /*----------------------------------------------------------------------------
6245 | Returns the square root of the extended double-precision floating-point
6246 | value `a'. The operation is performed according to the IEC/IEEE Standard
6247 | for Binary Floating-Point Arithmetic.
6248 *----------------------------------------------------------------------------*/
6249
6250 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6251 {
6252 bool aSign;
6253 int32_t aExp, zExp;
6254 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6255 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6256
6257 if (floatx80_invalid_encoding(a)) {
6258 float_raise(float_flag_invalid, status);
6259 return floatx80_default_nan(status);
6260 }
6261 aSig0 = extractFloatx80Frac( a );
6262 aExp = extractFloatx80Exp( a );
6263 aSign = extractFloatx80Sign( a );
6264 if ( aExp == 0x7FFF ) {
6265 if ((uint64_t)(aSig0 << 1)) {
6266 return propagateFloatx80NaN(a, a, status);
6267 }
6268 if ( ! aSign ) return a;
6269 goto invalid;
6270 }
6271 if ( aSign ) {
6272 if ( ( aExp | aSig0 ) == 0 ) return a;
6273 invalid:
6274 float_raise(float_flag_invalid, status);
6275 return floatx80_default_nan(status);
6276 }
6277 if ( aExp == 0 ) {
6278 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6279 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6280 }
6281 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6282 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6283 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6284 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6285 doubleZSig0 = zSig0<<1;
6286 mul64To128( zSig0, zSig0, &term0, &term1 );
6287 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6288 while ( (int64_t) rem0 < 0 ) {
6289 --zSig0;
6290 doubleZSig0 -= 2;
6291 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6292 }
6293 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6294 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6295 if ( zSig1 == 0 ) zSig1 = 1;
6296 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6297 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6298 mul64To128( zSig1, zSig1, &term2, &term3 );
6299 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6300 while ( (int64_t) rem1 < 0 ) {
6301 --zSig1;
6302 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6303 term3 |= 1;
6304 term2 |= doubleZSig0;
6305 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6306 }
6307 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6308 }
6309 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6310 zSig0 |= doubleZSig0;
6311 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6312 0, zExp, zSig0, zSig1, status);
6313 }
6314
6315 /*----------------------------------------------------------------------------
6316 | Returns the result of converting the quadruple-precision floating-point
6317 | value `a' to the 32-bit two's complement integer format. The conversion
6318 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6319 | Arithmetic---which means in particular that the conversion is rounded
6320 | according to the current rounding mode. If `a' is a NaN, the largest
6321 | positive integer is returned. Otherwise, if the conversion overflows, the
6322 | largest integer with the same sign as `a' is returned.
6323 *----------------------------------------------------------------------------*/
6324
6325 int32_t float128_to_int32(float128 a, float_status *status)
6326 {
6327 bool aSign;
6328 int32_t aExp, shiftCount;
6329 uint64_t aSig0, aSig1;
6330
6331 aSig1 = extractFloat128Frac1( a );
6332 aSig0 = extractFloat128Frac0( a );
6333 aExp = extractFloat128Exp( a );
6334 aSign = extractFloat128Sign( a );
6335 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6336 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6337 aSig0 |= ( aSig1 != 0 );
6338 shiftCount = 0x4028 - aExp;
6339 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6340 return roundAndPackInt32(aSign, aSig0, status);
6341
6342 }
6343
6344 /*----------------------------------------------------------------------------
6345 | Returns the result of converting the quadruple-precision floating-point
6346 | value `a' to the 32-bit two's complement integer format. The conversion
6347 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6348 | Arithmetic, except that the conversion is always rounded toward zero. If
6349 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6350 | conversion overflows, the largest integer with the same sign as `a' is
6351 | returned.
6352 *----------------------------------------------------------------------------*/
6353
6354 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6355 {
6356 bool aSign;
6357 int32_t aExp, shiftCount;
6358 uint64_t aSig0, aSig1, savedASig;
6359 int32_t z;
6360
6361 aSig1 = extractFloat128Frac1( a );
6362 aSig0 = extractFloat128Frac0( a );
6363 aExp = extractFloat128Exp( a );
6364 aSign = extractFloat128Sign( a );
6365 aSig0 |= ( aSig1 != 0 );
6366 if ( 0x401E < aExp ) {
6367 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6368 goto invalid;
6369 }
6370 else if ( aExp < 0x3FFF ) {
6371 if (aExp || aSig0) {
6372 float_raise(float_flag_inexact, status);
6373 }
6374 return 0;
6375 }
6376 aSig0 |= UINT64_C(0x0001000000000000);
6377 shiftCount = 0x402F - aExp;
6378 savedASig = aSig0;
6379 aSig0 >>= shiftCount;
6380 z = aSig0;
6381 if ( aSign ) z = - z;
6382 if ( ( z < 0 ) ^ aSign ) {
6383 invalid:
6384 float_raise(float_flag_invalid, status);
6385 return aSign ? INT32_MIN : INT32_MAX;
6386 }
6387 if ( ( aSig0<<shiftCount ) != savedASig ) {
6388 float_raise(float_flag_inexact, status);
6389 }
6390 return z;
6391
6392 }
6393
6394 /*----------------------------------------------------------------------------
6395 | Returns the result of converting the quadruple-precision floating-point
6396 | value `a' to the 64-bit two's complement integer format. The conversion
6397 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6398 | Arithmetic---which means in particular that the conversion is rounded
6399 | according to the current rounding mode. If `a' is a NaN, the largest
6400 | positive integer is returned. Otherwise, if the conversion overflows, the
6401 | largest integer with the same sign as `a' is returned.
6402 *----------------------------------------------------------------------------*/
6403
6404 int64_t float128_to_int64(float128 a, float_status *status)
6405 {
6406 bool aSign;
6407 int32_t aExp, shiftCount;
6408 uint64_t aSig0, aSig1;
6409
6410 aSig1 = extractFloat128Frac1( a );
6411 aSig0 = extractFloat128Frac0( a );
6412 aExp = extractFloat128Exp( a );
6413 aSign = extractFloat128Sign( a );
6414 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6415 shiftCount = 0x402F - aExp;
6416 if ( shiftCount <= 0 ) {
6417 if ( 0x403E < aExp ) {
6418 float_raise(float_flag_invalid, status);
6419 if ( ! aSign
6420 || ( ( aExp == 0x7FFF )
6421 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6422 )
6423 ) {
6424 return INT64_MAX;
6425 }
6426 return INT64_MIN;
6427 }
6428 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6429 }
6430 else {
6431 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6432 }
6433 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6434
6435 }
6436
6437 /*----------------------------------------------------------------------------
6438 | Returns the result of converting the quadruple-precision floating-point
6439 | value `a' to the 64-bit two's complement integer format. The conversion
6440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6441 | Arithmetic, except that the conversion is always rounded toward zero.
6442 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6443 | the conversion overflows, the largest integer with the same sign as `a' is
6444 | returned.
6445 *----------------------------------------------------------------------------*/
6446
6447 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6448 {
6449 bool aSign;
6450 int32_t aExp, shiftCount;
6451 uint64_t aSig0, aSig1;
6452 int64_t z;
6453
6454 aSig1 = extractFloat128Frac1( a );
6455 aSig0 = extractFloat128Frac0( a );
6456 aExp = extractFloat128Exp( a );
6457 aSign = extractFloat128Sign( a );
6458 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6459 shiftCount = aExp - 0x402F;
6460 if ( 0 < shiftCount ) {
6461 if ( 0x403E <= aExp ) {
6462 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6463 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6464 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6465 if (aSig1) {
6466 float_raise(float_flag_inexact, status);
6467 }
6468 }
6469 else {
6470 float_raise(float_flag_invalid, status);
6471 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6472 return INT64_MAX;
6473 }
6474 }
6475 return INT64_MIN;
6476 }
6477 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6478 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6479 float_raise(float_flag_inexact, status);
6480 }
6481 }
6482 else {
6483 if ( aExp < 0x3FFF ) {
6484 if ( aExp | aSig0 | aSig1 ) {
6485 float_raise(float_flag_inexact, status);
6486 }
6487 return 0;
6488 }
6489 z = aSig0>>( - shiftCount );
6490 if ( aSig1
6491 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6492 float_raise(float_flag_inexact, status);
6493 }
6494 }
6495 if ( aSign ) z = - z;
6496 return z;
6497
6498 }
6499
6500 /*----------------------------------------------------------------------------
6501 | Returns the result of converting the quadruple-precision floating-point value
6502 | `a' to the 64-bit unsigned integer format. The conversion is
6503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6504 | Arithmetic---which means in particular that the conversion is rounded
6505 | according to the current rounding mode. If `a' is a NaN, the largest
6506 | positive integer is returned. If the conversion overflows, the
6507 | largest unsigned integer is returned. If 'a' is negative, the value is
6508 | rounded and zero is returned; negative values that do not round to zero
6509 | will raise the inexact exception.
6510 *----------------------------------------------------------------------------*/
6511
6512 uint64_t float128_to_uint64(float128 a, float_status *status)
6513 {
6514 bool aSign;
6515 int aExp;
6516 int shiftCount;
6517 uint64_t aSig0, aSig1;
6518
6519 aSig0 = extractFloat128Frac0(a);
6520 aSig1 = extractFloat128Frac1(a);
6521 aExp = extractFloat128Exp(a);
6522 aSign = extractFloat128Sign(a);
6523 if (aSign && (aExp > 0x3FFE)) {
6524 float_raise(float_flag_invalid, status);
6525 if (float128_is_any_nan(a)) {
6526 return UINT64_MAX;
6527 } else {
6528 return 0;
6529 }
6530 }
6531 if (aExp) {
6532 aSig0 |= UINT64_C(0x0001000000000000);
6533 }
6534 shiftCount = 0x402F - aExp;
6535 if (shiftCount <= 0) {
6536 if (0x403E < aExp) {
6537 float_raise(float_flag_invalid, status);
6538 return UINT64_MAX;
6539 }
6540 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6541 } else {
6542 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6543 }
6544 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6545 }
6546
6547 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6548 {
6549 uint64_t v;
6550 signed char current_rounding_mode = status->float_rounding_mode;
6551
6552 set_float_rounding_mode(float_round_to_zero, status);
6553 v = float128_to_uint64(a, status);
6554 set_float_rounding_mode(current_rounding_mode, status);
6555
6556 return v;
6557 }
6558
6559 /*----------------------------------------------------------------------------
6560 | Returns the result of converting the quadruple-precision floating-point
6561 | value `a' to the 32-bit unsigned integer format. The conversion
6562 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6563 | Arithmetic except that the conversion is always rounded toward zero.
6564 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6565 | if the conversion overflows, the largest unsigned integer is returned.
6566 | If 'a' is negative, the value is rounded and zero is returned; negative
6567 | values that do not round to zero will raise the inexact exception.
6568 *----------------------------------------------------------------------------*/
6569
6570 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6571 {
6572 uint64_t v;
6573 uint32_t res;
6574 int old_exc_flags = get_float_exception_flags(status);
6575
6576 v = float128_to_uint64_round_to_zero(a, status);
6577 if (v > 0xffffffff) {
6578 res = 0xffffffff;
6579 } else {
6580 return v;
6581 }
6582 set_float_exception_flags(old_exc_flags, status);
6583 float_raise(float_flag_invalid, status);
6584 return res;
6585 }
6586
6587 /*----------------------------------------------------------------------------
6588 | Returns the result of converting the quadruple-precision floating-point value
6589 | `a' to the 32-bit unsigned integer format. The conversion is
6590 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6591 | Arithmetic---which means in particular that the conversion is rounded
6592 | according to the current rounding mode. If `a' is a NaN, the largest
6593 | positive integer is returned. If the conversion overflows, the
6594 | largest unsigned integer is returned. If 'a' is negative, the value is
6595 | rounded and zero is returned; negative values that do not round to zero
6596 | will raise the inexact exception.
6597 *----------------------------------------------------------------------------*/
6598
6599 uint32_t float128_to_uint32(float128 a, float_status *status)
6600 {
6601 uint64_t v;
6602 uint32_t res;
6603 int old_exc_flags = get_float_exception_flags(status);
6604
6605 v = float128_to_uint64(a, status);
6606 if (v > 0xffffffff) {
6607 res = 0xffffffff;
6608 } else {
6609 return v;
6610 }
6611 set_float_exception_flags(old_exc_flags, status);
6612 float_raise(float_flag_invalid, status);
6613 return res;
6614 }
6615
6616 /*----------------------------------------------------------------------------
6617 | Returns the result of converting the quadruple-precision floating-point
6618 | value `a' to the single-precision floating-point format. The conversion
6619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6620 | Arithmetic.
6621 *----------------------------------------------------------------------------*/
6622
6623 float32 float128_to_float32(float128 a, float_status *status)
6624 {
6625 bool aSign;
6626 int32_t aExp;
6627 uint64_t aSig0, aSig1;
6628 uint32_t zSig;
6629
6630 aSig1 = extractFloat128Frac1( a );
6631 aSig0 = extractFloat128Frac0( a );
6632 aExp = extractFloat128Exp( a );
6633 aSign = extractFloat128Sign( a );
6634 if ( aExp == 0x7FFF ) {
6635 if ( aSig0 | aSig1 ) {
6636 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6637 }
6638 return packFloat32( aSign, 0xFF, 0 );
6639 }
6640 aSig0 |= ( aSig1 != 0 );
6641 shift64RightJamming( aSig0, 18, &aSig0 );
6642 zSig = aSig0;
6643 if ( aExp || zSig ) {
6644 zSig |= 0x40000000;
6645 aExp -= 0x3F81;
6646 }
6647 return roundAndPackFloat32(aSign, aExp, zSig, status);
6648
6649 }
6650
6651 /*----------------------------------------------------------------------------
6652 | Returns the result of converting the quadruple-precision floating-point
6653 | value `a' to the double-precision floating-point format. The conversion
6654 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6655 | Arithmetic.
6656 *----------------------------------------------------------------------------*/
6657
6658 float64 float128_to_float64(float128 a, float_status *status)
6659 {
6660 bool aSign;
6661 int32_t aExp;
6662 uint64_t aSig0, aSig1;
6663
6664 aSig1 = extractFloat128Frac1( a );
6665 aSig0 = extractFloat128Frac0( a );
6666 aExp = extractFloat128Exp( a );
6667 aSign = extractFloat128Sign( a );
6668 if ( aExp == 0x7FFF ) {
6669 if ( aSig0 | aSig1 ) {
6670 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6671 }
6672 return packFloat64( aSign, 0x7FF, 0 );
6673 }
6674 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6675 aSig0 |= ( aSig1 != 0 );
6676 if ( aExp || aSig0 ) {
6677 aSig0 |= UINT64_C(0x4000000000000000);
6678 aExp -= 0x3C01;
6679 }
6680 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6681
6682 }
6683
6684 /*----------------------------------------------------------------------------
6685 | Returns the result of converting the quadruple-precision floating-point
6686 | value `a' to the extended double-precision floating-point format. The
6687 | conversion is performed according to the IEC/IEEE Standard for Binary
6688 | Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6690
6691 floatx80 float128_to_floatx80(float128 a, float_status *status)
6692 {
6693 bool aSign;
6694 int32_t aExp;
6695 uint64_t aSig0, aSig1;
6696
6697 aSig1 = extractFloat128Frac1( a );
6698 aSig0 = extractFloat128Frac0( a );
6699 aExp = extractFloat128Exp( a );
6700 aSign = extractFloat128Sign( a );
6701 if ( aExp == 0x7FFF ) {
6702 if ( aSig0 | aSig1 ) {
6703 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6704 status);
6705 return floatx80_silence_nan(res, status);
6706 }
6707 return packFloatx80(aSign, floatx80_infinity_high,
6708 floatx80_infinity_low);
6709 }
6710 if ( aExp == 0 ) {
6711 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6712 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6713 }
6714 else {
6715 aSig0 |= UINT64_C(0x0001000000000000);
6716 }
6717 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6718 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6719
6720 }
6721
6722 /*----------------------------------------------------------------------------
6723 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6724 | returns the result as a quadruple-precision floating-point value. The
6725 | operation is performed according to the IEC/IEEE Standard for Binary
6726 | Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6728
6729 float128 float128_round_to_int(float128 a, float_status *status)
6730 {
6731 bool aSign;
6732 int32_t aExp;
6733 uint64_t lastBitMask, roundBitsMask;
6734 float128 z;
6735
6736 aExp = extractFloat128Exp( a );
6737 if ( 0x402F <= aExp ) {
6738 if ( 0x406F <= aExp ) {
6739 if ( ( aExp == 0x7FFF )
6740 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6741 ) {
6742 return propagateFloat128NaN(a, a, status);
6743 }
6744 return a;
6745 }
6746 lastBitMask = 1;
6747 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6748 roundBitsMask = lastBitMask - 1;
6749 z = a;
6750 switch (status->float_rounding_mode) {
6751 case float_round_nearest_even:
6752 if ( lastBitMask ) {
6753 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6754 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6755 }
6756 else {
6757 if ( (int64_t) z.low < 0 ) {
6758 ++z.high;
6759 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6760 }
6761 }
6762 break;
6763 case float_round_ties_away:
6764 if (lastBitMask) {
6765 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6766 } else {
6767 if ((int64_t) z.low < 0) {
6768 ++z.high;
6769 }
6770 }
6771 break;
6772 case float_round_to_zero:
6773 break;
6774 case float_round_up:
6775 if (!extractFloat128Sign(z)) {
6776 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6777 }
6778 break;
6779 case float_round_down:
6780 if (extractFloat128Sign(z)) {
6781 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6782 }
6783 break;
6784 case float_round_to_odd:
6785 /*
6786 * Note that if lastBitMask == 0, the last bit is the lsb
6787 * of high, and roundBitsMask == -1.
6788 */
6789 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6790 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6791 }
6792 break;
6793 default:
6794 abort();
6795 }
6796 z.low &= ~ roundBitsMask;
6797 }
6798 else {
6799 if ( aExp < 0x3FFF ) {
6800 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6801 float_raise(float_flag_inexact, status);
6802 aSign = extractFloat128Sign( a );
6803 switch (status->float_rounding_mode) {
6804 case float_round_nearest_even:
6805 if ( ( aExp == 0x3FFE )
6806 && ( extractFloat128Frac0( a )
6807 | extractFloat128Frac1( a ) )
6808 ) {
6809 return packFloat128( aSign, 0x3FFF, 0, 0 );
6810 }
6811 break;
6812 case float_round_ties_away:
6813 if (aExp == 0x3FFE) {
6814 return packFloat128(aSign, 0x3FFF, 0, 0);
6815 }
6816 break;
6817 case float_round_down:
6818 return
6819 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6820 : packFloat128( 0, 0, 0, 0 );
6821 case float_round_up:
6822 return
6823 aSign ? packFloat128( 1, 0, 0, 0 )
6824 : packFloat128( 0, 0x3FFF, 0, 0 );
6825
6826 case float_round_to_odd:
6827 return packFloat128(aSign, 0x3FFF, 0, 0);
6828
6829 case float_round_to_zero:
6830 break;
6831 }
6832 return packFloat128( aSign, 0, 0, 0 );
6833 }
6834 lastBitMask = 1;
6835 lastBitMask <<= 0x402F - aExp;
6836 roundBitsMask = lastBitMask - 1;
6837 z.low = 0;
6838 z.high = a.high;
6839 switch (status->float_rounding_mode) {
6840 case float_round_nearest_even:
6841 z.high += lastBitMask>>1;
6842 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6843 z.high &= ~ lastBitMask;
6844 }
6845 break;
6846 case float_round_ties_away:
6847 z.high += lastBitMask>>1;
6848 break;
6849 case float_round_to_zero:
6850 break;
6851 case float_round_up:
6852 if (!extractFloat128Sign(z)) {
6853 z.high |= ( a.low != 0 );
6854 z.high += roundBitsMask;
6855 }
6856 break;
6857 case float_round_down:
6858 if (extractFloat128Sign(z)) {
6859 z.high |= (a.low != 0);
6860 z.high += roundBitsMask;
6861 }
6862 break;
6863 case float_round_to_odd:
6864 if ((z.high & lastBitMask) == 0) {
6865 z.high |= (a.low != 0);
6866 z.high += roundBitsMask;
6867 }
6868 break;
6869 default:
6870 abort();
6871 }
6872 z.high &= ~ roundBitsMask;
6873 }
6874 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6875 float_raise(float_flag_inexact, status);
6876 }
6877 return z;
6878
6879 }
6880
6881 /*----------------------------------------------------------------------------
6882 | Returns the result of adding the absolute values of the quadruple-precision
6883 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6884 | before being returned. `zSign' is ignored if the result is a NaN.
6885 | The addition is performed according to the IEC/IEEE Standard for Binary
6886 | Floating-Point Arithmetic.
6887 *----------------------------------------------------------------------------*/
6888
6889 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6890 float_status *status)
6891 {
6892 int32_t aExp, bExp, zExp;
6893 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6894 int32_t expDiff;
6895
6896 aSig1 = extractFloat128Frac1( a );
6897 aSig0 = extractFloat128Frac0( a );
6898 aExp = extractFloat128Exp( a );
6899 bSig1 = extractFloat128Frac1( b );
6900 bSig0 = extractFloat128Frac0( b );
6901 bExp = extractFloat128Exp( b );
6902 expDiff = aExp - bExp;
6903 if ( 0 < expDiff ) {
6904 if ( aExp == 0x7FFF ) {
6905 if (aSig0 | aSig1) {
6906 return propagateFloat128NaN(a, b, status);
6907 }
6908 return a;
6909 }
6910 if ( bExp == 0 ) {
6911 --expDiff;
6912 }
6913 else {
6914 bSig0 |= UINT64_C(0x0001000000000000);
6915 }
6916 shift128ExtraRightJamming(
6917 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6918 zExp = aExp;
6919 }
6920 else if ( expDiff < 0 ) {
6921 if ( bExp == 0x7FFF ) {
6922 if (bSig0 | bSig1) {
6923 return propagateFloat128NaN(a, b, status);
6924 }
6925 return packFloat128( zSign, 0x7FFF, 0, 0 );
6926 }
6927 if ( aExp == 0 ) {
6928 ++expDiff;
6929 }
6930 else {
6931 aSig0 |= UINT64_C(0x0001000000000000);
6932 }
6933 shift128ExtraRightJamming(
6934 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6935 zExp = bExp;
6936 }
6937 else {
6938 if ( aExp == 0x7FFF ) {
6939 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6940 return propagateFloat128NaN(a, b, status);
6941 }
6942 return a;
6943 }
6944 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6945 if ( aExp == 0 ) {
6946 if (status->flush_to_zero) {
6947 if (zSig0 | zSig1) {
6948 float_raise(float_flag_output_denormal, status);
6949 }
6950 return packFloat128(zSign, 0, 0, 0);
6951 }
6952 return packFloat128( zSign, 0, zSig0, zSig1 );
6953 }
6954 zSig2 = 0;
6955 zSig0 |= UINT64_C(0x0002000000000000);
6956 zExp = aExp;
6957 goto shiftRight1;
6958 }
6959 aSig0 |= UINT64_C(0x0001000000000000);
6960 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6961 --zExp;
6962 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6963 ++zExp;
6964 shiftRight1:
6965 shift128ExtraRightJamming(
6966 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6967 roundAndPack:
6968 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6969
6970 }
6971
6972 /*----------------------------------------------------------------------------
6973 | Returns the result of subtracting the absolute values of the quadruple-
6974 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6975 | difference is negated before being returned. `zSign' is ignored if the
6976 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6977 | Standard for Binary Floating-Point Arithmetic.
6978 *----------------------------------------------------------------------------*/
6979
6980 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6981 float_status *status)
6982 {
6983 int32_t aExp, bExp, zExp;
6984 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6985 int32_t expDiff;
6986
6987 aSig1 = extractFloat128Frac1( a );
6988 aSig0 = extractFloat128Frac0( a );
6989 aExp = extractFloat128Exp( a );
6990 bSig1 = extractFloat128Frac1( b );
6991 bSig0 = extractFloat128Frac0( b );
6992 bExp = extractFloat128Exp( b );
6993 expDiff = aExp - bExp;
6994 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6995 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6996 if ( 0 < expDiff ) goto aExpBigger;
6997 if ( expDiff < 0 ) goto bExpBigger;
6998 if ( aExp == 0x7FFF ) {
6999 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7000 return propagateFloat128NaN(a, b, status);
7001 }
7002 float_raise(float_flag_invalid, status);
7003 return float128_default_nan(status);
7004 }
7005 if ( aExp == 0 ) {
7006 aExp = 1;
7007 bExp = 1;
7008 }
7009 if ( bSig0 < aSig0 ) goto aBigger;
7010 if ( aSig0 < bSig0 ) goto bBigger;
7011 if ( bSig1 < aSig1 ) goto aBigger;
7012 if ( aSig1 < bSig1 ) goto bBigger;
7013 return packFloat128(status->float_rounding_mode == float_round_down,
7014 0, 0, 0);
7015 bExpBigger:
7016 if ( bExp == 0x7FFF ) {
7017 if (bSig0 | bSig1) {
7018 return propagateFloat128NaN(a, b, status);
7019 }
7020 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7021 }
7022 if ( aExp == 0 ) {
7023 ++expDiff;
7024 }
7025 else {
7026 aSig0 |= UINT64_C(0x4000000000000000);
7027 }
7028 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7029 bSig0 |= UINT64_C(0x4000000000000000);
7030 bBigger:
7031 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7032 zExp = bExp;
7033 zSign ^= 1;
7034 goto normalizeRoundAndPack;
7035 aExpBigger:
7036 if ( aExp == 0x7FFF ) {
7037 if (aSig0 | aSig1) {
7038 return propagateFloat128NaN(a, b, status);
7039 }
7040 return a;
7041 }
7042 if ( bExp == 0 ) {
7043 --expDiff;
7044 }
7045 else {
7046 bSig0 |= UINT64_C(0x4000000000000000);
7047 }
7048 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7049 aSig0 |= UINT64_C(0x4000000000000000);
7050 aBigger:
7051 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7052 zExp = aExp;
7053 normalizeRoundAndPack:
7054 --zExp;
7055 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7056 status);
7057
7058 }
7059
7060 /*----------------------------------------------------------------------------
7061 | Returns the result of adding the quadruple-precision floating-point values
7062 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7063 | for Binary Floating-Point Arithmetic.
7064 *----------------------------------------------------------------------------*/
7065
7066 float128 float128_add(float128 a, float128 b, float_status *status)
7067 {
7068 bool aSign, bSign;
7069
7070 aSign = extractFloat128Sign( a );
7071 bSign = extractFloat128Sign( b );
7072 if ( aSign == bSign ) {
7073 return addFloat128Sigs(a, b, aSign, status);
7074 }
7075 else {
7076 return subFloat128Sigs(a, b, aSign, status);
7077 }
7078
7079 }
7080
7081 /*----------------------------------------------------------------------------
7082 | Returns the result of subtracting the quadruple-precision floating-point
7083 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7084 | Standard for Binary Floating-Point Arithmetic.
7085 *----------------------------------------------------------------------------*/
7086
7087 float128 float128_sub(float128 a, float128 b, float_status *status)
7088 {
7089 bool aSign, bSign;
7090
7091 aSign = extractFloat128Sign( a );
7092 bSign = extractFloat128Sign( b );
7093 if ( aSign == bSign ) {
7094 return subFloat128Sigs(a, b, aSign, status);
7095 }
7096 else {
7097 return addFloat128Sigs(a, b, aSign, status);
7098 }
7099
7100 }
7101
7102 /*----------------------------------------------------------------------------
7103 | Returns the result of multiplying the quadruple-precision floating-point
7104 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7105 | Standard for Binary Floating-Point Arithmetic.
7106 *----------------------------------------------------------------------------*/
7107
7108 float128 float128_mul(float128 a, float128 b, float_status *status)
7109 {
7110 bool aSign, bSign, zSign;
7111 int32_t aExp, bExp, zExp;
7112 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7113
7114 aSig1 = extractFloat128Frac1( a );
7115 aSig0 = extractFloat128Frac0( a );
7116 aExp = extractFloat128Exp( a );
7117 aSign = extractFloat128Sign( a );
7118 bSig1 = extractFloat128Frac1( b );
7119 bSig0 = extractFloat128Frac0( b );
7120 bExp = extractFloat128Exp( b );
7121 bSign = extractFloat128Sign( b );
7122 zSign = aSign ^ bSign;
7123 if ( aExp == 0x7FFF ) {
7124 if ( ( aSig0 | aSig1 )
7125 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7126 return propagateFloat128NaN(a, b, status);
7127 }
7128 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7129 return packFloat128( zSign, 0x7FFF, 0, 0 );
7130 }
7131 if ( bExp == 0x7FFF ) {
7132 if (bSig0 | bSig1) {
7133 return propagateFloat128NaN(a, b, status);
7134 }
7135 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7136 invalid:
7137 float_raise(float_flag_invalid, status);
7138 return float128_default_nan(status);
7139 }
7140 return packFloat128( zSign, 0x7FFF, 0, 0 );
7141 }
7142 if ( aExp == 0 ) {
7143 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7144 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7145 }
7146 if ( bExp == 0 ) {
7147 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7148 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7149 }
7150 zExp = aExp + bExp - 0x4000;
7151 aSig0 |= UINT64_C(0x0001000000000000);
7152 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7153 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7154 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7155 zSig2 |= ( zSig3 != 0 );
7156 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7157 shift128ExtraRightJamming(
7158 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7159 ++zExp;
7160 }
7161 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7162
7163 }
7164
7165 /*----------------------------------------------------------------------------
7166 | Returns the result of dividing the quadruple-precision floating-point value
7167 | `a' by the corresponding value `b'. The operation is performed according to
7168 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7169 *----------------------------------------------------------------------------*/
7170
7171 float128 float128_div(float128 a, float128 b, float_status *status)
7172 {
7173 bool aSign, bSign, zSign;
7174 int32_t aExp, bExp, zExp;
7175 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7176 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7177
7178 aSig1 = extractFloat128Frac1( a );
7179 aSig0 = extractFloat128Frac0( a );
7180 aExp = extractFloat128Exp( a );
7181 aSign = extractFloat128Sign( a );
7182 bSig1 = extractFloat128Frac1( b );
7183 bSig0 = extractFloat128Frac0( b );
7184 bExp = extractFloat128Exp( b );
7185 bSign = extractFloat128Sign( b );
7186 zSign = aSign ^ bSign;
7187 if ( aExp == 0x7FFF ) {
7188 if (aSig0 | aSig1) {
7189 return propagateFloat128NaN(a, b, status);
7190 }
7191 if ( bExp == 0x7FFF ) {
7192 if (bSig0 | bSig1) {
7193 return propagateFloat128NaN(a, b, status);
7194 }
7195 goto invalid;
7196 }
7197 return packFloat128( zSign, 0x7FFF, 0, 0 );
7198 }
7199 if ( bExp == 0x7FFF ) {
7200 if (bSig0 | bSig1) {
7201 return propagateFloat128NaN(a, b, status);
7202 }
7203 return packFloat128( zSign, 0, 0, 0 );
7204 }
7205 if ( bExp == 0 ) {
7206 if ( ( bSig0 | bSig1 ) == 0 ) {
7207 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7208 invalid:
7209 float_raise(float_flag_invalid, status);
7210 return float128_default_nan(status);
7211 }
7212 float_raise(float_flag_divbyzero, status);
7213 return packFloat128( zSign, 0x7FFF, 0, 0 );
7214 }
7215 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7216 }
7217 if ( aExp == 0 ) {
7218 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7219 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7220 }
7221 zExp = aExp - bExp + 0x3FFD;
7222 shortShift128Left(
7223 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7224 shortShift128Left(
7225 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7226 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7227 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7228 ++zExp;
7229 }
7230 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7231 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7232 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7233 while ( (int64_t) rem0 < 0 ) {
7234 --zSig0;
7235 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7236 }
7237 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7238 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7239 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7240 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7241 while ( (int64_t) rem1 < 0 ) {
7242 --zSig1;
7243 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7244 }
7245 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7246 }
7247 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7248 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7249
7250 }
7251
7252 /*----------------------------------------------------------------------------
7253 | Returns the remainder of the quadruple-precision floating-point value `a'
7254 | with respect to the corresponding value `b'. The operation is performed
7255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7256 *----------------------------------------------------------------------------*/
7257
7258 float128 float128_rem(float128 a, float128 b, float_status *status)
7259 {
7260 bool aSign, zSign;
7261 int32_t aExp, bExp, expDiff;
7262 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7263 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7264 int64_t sigMean0;
7265
7266 aSig1 = extractFloat128Frac1( a );
7267 aSig0 = extractFloat128Frac0( a );
7268 aExp = extractFloat128Exp( a );
7269 aSign = extractFloat128Sign( a );
7270 bSig1 = extractFloat128Frac1( b );
7271 bSig0 = extractFloat128Frac0( b );
7272 bExp = extractFloat128Exp( b );
7273 if ( aExp == 0x7FFF ) {
7274 if ( ( aSig0 | aSig1 )
7275 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7276 return propagateFloat128NaN(a, b, status);
7277 }
7278 goto invalid;
7279 }
7280 if ( bExp == 0x7FFF ) {
7281 if (bSig0 | bSig1) {
7282 return propagateFloat128NaN(a, b, status);
7283 }
7284 return a;
7285 }
7286 if ( bExp == 0 ) {
7287 if ( ( bSig0 | bSig1 ) == 0 ) {
7288 invalid:
7289 float_raise(float_flag_invalid, status);
7290 return float128_default_nan(status);
7291 }
7292 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7293 }
7294 if ( aExp == 0 ) {
7295 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7296 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7297 }
7298 expDiff = aExp - bExp;
7299 if ( expDiff < -1 ) return a;
7300 shortShift128Left(
7301 aSig0 | UINT64_C(0x0001000000000000),
7302 aSig1,
7303 15 - ( expDiff < 0 ),
7304 &aSig0,
7305 &aSig1
7306 );
7307 shortShift128Left(
7308 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7309 q = le128( bSig0, bSig1, aSig0, aSig1 );
7310 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7311 expDiff -= 64;
7312 while ( 0 < expDiff ) {
7313 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7314 q = ( 4 < q ) ? q - 4 : 0;
7315 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7316 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7317 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7318 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7319 expDiff -= 61;
7320 }
7321 if ( -64 < expDiff ) {
7322 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7323 q = ( 4 < q ) ? q - 4 : 0;
7324 q >>= - expDiff;
7325 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7326 expDiff += 52;
7327 if ( expDiff < 0 ) {
7328 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7329 }
7330 else {
7331 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7332 }
7333 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7334 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7335 }
7336 else {
7337 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7338 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7339 }
7340 do {
7341 alternateASig0 = aSig0;
7342 alternateASig1 = aSig1;
7343 ++q;
7344 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7345 } while ( 0 <= (int64_t) aSig0 );
7346 add128(
7347 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7348 if ( ( sigMean0 < 0 )
7349 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7350 aSig0 = alternateASig0;
7351 aSig1 = alternateASig1;
7352 }
7353 zSign = ( (int64_t) aSig0 < 0 );
7354 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7355 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7356 status);
7357 }
7358
7359 /*----------------------------------------------------------------------------
7360 | Returns the square root of the quadruple-precision floating-point value `a'.
7361 | The operation is performed according to the IEC/IEEE Standard for Binary
7362 | Floating-Point Arithmetic.
7363 *----------------------------------------------------------------------------*/
7364
7365 float128 float128_sqrt(float128 a, float_status *status)
7366 {
7367 bool aSign;
7368 int32_t aExp, zExp;
7369 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7370 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7371
7372 aSig1 = extractFloat128Frac1( a );
7373 aSig0 = extractFloat128Frac0( a );
7374 aExp = extractFloat128Exp( a );
7375 aSign = extractFloat128Sign( a );
7376 if ( aExp == 0x7FFF ) {
7377 if (aSig0 | aSig1) {
7378 return propagateFloat128NaN(a, a, status);
7379 }
7380 if ( ! aSign ) return a;
7381 goto invalid;
7382 }
7383 if ( aSign ) {
7384 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7385 invalid:
7386 float_raise(float_flag_invalid, status);
7387 return float128_default_nan(status);
7388 }
7389 if ( aExp == 0 ) {
7390 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7391 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7392 }
7393 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7394 aSig0 |= UINT64_C(0x0001000000000000);
7395 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7396 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7397 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7398 doubleZSig0 = zSig0<<1;
7399 mul64To128( zSig0, zSig0, &term0, &term1 );
7400 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7401 while ( (int64_t) rem0 < 0 ) {
7402 --zSig0;
7403 doubleZSig0 -= 2;
7404 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7405 }
7406 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7407 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7408 if ( zSig1 == 0 ) zSig1 = 1;
7409 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7410 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7411 mul64To128( zSig1, zSig1, &term2, &term3 );
7412 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7413 while ( (int64_t) rem1 < 0 ) {
7414 --zSig1;
7415 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7416 term3 |= 1;
7417 term2 |= doubleZSig0;
7418 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7419 }
7420 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7421 }
7422 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7423 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7424
7425 }
7426
7427 static inline FloatRelation
7428 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7429 float_status *status)
7430 {
7431 bool aSign, bSign;
7432
7433 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7434 float_raise(float_flag_invalid, status);
7435 return float_relation_unordered;
7436 }
7437 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7438 ( extractFloatx80Frac( a )<<1 ) ) ||
7439 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7440 ( extractFloatx80Frac( b )<<1 ) )) {
7441 if (!is_quiet ||
7442 floatx80_is_signaling_nan(a, status) ||
7443 floatx80_is_signaling_nan(b, status)) {
7444 float_raise(float_flag_invalid, status);
7445 }
7446 return float_relation_unordered;
7447 }
7448 aSign = extractFloatx80Sign( a );
7449 bSign = extractFloatx80Sign( b );
7450 if ( aSign != bSign ) {
7451
7452 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7453 ( ( a.low | b.low ) == 0 ) ) {
7454 /* zero case */
7455 return float_relation_equal;
7456 } else {
7457 return 1 - (2 * aSign);
7458 }
7459 } else {
7460 /* Normalize pseudo-denormals before comparison. */
7461 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7462 ++a.high;
7463 }
7464 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7465 ++b.high;
7466 }
7467 if (a.low == b.low && a.high == b.high) {
7468 return float_relation_equal;
7469 } else {
7470 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7471 }
7472 }
7473 }
7474
7475 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7476 {
7477 return floatx80_compare_internal(a, b, 0, status);
7478 }
7479
7480 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7481 float_status *status)
7482 {
7483 return floatx80_compare_internal(a, b, 1, status);
7484 }
7485
7486 static inline FloatRelation
7487 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7488 float_status *status)
7489 {
7490 bool aSign, bSign;
7491
7492 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7493 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7494 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7495 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7496 if (!is_quiet ||
7497 float128_is_signaling_nan(a, status) ||
7498 float128_is_signaling_nan(b, status)) {
7499 float_raise(float_flag_invalid, status);
7500 }
7501 return float_relation_unordered;
7502 }
7503 aSign = extractFloat128Sign( a );
7504 bSign = extractFloat128Sign( b );
7505 if ( aSign != bSign ) {
7506 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7507 /* zero case */
7508 return float_relation_equal;
7509 } else {
7510 return 1 - (2 * aSign);
7511 }
7512 } else {
7513 if (a.low == b.low && a.high == b.high) {
7514 return float_relation_equal;
7515 } else {
7516 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7517 }
7518 }
7519 }
7520
7521 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7522 {
7523 return float128_compare_internal(a, b, 0, status);
7524 }
7525
7526 FloatRelation float128_compare_quiet(float128 a, float128 b,
7527 float_status *status)
7528 {
7529 return float128_compare_internal(a, b, 1, status);
7530 }
7531
7532 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7533 {
7534 bool aSign;
7535 int32_t aExp;
7536 uint64_t aSig;
7537
7538 if (floatx80_invalid_encoding(a)) {
7539 float_raise(float_flag_invalid, status);
7540 return floatx80_default_nan(status);
7541 }
7542 aSig = extractFloatx80Frac( a );
7543 aExp = extractFloatx80Exp( a );
7544 aSign = extractFloatx80Sign( a );
7545
7546 if ( aExp == 0x7FFF ) {
7547 if ( aSig<<1 ) {
7548 return propagateFloatx80NaN(a, a, status);
7549 }
7550 return a;
7551 }
7552
7553 if (aExp == 0) {
7554 if (aSig == 0) {
7555 return a;
7556 }
7557 aExp++;
7558 }
7559
7560 if (n > 0x10000) {
7561 n = 0x10000;
7562 } else if (n < -0x10000) {
7563 n = -0x10000;
7564 }
7565
7566 aExp += n;
7567 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7568 aSign, aExp, aSig, 0, status);
7569 }
7570
7571 float128 float128_scalbn(float128 a, int n, float_status *status)
7572 {
7573 bool aSign;
7574 int32_t aExp;
7575 uint64_t aSig0, aSig1;
7576
7577 aSig1 = extractFloat128Frac1( a );
7578 aSig0 = extractFloat128Frac0( a );
7579 aExp = extractFloat128Exp( a );
7580 aSign = extractFloat128Sign( a );
7581 if ( aExp == 0x7FFF ) {
7582 if ( aSig0 | aSig1 ) {
7583 return propagateFloat128NaN(a, a, status);
7584 }
7585 return a;
7586 }
7587 if (aExp != 0) {
7588 aSig0 |= UINT64_C(0x0001000000000000);
7589 } else if (aSig0 == 0 && aSig1 == 0) {
7590 return a;
7591 } else {
7592 aExp++;
7593 }
7594
7595 if (n > 0x10000) {
7596 n = 0x10000;
7597 } else if (n < -0x10000) {
7598 n = -0x10000;
7599 }
7600
7601 aExp += n - 1;
7602 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7603 , status);
7604
7605 }
7606
7607 static void __attribute__((constructor)) softfloat_init(void)
7608 {
7609 union_float64 ua, ub, uc, ur;
7610
7611 if (QEMU_NO_HARDFLOAT) {
7612 return;
7613 }
7614 /*
7615 * Test that the host's FMA is not obviously broken. For example,
7616 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7617 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7618 */
7619 ua.s = 0x0020000000000001ULL;
7620 ub.s = 0x3ca0000000000000ULL;
7621 uc.s = 0x0020000000000000ULL;
7622 ur.h = fma(ua.h, ub.h, uc.h);
7623 if (ur.s != 0x0020000000000001ULL) {
7624 force_soft_fma = true;
7625 }
7626 }