]> git.proxmox.com Git - mirror_qemu.git/blob - fpu/softfloat.c
softfloat: Use pointers with pack_raw
[mirror_qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 float_raise(float_flag_input_denormal, s); \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345 f32_check_fn pre, f32_check_fn post)
346 {
347 union_float32 ua, ub, ur;
348
349 ua.s = xa;
350 ub.s = xb;
351
352 if (unlikely(!can_use_fpu(s))) {
353 goto soft;
354 }
355
356 float32_input_flush2(&ua.s, &ub.s, s);
357 if (unlikely(!pre(ua, ub))) {
358 goto soft;
359 }
360
361 ur.h = hard(ua.h, ub.h);
362 if (unlikely(f32_is_inf(ur))) {
363 float_raise(float_flag_overflow, s);
364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365 goto soft;
366 }
367 return ur.s;
368
369 soft:
370 return soft(ua.s, ub.s, s);
371 }
372
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376 f64_check_fn pre, f64_check_fn post)
377 {
378 union_float64 ua, ub, ur;
379
380 ua.s = xa;
381 ub.s = xb;
382
383 if (unlikely(!can_use_fpu(s))) {
384 goto soft;
385 }
386
387 float64_input_flush2(&ua.s, &ub.s, s);
388 if (unlikely(!pre(ua, ub))) {
389 goto soft;
390 }
391
392 ur.h = hard(ua.h, ub.h);
393 if (unlikely(f64_is_inf(ur))) {
394 float_raise(float_flag_overflow, s);
395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396 goto soft;
397 }
398 return ur.s;
399
400 soft:
401 return soft(ua.s, ub.s, s);
402 }
403
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410 return float32_val(a) & 0x007FFFFF;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416
417 static inline int extractFloat32Exp(float32 a)
418 {
419 return (float32_val(a) >> 23) & 0xFF;
420 }
421
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425
426 static inline bool extractFloat32Sign(float32 a)
427 {
428 return float32_val(a) >> 31;
429 }
430
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443
444 static inline int extractFloat64Exp(float64 a)
445 {
446 return (float64_val(a) >> 52) & 0x7FF;
447 }
448
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452
453 static inline bool extractFloat64Sign(float64 a)
454 {
455 return float64_val(a) >> 63;
456 }
457
458 /*
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
461 */
462
463 typedef enum __attribute__ ((__packed__)) {
464 float_class_unclassified,
465 float_class_zero,
466 float_class_normal,
467 float_class_inf,
468 float_class_qnan, /* all NaNs from here */
469 float_class_snan,
470 } FloatClass;
471
472 #define float_cmask(bit) (1u << (bit))
473
474 enum {
475 float_cmask_zero = float_cmask(float_class_zero),
476 float_cmask_normal = float_cmask(float_class_normal),
477 float_cmask_inf = float_cmask(float_class_inf),
478 float_cmask_qnan = float_cmask(float_class_qnan),
479 float_cmask_snan = float_cmask(float_class_snan),
480
481 float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
483 };
484
485
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489 return unlikely(c >= float_class_qnan);
490 }
491
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494 return c == float_class_snan;
495 }
496
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499 return c == float_class_qnan;
500 }
501
502 /*
503 * Structure holding all of the decomposed parts of a float. The
504 * exponent is unbiased and the fraction is normalized. All
505 * calculations are done with a 64 bit fraction and then rounded as
506 * appropriate for the final format.
507 *
508 * Thanks to the packed FloatClass a decent compiler should be able to
509 * fit the whole structure into registers and avoid using the stack
510 * for parameter passing.
511 */
512
513 typedef struct {
514 uint64_t frac;
515 int32_t exp;
516 FloatClass cls;
517 bool sign;
518 } FloatParts64;
519
520 #define DECOMPOSED_BINARY_POINT 63
521 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
522
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
535 */
536 typedef struct {
537 int exp_size;
538 int exp_bias;
539 int exp_max;
540 int frac_size;
541 int frac_shift;
542 uint64_t frac_lsb;
543 uint64_t frac_lsbm1;
544 uint64_t round_mask;
545 uint64_t roundeven_mask;
546 bool arm_althp;
547 } FloatFmt;
548
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
551 .exp_size = E, \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
554 .frac_size = F, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
560
561 static const FloatFmt float16_params = {
562 FLOAT_PARAMS(5, 10)
563 };
564
565 static const FloatFmt float16_params_ahp = {
566 FLOAT_PARAMS(5, 10),
567 .arm_althp = true
568 };
569
570 static const FloatFmt bfloat16_params = {
571 FLOAT_PARAMS(8, 7)
572 };
573
574 static const FloatFmt float32_params = {
575 FLOAT_PARAMS(8, 23)
576 };
577
578 static const FloatFmt float64_params = {
579 FLOAT_PARAMS(11, 52)
580 };
581
582 /* Unpack a float to parts, but do not canonicalize. */
583 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
584 {
585 const int f_size = fmt->frac_size;
586 const int e_size = fmt->exp_size;
587
588 *r = (FloatParts64) {
589 .cls = float_class_unclassified,
590 .sign = extract64(raw, f_size + e_size, 1),
591 .exp = extract64(raw, f_size, e_size),
592 .frac = extract64(raw, 0, f_size)
593 };
594 }
595
596 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
597 {
598 unpack_raw64(p, &float16_params, f);
599 }
600
601 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
602 {
603 unpack_raw64(p, &bfloat16_params, f);
604 }
605
606 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
607 {
608 unpack_raw64(p, &float32_params, f);
609 }
610
611 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
612 {
613 unpack_raw64(p, &float64_params, f);
614 }
615
616 /* Pack a float from parts, but do not canonicalize. */
617 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
618 {
619 const int f_size = fmt->frac_size;
620 const int e_size = fmt->exp_size;
621 uint64_t ret;
622
623 ret = (uint64_t)p->sign << (f_size + e_size);
624 ret = deposit64(ret, f_size, e_size, p->exp);
625 ret = deposit64(ret, 0, f_size, p->frac);
626 return ret;
627 }
628
629 static inline float16 float16_pack_raw(FloatParts64 p)
630 {
631 return make_float16(pack_raw64(&p, &float16_params));
632 }
633
634 static inline bfloat16 bfloat16_pack_raw(FloatParts64 p)
635 {
636 return pack_raw64(&p, &bfloat16_params);
637 }
638
639 static inline float32 float32_pack_raw(FloatParts64 p)
640 {
641 return make_float32(pack_raw64(&p, &float32_params));
642 }
643
644 static inline float64 float64_pack_raw(FloatParts64 p)
645 {
646 return make_float64(pack_raw64(&p, &float64_params));
647 }
648
649 /*----------------------------------------------------------------------------
650 | Functions and definitions to determine: (1) whether tininess for underflow
651 | is detected before or after rounding by default, (2) what (if anything)
652 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
653 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
654 | are propagated from function inputs to output. These details are target-
655 | specific.
656 *----------------------------------------------------------------------------*/
657 #include "softfloat-specialize.c.inc"
658
659 #define parts_default_nan parts64_default_nan
660
661 /* Canonicalize EXP and FRAC, setting CLS. */
662 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
663 float_status *status)
664 {
665 if (part.exp == parm->exp_max && !parm->arm_althp) {
666 if (part.frac == 0) {
667 part.cls = float_class_inf;
668 } else {
669 part.frac <<= parm->frac_shift;
670 part.cls = (parts_is_snan_frac(part.frac, status)
671 ? float_class_snan : float_class_qnan);
672 }
673 } else if (part.exp == 0) {
674 if (likely(part.frac == 0)) {
675 part.cls = float_class_zero;
676 } else if (status->flush_inputs_to_zero) {
677 float_raise(float_flag_input_denormal, status);
678 part.cls = float_class_zero;
679 part.frac = 0;
680 } else {
681 int shift = clz64(part.frac);
682 part.cls = float_class_normal;
683 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
684 part.frac <<= shift;
685 }
686 } else {
687 part.cls = float_class_normal;
688 part.exp -= parm->exp_bias;
689 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
690 }
691 return part;
692 }
693
694 /* Round and uncanonicalize a floating-point number by parts. There
695 * are FRAC_SHIFT bits that may require rounding at the bottom of the
696 * fraction; these bits will be removed. The exponent will be biased
697 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
698 */
699
700 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
701 const FloatFmt *parm)
702 {
703 const uint64_t frac_lsb = parm->frac_lsb;
704 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
705 const uint64_t round_mask = parm->round_mask;
706 const uint64_t roundeven_mask = parm->roundeven_mask;
707 const int exp_max = parm->exp_max;
708 const int frac_shift = parm->frac_shift;
709 uint64_t frac, inc;
710 int exp, flags = 0;
711 bool overflow_norm;
712
713 frac = p.frac;
714 exp = p.exp;
715
716 switch (p.cls) {
717 case float_class_normal:
718 switch (s->float_rounding_mode) {
719 case float_round_nearest_even:
720 overflow_norm = false;
721 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
722 break;
723 case float_round_ties_away:
724 overflow_norm = false;
725 inc = frac_lsbm1;
726 break;
727 case float_round_to_zero:
728 overflow_norm = true;
729 inc = 0;
730 break;
731 case float_round_up:
732 inc = p.sign ? 0 : round_mask;
733 overflow_norm = p.sign;
734 break;
735 case float_round_down:
736 inc = p.sign ? round_mask : 0;
737 overflow_norm = !p.sign;
738 break;
739 case float_round_to_odd:
740 overflow_norm = true;
741 inc = frac & frac_lsb ? 0 : round_mask;
742 break;
743 default:
744 g_assert_not_reached();
745 }
746
747 exp += parm->exp_bias;
748 if (likely(exp > 0)) {
749 if (frac & round_mask) {
750 flags |= float_flag_inexact;
751 if (uadd64_overflow(frac, inc, &frac)) {
752 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
753 exp++;
754 }
755 }
756 frac >>= frac_shift;
757
758 if (parm->arm_althp) {
759 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
760 if (unlikely(exp > exp_max)) {
761 /* Overflow. Return the maximum normal. */
762 flags = float_flag_invalid;
763 exp = exp_max;
764 frac = -1;
765 }
766 } else if (unlikely(exp >= exp_max)) {
767 flags |= float_flag_overflow | float_flag_inexact;
768 if (overflow_norm) {
769 exp = exp_max - 1;
770 frac = -1;
771 } else {
772 p.cls = float_class_inf;
773 goto do_inf;
774 }
775 }
776 } else if (s->flush_to_zero) {
777 flags |= float_flag_output_denormal;
778 p.cls = float_class_zero;
779 goto do_zero;
780 } else {
781 bool is_tiny = s->tininess_before_rounding || (exp < 0);
782
783 if (!is_tiny) {
784 uint64_t discard;
785 is_tiny = !uadd64_overflow(frac, inc, &discard);
786 }
787
788 shift64RightJamming(frac, 1 - exp, &frac);
789 if (frac & round_mask) {
790 /* Need to recompute round-to-even. */
791 switch (s->float_rounding_mode) {
792 case float_round_nearest_even:
793 inc = ((frac & roundeven_mask) != frac_lsbm1
794 ? frac_lsbm1 : 0);
795 break;
796 case float_round_to_odd:
797 inc = frac & frac_lsb ? 0 : round_mask;
798 break;
799 default:
800 break;
801 }
802 flags |= float_flag_inexact;
803 frac += inc;
804 }
805
806 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
807 frac >>= frac_shift;
808
809 if (is_tiny && (flags & float_flag_inexact)) {
810 flags |= float_flag_underflow;
811 }
812 if (exp == 0 && frac == 0) {
813 p.cls = float_class_zero;
814 }
815 }
816 break;
817
818 case float_class_zero:
819 do_zero:
820 exp = 0;
821 frac = 0;
822 break;
823
824 case float_class_inf:
825 do_inf:
826 assert(!parm->arm_althp);
827 exp = exp_max;
828 frac = 0;
829 break;
830
831 case float_class_qnan:
832 case float_class_snan:
833 assert(!parm->arm_althp);
834 exp = exp_max;
835 frac >>= parm->frac_shift;
836 break;
837
838 default:
839 g_assert_not_reached();
840 }
841
842 float_raise(flags, s);
843 p.exp = exp;
844 p.frac = frac;
845 return p;
846 }
847
848 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
849 {
850 g_assert(is_nan(a.cls));
851 if (is_snan(a.cls)) {
852 float_raise(float_flag_invalid, s);
853 if (!s->default_nan_mode) {
854 return parts_silence_nan(a, s);
855 }
856 } else if (!s->default_nan_mode) {
857 return a;
858 }
859 parts_default_nan(&a, s);
860 return a;
861 }
862
863 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
864 {
865 if (is_snan(a.cls) || is_snan(b.cls)) {
866 float_raise(float_flag_invalid, s);
867 }
868
869 if (s->default_nan_mode) {
870 parts_default_nan(&a, s);
871 } else {
872 if (pickNaN(a.cls, b.cls,
873 a.frac > b.frac ||
874 (a.frac == b.frac && a.sign < b.sign), s)) {
875 a = b;
876 }
877 if (is_snan(a.cls)) {
878 return parts_silence_nan(a, s);
879 }
880 }
881 return a;
882 }
883
884 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
885 bool inf_zero, float_status *s)
886 {
887 int which;
888
889 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
890 float_raise(float_flag_invalid, s);
891 }
892
893 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
894
895 if (s->default_nan_mode) {
896 /* Note that this check is after pickNaNMulAdd so that function
897 * has an opportunity to set the Invalid flag.
898 */
899 which = 3;
900 }
901
902 switch (which) {
903 case 0:
904 break;
905 case 1:
906 a = b;
907 break;
908 case 2:
909 a = c;
910 break;
911 case 3:
912 parts_default_nan(&a, s);
913 break;
914 default:
915 g_assert_not_reached();
916 }
917
918 if (is_snan(a.cls)) {
919 return parts_silence_nan(a, s);
920 }
921 return a;
922 }
923
924 /*
925 * Pack/unpack routines with a specific FloatFmt.
926 */
927
928 static FloatParts64 float16a_unpack_canonical(float16 f, float_status *s,
929 const FloatFmt *params)
930 {
931 FloatParts64 p;
932
933 float16_unpack_raw(&p, f);
934 return sf_canonicalize(p, params, s);
935 }
936
937 static FloatParts64 float16_unpack_canonical(float16 f, float_status *s)
938 {
939 return float16a_unpack_canonical(f, s, &float16_params);
940 }
941
942 static FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s)
943 {
944 FloatParts64 p;
945
946 bfloat16_unpack_raw(&p, f);
947 return sf_canonicalize(p, &bfloat16_params, s);
948 }
949
950 static float16 float16a_round_pack_canonical(FloatParts64 p, float_status *s,
951 const FloatFmt *params)
952 {
953 return float16_pack_raw(round_canonical(p, s, params));
954 }
955
956 static float16 float16_round_pack_canonical(FloatParts64 p, float_status *s)
957 {
958 return float16a_round_pack_canonical(p, s, &float16_params);
959 }
960
961 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 p, float_status *s)
962 {
963 return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
964 }
965
966 static FloatParts64 float32_unpack_canonical(float32 f, float_status *s)
967 {
968 FloatParts64 p;
969
970 float32_unpack_raw(&p, f);
971 return sf_canonicalize(p, &float32_params, s);
972 }
973
974 static float32 float32_round_pack_canonical(FloatParts64 p, float_status *s)
975 {
976 return float32_pack_raw(round_canonical(p, s, &float32_params));
977 }
978
979 static FloatParts64 float64_unpack_canonical(float64 f, float_status *s)
980 {
981 FloatParts64 p;
982
983 float64_unpack_raw(&p, f);
984 return sf_canonicalize(p, &float64_params, s);
985 }
986
987 static float64 float64_round_pack_canonical(FloatParts64 p, float_status *s)
988 {
989 return float64_pack_raw(round_canonical(p, s, &float64_params));
990 }
991
992 /*
993 * Returns the result of adding or subtracting the values of the
994 * floating-point values `a' and `b'. The operation is performed
995 * according to the IEC/IEEE Standard for Binary Floating-Point
996 * Arithmetic.
997 */
998
999 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1000 float_status *s)
1001 {
1002 bool a_sign = a.sign;
1003 bool b_sign = b.sign ^ subtract;
1004
1005 if (a_sign != b_sign) {
1006 /* Subtraction */
1007
1008 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1009 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1010 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1011 a.frac = a.frac - b.frac;
1012 } else {
1013 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1014 a.frac = b.frac - a.frac;
1015 a.exp = b.exp;
1016 a_sign ^= 1;
1017 }
1018
1019 if (a.frac == 0) {
1020 a.cls = float_class_zero;
1021 a.sign = s->float_rounding_mode == float_round_down;
1022 } else {
1023 int shift = clz64(a.frac);
1024 a.frac = a.frac << shift;
1025 a.exp = a.exp - shift;
1026 a.sign = a_sign;
1027 }
1028 return a;
1029 }
1030 if (is_nan(a.cls) || is_nan(b.cls)) {
1031 return pick_nan(a, b, s);
1032 }
1033 if (a.cls == float_class_inf) {
1034 if (b.cls == float_class_inf) {
1035 float_raise(float_flag_invalid, s);
1036 parts_default_nan(&a, s);
1037 }
1038 return a;
1039 }
1040 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1041 a.sign = s->float_rounding_mode == float_round_down;
1042 return a;
1043 }
1044 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1045 b.sign = a_sign ^ 1;
1046 return b;
1047 }
1048 if (b.cls == float_class_zero) {
1049 return a;
1050 }
1051 } else {
1052 /* Addition */
1053 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1054 if (a.exp > b.exp) {
1055 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1056 } else if (a.exp < b.exp) {
1057 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1058 a.exp = b.exp;
1059 }
1060
1061 if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1062 shift64RightJamming(a.frac, 1, &a.frac);
1063 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1064 a.exp += 1;
1065 }
1066 return a;
1067 }
1068 if (is_nan(a.cls) || is_nan(b.cls)) {
1069 return pick_nan(a, b, s);
1070 }
1071 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1072 return a;
1073 }
1074 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1075 b.sign = b_sign;
1076 return b;
1077 }
1078 }
1079 g_assert_not_reached();
1080 }
1081
1082 /*
1083 * Returns the result of adding or subtracting the floating-point
1084 * values `a' and `b'. The operation is performed according to the
1085 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1086 */
1087
1088 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1089 {
1090 FloatParts64 pa = float16_unpack_canonical(a, status);
1091 FloatParts64 pb = float16_unpack_canonical(b, status);
1092 FloatParts64 pr = addsub_floats(pa, pb, false, status);
1093
1094 return float16_round_pack_canonical(pr, status);
1095 }
1096
1097 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1098 {
1099 FloatParts64 pa = float16_unpack_canonical(a, status);
1100 FloatParts64 pb = float16_unpack_canonical(b, status);
1101 FloatParts64 pr = addsub_floats(pa, pb, true, status);
1102
1103 return float16_round_pack_canonical(pr, status);
1104 }
1105
1106 static float32 QEMU_SOFTFLOAT_ATTR
1107 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1108 {
1109 FloatParts64 pa = float32_unpack_canonical(a, status);
1110 FloatParts64 pb = float32_unpack_canonical(b, status);
1111 FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1112
1113 return float32_round_pack_canonical(pr, status);
1114 }
1115
1116 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1117 {
1118 return soft_f32_addsub(a, b, false, status);
1119 }
1120
1121 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1122 {
1123 return soft_f32_addsub(a, b, true, status);
1124 }
1125
1126 static float64 QEMU_SOFTFLOAT_ATTR
1127 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1128 {
1129 FloatParts64 pa = float64_unpack_canonical(a, status);
1130 FloatParts64 pb = float64_unpack_canonical(b, status);
1131 FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1132
1133 return float64_round_pack_canonical(pr, status);
1134 }
1135
1136 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1137 {
1138 return soft_f64_addsub(a, b, false, status);
1139 }
1140
1141 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1142 {
1143 return soft_f64_addsub(a, b, true, status);
1144 }
1145
1146 static float hard_f32_add(float a, float b)
1147 {
1148 return a + b;
1149 }
1150
1151 static float hard_f32_sub(float a, float b)
1152 {
1153 return a - b;
1154 }
1155
1156 static double hard_f64_add(double a, double b)
1157 {
1158 return a + b;
1159 }
1160
1161 static double hard_f64_sub(double a, double b)
1162 {
1163 return a - b;
1164 }
1165
1166 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1167 {
1168 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1169 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1170 }
1171 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1172 }
1173
1174 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1175 {
1176 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1177 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1178 } else {
1179 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1180 }
1181 }
1182
1183 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1184 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1185 {
1186 return float32_gen2(a, b, s, hard, soft,
1187 f32_is_zon2, f32_addsubmul_post);
1188 }
1189
1190 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1191 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1192 {
1193 return float64_gen2(a, b, s, hard, soft,
1194 f64_is_zon2, f64_addsubmul_post);
1195 }
1196
1197 float32 QEMU_FLATTEN
1198 float32_add(float32 a, float32 b, float_status *s)
1199 {
1200 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1201 }
1202
1203 float32 QEMU_FLATTEN
1204 float32_sub(float32 a, float32 b, float_status *s)
1205 {
1206 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1207 }
1208
1209 float64 QEMU_FLATTEN
1210 float64_add(float64 a, float64 b, float_status *s)
1211 {
1212 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1213 }
1214
1215 float64 QEMU_FLATTEN
1216 float64_sub(float64 a, float64 b, float_status *s)
1217 {
1218 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1219 }
1220
1221 /*
1222 * Returns the result of adding or subtracting the bfloat16
1223 * values `a' and `b'.
1224 */
1225 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1226 {
1227 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1228 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1229 FloatParts64 pr = addsub_floats(pa, pb, false, status);
1230
1231 return bfloat16_round_pack_canonical(pr, status);
1232 }
1233
1234 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1235 {
1236 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1237 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1238 FloatParts64 pr = addsub_floats(pa, pb, true, status);
1239
1240 return bfloat16_round_pack_canonical(pr, status);
1241 }
1242
1243 /*
1244 * Returns the result of multiplying the floating-point values `a' and
1245 * `b'. The operation is performed according to the IEC/IEEE Standard
1246 * for Binary Floating-Point Arithmetic.
1247 */
1248
1249 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1250 {
1251 bool sign = a.sign ^ b.sign;
1252
1253 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1254 uint64_t hi, lo;
1255 int exp = a.exp + b.exp;
1256
1257 mul64To128(a.frac, b.frac, &hi, &lo);
1258 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1259 exp += 1;
1260 } else {
1261 hi <<= 1;
1262 }
1263 hi |= (lo != 0);
1264
1265 /* Re-use a */
1266 a.exp = exp;
1267 a.sign = sign;
1268 a.frac = hi;
1269 return a;
1270 }
1271 /* handle all the NaN cases */
1272 if (is_nan(a.cls) || is_nan(b.cls)) {
1273 return pick_nan(a, b, s);
1274 }
1275 /* Inf * Zero == NaN */
1276 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1277 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1278 float_raise(float_flag_invalid, s);
1279 parts_default_nan(&a, s);
1280 return a;
1281 }
1282 /* Multiply by 0 or Inf */
1283 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1284 a.sign = sign;
1285 return a;
1286 }
1287 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1288 b.sign = sign;
1289 return b;
1290 }
1291 g_assert_not_reached();
1292 }
1293
1294 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1295 {
1296 FloatParts64 pa = float16_unpack_canonical(a, status);
1297 FloatParts64 pb = float16_unpack_canonical(b, status);
1298 FloatParts64 pr = mul_floats(pa, pb, status);
1299
1300 return float16_round_pack_canonical(pr, status);
1301 }
1302
1303 static float32 QEMU_SOFTFLOAT_ATTR
1304 soft_f32_mul(float32 a, float32 b, float_status *status)
1305 {
1306 FloatParts64 pa = float32_unpack_canonical(a, status);
1307 FloatParts64 pb = float32_unpack_canonical(b, status);
1308 FloatParts64 pr = mul_floats(pa, pb, status);
1309
1310 return float32_round_pack_canonical(pr, status);
1311 }
1312
1313 static float64 QEMU_SOFTFLOAT_ATTR
1314 soft_f64_mul(float64 a, float64 b, float_status *status)
1315 {
1316 FloatParts64 pa = float64_unpack_canonical(a, status);
1317 FloatParts64 pb = float64_unpack_canonical(b, status);
1318 FloatParts64 pr = mul_floats(pa, pb, status);
1319
1320 return float64_round_pack_canonical(pr, status);
1321 }
1322
1323 static float hard_f32_mul(float a, float b)
1324 {
1325 return a * b;
1326 }
1327
1328 static double hard_f64_mul(double a, double b)
1329 {
1330 return a * b;
1331 }
1332
1333 float32 QEMU_FLATTEN
1334 float32_mul(float32 a, float32 b, float_status *s)
1335 {
1336 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1337 f32_is_zon2, f32_addsubmul_post);
1338 }
1339
1340 float64 QEMU_FLATTEN
1341 float64_mul(float64 a, float64 b, float_status *s)
1342 {
1343 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1344 f64_is_zon2, f64_addsubmul_post);
1345 }
1346
1347 /*
1348 * Returns the result of multiplying the bfloat16
1349 * values `a' and `b'.
1350 */
1351
1352 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1353 {
1354 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1355 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1356 FloatParts64 pr = mul_floats(pa, pb, status);
1357
1358 return bfloat16_round_pack_canonical(pr, status);
1359 }
1360
1361 /*
1362 * Returns the result of multiplying the floating-point values `a' and
1363 * `b' then adding 'c', with no intermediate rounding step after the
1364 * multiplication. The operation is performed according to the
1365 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1366 * The flags argument allows the caller to select negation of the
1367 * addend, the intermediate product, or the final result. (The
1368 * difference between this and having the caller do a separate
1369 * negation is that negating externally will flip the sign bit on
1370 * NaNs.)
1371 */
1372
1373 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1374 int flags, float_status *s)
1375 {
1376 bool inf_zero, p_sign;
1377 bool sign_flip = flags & float_muladd_negate_result;
1378 FloatClass p_class;
1379 uint64_t hi, lo;
1380 int p_exp;
1381 int ab_mask, abc_mask;
1382
1383 ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1384 abc_mask = float_cmask(c.cls) | ab_mask;
1385 inf_zero = ab_mask == float_cmask_infzero;
1386
1387 /* It is implementation-defined whether the cases of (0,inf,qnan)
1388 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1389 * they return if they do), so we have to hand this information
1390 * off to the target-specific pick-a-NaN routine.
1391 */
1392 if (unlikely(abc_mask & float_cmask_anynan)) {
1393 return pick_nan_muladd(a, b, c, inf_zero, s);
1394 }
1395
1396 if (inf_zero) {
1397 float_raise(float_flag_invalid, s);
1398 parts_default_nan(&a, s);
1399 return a;
1400 }
1401
1402 if (flags & float_muladd_negate_c) {
1403 c.sign ^= 1;
1404 }
1405
1406 p_sign = a.sign ^ b.sign;
1407
1408 if (flags & float_muladd_negate_product) {
1409 p_sign ^= 1;
1410 }
1411
1412 if (ab_mask & float_cmask_inf) {
1413 p_class = float_class_inf;
1414 } else if (ab_mask & float_cmask_zero) {
1415 p_class = float_class_zero;
1416 } else {
1417 p_class = float_class_normal;
1418 }
1419
1420 if (c.cls == float_class_inf) {
1421 if (p_class == float_class_inf && p_sign != c.sign) {
1422 float_raise(float_flag_invalid, s);
1423 parts_default_nan(&c, s);
1424 } else {
1425 c.sign ^= sign_flip;
1426 }
1427 return c;
1428 }
1429
1430 if (p_class == float_class_inf) {
1431 a.cls = float_class_inf;
1432 a.sign = p_sign ^ sign_flip;
1433 return a;
1434 }
1435
1436 if (p_class == float_class_zero) {
1437 if (c.cls == float_class_zero) {
1438 if (p_sign != c.sign) {
1439 p_sign = s->float_rounding_mode == float_round_down;
1440 }
1441 c.sign = p_sign;
1442 } else if (flags & float_muladd_halve_result) {
1443 c.exp -= 1;
1444 }
1445 c.sign ^= sign_flip;
1446 return c;
1447 }
1448
1449 /* a & b should be normals now... */
1450 assert(a.cls == float_class_normal &&
1451 b.cls == float_class_normal);
1452
1453 p_exp = a.exp + b.exp;
1454
1455 mul64To128(a.frac, b.frac, &hi, &lo);
1456
1457 /* Renormalize to the msb. */
1458 if (hi & DECOMPOSED_IMPLICIT_BIT) {
1459 p_exp += 1;
1460 } else {
1461 shortShift128Left(hi, lo, 1, &hi, &lo);
1462 }
1463
1464 /* + add/sub */
1465 if (c.cls != float_class_zero) {
1466 int exp_diff = p_exp - c.exp;
1467 if (p_sign == c.sign) {
1468 /* Addition */
1469 if (exp_diff <= 0) {
1470 shift64RightJamming(hi, -exp_diff, &hi);
1471 p_exp = c.exp;
1472 if (uadd64_overflow(hi, c.frac, &hi)) {
1473 shift64RightJamming(hi, 1, &hi);
1474 hi |= DECOMPOSED_IMPLICIT_BIT;
1475 p_exp += 1;
1476 }
1477 } else {
1478 uint64_t c_hi, c_lo, over;
1479 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1480 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1481 if (over) {
1482 shift64RightJamming(hi, 1, &hi);
1483 hi |= DECOMPOSED_IMPLICIT_BIT;
1484 p_exp += 1;
1485 }
1486 }
1487 } else {
1488 /* Subtraction */
1489 uint64_t c_hi = c.frac, c_lo = 0;
1490
1491 if (exp_diff <= 0) {
1492 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1493 if (exp_diff == 0
1494 &&
1495 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1496 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1497 } else {
1498 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1499 p_sign ^= 1;
1500 p_exp = c.exp;
1501 }
1502 } else {
1503 shift128RightJamming(c_hi, c_lo,
1504 exp_diff,
1505 &c_hi, &c_lo);
1506 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1507 }
1508
1509 if (hi == 0 && lo == 0) {
1510 a.cls = float_class_zero;
1511 a.sign = s->float_rounding_mode == float_round_down;
1512 a.sign ^= sign_flip;
1513 return a;
1514 } else {
1515 int shift;
1516 if (hi != 0) {
1517 shift = clz64(hi);
1518 } else {
1519 shift = clz64(lo) + 64;
1520 }
1521 /* Normalizing to a binary point of 124 is the
1522 correct adjust for the exponent. However since we're
1523 shifting, we might as well put the binary point back
1524 at 63 where we really want it. Therefore shift as
1525 if we're leaving 1 bit at the top of the word, but
1526 adjust the exponent as if we're leaving 3 bits. */
1527 shift128Left(hi, lo, shift, &hi, &lo);
1528 p_exp -= shift;
1529 }
1530 }
1531 }
1532 hi |= (lo != 0);
1533
1534 if (flags & float_muladd_halve_result) {
1535 p_exp -= 1;
1536 }
1537
1538 /* finally prepare our result */
1539 a.cls = float_class_normal;
1540 a.sign = p_sign ^ sign_flip;
1541 a.exp = p_exp;
1542 a.frac = hi;
1543
1544 return a;
1545 }
1546
1547 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1548 int flags, float_status *status)
1549 {
1550 FloatParts64 pa = float16_unpack_canonical(a, status);
1551 FloatParts64 pb = float16_unpack_canonical(b, status);
1552 FloatParts64 pc = float16_unpack_canonical(c, status);
1553 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1554
1555 return float16_round_pack_canonical(pr, status);
1556 }
1557
1558 static float32 QEMU_SOFTFLOAT_ATTR
1559 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1560 float_status *status)
1561 {
1562 FloatParts64 pa = float32_unpack_canonical(a, status);
1563 FloatParts64 pb = float32_unpack_canonical(b, status);
1564 FloatParts64 pc = float32_unpack_canonical(c, status);
1565 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1566
1567 return float32_round_pack_canonical(pr, status);
1568 }
1569
1570 static float64 QEMU_SOFTFLOAT_ATTR
1571 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1572 float_status *status)
1573 {
1574 FloatParts64 pa = float64_unpack_canonical(a, status);
1575 FloatParts64 pb = float64_unpack_canonical(b, status);
1576 FloatParts64 pc = float64_unpack_canonical(c, status);
1577 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1578
1579 return float64_round_pack_canonical(pr, status);
1580 }
1581
1582 static bool force_soft_fma;
1583
1584 float32 QEMU_FLATTEN
1585 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1586 {
1587 union_float32 ua, ub, uc, ur;
1588
1589 ua.s = xa;
1590 ub.s = xb;
1591 uc.s = xc;
1592
1593 if (unlikely(!can_use_fpu(s))) {
1594 goto soft;
1595 }
1596 if (unlikely(flags & float_muladd_halve_result)) {
1597 goto soft;
1598 }
1599
1600 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1601 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1602 goto soft;
1603 }
1604
1605 if (unlikely(force_soft_fma)) {
1606 goto soft;
1607 }
1608
1609 /*
1610 * When (a || b) == 0, there's no need to check for under/over flow,
1611 * since we know the addend is (normal || 0) and the product is 0.
1612 */
1613 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1614 union_float32 up;
1615 bool prod_sign;
1616
1617 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1618 prod_sign ^= !!(flags & float_muladd_negate_product);
1619 up.s = float32_set_sign(float32_zero, prod_sign);
1620
1621 if (flags & float_muladd_negate_c) {
1622 uc.h = -uc.h;
1623 }
1624 ur.h = up.h + uc.h;
1625 } else {
1626 union_float32 ua_orig = ua;
1627 union_float32 uc_orig = uc;
1628
1629 if (flags & float_muladd_negate_product) {
1630 ua.h = -ua.h;
1631 }
1632 if (flags & float_muladd_negate_c) {
1633 uc.h = -uc.h;
1634 }
1635
1636 ur.h = fmaf(ua.h, ub.h, uc.h);
1637
1638 if (unlikely(f32_is_inf(ur))) {
1639 float_raise(float_flag_overflow, s);
1640 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1641 ua = ua_orig;
1642 uc = uc_orig;
1643 goto soft;
1644 }
1645 }
1646 if (flags & float_muladd_negate_result) {
1647 return float32_chs(ur.s);
1648 }
1649 return ur.s;
1650
1651 soft:
1652 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1653 }
1654
1655 float64 QEMU_FLATTEN
1656 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1657 {
1658 union_float64 ua, ub, uc, ur;
1659
1660 ua.s = xa;
1661 ub.s = xb;
1662 uc.s = xc;
1663
1664 if (unlikely(!can_use_fpu(s))) {
1665 goto soft;
1666 }
1667 if (unlikely(flags & float_muladd_halve_result)) {
1668 goto soft;
1669 }
1670
1671 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1672 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1673 goto soft;
1674 }
1675
1676 if (unlikely(force_soft_fma)) {
1677 goto soft;
1678 }
1679
1680 /*
1681 * When (a || b) == 0, there's no need to check for under/over flow,
1682 * since we know the addend is (normal || 0) and the product is 0.
1683 */
1684 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1685 union_float64 up;
1686 bool prod_sign;
1687
1688 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1689 prod_sign ^= !!(flags & float_muladd_negate_product);
1690 up.s = float64_set_sign(float64_zero, prod_sign);
1691
1692 if (flags & float_muladd_negate_c) {
1693 uc.h = -uc.h;
1694 }
1695 ur.h = up.h + uc.h;
1696 } else {
1697 union_float64 ua_orig = ua;
1698 union_float64 uc_orig = uc;
1699
1700 if (flags & float_muladd_negate_product) {
1701 ua.h = -ua.h;
1702 }
1703 if (flags & float_muladd_negate_c) {
1704 uc.h = -uc.h;
1705 }
1706
1707 ur.h = fma(ua.h, ub.h, uc.h);
1708
1709 if (unlikely(f64_is_inf(ur))) {
1710 float_raise(float_flag_overflow, s);
1711 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1712 ua = ua_orig;
1713 uc = uc_orig;
1714 goto soft;
1715 }
1716 }
1717 if (flags & float_muladd_negate_result) {
1718 return float64_chs(ur.s);
1719 }
1720 return ur.s;
1721
1722 soft:
1723 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1724 }
1725
1726 /*
1727 * Returns the result of multiplying the bfloat16 values `a'
1728 * and `b' then adding 'c', with no intermediate rounding step after the
1729 * multiplication.
1730 */
1731
1732 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1733 int flags, float_status *status)
1734 {
1735 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1736 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1737 FloatParts64 pc = bfloat16_unpack_canonical(c, status);
1738 FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1739
1740 return bfloat16_round_pack_canonical(pr, status);
1741 }
1742
1743 /*
1744 * Returns the result of dividing the floating-point value `a' by the
1745 * corresponding value `b'. The operation is performed according to
1746 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1747 */
1748
1749 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1750 {
1751 bool sign = a.sign ^ b.sign;
1752
1753 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1754 uint64_t n0, n1, q, r;
1755 int exp = a.exp - b.exp;
1756
1757 /*
1758 * We want a 2*N / N-bit division to produce exactly an N-bit
1759 * result, so that we do not lose any precision and so that we
1760 * do not have to renormalize afterward. If A.frac < B.frac,
1761 * then division would produce an (N-1)-bit result; shift A left
1762 * by one to produce the an N-bit result, and decrement the
1763 * exponent to match.
1764 *
1765 * The udiv_qrnnd algorithm that we're using requires normalization,
1766 * i.e. the msb of the denominator must be set, which is already true.
1767 */
1768 if (a.frac < b.frac) {
1769 exp -= 1;
1770 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1771 } else {
1772 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1773 }
1774 q = udiv_qrnnd(&r, n1, n0, b.frac);
1775
1776 /* Set lsb if there is a remainder, to set inexact. */
1777 a.frac = q | (r != 0);
1778 a.sign = sign;
1779 a.exp = exp;
1780 return a;
1781 }
1782 /* handle all the NaN cases */
1783 if (is_nan(a.cls) || is_nan(b.cls)) {
1784 return pick_nan(a, b, s);
1785 }
1786 /* 0/0 or Inf/Inf */
1787 if (a.cls == b.cls
1788 &&
1789 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1790 float_raise(float_flag_invalid, s);
1791 parts_default_nan(&a, s);
1792 return a;
1793 }
1794 /* Inf / x or 0 / x */
1795 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1796 a.sign = sign;
1797 return a;
1798 }
1799 /* Div 0 => Inf */
1800 if (b.cls == float_class_zero) {
1801 float_raise(float_flag_divbyzero, s);
1802 a.cls = float_class_inf;
1803 a.sign = sign;
1804 return a;
1805 }
1806 /* Div by Inf */
1807 if (b.cls == float_class_inf) {
1808 a.cls = float_class_zero;
1809 a.sign = sign;
1810 return a;
1811 }
1812 g_assert_not_reached();
1813 }
1814
1815 float16 float16_div(float16 a, float16 b, float_status *status)
1816 {
1817 FloatParts64 pa = float16_unpack_canonical(a, status);
1818 FloatParts64 pb = float16_unpack_canonical(b, status);
1819 FloatParts64 pr = div_floats(pa, pb, status);
1820
1821 return float16_round_pack_canonical(pr, status);
1822 }
1823
1824 static float32 QEMU_SOFTFLOAT_ATTR
1825 soft_f32_div(float32 a, float32 b, float_status *status)
1826 {
1827 FloatParts64 pa = float32_unpack_canonical(a, status);
1828 FloatParts64 pb = float32_unpack_canonical(b, status);
1829 FloatParts64 pr = div_floats(pa, pb, status);
1830
1831 return float32_round_pack_canonical(pr, status);
1832 }
1833
1834 static float64 QEMU_SOFTFLOAT_ATTR
1835 soft_f64_div(float64 a, float64 b, float_status *status)
1836 {
1837 FloatParts64 pa = float64_unpack_canonical(a, status);
1838 FloatParts64 pb = float64_unpack_canonical(b, status);
1839 FloatParts64 pr = div_floats(pa, pb, status);
1840
1841 return float64_round_pack_canonical(pr, status);
1842 }
1843
1844 static float hard_f32_div(float a, float b)
1845 {
1846 return a / b;
1847 }
1848
1849 static double hard_f64_div(double a, double b)
1850 {
1851 return a / b;
1852 }
1853
1854 static bool f32_div_pre(union_float32 a, union_float32 b)
1855 {
1856 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1857 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1858 fpclassify(b.h) == FP_NORMAL;
1859 }
1860 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1861 }
1862
1863 static bool f64_div_pre(union_float64 a, union_float64 b)
1864 {
1865 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1866 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1867 fpclassify(b.h) == FP_NORMAL;
1868 }
1869 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1870 }
1871
1872 static bool f32_div_post(union_float32 a, union_float32 b)
1873 {
1874 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1875 return fpclassify(a.h) != FP_ZERO;
1876 }
1877 return !float32_is_zero(a.s);
1878 }
1879
1880 static bool f64_div_post(union_float64 a, union_float64 b)
1881 {
1882 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1883 return fpclassify(a.h) != FP_ZERO;
1884 }
1885 return !float64_is_zero(a.s);
1886 }
1887
1888 float32 QEMU_FLATTEN
1889 float32_div(float32 a, float32 b, float_status *s)
1890 {
1891 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1892 f32_div_pre, f32_div_post);
1893 }
1894
1895 float64 QEMU_FLATTEN
1896 float64_div(float64 a, float64 b, float_status *s)
1897 {
1898 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1899 f64_div_pre, f64_div_post);
1900 }
1901
1902 /*
1903 * Returns the result of dividing the bfloat16
1904 * value `a' by the corresponding value `b'.
1905 */
1906
1907 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1908 {
1909 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1910 FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1911 FloatParts64 pr = div_floats(pa, pb, status);
1912
1913 return bfloat16_round_pack_canonical(pr, status);
1914 }
1915
1916 /*
1917 * Float to Float conversions
1918 *
1919 * Returns the result of converting one float format to another. The
1920 * conversion is performed according to the IEC/IEEE Standard for
1921 * Binary Floating-Point Arithmetic.
1922 *
1923 * The float_to_float helper only needs to take care of raising
1924 * invalid exceptions and handling the conversion on NaNs.
1925 */
1926
1927 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1928 float_status *s)
1929 {
1930 if (dstf->arm_althp) {
1931 switch (a.cls) {
1932 case float_class_qnan:
1933 case float_class_snan:
1934 /* There is no NaN in the destination format. Raise Invalid
1935 * and return a zero with the sign of the input NaN.
1936 */
1937 float_raise(float_flag_invalid, s);
1938 a.cls = float_class_zero;
1939 a.frac = 0;
1940 a.exp = 0;
1941 break;
1942
1943 case float_class_inf:
1944 /* There is no Inf in the destination format. Raise Invalid
1945 * and return the maximum normal with the correct sign.
1946 */
1947 float_raise(float_flag_invalid, s);
1948 a.cls = float_class_normal;
1949 a.exp = dstf->exp_max;
1950 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1951 break;
1952
1953 default:
1954 break;
1955 }
1956 } else if (is_nan(a.cls)) {
1957 return return_nan(a, s);
1958 }
1959 return a;
1960 }
1961
1962 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1963 {
1964 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1965 FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1966 FloatParts64 pr = float_to_float(p, &float32_params, s);
1967 return float32_round_pack_canonical(pr, s);
1968 }
1969
1970 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1971 {
1972 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1973 FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1974 FloatParts64 pr = float_to_float(p, &float64_params, s);
1975 return float64_round_pack_canonical(pr, s);
1976 }
1977
1978 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1979 {
1980 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1981 FloatParts64 p = float32_unpack_canonical(a, s);
1982 FloatParts64 pr = float_to_float(p, fmt16, s);
1983 return float16a_round_pack_canonical(pr, s, fmt16);
1984 }
1985
1986 static float64 QEMU_SOFTFLOAT_ATTR
1987 soft_float32_to_float64(float32 a, float_status *s)
1988 {
1989 FloatParts64 p = float32_unpack_canonical(a, s);
1990 FloatParts64 pr = float_to_float(p, &float64_params, s);
1991 return float64_round_pack_canonical(pr, s);
1992 }
1993
1994 float64 float32_to_float64(float32 a, float_status *s)
1995 {
1996 if (likely(float32_is_normal(a))) {
1997 /* Widening conversion can never produce inexact results. */
1998 union_float32 uf;
1999 union_float64 ud;
2000 uf.s = a;
2001 ud.h = uf.h;
2002 return ud.s;
2003 } else if (float32_is_zero(a)) {
2004 return float64_set_sign(float64_zero, float32_is_neg(a));
2005 } else {
2006 return soft_float32_to_float64(a, s);
2007 }
2008 }
2009
2010 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2011 {
2012 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2013 FloatParts64 p = float64_unpack_canonical(a, s);
2014 FloatParts64 pr = float_to_float(p, fmt16, s);
2015 return float16a_round_pack_canonical(pr, s, fmt16);
2016 }
2017
2018 float32 float64_to_float32(float64 a, float_status *s)
2019 {
2020 FloatParts64 p = float64_unpack_canonical(a, s);
2021 FloatParts64 pr = float_to_float(p, &float32_params, s);
2022 return float32_round_pack_canonical(pr, s);
2023 }
2024
2025 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2026 {
2027 FloatParts64 p = bfloat16_unpack_canonical(a, s);
2028 FloatParts64 pr = float_to_float(p, &float32_params, s);
2029 return float32_round_pack_canonical(pr, s);
2030 }
2031
2032 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2033 {
2034 FloatParts64 p = bfloat16_unpack_canonical(a, s);
2035 FloatParts64 pr = float_to_float(p, &float64_params, s);
2036 return float64_round_pack_canonical(pr, s);
2037 }
2038
2039 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2040 {
2041 FloatParts64 p = float32_unpack_canonical(a, s);
2042 FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2043 return bfloat16_round_pack_canonical(pr, s);
2044 }
2045
2046 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2047 {
2048 FloatParts64 p = float64_unpack_canonical(a, s);
2049 FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2050 return bfloat16_round_pack_canonical(pr, s);
2051 }
2052
2053 /*
2054 * Rounds the floating-point value `a' to an integer, and returns the
2055 * result as a floating-point value. The operation is performed
2056 * according to the IEC/IEEE Standard for Binary Floating-Point
2057 * Arithmetic.
2058 */
2059
2060 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2061 int scale, float_status *s)
2062 {
2063 switch (a.cls) {
2064 case float_class_qnan:
2065 case float_class_snan:
2066 return return_nan(a, s);
2067
2068 case float_class_zero:
2069 case float_class_inf:
2070 /* already "integral" */
2071 break;
2072
2073 case float_class_normal:
2074 scale = MIN(MAX(scale, -0x10000), 0x10000);
2075 a.exp += scale;
2076
2077 if (a.exp >= DECOMPOSED_BINARY_POINT) {
2078 /* already integral */
2079 break;
2080 }
2081 if (a.exp < 0) {
2082 bool one;
2083 /* all fractional */
2084 float_raise(float_flag_inexact, s);
2085 switch (rmode) {
2086 case float_round_nearest_even:
2087 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2088 break;
2089 case float_round_ties_away:
2090 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2091 break;
2092 case float_round_to_zero:
2093 one = false;
2094 break;
2095 case float_round_up:
2096 one = !a.sign;
2097 break;
2098 case float_round_down:
2099 one = a.sign;
2100 break;
2101 case float_round_to_odd:
2102 one = true;
2103 break;
2104 default:
2105 g_assert_not_reached();
2106 }
2107
2108 if (one) {
2109 a.frac = DECOMPOSED_IMPLICIT_BIT;
2110 a.exp = 0;
2111 } else {
2112 a.cls = float_class_zero;
2113 }
2114 } else {
2115 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2116 uint64_t frac_lsbm1 = frac_lsb >> 1;
2117 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2118 uint64_t rnd_mask = rnd_even_mask >> 1;
2119 uint64_t inc;
2120
2121 switch (rmode) {
2122 case float_round_nearest_even:
2123 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2124 break;
2125 case float_round_ties_away:
2126 inc = frac_lsbm1;
2127 break;
2128 case float_round_to_zero:
2129 inc = 0;
2130 break;
2131 case float_round_up:
2132 inc = a.sign ? 0 : rnd_mask;
2133 break;
2134 case float_round_down:
2135 inc = a.sign ? rnd_mask : 0;
2136 break;
2137 case float_round_to_odd:
2138 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2139 break;
2140 default:
2141 g_assert_not_reached();
2142 }
2143
2144 if (a.frac & rnd_mask) {
2145 float_raise(float_flag_inexact, s);
2146 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2147 a.frac >>= 1;
2148 a.frac |= DECOMPOSED_IMPLICIT_BIT;
2149 a.exp++;
2150 }
2151 a.frac &= ~rnd_mask;
2152 }
2153 }
2154 break;
2155 default:
2156 g_assert_not_reached();
2157 }
2158 return a;
2159 }
2160
2161 float16 float16_round_to_int(float16 a, float_status *s)
2162 {
2163 FloatParts64 pa = float16_unpack_canonical(a, s);
2164 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2165 return float16_round_pack_canonical(pr, s);
2166 }
2167
2168 float32 float32_round_to_int(float32 a, float_status *s)
2169 {
2170 FloatParts64 pa = float32_unpack_canonical(a, s);
2171 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2172 return float32_round_pack_canonical(pr, s);
2173 }
2174
2175 float64 float64_round_to_int(float64 a, float_status *s)
2176 {
2177 FloatParts64 pa = float64_unpack_canonical(a, s);
2178 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2179 return float64_round_pack_canonical(pr, s);
2180 }
2181
2182 /*
2183 * Rounds the bfloat16 value `a' to an integer, and returns the
2184 * result as a bfloat16 value.
2185 */
2186
2187 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2188 {
2189 FloatParts64 pa = bfloat16_unpack_canonical(a, s);
2190 FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2191 return bfloat16_round_pack_canonical(pr, s);
2192 }
2193
2194 /*
2195 * Returns the result of converting the floating-point value `a' to
2196 * the two's complement integer format. The conversion is performed
2197 * according to the IEC/IEEE Standard for Binary Floating-Point
2198 * Arithmetic---which means in particular that the conversion is
2199 * rounded according to the current rounding mode. If `a' is a NaN,
2200 * the largest positive integer is returned. Otherwise, if the
2201 * conversion overflows, the largest integer with the same sign as `a'
2202 * is returned.
2203 */
2204
2205 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2206 int scale, int64_t min, int64_t max,
2207 float_status *s)
2208 {
2209 uint64_t r;
2210 int orig_flags = get_float_exception_flags(s);
2211 FloatParts64 p = round_to_int(in, rmode, scale, s);
2212
2213 switch (p.cls) {
2214 case float_class_snan:
2215 case float_class_qnan:
2216 s->float_exception_flags = orig_flags | float_flag_invalid;
2217 return max;
2218 case float_class_inf:
2219 s->float_exception_flags = orig_flags | float_flag_invalid;
2220 return p.sign ? min : max;
2221 case float_class_zero:
2222 return 0;
2223 case float_class_normal:
2224 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2225 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2226 } else {
2227 r = UINT64_MAX;
2228 }
2229 if (p.sign) {
2230 if (r <= -(uint64_t) min) {
2231 return -r;
2232 } else {
2233 s->float_exception_flags = orig_flags | float_flag_invalid;
2234 return min;
2235 }
2236 } else {
2237 if (r <= max) {
2238 return r;
2239 } else {
2240 s->float_exception_flags = orig_flags | float_flag_invalid;
2241 return max;
2242 }
2243 }
2244 default:
2245 g_assert_not_reached();
2246 }
2247 }
2248
2249 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2250 float_status *s)
2251 {
2252 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2253 rmode, scale, INT8_MIN, INT8_MAX, s);
2254 }
2255
2256 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2257 float_status *s)
2258 {
2259 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2260 rmode, scale, INT16_MIN, INT16_MAX, s);
2261 }
2262
2263 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2264 float_status *s)
2265 {
2266 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2267 rmode, scale, INT32_MIN, INT32_MAX, s);
2268 }
2269
2270 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2271 float_status *s)
2272 {
2273 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2274 rmode, scale, INT64_MIN, INT64_MAX, s);
2275 }
2276
2277 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2278 float_status *s)
2279 {
2280 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2281 rmode, scale, INT16_MIN, INT16_MAX, s);
2282 }
2283
2284 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2285 float_status *s)
2286 {
2287 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2288 rmode, scale, INT32_MIN, INT32_MAX, s);
2289 }
2290
2291 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2292 float_status *s)
2293 {
2294 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2295 rmode, scale, INT64_MIN, INT64_MAX, s);
2296 }
2297
2298 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2299 float_status *s)
2300 {
2301 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2302 rmode, scale, INT16_MIN, INT16_MAX, s);
2303 }
2304
2305 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2306 float_status *s)
2307 {
2308 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2309 rmode, scale, INT32_MIN, INT32_MAX, s);
2310 }
2311
2312 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2313 float_status *s)
2314 {
2315 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2316 rmode, scale, INT64_MIN, INT64_MAX, s);
2317 }
2318
2319 int8_t float16_to_int8(float16 a, float_status *s)
2320 {
2321 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2322 }
2323
2324 int16_t float16_to_int16(float16 a, float_status *s)
2325 {
2326 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2327 }
2328
2329 int32_t float16_to_int32(float16 a, float_status *s)
2330 {
2331 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2332 }
2333
2334 int64_t float16_to_int64(float16 a, float_status *s)
2335 {
2336 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2337 }
2338
2339 int16_t float32_to_int16(float32 a, float_status *s)
2340 {
2341 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2342 }
2343
2344 int32_t float32_to_int32(float32 a, float_status *s)
2345 {
2346 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2347 }
2348
2349 int64_t float32_to_int64(float32 a, float_status *s)
2350 {
2351 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2352 }
2353
2354 int16_t float64_to_int16(float64 a, float_status *s)
2355 {
2356 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2357 }
2358
2359 int32_t float64_to_int32(float64 a, float_status *s)
2360 {
2361 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2362 }
2363
2364 int64_t float64_to_int64(float64 a, float_status *s)
2365 {
2366 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2367 }
2368
2369 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2370 {
2371 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2372 }
2373
2374 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2375 {
2376 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2377 }
2378
2379 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2380 {
2381 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2382 }
2383
2384 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2385 {
2386 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2387 }
2388
2389 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2390 {
2391 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2392 }
2393
2394 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2395 {
2396 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2397 }
2398
2399 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2400 {
2401 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2402 }
2403
2404 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2405 {
2406 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2407 }
2408
2409 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2410 {
2411 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2412 }
2413
2414 /*
2415 * Returns the result of converting the floating-point value `a' to
2416 * the two's complement integer format.
2417 */
2418
2419 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2420 float_status *s)
2421 {
2422 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2423 rmode, scale, INT16_MIN, INT16_MAX, s);
2424 }
2425
2426 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2427 float_status *s)
2428 {
2429 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2430 rmode, scale, INT32_MIN, INT32_MAX, s);
2431 }
2432
2433 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2434 float_status *s)
2435 {
2436 return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2437 rmode, scale, INT64_MIN, INT64_MAX, s);
2438 }
2439
2440 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2441 {
2442 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2443 }
2444
2445 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2446 {
2447 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2448 }
2449
2450 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2451 {
2452 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2453 }
2454
2455 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2456 {
2457 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2458 }
2459
2460 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2461 {
2462 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2463 }
2464
2465 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2466 {
2467 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2468 }
2469
2470 /*
2471 * Returns the result of converting the floating-point value `a' to
2472 * the unsigned integer format. The conversion is performed according
2473 * to the IEC/IEEE Standard for Binary Floating-Point
2474 * Arithmetic---which means in particular that the conversion is
2475 * rounded according to the current rounding mode. If `a' is a NaN,
2476 * the largest unsigned integer is returned. Otherwise, if the
2477 * conversion overflows, the largest unsigned integer is returned. If
2478 * the 'a' is negative, the result is rounded and zero is returned;
2479 * values that do not round to zero will raise the inexact exception
2480 * flag.
2481 */
2482
2483 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2484 int scale, uint64_t max,
2485 float_status *s)
2486 {
2487 int orig_flags = get_float_exception_flags(s);
2488 FloatParts64 p = round_to_int(in, rmode, scale, s);
2489 uint64_t r;
2490
2491 switch (p.cls) {
2492 case float_class_snan:
2493 case float_class_qnan:
2494 s->float_exception_flags = orig_flags | float_flag_invalid;
2495 return max;
2496 case float_class_inf:
2497 s->float_exception_flags = orig_flags | float_flag_invalid;
2498 return p.sign ? 0 : max;
2499 case float_class_zero:
2500 return 0;
2501 case float_class_normal:
2502 if (p.sign) {
2503 s->float_exception_flags = orig_flags | float_flag_invalid;
2504 return 0;
2505 }
2506
2507 if (p.exp <= DECOMPOSED_BINARY_POINT) {
2508 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2509 } else {
2510 s->float_exception_flags = orig_flags | float_flag_invalid;
2511 return max;
2512 }
2513
2514 /* For uint64 this will never trip, but if p.exp is too large
2515 * to shift a decomposed fraction we shall have exited via the
2516 * 3rd leg above.
2517 */
2518 if (r > max) {
2519 s->float_exception_flags = orig_flags | float_flag_invalid;
2520 return max;
2521 }
2522 return r;
2523 default:
2524 g_assert_not_reached();
2525 }
2526 }
2527
2528 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2529 float_status *s)
2530 {
2531 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2532 rmode, scale, UINT8_MAX, s);
2533 }
2534
2535 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2536 float_status *s)
2537 {
2538 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2539 rmode, scale, UINT16_MAX, s);
2540 }
2541
2542 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2543 float_status *s)
2544 {
2545 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2546 rmode, scale, UINT32_MAX, s);
2547 }
2548
2549 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2550 float_status *s)
2551 {
2552 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2553 rmode, scale, UINT64_MAX, s);
2554 }
2555
2556 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2557 float_status *s)
2558 {
2559 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2560 rmode, scale, UINT16_MAX, s);
2561 }
2562
2563 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2564 float_status *s)
2565 {
2566 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2567 rmode, scale, UINT32_MAX, s);
2568 }
2569
2570 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2571 float_status *s)
2572 {
2573 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2574 rmode, scale, UINT64_MAX, s);
2575 }
2576
2577 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2578 float_status *s)
2579 {
2580 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2581 rmode, scale, UINT16_MAX, s);
2582 }
2583
2584 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2585 float_status *s)
2586 {
2587 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2588 rmode, scale, UINT32_MAX, s);
2589 }
2590
2591 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2592 float_status *s)
2593 {
2594 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2595 rmode, scale, UINT64_MAX, s);
2596 }
2597
2598 uint8_t float16_to_uint8(float16 a, float_status *s)
2599 {
2600 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2601 }
2602
2603 uint16_t float16_to_uint16(float16 a, float_status *s)
2604 {
2605 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2606 }
2607
2608 uint32_t float16_to_uint32(float16 a, float_status *s)
2609 {
2610 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2611 }
2612
2613 uint64_t float16_to_uint64(float16 a, float_status *s)
2614 {
2615 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2616 }
2617
2618 uint16_t float32_to_uint16(float32 a, float_status *s)
2619 {
2620 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2621 }
2622
2623 uint32_t float32_to_uint32(float32 a, float_status *s)
2624 {
2625 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2626 }
2627
2628 uint64_t float32_to_uint64(float32 a, float_status *s)
2629 {
2630 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2631 }
2632
2633 uint16_t float64_to_uint16(float64 a, float_status *s)
2634 {
2635 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2636 }
2637
2638 uint32_t float64_to_uint32(float64 a, float_status *s)
2639 {
2640 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2641 }
2642
2643 uint64_t float64_to_uint64(float64 a, float_status *s)
2644 {
2645 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2646 }
2647
2648 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2649 {
2650 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2651 }
2652
2653 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2654 {
2655 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2656 }
2657
2658 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2659 {
2660 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2661 }
2662
2663 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2664 {
2665 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2666 }
2667
2668 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2669 {
2670 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2671 }
2672
2673 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2674 {
2675 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2676 }
2677
2678 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2679 {
2680 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2681 }
2682
2683 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2684 {
2685 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2686 }
2687
2688 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2689 {
2690 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2691 }
2692
2693 /*
2694 * Returns the result of converting the bfloat16 value `a' to
2695 * the unsigned integer format.
2696 */
2697
2698 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2699 int scale, float_status *s)
2700 {
2701 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2702 rmode, scale, UINT16_MAX, s);
2703 }
2704
2705 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2706 int scale, float_status *s)
2707 {
2708 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2709 rmode, scale, UINT32_MAX, s);
2710 }
2711
2712 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2713 int scale, float_status *s)
2714 {
2715 return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2716 rmode, scale, UINT64_MAX, s);
2717 }
2718
2719 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2720 {
2721 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2722 }
2723
2724 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2725 {
2726 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2727 }
2728
2729 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2730 {
2731 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2732 }
2733
2734 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2735 {
2736 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2737 }
2738
2739 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2740 {
2741 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2742 }
2743
2744 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2745 {
2746 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2747 }
2748
2749 /*
2750 * Integer to float conversions
2751 *
2752 * Returns the result of converting the two's complement integer `a'
2753 * to the floating-point format. The conversion is performed according
2754 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2755 */
2756
2757 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2758 {
2759 FloatParts64 r = { .sign = false };
2760
2761 if (a == 0) {
2762 r.cls = float_class_zero;
2763 } else {
2764 uint64_t f = a;
2765 int shift;
2766
2767 r.cls = float_class_normal;
2768 if (a < 0) {
2769 f = -f;
2770 r.sign = true;
2771 }
2772 shift = clz64(f);
2773 scale = MIN(MAX(scale, -0x10000), 0x10000);
2774
2775 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2776 r.frac = f << shift;
2777 }
2778
2779 return r;
2780 }
2781
2782 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2783 {
2784 FloatParts64 pa = int_to_float(a, scale, status);
2785 return float16_round_pack_canonical(pa, status);
2786 }
2787
2788 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2789 {
2790 return int64_to_float16_scalbn(a, scale, status);
2791 }
2792
2793 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2794 {
2795 return int64_to_float16_scalbn(a, scale, status);
2796 }
2797
2798 float16 int64_to_float16(int64_t a, float_status *status)
2799 {
2800 return int64_to_float16_scalbn(a, 0, status);
2801 }
2802
2803 float16 int32_to_float16(int32_t a, float_status *status)
2804 {
2805 return int64_to_float16_scalbn(a, 0, status);
2806 }
2807
2808 float16 int16_to_float16(int16_t a, float_status *status)
2809 {
2810 return int64_to_float16_scalbn(a, 0, status);
2811 }
2812
2813 float16 int8_to_float16(int8_t a, float_status *status)
2814 {
2815 return int64_to_float16_scalbn(a, 0, status);
2816 }
2817
2818 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2819 {
2820 FloatParts64 pa = int_to_float(a, scale, status);
2821 return float32_round_pack_canonical(pa, status);
2822 }
2823
2824 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2825 {
2826 return int64_to_float32_scalbn(a, scale, status);
2827 }
2828
2829 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2830 {
2831 return int64_to_float32_scalbn(a, scale, status);
2832 }
2833
2834 float32 int64_to_float32(int64_t a, float_status *status)
2835 {
2836 return int64_to_float32_scalbn(a, 0, status);
2837 }
2838
2839 float32 int32_to_float32(int32_t a, float_status *status)
2840 {
2841 return int64_to_float32_scalbn(a, 0, status);
2842 }
2843
2844 float32 int16_to_float32(int16_t a, float_status *status)
2845 {
2846 return int64_to_float32_scalbn(a, 0, status);
2847 }
2848
2849 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2850 {
2851 FloatParts64 pa = int_to_float(a, scale, status);
2852 return float64_round_pack_canonical(pa, status);
2853 }
2854
2855 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2856 {
2857 return int64_to_float64_scalbn(a, scale, status);
2858 }
2859
2860 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2861 {
2862 return int64_to_float64_scalbn(a, scale, status);
2863 }
2864
2865 float64 int64_to_float64(int64_t a, float_status *status)
2866 {
2867 return int64_to_float64_scalbn(a, 0, status);
2868 }
2869
2870 float64 int32_to_float64(int32_t a, float_status *status)
2871 {
2872 return int64_to_float64_scalbn(a, 0, status);
2873 }
2874
2875 float64 int16_to_float64(int16_t a, float_status *status)
2876 {
2877 return int64_to_float64_scalbn(a, 0, status);
2878 }
2879
2880 /*
2881 * Returns the result of converting the two's complement integer `a'
2882 * to the bfloat16 format.
2883 */
2884
2885 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2886 {
2887 FloatParts64 pa = int_to_float(a, scale, status);
2888 return bfloat16_round_pack_canonical(pa, status);
2889 }
2890
2891 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2892 {
2893 return int64_to_bfloat16_scalbn(a, scale, status);
2894 }
2895
2896 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2897 {
2898 return int64_to_bfloat16_scalbn(a, scale, status);
2899 }
2900
2901 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2902 {
2903 return int64_to_bfloat16_scalbn(a, 0, status);
2904 }
2905
2906 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2907 {
2908 return int64_to_bfloat16_scalbn(a, 0, status);
2909 }
2910
2911 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2912 {
2913 return int64_to_bfloat16_scalbn(a, 0, status);
2914 }
2915
2916 /*
2917 * Unsigned Integer to float conversions
2918 *
2919 * Returns the result of converting the unsigned integer `a' to the
2920 * floating-point format. The conversion is performed according to the
2921 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2922 */
2923
2924 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
2925 {
2926 FloatParts64 r = { .sign = false };
2927 int shift;
2928
2929 if (a == 0) {
2930 r.cls = float_class_zero;
2931 } else {
2932 scale = MIN(MAX(scale, -0x10000), 0x10000);
2933 shift = clz64(a);
2934 r.cls = float_class_normal;
2935 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2936 r.frac = a << shift;
2937 }
2938
2939 return r;
2940 }
2941
2942 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2943 {
2944 FloatParts64 pa = uint_to_float(a, scale, status);
2945 return float16_round_pack_canonical(pa, status);
2946 }
2947
2948 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2949 {
2950 return uint64_to_float16_scalbn(a, scale, status);
2951 }
2952
2953 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2954 {
2955 return uint64_to_float16_scalbn(a, scale, status);
2956 }
2957
2958 float16 uint64_to_float16(uint64_t a, float_status *status)
2959 {
2960 return uint64_to_float16_scalbn(a, 0, status);
2961 }
2962
2963 float16 uint32_to_float16(uint32_t a, float_status *status)
2964 {
2965 return uint64_to_float16_scalbn(a, 0, status);
2966 }
2967
2968 float16 uint16_to_float16(uint16_t a, float_status *status)
2969 {
2970 return uint64_to_float16_scalbn(a, 0, status);
2971 }
2972
2973 float16 uint8_to_float16(uint8_t a, float_status *status)
2974 {
2975 return uint64_to_float16_scalbn(a, 0, status);
2976 }
2977
2978 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2979 {
2980 FloatParts64 pa = uint_to_float(a, scale, status);
2981 return float32_round_pack_canonical(pa, status);
2982 }
2983
2984 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2985 {
2986 return uint64_to_float32_scalbn(a, scale, status);
2987 }
2988
2989 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2990 {
2991 return uint64_to_float32_scalbn(a, scale, status);
2992 }
2993
2994 float32 uint64_to_float32(uint64_t a, float_status *status)
2995 {
2996 return uint64_to_float32_scalbn(a, 0, status);
2997 }
2998
2999 float32 uint32_to_float32(uint32_t a, float_status *status)
3000 {
3001 return uint64_to_float32_scalbn(a, 0, status);
3002 }
3003
3004 float32 uint16_to_float32(uint16_t a, float_status *status)
3005 {
3006 return uint64_to_float32_scalbn(a, 0, status);
3007 }
3008
3009 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3010 {
3011 FloatParts64 pa = uint_to_float(a, scale, status);
3012 return float64_round_pack_canonical(pa, status);
3013 }
3014
3015 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3016 {
3017 return uint64_to_float64_scalbn(a, scale, status);
3018 }
3019
3020 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3021 {
3022 return uint64_to_float64_scalbn(a, scale, status);
3023 }
3024
3025 float64 uint64_to_float64(uint64_t a, float_status *status)
3026 {
3027 return uint64_to_float64_scalbn(a, 0, status);
3028 }
3029
3030 float64 uint32_to_float64(uint32_t a, float_status *status)
3031 {
3032 return uint64_to_float64_scalbn(a, 0, status);
3033 }
3034
3035 float64 uint16_to_float64(uint16_t a, float_status *status)
3036 {
3037 return uint64_to_float64_scalbn(a, 0, status);
3038 }
3039
3040 /*
3041 * Returns the result of converting the unsigned integer `a' to the
3042 * bfloat16 format.
3043 */
3044
3045 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3046 {
3047 FloatParts64 pa = uint_to_float(a, scale, status);
3048 return bfloat16_round_pack_canonical(pa, status);
3049 }
3050
3051 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3052 {
3053 return uint64_to_bfloat16_scalbn(a, scale, status);
3054 }
3055
3056 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3057 {
3058 return uint64_to_bfloat16_scalbn(a, scale, status);
3059 }
3060
3061 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3062 {
3063 return uint64_to_bfloat16_scalbn(a, 0, status);
3064 }
3065
3066 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3067 {
3068 return uint64_to_bfloat16_scalbn(a, 0, status);
3069 }
3070
3071 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3072 {
3073 return uint64_to_bfloat16_scalbn(a, 0, status);
3074 }
3075
3076 /* Float Min/Max */
3077 /* min() and max() functions. These can't be implemented as
3078 * 'compare and pick one input' because that would mishandle
3079 * NaNs and +0 vs -0.
3080 *
3081 * minnum() and maxnum() functions. These are similar to the min()
3082 * and max() functions but if one of the arguments is a QNaN and
3083 * the other is numerical then the numerical argument is returned.
3084 * SNaNs will get quietened before being returned.
3085 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3086 * and maxNum() operations. min() and max() are the typical min/max
3087 * semantics provided by many CPUs which predate that specification.
3088 *
3089 * minnummag() and maxnummag() functions correspond to minNumMag()
3090 * and minNumMag() from the IEEE-754 2008.
3091 */
3092 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3093 bool ieee, bool ismag, float_status *s)
3094 {
3095 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3096 if (ieee) {
3097 /* Takes two floating-point values `a' and `b', one of
3098 * which is a NaN, and returns the appropriate NaN
3099 * result. If either `a' or `b' is a signaling NaN,
3100 * the invalid exception is raised.
3101 */
3102 if (is_snan(a.cls) || is_snan(b.cls)) {
3103 return pick_nan(a, b, s);
3104 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3105 return b;
3106 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3107 return a;
3108 }
3109 }
3110 return pick_nan(a, b, s);
3111 } else {
3112 int a_exp, b_exp;
3113
3114 switch (a.cls) {
3115 case float_class_normal:
3116 a_exp = a.exp;
3117 break;
3118 case float_class_inf:
3119 a_exp = INT_MAX;
3120 break;
3121 case float_class_zero:
3122 a_exp = INT_MIN;
3123 break;
3124 default:
3125 g_assert_not_reached();
3126 break;
3127 }
3128 switch (b.cls) {
3129 case float_class_normal:
3130 b_exp = b.exp;
3131 break;
3132 case float_class_inf:
3133 b_exp = INT_MAX;
3134 break;
3135 case float_class_zero:
3136 b_exp = INT_MIN;
3137 break;
3138 default:
3139 g_assert_not_reached();
3140 break;
3141 }
3142
3143 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3144 bool a_less = a_exp < b_exp;
3145 if (a_exp == b_exp) {
3146 a_less = a.frac < b.frac;
3147 }
3148 return a_less ^ ismin ? b : a;
3149 }
3150
3151 if (a.sign == b.sign) {
3152 bool a_less = a_exp < b_exp;
3153 if (a_exp == b_exp) {
3154 a_less = a.frac < b.frac;
3155 }
3156 return a.sign ^ a_less ^ ismin ? b : a;
3157 } else {
3158 return a.sign ^ ismin ? b : a;
3159 }
3160 }
3161 }
3162
3163 #define MINMAX(sz, name, ismin, isiee, ismag) \
3164 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3165 float_status *s) \
3166 { \
3167 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3168 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3169 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3170 \
3171 return float ## sz ## _round_pack_canonical(pr, s); \
3172 }
3173
3174 MINMAX(16, min, true, false, false)
3175 MINMAX(16, minnum, true, true, false)
3176 MINMAX(16, minnummag, true, true, true)
3177 MINMAX(16, max, false, false, false)
3178 MINMAX(16, maxnum, false, true, false)
3179 MINMAX(16, maxnummag, false, true, true)
3180
3181 MINMAX(32, min, true, false, false)
3182 MINMAX(32, minnum, true, true, false)
3183 MINMAX(32, minnummag, true, true, true)
3184 MINMAX(32, max, false, false, false)
3185 MINMAX(32, maxnum, false, true, false)
3186 MINMAX(32, maxnummag, false, true, true)
3187
3188 MINMAX(64, min, true, false, false)
3189 MINMAX(64, minnum, true, true, false)
3190 MINMAX(64, minnummag, true, true, true)
3191 MINMAX(64, max, false, false, false)
3192 MINMAX(64, maxnum, false, true, false)
3193 MINMAX(64, maxnummag, false, true, true)
3194
3195 #undef MINMAX
3196
3197 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3198 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3199 { \
3200 FloatParts64 pa = bfloat16_unpack_canonical(a, s); \
3201 FloatParts64 pb = bfloat16_unpack_canonical(b, s); \
3202 FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3203 \
3204 return bfloat16_round_pack_canonical(pr, s); \
3205 }
3206
3207 BF16_MINMAX(min, true, false, false)
3208 BF16_MINMAX(minnum, true, true, false)
3209 BF16_MINMAX(minnummag, true, true, true)
3210 BF16_MINMAX(max, false, false, false)
3211 BF16_MINMAX(maxnum, false, true, false)
3212 BF16_MINMAX(maxnummag, false, true, true)
3213
3214 #undef BF16_MINMAX
3215
3216 /* Floating point compare */
3217 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3218 float_status *s)
3219 {
3220 if (is_nan(a.cls) || is_nan(b.cls)) {
3221 if (!is_quiet ||
3222 a.cls == float_class_snan ||
3223 b.cls == float_class_snan) {
3224 float_raise(float_flag_invalid, s);
3225 }
3226 return float_relation_unordered;
3227 }
3228
3229 if (a.cls == float_class_zero) {
3230 if (b.cls == float_class_zero) {
3231 return float_relation_equal;
3232 }
3233 return b.sign ? float_relation_greater : float_relation_less;
3234 } else if (b.cls == float_class_zero) {
3235 return a.sign ? float_relation_less : float_relation_greater;
3236 }
3237
3238 /* The only really important thing about infinity is its sign. If
3239 * both are infinities the sign marks the smallest of the two.
3240 */
3241 if (a.cls == float_class_inf) {
3242 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3243 return float_relation_equal;
3244 }
3245 return a.sign ? float_relation_less : float_relation_greater;
3246 } else if (b.cls == float_class_inf) {
3247 return b.sign ? float_relation_greater : float_relation_less;
3248 }
3249
3250 if (a.sign != b.sign) {
3251 return a.sign ? float_relation_less : float_relation_greater;
3252 }
3253
3254 if (a.exp == b.exp) {
3255 if (a.frac == b.frac) {
3256 return float_relation_equal;
3257 }
3258 if (a.sign) {
3259 return a.frac > b.frac ?
3260 float_relation_less : float_relation_greater;
3261 } else {
3262 return a.frac > b.frac ?
3263 float_relation_greater : float_relation_less;
3264 }
3265 } else {
3266 if (a.sign) {
3267 return a.exp > b.exp ? float_relation_less : float_relation_greater;
3268 } else {
3269 return a.exp > b.exp ? float_relation_greater : float_relation_less;
3270 }
3271 }
3272 }
3273
3274 #define COMPARE(name, attr, sz) \
3275 static int attr \
3276 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3277 { \
3278 FloatParts64 pa = float ## sz ## _unpack_canonical(a, s); \
3279 FloatParts64 pb = float ## sz ## _unpack_canonical(b, s); \
3280 return compare_floats(pa, pb, is_quiet, s); \
3281 }
3282
3283 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3284 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3285 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3286
3287 #undef COMPARE
3288
3289 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3290 {
3291 return soft_f16_compare(a, b, false, s);
3292 }
3293
3294 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3295 {
3296 return soft_f16_compare(a, b, true, s);
3297 }
3298
3299 static FloatRelation QEMU_FLATTEN
3300 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3301 {
3302 union_float32 ua, ub;
3303
3304 ua.s = xa;
3305 ub.s = xb;
3306
3307 if (QEMU_NO_HARDFLOAT) {
3308 goto soft;
3309 }
3310
3311 float32_input_flush2(&ua.s, &ub.s, s);
3312 if (isgreaterequal(ua.h, ub.h)) {
3313 if (isgreater(ua.h, ub.h)) {
3314 return float_relation_greater;
3315 }
3316 return float_relation_equal;
3317 }
3318 if (likely(isless(ua.h, ub.h))) {
3319 return float_relation_less;
3320 }
3321 /* The only condition remaining is unordered.
3322 * Fall through to set flags.
3323 */
3324 soft:
3325 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3326 }
3327
3328 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3329 {
3330 return f32_compare(a, b, false, s);
3331 }
3332
3333 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3334 {
3335 return f32_compare(a, b, true, s);
3336 }
3337
3338 static FloatRelation QEMU_FLATTEN
3339 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3340 {
3341 union_float64 ua, ub;
3342
3343 ua.s = xa;
3344 ub.s = xb;
3345
3346 if (QEMU_NO_HARDFLOAT) {
3347 goto soft;
3348 }
3349
3350 float64_input_flush2(&ua.s, &ub.s, s);
3351 if (isgreaterequal(ua.h, ub.h)) {
3352 if (isgreater(ua.h, ub.h)) {
3353 return float_relation_greater;
3354 }
3355 return float_relation_equal;
3356 }
3357 if (likely(isless(ua.h, ub.h))) {
3358 return float_relation_less;
3359 }
3360 /* The only condition remaining is unordered.
3361 * Fall through to set flags.
3362 */
3363 soft:
3364 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3365 }
3366
3367 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3368 {
3369 return f64_compare(a, b, false, s);
3370 }
3371
3372 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3373 {
3374 return f64_compare(a, b, true, s);
3375 }
3376
3377 static FloatRelation QEMU_FLATTEN
3378 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3379 {
3380 FloatParts64 pa = bfloat16_unpack_canonical(a, s);
3381 FloatParts64 pb = bfloat16_unpack_canonical(b, s);
3382 return compare_floats(pa, pb, is_quiet, s);
3383 }
3384
3385 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3386 {
3387 return soft_bf16_compare(a, b, false, s);
3388 }
3389
3390 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3391 {
3392 return soft_bf16_compare(a, b, true, s);
3393 }
3394
3395 /* Multiply A by 2 raised to the power N. */
3396 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3397 {
3398 if (unlikely(is_nan(a.cls))) {
3399 return return_nan(a, s);
3400 }
3401 if (a.cls == float_class_normal) {
3402 /* The largest float type (even though not supported by FloatParts64)
3403 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3404 * still allows rounding to infinity, without allowing overflow
3405 * within the int32_t that backs FloatParts64.exp.
3406 */
3407 n = MIN(MAX(n, -0x10000), 0x10000);
3408 a.exp += n;
3409 }
3410 return a;
3411 }
3412
3413 float16 float16_scalbn(float16 a, int n, float_status *status)
3414 {
3415 FloatParts64 pa = float16_unpack_canonical(a, status);
3416 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3417 return float16_round_pack_canonical(pr, status);
3418 }
3419
3420 float32 float32_scalbn(float32 a, int n, float_status *status)
3421 {
3422 FloatParts64 pa = float32_unpack_canonical(a, status);
3423 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3424 return float32_round_pack_canonical(pr, status);
3425 }
3426
3427 float64 float64_scalbn(float64 a, int n, float_status *status)
3428 {
3429 FloatParts64 pa = float64_unpack_canonical(a, status);
3430 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3431 return float64_round_pack_canonical(pr, status);
3432 }
3433
3434 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3435 {
3436 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3437 FloatParts64 pr = scalbn_decomposed(pa, n, status);
3438 return bfloat16_round_pack_canonical(pr, status);
3439 }
3440
3441 /*
3442 * Square Root
3443 *
3444 * The old softfloat code did an approximation step before zeroing in
3445 * on the final result. However for simpleness we just compute the
3446 * square root by iterating down from the implicit bit to enough extra
3447 * bits to ensure we get a correctly rounded result.
3448 *
3449 * This does mean however the calculation is slower than before,
3450 * especially for 64 bit floats.
3451 */
3452
3453 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3454 {
3455 uint64_t a_frac, r_frac, s_frac;
3456 int bit, last_bit;
3457
3458 if (is_nan(a.cls)) {
3459 return return_nan(a, s);
3460 }
3461 if (a.cls == float_class_zero) {
3462 return a; /* sqrt(+-0) = +-0 */
3463 }
3464 if (a.sign) {
3465 float_raise(float_flag_invalid, s);
3466 parts_default_nan(&a, s);
3467 return a;
3468 }
3469 if (a.cls == float_class_inf) {
3470 return a; /* sqrt(+inf) = +inf */
3471 }
3472
3473 assert(a.cls == float_class_normal);
3474
3475 /* We need two overflow bits at the top. Adding room for that is a
3476 * right shift. If the exponent is odd, we can discard the low bit
3477 * by multiplying the fraction by 2; that's a left shift. Combine
3478 * those and we shift right by 1 if the exponent is odd, otherwise 2.
3479 */
3480 a_frac = a.frac >> (2 - (a.exp & 1));
3481 a.exp >>= 1;
3482
3483 /* Bit-by-bit computation of sqrt. */
3484 r_frac = 0;
3485 s_frac = 0;
3486
3487 /* Iterate from implicit bit down to the 3 extra bits to compute a
3488 * properly rounded result. Remember we've inserted two more bits
3489 * at the top, so these positions are two less.
3490 */
3491 bit = DECOMPOSED_BINARY_POINT - 2;
3492 last_bit = MAX(p->frac_shift - 4, 0);
3493 do {
3494 uint64_t q = 1ULL << bit;
3495 uint64_t t_frac = s_frac + q;
3496 if (t_frac <= a_frac) {
3497 s_frac = t_frac + q;
3498 a_frac -= t_frac;
3499 r_frac += q;
3500 }
3501 a_frac <<= 1;
3502 } while (--bit >= last_bit);
3503
3504 /* Undo the right shift done above. If there is any remaining
3505 * fraction, the result is inexact. Set the sticky bit.
3506 */
3507 a.frac = (r_frac << 2) + (a_frac != 0);
3508
3509 return a;
3510 }
3511
3512 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3513 {
3514 FloatParts64 pa = float16_unpack_canonical(a, status);
3515 FloatParts64 pr = sqrt_float(pa, status, &float16_params);
3516 return float16_round_pack_canonical(pr, status);
3517 }
3518
3519 static float32 QEMU_SOFTFLOAT_ATTR
3520 soft_f32_sqrt(float32 a, float_status *status)
3521 {
3522 FloatParts64 pa = float32_unpack_canonical(a, status);
3523 FloatParts64 pr = sqrt_float(pa, status, &float32_params);
3524 return float32_round_pack_canonical(pr, status);
3525 }
3526
3527 static float64 QEMU_SOFTFLOAT_ATTR
3528 soft_f64_sqrt(float64 a, float_status *status)
3529 {
3530 FloatParts64 pa = float64_unpack_canonical(a, status);
3531 FloatParts64 pr = sqrt_float(pa, status, &float64_params);
3532 return float64_round_pack_canonical(pr, status);
3533 }
3534
3535 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3536 {
3537 union_float32 ua, ur;
3538
3539 ua.s = xa;
3540 if (unlikely(!can_use_fpu(s))) {
3541 goto soft;
3542 }
3543
3544 float32_input_flush1(&ua.s, s);
3545 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3546 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3547 fpclassify(ua.h) == FP_ZERO) ||
3548 signbit(ua.h))) {
3549 goto soft;
3550 }
3551 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3552 float32_is_neg(ua.s))) {
3553 goto soft;
3554 }
3555 ur.h = sqrtf(ua.h);
3556 return ur.s;
3557
3558 soft:
3559 return soft_f32_sqrt(ua.s, s);
3560 }
3561
3562 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3563 {
3564 union_float64 ua, ur;
3565
3566 ua.s = xa;
3567 if (unlikely(!can_use_fpu(s))) {
3568 goto soft;
3569 }
3570
3571 float64_input_flush1(&ua.s, s);
3572 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3573 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3574 fpclassify(ua.h) == FP_ZERO) ||
3575 signbit(ua.h))) {
3576 goto soft;
3577 }
3578 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3579 float64_is_neg(ua.s))) {
3580 goto soft;
3581 }
3582 ur.h = sqrt(ua.h);
3583 return ur.s;
3584
3585 soft:
3586 return soft_f64_sqrt(ua.s, s);
3587 }
3588
3589 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3590 {
3591 FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3592 FloatParts64 pr = sqrt_float(pa, status, &bfloat16_params);
3593 return bfloat16_round_pack_canonical(pr, status);
3594 }
3595
3596 /*----------------------------------------------------------------------------
3597 | The pattern for a default generated NaN.
3598 *----------------------------------------------------------------------------*/
3599
3600 float16 float16_default_nan(float_status *status)
3601 {
3602 FloatParts64 p;
3603
3604 parts_default_nan(&p, status);
3605 p.frac >>= float16_params.frac_shift;
3606 return float16_pack_raw(p);
3607 }
3608
3609 float32 float32_default_nan(float_status *status)
3610 {
3611 FloatParts64 p;
3612
3613 parts_default_nan(&p, status);
3614 p.frac >>= float32_params.frac_shift;
3615 return float32_pack_raw(p);
3616 }
3617
3618 float64 float64_default_nan(float_status *status)
3619 {
3620 FloatParts64 p;
3621
3622 parts_default_nan(&p, status);
3623 p.frac >>= float64_params.frac_shift;
3624 return float64_pack_raw(p);
3625 }
3626
3627 float128 float128_default_nan(float_status *status)
3628 {
3629 FloatParts64 p;
3630 float128 r;
3631
3632 parts_default_nan(&p, status);
3633 /* Extrapolate from the choices made by parts_default_nan to fill
3634 * in the quad-floating format. If the low bit is set, assume we
3635 * want to set all non-snan bits.
3636 */
3637 r.low = -(p.frac & 1);
3638 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3639 r.high |= UINT64_C(0x7FFF000000000000);
3640 r.high |= (uint64_t)p.sign << 63;
3641
3642 return r;
3643 }
3644
3645 bfloat16 bfloat16_default_nan(float_status *status)
3646 {
3647 FloatParts64 p;
3648
3649 parts_default_nan(&p, status);
3650 p.frac >>= bfloat16_params.frac_shift;
3651 return bfloat16_pack_raw(p);
3652 }
3653
3654 /*----------------------------------------------------------------------------
3655 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3656 *----------------------------------------------------------------------------*/
3657
3658 float16 float16_silence_nan(float16 a, float_status *status)
3659 {
3660 FloatParts64 p;
3661
3662 float16_unpack_raw(&p, a);
3663 p.frac <<= float16_params.frac_shift;
3664 p = parts_silence_nan(p, status);
3665 p.frac >>= float16_params.frac_shift;
3666 return float16_pack_raw(p);
3667 }
3668
3669 float32 float32_silence_nan(float32 a, float_status *status)
3670 {
3671 FloatParts64 p;
3672
3673 float32_unpack_raw(&p, a);
3674 p.frac <<= float32_params.frac_shift;
3675 p = parts_silence_nan(p, status);
3676 p.frac >>= float32_params.frac_shift;
3677 return float32_pack_raw(p);
3678 }
3679
3680 float64 float64_silence_nan(float64 a, float_status *status)
3681 {
3682 FloatParts64 p;
3683
3684 float64_unpack_raw(&p, a);
3685 p.frac <<= float64_params.frac_shift;
3686 p = parts_silence_nan(p, status);
3687 p.frac >>= float64_params.frac_shift;
3688 return float64_pack_raw(p);
3689 }
3690
3691 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3692 {
3693 FloatParts64 p;
3694
3695 bfloat16_unpack_raw(&p, a);
3696 p.frac <<= bfloat16_params.frac_shift;
3697 p = parts_silence_nan(p, status);
3698 p.frac >>= bfloat16_params.frac_shift;
3699 return bfloat16_pack_raw(p);
3700 }
3701
3702 /*----------------------------------------------------------------------------
3703 | If `a' is denormal and we are in flush-to-zero mode then set the
3704 | input-denormal exception and return zero. Otherwise just return the value.
3705 *----------------------------------------------------------------------------*/
3706
3707 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3708 {
3709 if (p.exp == 0 && p.frac != 0) {
3710 float_raise(float_flag_input_denormal, status);
3711 return true;
3712 }
3713
3714 return false;
3715 }
3716
3717 float16 float16_squash_input_denormal(float16 a, float_status *status)
3718 {
3719 if (status->flush_inputs_to_zero) {
3720 FloatParts64 p;
3721
3722 float16_unpack_raw(&p, a);
3723 if (parts_squash_denormal(p, status)) {
3724 return float16_set_sign(float16_zero, p.sign);
3725 }
3726 }
3727 return a;
3728 }
3729
3730 float32 float32_squash_input_denormal(float32 a, float_status *status)
3731 {
3732 if (status->flush_inputs_to_zero) {
3733 FloatParts64 p;
3734
3735 float32_unpack_raw(&p, a);
3736 if (parts_squash_denormal(p, status)) {
3737 return float32_set_sign(float32_zero, p.sign);
3738 }
3739 }
3740 return a;
3741 }
3742
3743 float64 float64_squash_input_denormal(float64 a, float_status *status)
3744 {
3745 if (status->flush_inputs_to_zero) {
3746 FloatParts64 p;
3747
3748 float64_unpack_raw(&p, a);
3749 if (parts_squash_denormal(p, status)) {
3750 return float64_set_sign(float64_zero, p.sign);
3751 }
3752 }
3753 return a;
3754 }
3755
3756 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3757 {
3758 if (status->flush_inputs_to_zero) {
3759 FloatParts64 p;
3760
3761 bfloat16_unpack_raw(&p, a);
3762 if (parts_squash_denormal(p, status)) {
3763 return bfloat16_set_sign(bfloat16_zero, p.sign);
3764 }
3765 }
3766 return a;
3767 }
3768
3769 /*----------------------------------------------------------------------------
3770 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3771 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3772 | input. If `zSign' is 1, the input is negated before being converted to an
3773 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3774 | is simply rounded to an integer, with the inexact exception raised if the
3775 | input cannot be represented exactly as an integer. However, if the fixed-
3776 | point input is too large, the invalid exception is raised and the largest
3777 | positive or negative integer is returned.
3778 *----------------------------------------------------------------------------*/
3779
3780 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3781 float_status *status)
3782 {
3783 int8_t roundingMode;
3784 bool roundNearestEven;
3785 int8_t roundIncrement, roundBits;
3786 int32_t z;
3787
3788 roundingMode = status->float_rounding_mode;
3789 roundNearestEven = ( roundingMode == float_round_nearest_even );
3790 switch (roundingMode) {
3791 case float_round_nearest_even:
3792 case float_round_ties_away:
3793 roundIncrement = 0x40;
3794 break;
3795 case float_round_to_zero:
3796 roundIncrement = 0;
3797 break;
3798 case float_round_up:
3799 roundIncrement = zSign ? 0 : 0x7f;
3800 break;
3801 case float_round_down:
3802 roundIncrement = zSign ? 0x7f : 0;
3803 break;
3804 case float_round_to_odd:
3805 roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3806 break;
3807 default:
3808 abort();
3809 }
3810 roundBits = absZ & 0x7F;
3811 absZ = ( absZ + roundIncrement )>>7;
3812 if (!(roundBits ^ 0x40) && roundNearestEven) {
3813 absZ &= ~1;
3814 }
3815 z = absZ;
3816 if ( zSign ) z = - z;
3817 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3818 float_raise(float_flag_invalid, status);
3819 return zSign ? INT32_MIN : INT32_MAX;
3820 }
3821 if (roundBits) {
3822 float_raise(float_flag_inexact, status);
3823 }
3824 return z;
3825
3826 }
3827
3828 /*----------------------------------------------------------------------------
3829 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3830 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3831 | and returns the properly rounded 64-bit integer corresponding to the input.
3832 | If `zSign' is 1, the input is negated before being converted to an integer.
3833 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3834 | the inexact exception raised if the input cannot be represented exactly as
3835 | an integer. However, if the fixed-point input is too large, the invalid
3836 | exception is raised and the largest positive or negative integer is
3837 | returned.
3838 *----------------------------------------------------------------------------*/
3839
3840 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3841 float_status *status)
3842 {
3843 int8_t roundingMode;
3844 bool roundNearestEven, increment;
3845 int64_t z;
3846
3847 roundingMode = status->float_rounding_mode;
3848 roundNearestEven = ( roundingMode == float_round_nearest_even );
3849 switch (roundingMode) {
3850 case float_round_nearest_even:
3851 case float_round_ties_away:
3852 increment = ((int64_t) absZ1 < 0);
3853 break;
3854 case float_round_to_zero:
3855 increment = 0;
3856 break;
3857 case float_round_up:
3858 increment = !zSign && absZ1;
3859 break;
3860 case float_round_down:
3861 increment = zSign && absZ1;
3862 break;
3863 case float_round_to_odd:
3864 increment = !(absZ0 & 1) && absZ1;
3865 break;
3866 default:
3867 abort();
3868 }
3869 if ( increment ) {
3870 ++absZ0;
3871 if ( absZ0 == 0 ) goto overflow;
3872 if (!(absZ1 << 1) && roundNearestEven) {
3873 absZ0 &= ~1;
3874 }
3875 }
3876 z = absZ0;
3877 if ( zSign ) z = - z;
3878 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3879 overflow:
3880 float_raise(float_flag_invalid, status);
3881 return zSign ? INT64_MIN : INT64_MAX;
3882 }
3883 if (absZ1) {
3884 float_raise(float_flag_inexact, status);
3885 }
3886 return z;
3887
3888 }
3889
3890 /*----------------------------------------------------------------------------
3891 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3892 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3893 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3894 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3895 | with the inexact exception raised if the input cannot be represented exactly
3896 | as an integer. However, if the fixed-point input is too large, the invalid
3897 | exception is raised and the largest unsigned integer is returned.
3898 *----------------------------------------------------------------------------*/
3899
3900 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3901 uint64_t absZ1, float_status *status)
3902 {
3903 int8_t roundingMode;
3904 bool roundNearestEven, increment;
3905
3906 roundingMode = status->float_rounding_mode;
3907 roundNearestEven = (roundingMode == float_round_nearest_even);
3908 switch (roundingMode) {
3909 case float_round_nearest_even:
3910 case float_round_ties_away:
3911 increment = ((int64_t)absZ1 < 0);
3912 break;
3913 case float_round_to_zero:
3914 increment = 0;
3915 break;
3916 case float_round_up:
3917 increment = !zSign && absZ1;
3918 break;
3919 case float_round_down:
3920 increment = zSign && absZ1;
3921 break;
3922 case float_round_to_odd:
3923 increment = !(absZ0 & 1) && absZ1;
3924 break;
3925 default:
3926 abort();
3927 }
3928 if (increment) {
3929 ++absZ0;
3930 if (absZ0 == 0) {
3931 float_raise(float_flag_invalid, status);
3932 return UINT64_MAX;
3933 }
3934 if (!(absZ1 << 1) && roundNearestEven) {
3935 absZ0 &= ~1;
3936 }
3937 }
3938
3939 if (zSign && absZ0) {
3940 float_raise(float_flag_invalid, status);
3941 return 0;
3942 }
3943
3944 if (absZ1) {
3945 float_raise(float_flag_inexact, status);
3946 }
3947 return absZ0;
3948 }
3949
3950 /*----------------------------------------------------------------------------
3951 | Normalizes the subnormal single-precision floating-point value represented
3952 | by the denormalized significand `aSig'. The normalized exponent and
3953 | significand are stored at the locations pointed to by `zExpPtr' and
3954 | `zSigPtr', respectively.
3955 *----------------------------------------------------------------------------*/
3956
3957 static void
3958 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3959 {
3960 int8_t shiftCount;
3961
3962 shiftCount = clz32(aSig) - 8;
3963 *zSigPtr = aSig<<shiftCount;
3964 *zExpPtr = 1 - shiftCount;
3965
3966 }
3967
3968 /*----------------------------------------------------------------------------
3969 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3970 | and significand `zSig', and returns the proper single-precision floating-
3971 | point value corresponding to the abstract input. Ordinarily, the abstract
3972 | value is simply rounded and packed into the single-precision format, with
3973 | the inexact exception raised if the abstract input cannot be represented
3974 | exactly. However, if the abstract value is too large, the overflow and
3975 | inexact exceptions are raised and an infinity or maximal finite value is
3976 | returned. If the abstract value is too small, the input value is rounded to
3977 | a subnormal number, and the underflow and inexact exceptions are raised if
3978 | the abstract input cannot be represented exactly as a subnormal single-
3979 | precision floating-point number.
3980 | The input significand `zSig' has its binary point between bits 30
3981 | and 29, which is 7 bits to the left of the usual location. This shifted
3982 | significand must be normalized or smaller. If `zSig' is not normalized,
3983 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3984 | and it must not require rounding. In the usual case that `zSig' is
3985 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3986 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3987 | Binary Floating-Point Arithmetic.
3988 *----------------------------------------------------------------------------*/
3989
3990 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3991 float_status *status)
3992 {
3993 int8_t roundingMode;
3994 bool roundNearestEven;
3995 int8_t roundIncrement, roundBits;
3996 bool isTiny;
3997
3998 roundingMode = status->float_rounding_mode;
3999 roundNearestEven = ( roundingMode == float_round_nearest_even );
4000 switch (roundingMode) {
4001 case float_round_nearest_even:
4002 case float_round_ties_away:
4003 roundIncrement = 0x40;
4004 break;
4005 case float_round_to_zero:
4006 roundIncrement = 0;
4007 break;
4008 case float_round_up:
4009 roundIncrement = zSign ? 0 : 0x7f;
4010 break;
4011 case float_round_down:
4012 roundIncrement = zSign ? 0x7f : 0;
4013 break;
4014 case float_round_to_odd:
4015 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4016 break;
4017 default:
4018 abort();
4019 break;
4020 }
4021 roundBits = zSig & 0x7F;
4022 if ( 0xFD <= (uint16_t) zExp ) {
4023 if ( ( 0xFD < zExp )
4024 || ( ( zExp == 0xFD )
4025 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4026 ) {
4027 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4028 roundIncrement != 0;
4029 float_raise(float_flag_overflow | float_flag_inexact, status);
4030 return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4031 }
4032 if ( zExp < 0 ) {
4033 if (status->flush_to_zero) {
4034 float_raise(float_flag_output_denormal, status);
4035 return packFloat32(zSign, 0, 0);
4036 }
4037 isTiny = status->tininess_before_rounding
4038 || (zExp < -1)
4039 || (zSig + roundIncrement < 0x80000000);
4040 shift32RightJamming( zSig, - zExp, &zSig );
4041 zExp = 0;
4042 roundBits = zSig & 0x7F;
4043 if (isTiny && roundBits) {
4044 float_raise(float_flag_underflow, status);
4045 }
4046 if (roundingMode == float_round_to_odd) {
4047 /*
4048 * For round-to-odd case, the roundIncrement depends on
4049 * zSig which just changed.
4050 */
4051 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4052 }
4053 }
4054 }
4055 if (roundBits) {
4056 float_raise(float_flag_inexact, status);
4057 }
4058 zSig = ( zSig + roundIncrement )>>7;
4059 if (!(roundBits ^ 0x40) && roundNearestEven) {
4060 zSig &= ~1;
4061 }
4062 if ( zSig == 0 ) zExp = 0;
4063 return packFloat32( zSign, zExp, zSig );
4064
4065 }
4066
4067 /*----------------------------------------------------------------------------
4068 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4069 | and significand `zSig', and returns the proper single-precision floating-
4070 | point value corresponding to the abstract input. This routine is just like
4071 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4072 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4073 | floating-point exponent.
4074 *----------------------------------------------------------------------------*/
4075
4076 static float32
4077 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4078 float_status *status)
4079 {
4080 int8_t shiftCount;
4081
4082 shiftCount = clz32(zSig) - 1;
4083 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4084 status);
4085
4086 }
4087
4088 /*----------------------------------------------------------------------------
4089 | Normalizes the subnormal double-precision floating-point value represented
4090 | by the denormalized significand `aSig'. The normalized exponent and
4091 | significand are stored at the locations pointed to by `zExpPtr' and
4092 | `zSigPtr', respectively.
4093 *----------------------------------------------------------------------------*/
4094
4095 static void
4096 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4097 {
4098 int8_t shiftCount;
4099
4100 shiftCount = clz64(aSig) - 11;
4101 *zSigPtr = aSig<<shiftCount;
4102 *zExpPtr = 1 - shiftCount;
4103
4104 }
4105
4106 /*----------------------------------------------------------------------------
4107 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4108 | double-precision floating-point value, returning the result. After being
4109 | shifted into the proper positions, the three fields are simply added
4110 | together to form the result. This means that any integer portion of `zSig'
4111 | will be added into the exponent. Since a properly normalized significand
4112 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4113 | than the desired result exponent whenever `zSig' is a complete, normalized
4114 | significand.
4115 *----------------------------------------------------------------------------*/
4116
4117 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4118 {
4119
4120 return make_float64(
4121 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4122
4123 }
4124
4125 /*----------------------------------------------------------------------------
4126 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4127 | and significand `zSig', and returns the proper double-precision floating-
4128 | point value corresponding to the abstract input. Ordinarily, the abstract
4129 | value is simply rounded and packed into the double-precision format, with
4130 | the inexact exception raised if the abstract input cannot be represented
4131 | exactly. However, if the abstract value is too large, the overflow and
4132 | inexact exceptions are raised and an infinity or maximal finite value is
4133 | returned. If the abstract value is too small, the input value is rounded to
4134 | a subnormal number, and the underflow and inexact exceptions are raised if
4135 | the abstract input cannot be represented exactly as a subnormal double-
4136 | precision floating-point number.
4137 | The input significand `zSig' has its binary point between bits 62
4138 | and 61, which is 10 bits to the left of the usual location. This shifted
4139 | significand must be normalized or smaller. If `zSig' is not normalized,
4140 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4141 | and it must not require rounding. In the usual case that `zSig' is
4142 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4143 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4144 | Binary Floating-Point Arithmetic.
4145 *----------------------------------------------------------------------------*/
4146
4147 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4148 float_status *status)
4149 {
4150 int8_t roundingMode;
4151 bool roundNearestEven;
4152 int roundIncrement, roundBits;
4153 bool isTiny;
4154
4155 roundingMode = status->float_rounding_mode;
4156 roundNearestEven = ( roundingMode == float_round_nearest_even );
4157 switch (roundingMode) {
4158 case float_round_nearest_even:
4159 case float_round_ties_away:
4160 roundIncrement = 0x200;
4161 break;
4162 case float_round_to_zero:
4163 roundIncrement = 0;
4164 break;
4165 case float_round_up:
4166 roundIncrement = zSign ? 0 : 0x3ff;
4167 break;
4168 case float_round_down:
4169 roundIncrement = zSign ? 0x3ff : 0;
4170 break;
4171 case float_round_to_odd:
4172 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4173 break;
4174 default:
4175 abort();
4176 }
4177 roundBits = zSig & 0x3FF;
4178 if ( 0x7FD <= (uint16_t) zExp ) {
4179 if ( ( 0x7FD < zExp )
4180 || ( ( zExp == 0x7FD )
4181 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4182 ) {
4183 bool overflow_to_inf = roundingMode != float_round_to_odd &&
4184 roundIncrement != 0;
4185 float_raise(float_flag_overflow | float_flag_inexact, status);
4186 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4187 }
4188 if ( zExp < 0 ) {
4189 if (status->flush_to_zero) {
4190 float_raise(float_flag_output_denormal, status);
4191 return packFloat64(zSign, 0, 0);
4192 }
4193 isTiny = status->tininess_before_rounding
4194 || (zExp < -1)
4195 || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4196 shift64RightJamming( zSig, - zExp, &zSig );
4197 zExp = 0;
4198 roundBits = zSig & 0x3FF;
4199 if (isTiny && roundBits) {
4200 float_raise(float_flag_underflow, status);
4201 }
4202 if (roundingMode == float_round_to_odd) {
4203 /*
4204 * For round-to-odd case, the roundIncrement depends on
4205 * zSig which just changed.
4206 */
4207 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4208 }
4209 }
4210 }
4211 if (roundBits) {
4212 float_raise(float_flag_inexact, status);
4213 }
4214 zSig = ( zSig + roundIncrement )>>10;
4215 if (!(roundBits ^ 0x200) && roundNearestEven) {
4216 zSig &= ~1;
4217 }
4218 if ( zSig == 0 ) zExp = 0;
4219 return packFloat64( zSign, zExp, zSig );
4220
4221 }
4222
4223 /*----------------------------------------------------------------------------
4224 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4225 | and significand `zSig', and returns the proper double-precision floating-
4226 | point value corresponding to the abstract input. This routine is just like
4227 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4228 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4229 | floating-point exponent.
4230 *----------------------------------------------------------------------------*/
4231
4232 static float64
4233 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4234 float_status *status)
4235 {
4236 int8_t shiftCount;
4237
4238 shiftCount = clz64(zSig) - 1;
4239 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4240 status);
4241
4242 }
4243
4244 /*----------------------------------------------------------------------------
4245 | Normalizes the subnormal extended double-precision floating-point value
4246 | represented by the denormalized significand `aSig'. The normalized exponent
4247 | and significand are stored at the locations pointed to by `zExpPtr' and
4248 | `zSigPtr', respectively.
4249 *----------------------------------------------------------------------------*/
4250
4251 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4252 uint64_t *zSigPtr)
4253 {
4254 int8_t shiftCount;
4255
4256 shiftCount = clz64(aSig);
4257 *zSigPtr = aSig<<shiftCount;
4258 *zExpPtr = 1 - shiftCount;
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4263 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4264 | and returns the proper extended double-precision floating-point value
4265 | corresponding to the abstract input. Ordinarily, the abstract value is
4266 | rounded and packed into the extended double-precision format, with the
4267 | inexact exception raised if the abstract input cannot be represented
4268 | exactly. However, if the abstract value is too large, the overflow and
4269 | inexact exceptions are raised and an infinity or maximal finite value is
4270 | returned. If the abstract value is too small, the input value is rounded to
4271 | a subnormal number, and the underflow and inexact exceptions are raised if
4272 | the abstract input cannot be represented exactly as a subnormal extended
4273 | double-precision floating-point number.
4274 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4275 | number of bits as single or double precision, respectively. Otherwise, the
4276 | result is rounded to the full precision of the extended double-precision
4277 | format.
4278 | The input significand must be normalized or smaller. If the input
4279 | significand is not normalized, `zExp' must be 0; in that case, the result
4280 | returned is a subnormal number, and it must not require rounding. The
4281 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4282 | Floating-Point Arithmetic.
4283 *----------------------------------------------------------------------------*/
4284
4285 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4286 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4287 float_status *status)
4288 {
4289 int8_t roundingMode;
4290 bool roundNearestEven, increment, isTiny;
4291 int64_t roundIncrement, roundMask, roundBits;
4292
4293 roundingMode = status->float_rounding_mode;
4294 roundNearestEven = ( roundingMode == float_round_nearest_even );
4295 if ( roundingPrecision == 80 ) goto precision80;
4296 if ( roundingPrecision == 64 ) {
4297 roundIncrement = UINT64_C(0x0000000000000400);
4298 roundMask = UINT64_C(0x00000000000007FF);
4299 }
4300 else if ( roundingPrecision == 32 ) {
4301 roundIncrement = UINT64_C(0x0000008000000000);
4302 roundMask = UINT64_C(0x000000FFFFFFFFFF);
4303 }
4304 else {
4305 goto precision80;
4306 }
4307 zSig0 |= ( zSig1 != 0 );
4308 switch (roundingMode) {
4309 case float_round_nearest_even:
4310 case float_round_ties_away:
4311 break;
4312 case float_round_to_zero:
4313 roundIncrement = 0;
4314 break;
4315 case float_round_up:
4316 roundIncrement = zSign ? 0 : roundMask;
4317 break;
4318 case float_round_down:
4319 roundIncrement = zSign ? roundMask : 0;
4320 break;
4321 default:
4322 abort();
4323 }
4324 roundBits = zSig0 & roundMask;
4325 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4326 if ( ( 0x7FFE < zExp )
4327 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4328 ) {
4329 goto overflow;
4330 }
4331 if ( zExp <= 0 ) {
4332 if (status->flush_to_zero) {
4333 float_raise(float_flag_output_denormal, status);
4334 return packFloatx80(zSign, 0, 0);
4335 }
4336 isTiny = status->tininess_before_rounding
4337 || (zExp < 0 )
4338 || (zSig0 <= zSig0 + roundIncrement);
4339 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4340 zExp = 0;
4341 roundBits = zSig0 & roundMask;
4342 if (isTiny && roundBits) {
4343 float_raise(float_flag_underflow, status);
4344 }
4345 if (roundBits) {
4346 float_raise(float_flag_inexact, status);
4347 }
4348 zSig0 += roundIncrement;
4349 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4350 roundIncrement = roundMask + 1;
4351 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4352 roundMask |= roundIncrement;
4353 }
4354 zSig0 &= ~ roundMask;
4355 return packFloatx80( zSign, zExp, zSig0 );
4356 }
4357 }
4358 if (roundBits) {
4359 float_raise(float_flag_inexact, status);
4360 }
4361 zSig0 += roundIncrement;
4362 if ( zSig0 < roundIncrement ) {
4363 ++zExp;
4364 zSig0 = UINT64_C(0x8000000000000000);
4365 }
4366 roundIncrement = roundMask + 1;
4367 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4368 roundMask |= roundIncrement;
4369 }
4370 zSig0 &= ~ roundMask;
4371 if ( zSig0 == 0 ) zExp = 0;
4372 return packFloatx80( zSign, zExp, zSig0 );
4373 precision80:
4374 switch (roundingMode) {
4375 case float_round_nearest_even:
4376 case float_round_ties_away:
4377 increment = ((int64_t)zSig1 < 0);
4378 break;
4379 case float_round_to_zero:
4380 increment = 0;
4381 break;
4382 case float_round_up:
4383 increment = !zSign && zSig1;
4384 break;
4385 case float_round_down:
4386 increment = zSign && zSig1;
4387 break;
4388 default:
4389 abort();
4390 }
4391 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4392 if ( ( 0x7FFE < zExp )
4393 || ( ( zExp == 0x7FFE )
4394 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4395 && increment
4396 )
4397 ) {
4398 roundMask = 0;
4399 overflow:
4400 float_raise(float_flag_overflow | float_flag_inexact, status);
4401 if ( ( roundingMode == float_round_to_zero )
4402 || ( zSign && ( roundingMode == float_round_up ) )
4403 || ( ! zSign && ( roundingMode == float_round_down ) )
4404 ) {
4405 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4406 }
4407 return packFloatx80(zSign,
4408 floatx80_infinity_high,
4409 floatx80_infinity_low);
4410 }
4411 if ( zExp <= 0 ) {
4412 isTiny = status->tininess_before_rounding
4413 || (zExp < 0)
4414 || !increment
4415 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4416 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4417 zExp = 0;
4418 if (isTiny && zSig1) {
4419 float_raise(float_flag_underflow, status);
4420 }
4421 if (zSig1) {
4422 float_raise(float_flag_inexact, status);
4423 }
4424 switch (roundingMode) {
4425 case float_round_nearest_even:
4426 case float_round_ties_away:
4427 increment = ((int64_t)zSig1 < 0);
4428 break;
4429 case float_round_to_zero:
4430 increment = 0;
4431 break;
4432 case float_round_up:
4433 increment = !zSign && zSig1;
4434 break;
4435 case float_round_down:
4436 increment = zSign && zSig1;
4437 break;
4438 default:
4439 abort();
4440 }
4441 if ( increment ) {
4442 ++zSig0;
4443 if (!(zSig1 << 1) && roundNearestEven) {
4444 zSig0 &= ~1;
4445 }
4446 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4447 }
4448 return packFloatx80( zSign, zExp, zSig0 );
4449 }
4450 }
4451 if (zSig1) {
4452 float_raise(float_flag_inexact, status);
4453 }
4454 if ( increment ) {
4455 ++zSig0;
4456 if ( zSig0 == 0 ) {
4457 ++zExp;
4458 zSig0 = UINT64_C(0x8000000000000000);
4459 }
4460 else {
4461 if (!(zSig1 << 1) && roundNearestEven) {
4462 zSig0 &= ~1;
4463 }
4464 }
4465 }
4466 else {
4467 if ( zSig0 == 0 ) zExp = 0;
4468 }
4469 return packFloatx80( zSign, zExp, zSig0 );
4470
4471 }
4472
4473 /*----------------------------------------------------------------------------
4474 | Takes an abstract floating-point value having sign `zSign', exponent
4475 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4476 | and returns the proper extended double-precision floating-point value
4477 | corresponding to the abstract input. This routine is just like
4478 | `roundAndPackFloatx80' except that the input significand does not have to be
4479 | normalized.
4480 *----------------------------------------------------------------------------*/
4481
4482 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4483 bool zSign, int32_t zExp,
4484 uint64_t zSig0, uint64_t zSig1,
4485 float_status *status)
4486 {
4487 int8_t shiftCount;
4488
4489 if ( zSig0 == 0 ) {
4490 zSig0 = zSig1;
4491 zSig1 = 0;
4492 zExp -= 64;
4493 }
4494 shiftCount = clz64(zSig0);
4495 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4496 zExp -= shiftCount;
4497 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4498 zSig0, zSig1, status);
4499
4500 }
4501
4502 /*----------------------------------------------------------------------------
4503 | Returns the least-significant 64 fraction bits of the quadruple-precision
4504 | floating-point value `a'.
4505 *----------------------------------------------------------------------------*/
4506
4507 static inline uint64_t extractFloat128Frac1( float128 a )
4508 {
4509
4510 return a.low;
4511
4512 }
4513
4514 /*----------------------------------------------------------------------------
4515 | Returns the most-significant 48 fraction bits of the quadruple-precision
4516 | floating-point value `a'.
4517 *----------------------------------------------------------------------------*/
4518
4519 static inline uint64_t extractFloat128Frac0( float128 a )
4520 {
4521
4522 return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4523
4524 }
4525
4526 /*----------------------------------------------------------------------------
4527 | Returns the exponent bits of the quadruple-precision floating-point value
4528 | `a'.
4529 *----------------------------------------------------------------------------*/
4530
4531 static inline int32_t extractFloat128Exp( float128 a )
4532 {
4533
4534 return ( a.high>>48 ) & 0x7FFF;
4535
4536 }
4537
4538 /*----------------------------------------------------------------------------
4539 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4540 *----------------------------------------------------------------------------*/
4541
4542 static inline bool extractFloat128Sign(float128 a)
4543 {
4544 return a.high >> 63;
4545 }
4546
4547 /*----------------------------------------------------------------------------
4548 | Normalizes the subnormal quadruple-precision floating-point value
4549 | represented by the denormalized significand formed by the concatenation of
4550 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4551 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4552 | significand are stored at the location pointed to by `zSig0Ptr', and the
4553 | least significant 64 bits of the normalized significand are stored at the
4554 | location pointed to by `zSig1Ptr'.
4555 *----------------------------------------------------------------------------*/
4556
4557 static void
4558 normalizeFloat128Subnormal(
4559 uint64_t aSig0,
4560 uint64_t aSig1,
4561 int32_t *zExpPtr,
4562 uint64_t *zSig0Ptr,
4563 uint64_t *zSig1Ptr
4564 )
4565 {
4566 int8_t shiftCount;
4567
4568 if ( aSig0 == 0 ) {
4569 shiftCount = clz64(aSig1) - 15;
4570 if ( shiftCount < 0 ) {
4571 *zSig0Ptr = aSig1>>( - shiftCount );
4572 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4573 }
4574 else {
4575 *zSig0Ptr = aSig1<<shiftCount;
4576 *zSig1Ptr = 0;
4577 }
4578 *zExpPtr = - shiftCount - 63;
4579 }
4580 else {
4581 shiftCount = clz64(aSig0) - 15;
4582 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4583 *zExpPtr = 1 - shiftCount;
4584 }
4585
4586 }
4587
4588 /*----------------------------------------------------------------------------
4589 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4590 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4591 | floating-point value, returning the result. After being shifted into the
4592 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4593 | added together to form the most significant 32 bits of the result. This
4594 | means that any integer portion of `zSig0' will be added into the exponent.
4595 | Since a properly normalized significand will have an integer portion equal
4596 | to 1, the `zExp' input should be 1 less than the desired result exponent
4597 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4598 | significand.
4599 *----------------------------------------------------------------------------*/
4600
4601 static inline float128
4602 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4603 {
4604 float128 z;
4605
4606 z.low = zSig1;
4607 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4608 return z;
4609 }
4610
4611 /*----------------------------------------------------------------------------
4612 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4613 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4614 | and `zSig2', and returns the proper quadruple-precision floating-point value
4615 | corresponding to the abstract input. Ordinarily, the abstract value is
4616 | simply rounded and packed into the quadruple-precision format, with the
4617 | inexact exception raised if the abstract input cannot be represented
4618 | exactly. However, if the abstract value is too large, the overflow and
4619 | inexact exceptions are raised and an infinity or maximal finite value is
4620 | returned. If the abstract value is too small, the input value is rounded to
4621 | a subnormal number, and the underflow and inexact exceptions are raised if
4622 | the abstract input cannot be represented exactly as a subnormal quadruple-
4623 | precision floating-point number.
4624 | The input significand must be normalized or smaller. If the input
4625 | significand is not normalized, `zExp' must be 0; in that case, the result
4626 | returned is a subnormal number, and it must not require rounding. In the
4627 | usual case that the input significand is normalized, `zExp' must be 1 less
4628 | than the ``true'' floating-point exponent. The handling of underflow and
4629 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631
4632 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4633 uint64_t zSig0, uint64_t zSig1,
4634 uint64_t zSig2, float_status *status)
4635 {
4636 int8_t roundingMode;
4637 bool roundNearestEven, increment, isTiny;
4638
4639 roundingMode = status->float_rounding_mode;
4640 roundNearestEven = ( roundingMode == float_round_nearest_even );
4641 switch (roundingMode) {
4642 case float_round_nearest_even:
4643 case float_round_ties_away:
4644 increment = ((int64_t)zSig2 < 0);
4645 break;
4646 case float_round_to_zero:
4647 increment = 0;
4648 break;
4649 case float_round_up:
4650 increment = !zSign && zSig2;
4651 break;
4652 case float_round_down:
4653 increment = zSign && zSig2;
4654 break;
4655 case float_round_to_odd:
4656 increment = !(zSig1 & 0x1) && zSig2;
4657 break;
4658 default:
4659 abort();
4660 }
4661 if ( 0x7FFD <= (uint32_t) zExp ) {
4662 if ( ( 0x7FFD < zExp )
4663 || ( ( zExp == 0x7FFD )
4664 && eq128(
4665 UINT64_C(0x0001FFFFFFFFFFFF),
4666 UINT64_C(0xFFFFFFFFFFFFFFFF),
4667 zSig0,
4668 zSig1
4669 )
4670 && increment
4671 )
4672 ) {
4673 float_raise(float_flag_overflow | float_flag_inexact, status);
4674 if ( ( roundingMode == float_round_to_zero )
4675 || ( zSign && ( roundingMode == float_round_up ) )
4676 || ( ! zSign && ( roundingMode == float_round_down ) )
4677 || (roundingMode == float_round_to_odd)
4678 ) {
4679 return
4680 packFloat128(
4681 zSign,
4682 0x7FFE,
4683 UINT64_C(0x0000FFFFFFFFFFFF),
4684 UINT64_C(0xFFFFFFFFFFFFFFFF)
4685 );
4686 }
4687 return packFloat128( zSign, 0x7FFF, 0, 0 );
4688 }
4689 if ( zExp < 0 ) {
4690 if (status->flush_to_zero) {
4691 float_raise(float_flag_output_denormal, status);
4692 return packFloat128(zSign, 0, 0, 0);
4693 }
4694 isTiny = status->tininess_before_rounding
4695 || (zExp < -1)
4696 || !increment
4697 || lt128(zSig0, zSig1,
4698 UINT64_C(0x0001FFFFFFFFFFFF),
4699 UINT64_C(0xFFFFFFFFFFFFFFFF));
4700 shift128ExtraRightJamming(
4701 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4702 zExp = 0;
4703 if (isTiny && zSig2) {
4704 float_raise(float_flag_underflow, status);
4705 }
4706 switch (roundingMode) {
4707 case float_round_nearest_even:
4708 case float_round_ties_away:
4709 increment = ((int64_t)zSig2 < 0);
4710 break;
4711 case float_round_to_zero:
4712 increment = 0;
4713 break;
4714 case float_round_up:
4715 increment = !zSign && zSig2;
4716 break;
4717 case float_round_down:
4718 increment = zSign && zSig2;
4719 break;
4720 case float_round_to_odd:
4721 increment = !(zSig1 & 0x1) && zSig2;
4722 break;
4723 default:
4724 abort();
4725 }
4726 }
4727 }
4728 if (zSig2) {
4729 float_raise(float_flag_inexact, status);
4730 }
4731 if ( increment ) {
4732 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4733 if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4734 zSig1 &= ~1;
4735 }
4736 }
4737 else {
4738 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4739 }
4740 return packFloat128( zSign, zExp, zSig0, zSig1 );
4741
4742 }
4743
4744 /*----------------------------------------------------------------------------
4745 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4746 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4747 | returns the proper quadruple-precision floating-point value corresponding
4748 | to the abstract input. This routine is just like `roundAndPackFloat128'
4749 | except that the input significand has fewer bits and does not have to be
4750 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4751 | point exponent.
4752 *----------------------------------------------------------------------------*/
4753
4754 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4755 uint64_t zSig0, uint64_t zSig1,
4756 float_status *status)
4757 {
4758 int8_t shiftCount;
4759 uint64_t zSig2;
4760
4761 if ( zSig0 == 0 ) {
4762 zSig0 = zSig1;
4763 zSig1 = 0;
4764 zExp -= 64;
4765 }
4766 shiftCount = clz64(zSig0) - 15;
4767 if ( 0 <= shiftCount ) {
4768 zSig2 = 0;
4769 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4770 }
4771 else {
4772 shift128ExtraRightJamming(
4773 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4774 }
4775 zExp -= shiftCount;
4776 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4777
4778 }
4779
4780
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of converting the 32-bit two's complement integer `a'
4783 | to the extended double-precision floating-point format. The conversion
4784 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4785 | Arithmetic.
4786 *----------------------------------------------------------------------------*/
4787
4788 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4789 {
4790 bool zSign;
4791 uint32_t absA;
4792 int8_t shiftCount;
4793 uint64_t zSig;
4794
4795 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4796 zSign = ( a < 0 );
4797 absA = zSign ? - a : a;
4798 shiftCount = clz32(absA) + 32;
4799 zSig = absA;
4800 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4801
4802 }
4803
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the 32-bit two's complement integer `a' to
4806 | the quadruple-precision floating-point format. The conversion is performed
4807 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4808 *----------------------------------------------------------------------------*/
4809
4810 float128 int32_to_float128(int32_t a, float_status *status)
4811 {
4812 bool zSign;
4813 uint32_t absA;
4814 int8_t shiftCount;
4815 uint64_t zSig0;
4816
4817 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4818 zSign = ( a < 0 );
4819 absA = zSign ? - a : a;
4820 shiftCount = clz32(absA) + 17;
4821 zSig0 = absA;
4822 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4823
4824 }
4825
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the 64-bit two's complement integer `a'
4828 | to the extended double-precision floating-point format. The conversion
4829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4830 | Arithmetic.
4831 *----------------------------------------------------------------------------*/
4832
4833 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4834 {
4835 bool zSign;
4836 uint64_t absA;
4837 int8_t shiftCount;
4838
4839 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4840 zSign = ( a < 0 );
4841 absA = zSign ? - a : a;
4842 shiftCount = clz64(absA);
4843 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4844
4845 }
4846
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of converting the 64-bit two's complement integer `a' to
4849 | the quadruple-precision floating-point format. The conversion is performed
4850 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4851 *----------------------------------------------------------------------------*/
4852
4853 float128 int64_to_float128(int64_t a, float_status *status)
4854 {
4855 bool zSign;
4856 uint64_t absA;
4857 int8_t shiftCount;
4858 int32_t zExp;
4859 uint64_t zSig0, zSig1;
4860
4861 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4862 zSign = ( a < 0 );
4863 absA = zSign ? - a : a;
4864 shiftCount = clz64(absA) + 49;
4865 zExp = 0x406E - shiftCount;
4866 if ( 64 <= shiftCount ) {
4867 zSig1 = 0;
4868 zSig0 = absA;
4869 shiftCount -= 64;
4870 }
4871 else {
4872 zSig1 = absA;
4873 zSig0 = 0;
4874 }
4875 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4876 return packFloat128( zSign, zExp, zSig0, zSig1 );
4877
4878 }
4879
4880 /*----------------------------------------------------------------------------
4881 | Returns the result of converting the 64-bit unsigned integer `a'
4882 | to the quadruple-precision floating-point format. The conversion is performed
4883 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4884 *----------------------------------------------------------------------------*/
4885
4886 float128 uint64_to_float128(uint64_t a, float_status *status)
4887 {
4888 if (a == 0) {
4889 return float128_zero;
4890 }
4891 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4892 }
4893
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of converting the single-precision floating-point value
4896 | `a' to the extended double-precision floating-point format. The conversion
4897 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4898 | Arithmetic.
4899 *----------------------------------------------------------------------------*/
4900
4901 floatx80 float32_to_floatx80(float32 a, float_status *status)
4902 {
4903 bool aSign;
4904 int aExp;
4905 uint32_t aSig;
4906
4907 a = float32_squash_input_denormal(a, status);
4908 aSig = extractFloat32Frac( a );
4909 aExp = extractFloat32Exp( a );
4910 aSign = extractFloat32Sign( a );
4911 if ( aExp == 0xFF ) {
4912 if (aSig) {
4913 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4914 status);
4915 return floatx80_silence_nan(res, status);
4916 }
4917 return packFloatx80(aSign,
4918 floatx80_infinity_high,
4919 floatx80_infinity_low);
4920 }
4921 if ( aExp == 0 ) {
4922 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4923 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4924 }
4925 aSig |= 0x00800000;
4926 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4927
4928 }
4929
4930 /*----------------------------------------------------------------------------
4931 | Returns the result of converting the single-precision floating-point value
4932 | `a' to the double-precision floating-point format. The conversion is
4933 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4934 | Arithmetic.
4935 *----------------------------------------------------------------------------*/
4936
4937 float128 float32_to_float128(float32 a, float_status *status)
4938 {
4939 bool aSign;
4940 int aExp;
4941 uint32_t aSig;
4942
4943 a = float32_squash_input_denormal(a, status);
4944 aSig = extractFloat32Frac( a );
4945 aExp = extractFloat32Exp( a );
4946 aSign = extractFloat32Sign( a );
4947 if ( aExp == 0xFF ) {
4948 if (aSig) {
4949 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4950 }
4951 return packFloat128( aSign, 0x7FFF, 0, 0 );
4952 }
4953 if ( aExp == 0 ) {
4954 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4955 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4956 --aExp;
4957 }
4958 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4959
4960 }
4961
4962 /*----------------------------------------------------------------------------
4963 | Returns the remainder of the single-precision floating-point value `a'
4964 | with respect to the corresponding value `b'. The operation is performed
4965 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967
4968 float32 float32_rem(float32 a, float32 b, float_status *status)
4969 {
4970 bool aSign, zSign;
4971 int aExp, bExp, expDiff;
4972 uint32_t aSig, bSig;
4973 uint32_t q;
4974 uint64_t aSig64, bSig64, q64;
4975 uint32_t alternateASig;
4976 int32_t sigMean;
4977 a = float32_squash_input_denormal(a, status);
4978 b = float32_squash_input_denormal(b, status);
4979
4980 aSig = extractFloat32Frac( a );
4981 aExp = extractFloat32Exp( a );
4982 aSign = extractFloat32Sign( a );
4983 bSig = extractFloat32Frac( b );
4984 bExp = extractFloat32Exp( b );
4985 if ( aExp == 0xFF ) {
4986 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4987 return propagateFloat32NaN(a, b, status);
4988 }
4989 float_raise(float_flag_invalid, status);
4990 return float32_default_nan(status);
4991 }
4992 if ( bExp == 0xFF ) {
4993 if (bSig) {
4994 return propagateFloat32NaN(a, b, status);
4995 }
4996 return a;
4997 }
4998 if ( bExp == 0 ) {
4999 if ( bSig == 0 ) {
5000 float_raise(float_flag_invalid, status);
5001 return float32_default_nan(status);
5002 }
5003 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5004 }
5005 if ( aExp == 0 ) {
5006 if ( aSig == 0 ) return a;
5007 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5008 }
5009 expDiff = aExp - bExp;
5010 aSig |= 0x00800000;
5011 bSig |= 0x00800000;
5012 if ( expDiff < 32 ) {
5013 aSig <<= 8;
5014 bSig <<= 8;
5015 if ( expDiff < 0 ) {
5016 if ( expDiff < -1 ) return a;
5017 aSig >>= 1;
5018 }
5019 q = ( bSig <= aSig );
5020 if ( q ) aSig -= bSig;
5021 if ( 0 < expDiff ) {
5022 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5023 q >>= 32 - expDiff;
5024 bSig >>= 2;
5025 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5026 }
5027 else {
5028 aSig >>= 2;
5029 bSig >>= 2;
5030 }
5031 }
5032 else {
5033 if ( bSig <= aSig ) aSig -= bSig;
5034 aSig64 = ( (uint64_t) aSig )<<40;
5035 bSig64 = ( (uint64_t) bSig )<<40;
5036 expDiff -= 64;
5037 while ( 0 < expDiff ) {
5038 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5039 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5040 aSig64 = - ( ( bSig * q64 )<<38 );
5041 expDiff -= 62;
5042 }
5043 expDiff += 64;
5044 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5045 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5046 q = q64>>( 64 - expDiff );
5047 bSig <<= 6;
5048 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5049 }
5050 do {
5051 alternateASig = aSig;
5052 ++q;
5053 aSig -= bSig;
5054 } while ( 0 <= (int32_t) aSig );
5055 sigMean = aSig + alternateASig;
5056 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5057 aSig = alternateASig;
5058 }
5059 zSign = ( (int32_t) aSig < 0 );
5060 if ( zSign ) aSig = - aSig;
5061 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5062 }
5063
5064
5065
5066 /*----------------------------------------------------------------------------
5067 | Returns the binary exponential of the single-precision floating-point value
5068 | `a'. The operation is performed according to the IEC/IEEE Standard for
5069 | Binary Floating-Point Arithmetic.
5070 |
5071 | Uses the following identities:
5072 |
5073 | 1. -------------------------------------------------------------------------
5074 | x x*ln(2)
5075 | 2 = e
5076 |
5077 | 2. -------------------------------------------------------------------------
5078 | 2 3 4 5 n
5079 | x x x x x x x
5080 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5081 | 1! 2! 3! 4! 5! n!
5082 *----------------------------------------------------------------------------*/
5083
5084 static const float64 float32_exp2_coefficients[15] =
5085 {
5086 const_float64( 0x3ff0000000000000ll ), /* 1 */
5087 const_float64( 0x3fe0000000000000ll ), /* 2 */
5088 const_float64( 0x3fc5555555555555ll ), /* 3 */
5089 const_float64( 0x3fa5555555555555ll ), /* 4 */
5090 const_float64( 0x3f81111111111111ll ), /* 5 */
5091 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
5092 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
5093 const_float64( 0x3efa01a01a01a01all ), /* 8 */
5094 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
5095 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5096 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5097 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5098 const_float64( 0x3de6124613a86d09ll ), /* 13 */
5099 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5100 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5101 };
5102
5103 float32 float32_exp2(float32 a, float_status *status)
5104 {
5105 bool aSign;
5106 int aExp;
5107 uint32_t aSig;
5108 float64 r, x, xn;
5109 int i;
5110 a = float32_squash_input_denormal(a, status);
5111
5112 aSig = extractFloat32Frac( a );
5113 aExp = extractFloat32Exp( a );
5114 aSign = extractFloat32Sign( a );
5115
5116 if ( aExp == 0xFF) {
5117 if (aSig) {
5118 return propagateFloat32NaN(a, float32_zero, status);
5119 }
5120 return (aSign) ? float32_zero : a;
5121 }
5122 if (aExp == 0) {
5123 if (aSig == 0) return float32_one;
5124 }
5125
5126 float_raise(float_flag_inexact, status);
5127
5128 /* ******************************* */
5129 /* using float64 for approximation */
5130 /* ******************************* */
5131 x = float32_to_float64(a, status);
5132 x = float64_mul(x, float64_ln2, status);
5133
5134 xn = x;
5135 r = float64_one;
5136 for (i = 0 ; i < 15 ; i++) {
5137 float64 f;
5138
5139 f = float64_mul(xn, float32_exp2_coefficients[i], status);
5140 r = float64_add(r, f, status);
5141
5142 xn = float64_mul(xn, x, status);
5143 }
5144
5145 return float64_to_float32(r, status);
5146 }
5147
5148 /*----------------------------------------------------------------------------
5149 | Returns the binary log of the single-precision floating-point value `a'.
5150 | The operation is performed according to the IEC/IEEE Standard for Binary
5151 | Floating-Point Arithmetic.
5152 *----------------------------------------------------------------------------*/
5153 float32 float32_log2(float32 a, float_status *status)
5154 {
5155 bool aSign, zSign;
5156 int aExp;
5157 uint32_t aSig, zSig, i;
5158
5159 a = float32_squash_input_denormal(a, status);
5160 aSig = extractFloat32Frac( a );
5161 aExp = extractFloat32Exp( a );
5162 aSign = extractFloat32Sign( a );
5163
5164 if ( aExp == 0 ) {
5165 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5166 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5167 }
5168 if ( aSign ) {
5169 float_raise(float_flag_invalid, status);
5170 return float32_default_nan(status);
5171 }
5172 if ( aExp == 0xFF ) {
5173 if (aSig) {
5174 return propagateFloat32NaN(a, float32_zero, status);
5175 }
5176 return a;
5177 }
5178
5179 aExp -= 0x7F;
5180 aSig |= 0x00800000;
5181 zSign = aExp < 0;
5182 zSig = aExp << 23;
5183
5184 for (i = 1 << 22; i > 0; i >>= 1) {
5185 aSig = ( (uint64_t)aSig * aSig ) >> 23;
5186 if ( aSig & 0x01000000 ) {
5187 aSig >>= 1;
5188 zSig |= i;
5189 }
5190 }
5191
5192 if ( zSign )
5193 zSig = -zSig;
5194
5195 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5196 }
5197
5198 /*----------------------------------------------------------------------------
5199 | Returns the result of converting the double-precision floating-point value
5200 | `a' to the extended double-precision floating-point format. The conversion
5201 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5202 | Arithmetic.
5203 *----------------------------------------------------------------------------*/
5204
5205 floatx80 float64_to_floatx80(float64 a, float_status *status)
5206 {
5207 bool aSign;
5208 int aExp;
5209 uint64_t aSig;
5210
5211 a = float64_squash_input_denormal(a, status);
5212 aSig = extractFloat64Frac( a );
5213 aExp = extractFloat64Exp( a );
5214 aSign = extractFloat64Sign( a );
5215 if ( aExp == 0x7FF ) {
5216 if (aSig) {
5217 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5218 status);
5219 return floatx80_silence_nan(res, status);
5220 }
5221 return packFloatx80(aSign,
5222 floatx80_infinity_high,
5223 floatx80_infinity_low);
5224 }
5225 if ( aExp == 0 ) {
5226 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5227 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5228 }
5229 return
5230 packFloatx80(
5231 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5232
5233 }
5234
5235 /*----------------------------------------------------------------------------
5236 | Returns the result of converting the double-precision floating-point value
5237 | `a' to the quadruple-precision floating-point format. The conversion is
5238 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5239 | Arithmetic.
5240 *----------------------------------------------------------------------------*/
5241
5242 float128 float64_to_float128(float64 a, float_status *status)
5243 {
5244 bool aSign;
5245 int aExp;
5246 uint64_t aSig, zSig0, zSig1;
5247
5248 a = float64_squash_input_denormal(a, status);
5249 aSig = extractFloat64Frac( a );
5250 aExp = extractFloat64Exp( a );
5251 aSign = extractFloat64Sign( a );
5252 if ( aExp == 0x7FF ) {
5253 if (aSig) {
5254 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5255 }
5256 return packFloat128( aSign, 0x7FFF, 0, 0 );
5257 }
5258 if ( aExp == 0 ) {
5259 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5260 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5261 --aExp;
5262 }
5263 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5264 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5265
5266 }
5267
5268
5269 /*----------------------------------------------------------------------------
5270 | Returns the remainder of the double-precision floating-point value `a'
5271 | with respect to the corresponding value `b'. The operation is performed
5272 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5274
5275 float64 float64_rem(float64 a, float64 b, float_status *status)
5276 {
5277 bool aSign, zSign;
5278 int aExp, bExp, expDiff;
5279 uint64_t aSig, bSig;
5280 uint64_t q, alternateASig;
5281 int64_t sigMean;
5282
5283 a = float64_squash_input_denormal(a, status);
5284 b = float64_squash_input_denormal(b, status);
5285 aSig = extractFloat64Frac( a );
5286 aExp = extractFloat64Exp( a );
5287 aSign = extractFloat64Sign( a );
5288 bSig = extractFloat64Frac( b );
5289 bExp = extractFloat64Exp( b );
5290 if ( aExp == 0x7FF ) {
5291 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5292 return propagateFloat64NaN(a, b, status);
5293 }
5294 float_raise(float_flag_invalid, status);
5295 return float64_default_nan(status);
5296 }
5297 if ( bExp == 0x7FF ) {
5298 if (bSig) {
5299 return propagateFloat64NaN(a, b, status);
5300 }
5301 return a;
5302 }
5303 if ( bExp == 0 ) {
5304 if ( bSig == 0 ) {
5305 float_raise(float_flag_invalid, status);
5306 return float64_default_nan(status);
5307 }
5308 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5309 }
5310 if ( aExp == 0 ) {
5311 if ( aSig == 0 ) return a;
5312 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5313 }
5314 expDiff = aExp - bExp;
5315 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5316 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5317 if ( expDiff < 0 ) {
5318 if ( expDiff < -1 ) return a;
5319 aSig >>= 1;
5320 }
5321 q = ( bSig <= aSig );
5322 if ( q ) aSig -= bSig;
5323 expDiff -= 64;
5324 while ( 0 < expDiff ) {
5325 q = estimateDiv128To64( aSig, 0, bSig );
5326 q = ( 2 < q ) ? q - 2 : 0;
5327 aSig = - ( ( bSig>>2 ) * q );
5328 expDiff -= 62;
5329 }
5330 expDiff += 64;
5331 if ( 0 < expDiff ) {
5332 q = estimateDiv128To64( aSig, 0, bSig );
5333 q = ( 2 < q ) ? q - 2 : 0;
5334 q >>= 64 - expDiff;
5335 bSig >>= 2;
5336 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5337 }
5338 else {
5339 aSig >>= 2;
5340 bSig >>= 2;
5341 }
5342 do {
5343 alternateASig = aSig;
5344 ++q;
5345 aSig -= bSig;
5346 } while ( 0 <= (int64_t) aSig );
5347 sigMean = aSig + alternateASig;
5348 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5349 aSig = alternateASig;
5350 }
5351 zSign = ( (int64_t) aSig < 0 );
5352 if ( zSign ) aSig = - aSig;
5353 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5354
5355 }
5356
5357 /*----------------------------------------------------------------------------
5358 | Returns the binary log of the double-precision floating-point value `a'.
5359 | The operation is performed according to the IEC/IEEE Standard for Binary
5360 | Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362 float64 float64_log2(float64 a, float_status *status)
5363 {
5364 bool aSign, zSign;
5365 int aExp;
5366 uint64_t aSig, aSig0, aSig1, zSig, i;
5367 a = float64_squash_input_denormal(a, status);
5368
5369 aSig = extractFloat64Frac( a );
5370 aExp = extractFloat64Exp( a );
5371 aSign = extractFloat64Sign( a );
5372
5373 if ( aExp == 0 ) {
5374 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5375 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5376 }
5377 if ( aSign ) {
5378 float_raise(float_flag_invalid, status);
5379 return float64_default_nan(status);
5380 }
5381 if ( aExp == 0x7FF ) {
5382 if (aSig) {
5383 return propagateFloat64NaN(a, float64_zero, status);
5384 }
5385 return a;
5386 }
5387
5388 aExp -= 0x3FF;
5389 aSig |= UINT64_C(0x0010000000000000);
5390 zSign = aExp < 0;
5391 zSig = (uint64_t)aExp << 52;
5392 for (i = 1LL << 51; i > 0; i >>= 1) {
5393 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5394 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5395 if ( aSig & UINT64_C(0x0020000000000000) ) {
5396 aSig >>= 1;
5397 zSig |= i;
5398 }
5399 }
5400
5401 if ( zSign )
5402 zSig = -zSig;
5403 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5404 }
5405
5406 /*----------------------------------------------------------------------------
5407 | Returns the result of converting the extended double-precision floating-
5408 | point value `a' to the 32-bit two's complement integer format. The
5409 | conversion is performed according to the IEC/IEEE Standard for Binary
5410 | Floating-Point Arithmetic---which means in particular that the conversion
5411 | is rounded according to the current rounding mode. If `a' is a NaN, the
5412 | largest positive integer is returned. Otherwise, if the conversion
5413 | overflows, the largest integer with the same sign as `a' is returned.
5414 *----------------------------------------------------------------------------*/
5415
5416 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5417 {
5418 bool aSign;
5419 int32_t aExp, shiftCount;
5420 uint64_t aSig;
5421
5422 if (floatx80_invalid_encoding(a)) {
5423 float_raise(float_flag_invalid, status);
5424 return 1 << 31;
5425 }
5426 aSig = extractFloatx80Frac( a );
5427 aExp = extractFloatx80Exp( a );
5428 aSign = extractFloatx80Sign( a );
5429 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5430 shiftCount = 0x4037 - aExp;
5431 if ( shiftCount <= 0 ) shiftCount = 1;
5432 shift64RightJamming( aSig, shiftCount, &aSig );
5433 return roundAndPackInt32(aSign, aSig, status);
5434
5435 }
5436
5437 /*----------------------------------------------------------------------------
5438 | Returns the result of converting the extended double-precision floating-
5439 | point value `a' to the 32-bit two's complement integer format. The
5440 | conversion is performed according to the IEC/IEEE Standard for Binary
5441 | Floating-Point Arithmetic, except that the conversion is always rounded
5442 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5443 | Otherwise, if the conversion overflows, the largest integer with the same
5444 | sign as `a' is returned.
5445 *----------------------------------------------------------------------------*/
5446
5447 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5448 {
5449 bool aSign;
5450 int32_t aExp, shiftCount;
5451 uint64_t aSig, savedASig;
5452 int32_t z;
5453
5454 if (floatx80_invalid_encoding(a)) {
5455 float_raise(float_flag_invalid, status);
5456 return 1 << 31;
5457 }
5458 aSig = extractFloatx80Frac( a );
5459 aExp = extractFloatx80Exp( a );
5460 aSign = extractFloatx80Sign( a );
5461 if ( 0x401E < aExp ) {
5462 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5463 goto invalid;
5464 }
5465 else if ( aExp < 0x3FFF ) {
5466 if (aExp || aSig) {
5467 float_raise(float_flag_inexact, status);
5468 }
5469 return 0;
5470 }
5471 shiftCount = 0x403E - aExp;
5472 savedASig = aSig;
5473 aSig >>= shiftCount;
5474 z = aSig;
5475 if ( aSign ) z = - z;
5476 if ( ( z < 0 ) ^ aSign ) {
5477 invalid:
5478 float_raise(float_flag_invalid, status);
5479 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5480 }
5481 if ( ( aSig<<shiftCount ) != savedASig ) {
5482 float_raise(float_flag_inexact, status);
5483 }
5484 return z;
5485
5486 }
5487
5488 /*----------------------------------------------------------------------------
5489 | Returns the result of converting the extended double-precision floating-
5490 | point value `a' to the 64-bit two's complement integer format. The
5491 | conversion is performed according to the IEC/IEEE Standard for Binary
5492 | Floating-Point Arithmetic---which means in particular that the conversion
5493 | is rounded according to the current rounding mode. If `a' is a NaN,
5494 | the largest positive integer is returned. Otherwise, if the conversion
5495 | overflows, the largest integer with the same sign as `a' is returned.
5496 *----------------------------------------------------------------------------*/
5497
5498 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5499 {
5500 bool aSign;
5501 int32_t aExp, shiftCount;
5502 uint64_t aSig, aSigExtra;
5503
5504 if (floatx80_invalid_encoding(a)) {
5505 float_raise(float_flag_invalid, status);
5506 return 1ULL << 63;
5507 }
5508 aSig = extractFloatx80Frac( a );
5509 aExp = extractFloatx80Exp( a );
5510 aSign = extractFloatx80Sign( a );
5511 shiftCount = 0x403E - aExp;
5512 if ( shiftCount <= 0 ) {
5513 if ( shiftCount ) {
5514 float_raise(float_flag_invalid, status);
5515 if (!aSign || floatx80_is_any_nan(a)) {
5516 return INT64_MAX;
5517 }
5518 return INT64_MIN;
5519 }
5520 aSigExtra = 0;
5521 }
5522 else {
5523 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5524 }
5525 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5526
5527 }
5528
5529 /*----------------------------------------------------------------------------
5530 | Returns the result of converting the extended double-precision floating-
5531 | point value `a' to the 64-bit two's complement integer format. The
5532 | conversion is performed according to the IEC/IEEE Standard for Binary
5533 | Floating-Point Arithmetic, except that the conversion is always rounded
5534 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5535 | Otherwise, if the conversion overflows, the largest integer with the same
5536 | sign as `a' is returned.
5537 *----------------------------------------------------------------------------*/
5538
5539 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5540 {
5541 bool aSign;
5542 int32_t aExp, shiftCount;
5543 uint64_t aSig;
5544 int64_t z;
5545
5546 if (floatx80_invalid_encoding(a)) {
5547 float_raise(float_flag_invalid, status);
5548 return 1ULL << 63;
5549 }
5550 aSig = extractFloatx80Frac( a );
5551 aExp = extractFloatx80Exp( a );
5552 aSign = extractFloatx80Sign( a );
5553 shiftCount = aExp - 0x403E;
5554 if ( 0 <= shiftCount ) {
5555 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5556 if ( ( a.high != 0xC03E ) || aSig ) {
5557 float_raise(float_flag_invalid, status);
5558 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5559 return INT64_MAX;
5560 }
5561 }
5562 return INT64_MIN;
5563 }
5564 else if ( aExp < 0x3FFF ) {
5565 if (aExp | aSig) {
5566 float_raise(float_flag_inexact, status);
5567 }
5568 return 0;
5569 }
5570 z = aSig>>( - shiftCount );
5571 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5572 float_raise(float_flag_inexact, status);
5573 }
5574 if ( aSign ) z = - z;
5575 return z;
5576
5577 }
5578
5579 /*----------------------------------------------------------------------------
5580 | Returns the result of converting the extended double-precision floating-
5581 | point value `a' to the single-precision floating-point format. The
5582 | conversion is performed according to the IEC/IEEE Standard for Binary
5583 | Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585
5586 float32 floatx80_to_float32(floatx80 a, float_status *status)
5587 {
5588 bool aSign;
5589 int32_t aExp;
5590 uint64_t aSig;
5591
5592 if (floatx80_invalid_encoding(a)) {
5593 float_raise(float_flag_invalid, status);
5594 return float32_default_nan(status);
5595 }
5596 aSig = extractFloatx80Frac( a );
5597 aExp = extractFloatx80Exp( a );
5598 aSign = extractFloatx80Sign( a );
5599 if ( aExp == 0x7FFF ) {
5600 if ( (uint64_t) ( aSig<<1 ) ) {
5601 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5602 status);
5603 return float32_silence_nan(res, status);
5604 }
5605 return packFloat32( aSign, 0xFF, 0 );
5606 }
5607 shift64RightJamming( aSig, 33, &aSig );
5608 if ( aExp || aSig ) aExp -= 0x3F81;
5609 return roundAndPackFloat32(aSign, aExp, aSig, status);
5610
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the extended double-precision floating-
5615 | point value `a' to the double-precision floating-point format. The
5616 | conversion is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619
5620 float64 floatx80_to_float64(floatx80 a, float_status *status)
5621 {
5622 bool aSign;
5623 int32_t aExp;
5624 uint64_t aSig, zSig;
5625
5626 if (floatx80_invalid_encoding(a)) {
5627 float_raise(float_flag_invalid, status);
5628 return float64_default_nan(status);
5629 }
5630 aSig = extractFloatx80Frac( a );
5631 aExp = extractFloatx80Exp( a );
5632 aSign = extractFloatx80Sign( a );
5633 if ( aExp == 0x7FFF ) {
5634 if ( (uint64_t) ( aSig<<1 ) ) {
5635 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5636 status);
5637 return float64_silence_nan(res, status);
5638 }
5639 return packFloat64( aSign, 0x7FF, 0 );
5640 }
5641 shift64RightJamming( aSig, 1, &zSig );
5642 if ( aExp || aSig ) aExp -= 0x3C01;
5643 return roundAndPackFloat64(aSign, aExp, zSig, status);
5644
5645 }
5646
5647 /*----------------------------------------------------------------------------
5648 | Returns the result of converting the extended double-precision floating-
5649 | point value `a' to the quadruple-precision floating-point format. The
5650 | conversion is performed according to the IEC/IEEE Standard for Binary
5651 | Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5653
5654 float128 floatx80_to_float128(floatx80 a, float_status *status)
5655 {
5656 bool aSign;
5657 int aExp;
5658 uint64_t aSig, zSig0, zSig1;
5659
5660 if (floatx80_invalid_encoding(a)) {
5661 float_raise(float_flag_invalid, status);
5662 return float128_default_nan(status);
5663 }
5664 aSig = extractFloatx80Frac( a );
5665 aExp = extractFloatx80Exp( a );
5666 aSign = extractFloatx80Sign( a );
5667 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5668 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5669 status);
5670 return float128_silence_nan(res, status);
5671 }
5672 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5673 return packFloat128( aSign, aExp, zSig0, zSig1 );
5674
5675 }
5676
5677 /*----------------------------------------------------------------------------
5678 | Rounds the extended double-precision floating-point value `a'
5679 | to the precision provided by floatx80_rounding_precision and returns the
5680 | result as an extended double-precision floating-point value.
5681 | The operation is performed according to the IEC/IEEE Standard for Binary
5682 | Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5684
5685 floatx80 floatx80_round(floatx80 a, float_status *status)
5686 {
5687 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5688 extractFloatx80Sign(a),
5689 extractFloatx80Exp(a),
5690 extractFloatx80Frac(a), 0, status);
5691 }
5692
5693 /*----------------------------------------------------------------------------
5694 | Rounds the extended double-precision floating-point value `a' to an integer,
5695 | and returns the result as an extended quadruple-precision floating-point
5696 | value. The operation is performed according to the IEC/IEEE Standard for
5697 | Binary Floating-Point Arithmetic.
5698 *----------------------------------------------------------------------------*/
5699
5700 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5701 {
5702 bool aSign;
5703 int32_t aExp;
5704 uint64_t lastBitMask, roundBitsMask;
5705 floatx80 z;
5706
5707 if (floatx80_invalid_encoding(a)) {
5708 float_raise(float_flag_invalid, status);
5709 return floatx80_default_nan(status);
5710 }
5711 aExp = extractFloatx80Exp( a );
5712 if ( 0x403E <= aExp ) {
5713 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5714 return propagateFloatx80NaN(a, a, status);
5715 }
5716 return a;
5717 }
5718 if ( aExp < 0x3FFF ) {
5719 if ( ( aExp == 0 )
5720 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5721 return a;
5722 }
5723 float_raise(float_flag_inexact, status);
5724 aSign = extractFloatx80Sign( a );
5725 switch (status->float_rounding_mode) {
5726 case float_round_nearest_even:
5727 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5728 ) {
5729 return
5730 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5731 }
5732 break;
5733 case float_round_ties_away:
5734 if (aExp == 0x3FFE) {
5735 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5736 }
5737 break;
5738 case float_round_down:
5739 return
5740 aSign ?
5741 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5742 : packFloatx80( 0, 0, 0 );
5743 case float_round_up:
5744 return
5745 aSign ? packFloatx80( 1, 0, 0 )
5746 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5747
5748 case float_round_to_zero:
5749 break;
5750 default:
5751 g_assert_not_reached();
5752 }
5753 return packFloatx80( aSign, 0, 0 );
5754 }
5755 lastBitMask = 1;
5756 lastBitMask <<= 0x403E - aExp;
5757 roundBitsMask = lastBitMask - 1;
5758 z = a;
5759 switch (status->float_rounding_mode) {
5760 case float_round_nearest_even:
5761 z.low += lastBitMask>>1;
5762 if ((z.low & roundBitsMask) == 0) {
5763 z.low &= ~lastBitMask;
5764 }
5765 break;
5766 case float_round_ties_away:
5767 z.low += lastBitMask >> 1;
5768 break;
5769 case float_round_to_zero:
5770 break;
5771 case float_round_up:
5772 if (!extractFloatx80Sign(z)) {
5773 z.low += roundBitsMask;
5774 }
5775 break;
5776 case float_round_down:
5777 if (extractFloatx80Sign(z)) {
5778 z.low += roundBitsMask;
5779 }
5780 break;
5781 default:
5782 abort();
5783 }
5784 z.low &= ~ roundBitsMask;
5785 if ( z.low == 0 ) {
5786 ++z.high;
5787 z.low = UINT64_C(0x8000000000000000);
5788 }
5789 if (z.low != a.low) {
5790 float_raise(float_flag_inexact, status);
5791 }
5792 return z;
5793
5794 }
5795
5796 /*----------------------------------------------------------------------------
5797 | Returns the result of adding the absolute values of the extended double-
5798 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5799 | negated before being returned. `zSign' is ignored if the result is a NaN.
5800 | The addition is performed according to the IEC/IEEE Standard for Binary
5801 | Floating-Point Arithmetic.
5802 *----------------------------------------------------------------------------*/
5803
5804 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5805 float_status *status)
5806 {
5807 int32_t aExp, bExp, zExp;
5808 uint64_t aSig, bSig, zSig0, zSig1;
5809 int32_t expDiff;
5810
5811 aSig = extractFloatx80Frac( a );
5812 aExp = extractFloatx80Exp( a );
5813 bSig = extractFloatx80Frac( b );
5814 bExp = extractFloatx80Exp( b );
5815 expDiff = aExp - bExp;
5816 if ( 0 < expDiff ) {
5817 if ( aExp == 0x7FFF ) {
5818 if ((uint64_t)(aSig << 1)) {
5819 return propagateFloatx80NaN(a, b, status);
5820 }
5821 return a;
5822 }
5823 if ( bExp == 0 ) --expDiff;
5824 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5825 zExp = aExp;
5826 }
5827 else if ( expDiff < 0 ) {
5828 if ( bExp == 0x7FFF ) {
5829 if ((uint64_t)(bSig << 1)) {
5830 return propagateFloatx80NaN(a, b, status);
5831 }
5832 return packFloatx80(zSign,
5833 floatx80_infinity_high,
5834 floatx80_infinity_low);
5835 }
5836 if ( aExp == 0 ) ++expDiff;
5837 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5838 zExp = bExp;
5839 }
5840 else {
5841 if ( aExp == 0x7FFF ) {
5842 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5843 return propagateFloatx80NaN(a, b, status);
5844 }
5845 return a;
5846 }
5847 zSig1 = 0;
5848 zSig0 = aSig + bSig;
5849 if ( aExp == 0 ) {
5850 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5851 /* At least one of the values is a pseudo-denormal,
5852 * and there is a carry out of the result. */
5853 zExp = 1;
5854 goto shiftRight1;
5855 }
5856 if (zSig0 == 0) {
5857 return packFloatx80(zSign, 0, 0);
5858 }
5859 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5860 goto roundAndPack;
5861 }
5862 zExp = aExp;
5863 goto shiftRight1;
5864 }
5865 zSig0 = aSig + bSig;
5866 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5867 shiftRight1:
5868 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5869 zSig0 |= UINT64_C(0x8000000000000000);
5870 ++zExp;
5871 roundAndPack:
5872 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5873 zSign, zExp, zSig0, zSig1, status);
5874 }
5875
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of subtracting the absolute values of the extended
5878 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5879 | difference is negated before being returned. `zSign' is ignored if the
5880 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5881 | Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5883
5884 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5885 float_status *status)
5886 {
5887 int32_t aExp, bExp, zExp;
5888 uint64_t aSig, bSig, zSig0, zSig1;
5889 int32_t expDiff;
5890
5891 aSig = extractFloatx80Frac( a );
5892 aExp = extractFloatx80Exp( a );
5893 bSig = extractFloatx80Frac( b );
5894 bExp = extractFloatx80Exp( b );
5895 expDiff = aExp - bExp;
5896 if ( 0 < expDiff ) goto aExpBigger;
5897 if ( expDiff < 0 ) goto bExpBigger;
5898 if ( aExp == 0x7FFF ) {
5899 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5900 return propagateFloatx80NaN(a, b, status);
5901 }
5902 float_raise(float_flag_invalid, status);
5903 return floatx80_default_nan(status);
5904 }
5905 if ( aExp == 0 ) {
5906 aExp = 1;
5907 bExp = 1;
5908 }
5909 zSig1 = 0;
5910 if ( bSig < aSig ) goto aBigger;
5911 if ( aSig < bSig ) goto bBigger;
5912 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5913 bExpBigger:
5914 if ( bExp == 0x7FFF ) {
5915 if ((uint64_t)(bSig << 1)) {
5916 return propagateFloatx80NaN(a, b, status);
5917 }
5918 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5919 floatx80_infinity_low);
5920 }
5921 if ( aExp == 0 ) ++expDiff;
5922 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5923 bBigger:
5924 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5925 zExp = bExp;
5926 zSign ^= 1;
5927 goto normalizeRoundAndPack;
5928 aExpBigger:
5929 if ( aExp == 0x7FFF ) {
5930 if ((uint64_t)(aSig << 1)) {
5931 return propagateFloatx80NaN(a, b, status);
5932 }
5933 return a;
5934 }
5935 if ( bExp == 0 ) --expDiff;
5936 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5937 aBigger:
5938 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5939 zExp = aExp;
5940 normalizeRoundAndPack:
5941 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5942 zSign, zExp, zSig0, zSig1, status);
5943 }
5944
5945 /*----------------------------------------------------------------------------
5946 | Returns the result of adding the extended double-precision floating-point
5947 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5948 | Standard for Binary Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5950
5951 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5952 {
5953 bool aSign, bSign;
5954
5955 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5956 float_raise(float_flag_invalid, status);
5957 return floatx80_default_nan(status);
5958 }
5959 aSign = extractFloatx80Sign( a );
5960 bSign = extractFloatx80Sign( b );
5961 if ( aSign == bSign ) {
5962 return addFloatx80Sigs(a, b, aSign, status);
5963 }
5964 else {
5965 return subFloatx80Sigs(a, b, aSign, status);
5966 }
5967
5968 }
5969
5970 /*----------------------------------------------------------------------------
5971 | Returns the result of subtracting the extended double-precision floating-
5972 | point values `a' and `b'. The operation is performed according to the
5973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5974 *----------------------------------------------------------------------------*/
5975
5976 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5977 {
5978 bool aSign, bSign;
5979
5980 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5981 float_raise(float_flag_invalid, status);
5982 return floatx80_default_nan(status);
5983 }
5984 aSign = extractFloatx80Sign( a );
5985 bSign = extractFloatx80Sign( b );
5986 if ( aSign == bSign ) {
5987 return subFloatx80Sigs(a, b, aSign, status);
5988 }
5989 else {
5990 return addFloatx80Sigs(a, b, aSign, status);
5991 }
5992
5993 }
5994
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of multiplying the extended double-precision floating-
5997 | point values `a' and `b'. The operation is performed according to the
5998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5999 *----------------------------------------------------------------------------*/
6000
6001 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6002 {
6003 bool aSign, bSign, zSign;
6004 int32_t aExp, bExp, zExp;
6005 uint64_t aSig, bSig, zSig0, zSig1;
6006
6007 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6008 float_raise(float_flag_invalid, status);
6009 return floatx80_default_nan(status);
6010 }
6011 aSig = extractFloatx80Frac( a );
6012 aExp = extractFloatx80Exp( a );
6013 aSign = extractFloatx80Sign( a );
6014 bSig = extractFloatx80Frac( b );
6015 bExp = extractFloatx80Exp( b );
6016 bSign = extractFloatx80Sign( b );
6017 zSign = aSign ^ bSign;
6018 if ( aExp == 0x7FFF ) {
6019 if ( (uint64_t) ( aSig<<1 )
6020 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6021 return propagateFloatx80NaN(a, b, status);
6022 }
6023 if ( ( bExp | bSig ) == 0 ) goto invalid;
6024 return packFloatx80(zSign, floatx80_infinity_high,
6025 floatx80_infinity_low);
6026 }
6027 if ( bExp == 0x7FFF ) {
6028 if ((uint64_t)(bSig << 1)) {
6029 return propagateFloatx80NaN(a, b, status);
6030 }
6031 if ( ( aExp | aSig ) == 0 ) {
6032 invalid:
6033 float_raise(float_flag_invalid, status);
6034 return floatx80_default_nan(status);
6035 }
6036 return packFloatx80(zSign, floatx80_infinity_high,
6037 floatx80_infinity_low);
6038 }
6039 if ( aExp == 0 ) {
6040 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6041 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6042 }
6043 if ( bExp == 0 ) {
6044 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6045 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6046 }
6047 zExp = aExp + bExp - 0x3FFE;
6048 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6049 if ( 0 < (int64_t) zSig0 ) {
6050 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6051 --zExp;
6052 }
6053 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6054 zSign, zExp, zSig0, zSig1, status);
6055 }
6056
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of dividing the extended double-precision floating-point
6059 | value `a' by the corresponding value `b'. The operation is performed
6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6061 *----------------------------------------------------------------------------*/
6062
6063 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6064 {
6065 bool aSign, bSign, zSign;
6066 int32_t aExp, bExp, zExp;
6067 uint64_t aSig, bSig, zSig0, zSig1;
6068 uint64_t rem0, rem1, rem2, term0, term1, term2;
6069
6070 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6071 float_raise(float_flag_invalid, status);
6072 return floatx80_default_nan(status);
6073 }
6074 aSig = extractFloatx80Frac( a );
6075 aExp = extractFloatx80Exp( a );
6076 aSign = extractFloatx80Sign( a );
6077 bSig = extractFloatx80Frac( b );
6078 bExp = extractFloatx80Exp( b );
6079 bSign = extractFloatx80Sign( b );
6080 zSign = aSign ^ bSign;
6081 if ( aExp == 0x7FFF ) {
6082 if ((uint64_t)(aSig << 1)) {
6083 return propagateFloatx80NaN(a, b, status);
6084 }
6085 if ( bExp == 0x7FFF ) {
6086 if ((uint64_t)(bSig << 1)) {
6087 return propagateFloatx80NaN(a, b, status);
6088 }
6089 goto invalid;
6090 }
6091 return packFloatx80(zSign, floatx80_infinity_high,
6092 floatx80_infinity_low);
6093 }
6094 if ( bExp == 0x7FFF ) {
6095 if ((uint64_t)(bSig << 1)) {
6096 return propagateFloatx80NaN(a, b, status);
6097 }
6098 return packFloatx80( zSign, 0, 0 );
6099 }
6100 if ( bExp == 0 ) {
6101 if ( bSig == 0 ) {
6102 if ( ( aExp | aSig ) == 0 ) {
6103 invalid:
6104 float_raise(float_flag_invalid, status);
6105 return floatx80_default_nan(status);
6106 }
6107 float_raise(float_flag_divbyzero, status);
6108 return packFloatx80(zSign, floatx80_infinity_high,
6109 floatx80_infinity_low);
6110 }
6111 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6112 }
6113 if ( aExp == 0 ) {
6114 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6115 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6116 }
6117 zExp = aExp - bExp + 0x3FFE;
6118 rem1 = 0;
6119 if ( bSig <= aSig ) {
6120 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6121 ++zExp;
6122 }
6123 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6124 mul64To128( bSig, zSig0, &term0, &term1 );
6125 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6126 while ( (int64_t) rem0 < 0 ) {
6127 --zSig0;
6128 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6129 }
6130 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6131 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6132 mul64To128( bSig, zSig1, &term1, &term2 );
6133 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6134 while ( (int64_t) rem1 < 0 ) {
6135 --zSig1;
6136 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6137 }
6138 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6139 }
6140 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6141 zSign, zExp, zSig0, zSig1, status);
6142 }
6143
6144 /*----------------------------------------------------------------------------
6145 | Returns the remainder of the extended double-precision floating-point value
6146 | `a' with respect to the corresponding value `b'. The operation is performed
6147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6148 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6149 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6150 | the absolute value of the integer quotient.
6151 *----------------------------------------------------------------------------*/
6152
6153 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6154 float_status *status)
6155 {
6156 bool aSign, zSign;
6157 int32_t aExp, bExp, expDiff, aExpOrig;
6158 uint64_t aSig0, aSig1, bSig;
6159 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6160
6161 *quotient = 0;
6162 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6163 float_raise(float_flag_invalid, status);
6164 return floatx80_default_nan(status);
6165 }
6166 aSig0 = extractFloatx80Frac( a );
6167 aExpOrig = aExp = extractFloatx80Exp( a );
6168 aSign = extractFloatx80Sign( a );
6169 bSig = extractFloatx80Frac( b );
6170 bExp = extractFloatx80Exp( b );
6171 if ( aExp == 0x7FFF ) {
6172 if ( (uint64_t) ( aSig0<<1 )
6173 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6174 return propagateFloatx80NaN(a, b, status);
6175 }
6176 goto invalid;
6177 }
6178 if ( bExp == 0x7FFF ) {
6179 if ((uint64_t)(bSig << 1)) {
6180 return propagateFloatx80NaN(a, b, status);
6181 }
6182 if (aExp == 0 && aSig0 >> 63) {
6183 /*
6184 * Pseudo-denormal argument must be returned in normalized
6185 * form.
6186 */
6187 return packFloatx80(aSign, 1, aSig0);
6188 }
6189 return a;
6190 }
6191 if ( bExp == 0 ) {
6192 if ( bSig == 0 ) {
6193 invalid:
6194 float_raise(float_flag_invalid, status);
6195 return floatx80_default_nan(status);
6196 }
6197 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6198 }
6199 if ( aExp == 0 ) {
6200 if ( aSig0 == 0 ) return a;
6201 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6202 }
6203 zSign = aSign;
6204 expDiff = aExp - bExp;
6205 aSig1 = 0;
6206 if ( expDiff < 0 ) {
6207 if ( mod || expDiff < -1 ) {
6208 if (aExp == 1 && aExpOrig == 0) {
6209 /*
6210 * Pseudo-denormal argument must be returned in
6211 * normalized form.
6212 */
6213 return packFloatx80(aSign, aExp, aSig0);
6214 }
6215 return a;
6216 }
6217 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6218 expDiff = 0;
6219 }
6220 *quotient = q = ( bSig <= aSig0 );
6221 if ( q ) aSig0 -= bSig;
6222 expDiff -= 64;
6223 while ( 0 < expDiff ) {
6224 q = estimateDiv128To64( aSig0, aSig1, bSig );
6225 q = ( 2 < q ) ? q - 2 : 0;
6226 mul64To128( bSig, q, &term0, &term1 );
6227 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6228 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6229 expDiff -= 62;
6230 *quotient <<= 62;
6231 *quotient += q;
6232 }
6233 expDiff += 64;
6234 if ( 0 < expDiff ) {
6235 q = estimateDiv128To64( aSig0, aSig1, bSig );
6236 q = ( 2 < q ) ? q - 2 : 0;
6237 q >>= 64 - expDiff;
6238 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6239 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6240 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6241 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6242 ++q;
6243 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6244 }
6245 if (expDiff < 64) {
6246 *quotient <<= expDiff;
6247 } else {
6248 *quotient = 0;
6249 }
6250 *quotient += q;
6251 }
6252 else {
6253 term1 = 0;
6254 term0 = bSig;
6255 }
6256 if (!mod) {
6257 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6258 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6259 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6260 && ( q & 1 ) )
6261 ) {
6262 aSig0 = alternateASig0;
6263 aSig1 = alternateASig1;
6264 zSign = ! zSign;
6265 ++*quotient;
6266 }
6267 }
6268 return
6269 normalizeRoundAndPackFloatx80(
6270 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6271
6272 }
6273
6274 /*----------------------------------------------------------------------------
6275 | Returns the remainder of the extended double-precision floating-point value
6276 | `a' with respect to the corresponding value `b'. The operation is performed
6277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6278 *----------------------------------------------------------------------------*/
6279
6280 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6281 {
6282 uint64_t quotient;
6283 return floatx80_modrem(a, b, false, &quotient, status);
6284 }
6285
6286 /*----------------------------------------------------------------------------
6287 | Returns the remainder of the extended double-precision floating-point value
6288 | `a' with respect to the corresponding value `b', with the quotient truncated
6289 | toward zero.
6290 *----------------------------------------------------------------------------*/
6291
6292 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6293 {
6294 uint64_t quotient;
6295 return floatx80_modrem(a, b, true, &quotient, status);
6296 }
6297
6298 /*----------------------------------------------------------------------------
6299 | Returns the square root of the extended double-precision floating-point
6300 | value `a'. The operation is performed according to the IEC/IEEE Standard
6301 | for Binary Floating-Point Arithmetic.
6302 *----------------------------------------------------------------------------*/
6303
6304 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6305 {
6306 bool aSign;
6307 int32_t aExp, zExp;
6308 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6309 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6310
6311 if (floatx80_invalid_encoding(a)) {
6312 float_raise(float_flag_invalid, status);
6313 return floatx80_default_nan(status);
6314 }
6315 aSig0 = extractFloatx80Frac( a );
6316 aExp = extractFloatx80Exp( a );
6317 aSign = extractFloatx80Sign( a );
6318 if ( aExp == 0x7FFF ) {
6319 if ((uint64_t)(aSig0 << 1)) {
6320 return propagateFloatx80NaN(a, a, status);
6321 }
6322 if ( ! aSign ) return a;
6323 goto invalid;
6324 }
6325 if ( aSign ) {
6326 if ( ( aExp | aSig0 ) == 0 ) return a;
6327 invalid:
6328 float_raise(float_flag_invalid, status);
6329 return floatx80_default_nan(status);
6330 }
6331 if ( aExp == 0 ) {
6332 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6333 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6334 }
6335 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6336 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6337 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6338 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6339 doubleZSig0 = zSig0<<1;
6340 mul64To128( zSig0, zSig0, &term0, &term1 );
6341 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6342 while ( (int64_t) rem0 < 0 ) {
6343 --zSig0;
6344 doubleZSig0 -= 2;
6345 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6346 }
6347 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6348 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6349 if ( zSig1 == 0 ) zSig1 = 1;
6350 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6351 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6352 mul64To128( zSig1, zSig1, &term2, &term3 );
6353 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6354 while ( (int64_t) rem1 < 0 ) {
6355 --zSig1;
6356 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6357 term3 |= 1;
6358 term2 |= doubleZSig0;
6359 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6360 }
6361 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6362 }
6363 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6364 zSig0 |= doubleZSig0;
6365 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6366 0, zExp, zSig0, zSig1, status);
6367 }
6368
6369 /*----------------------------------------------------------------------------
6370 | Returns the result of converting the quadruple-precision floating-point
6371 | value `a' to the 32-bit two's complement integer format. The conversion
6372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6373 | Arithmetic---which means in particular that the conversion is rounded
6374 | according to the current rounding mode. If `a' is a NaN, the largest
6375 | positive integer is returned. Otherwise, if the conversion overflows, the
6376 | largest integer with the same sign as `a' is returned.
6377 *----------------------------------------------------------------------------*/
6378
6379 int32_t float128_to_int32(float128 a, float_status *status)
6380 {
6381 bool aSign;
6382 int32_t aExp, shiftCount;
6383 uint64_t aSig0, aSig1;
6384
6385 aSig1 = extractFloat128Frac1( a );
6386 aSig0 = extractFloat128Frac0( a );
6387 aExp = extractFloat128Exp( a );
6388 aSign = extractFloat128Sign( a );
6389 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6390 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6391 aSig0 |= ( aSig1 != 0 );
6392 shiftCount = 0x4028 - aExp;
6393 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6394 return roundAndPackInt32(aSign, aSig0, status);
6395
6396 }
6397
6398 /*----------------------------------------------------------------------------
6399 | Returns the result of converting the quadruple-precision floating-point
6400 | value `a' to the 32-bit two's complement integer format. The conversion
6401 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6402 | Arithmetic, except that the conversion is always rounded toward zero. If
6403 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6404 | conversion overflows, the largest integer with the same sign as `a' is
6405 | returned.
6406 *----------------------------------------------------------------------------*/
6407
6408 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6409 {
6410 bool aSign;
6411 int32_t aExp, shiftCount;
6412 uint64_t aSig0, aSig1, savedASig;
6413 int32_t z;
6414
6415 aSig1 = extractFloat128Frac1( a );
6416 aSig0 = extractFloat128Frac0( a );
6417 aExp = extractFloat128Exp( a );
6418 aSign = extractFloat128Sign( a );
6419 aSig0 |= ( aSig1 != 0 );
6420 if ( 0x401E < aExp ) {
6421 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6422 goto invalid;
6423 }
6424 else if ( aExp < 0x3FFF ) {
6425 if (aExp || aSig0) {
6426 float_raise(float_flag_inexact, status);
6427 }
6428 return 0;
6429 }
6430 aSig0 |= UINT64_C(0x0001000000000000);
6431 shiftCount = 0x402F - aExp;
6432 savedASig = aSig0;
6433 aSig0 >>= shiftCount;
6434 z = aSig0;
6435 if ( aSign ) z = - z;
6436 if ( ( z < 0 ) ^ aSign ) {
6437 invalid:
6438 float_raise(float_flag_invalid, status);
6439 return aSign ? INT32_MIN : INT32_MAX;
6440 }
6441 if ( ( aSig0<<shiftCount ) != savedASig ) {
6442 float_raise(float_flag_inexact, status);
6443 }
6444 return z;
6445
6446 }
6447
6448 /*----------------------------------------------------------------------------
6449 | Returns the result of converting the quadruple-precision floating-point
6450 | value `a' to the 64-bit two's complement integer format. The conversion
6451 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6452 | Arithmetic---which means in particular that the conversion is rounded
6453 | according to the current rounding mode. If `a' is a NaN, the largest
6454 | positive integer is returned. Otherwise, if the conversion overflows, the
6455 | largest integer with the same sign as `a' is returned.
6456 *----------------------------------------------------------------------------*/
6457
6458 int64_t float128_to_int64(float128 a, float_status *status)
6459 {
6460 bool aSign;
6461 int32_t aExp, shiftCount;
6462 uint64_t aSig0, aSig1;
6463
6464 aSig1 = extractFloat128Frac1( a );
6465 aSig0 = extractFloat128Frac0( a );
6466 aExp = extractFloat128Exp( a );
6467 aSign = extractFloat128Sign( a );
6468 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6469 shiftCount = 0x402F - aExp;
6470 if ( shiftCount <= 0 ) {
6471 if ( 0x403E < aExp ) {
6472 float_raise(float_flag_invalid, status);
6473 if ( ! aSign
6474 || ( ( aExp == 0x7FFF )
6475 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6476 )
6477 ) {
6478 return INT64_MAX;
6479 }
6480 return INT64_MIN;
6481 }
6482 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6483 }
6484 else {
6485 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6486 }
6487 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6488
6489 }
6490
6491 /*----------------------------------------------------------------------------
6492 | Returns the result of converting the quadruple-precision floating-point
6493 | value `a' to the 64-bit two's complement integer format. The conversion
6494 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6495 | Arithmetic, except that the conversion is always rounded toward zero.
6496 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6497 | the conversion overflows, the largest integer with the same sign as `a' is
6498 | returned.
6499 *----------------------------------------------------------------------------*/
6500
6501 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6502 {
6503 bool aSign;
6504 int32_t aExp, shiftCount;
6505 uint64_t aSig0, aSig1;
6506 int64_t z;
6507
6508 aSig1 = extractFloat128Frac1( a );
6509 aSig0 = extractFloat128Frac0( a );
6510 aExp = extractFloat128Exp( a );
6511 aSign = extractFloat128Sign( a );
6512 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6513 shiftCount = aExp - 0x402F;
6514 if ( 0 < shiftCount ) {
6515 if ( 0x403E <= aExp ) {
6516 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6517 if ( ( a.high == UINT64_C(0xC03E000000000000) )
6518 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6519 if (aSig1) {
6520 float_raise(float_flag_inexact, status);
6521 }
6522 }
6523 else {
6524 float_raise(float_flag_invalid, status);
6525 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6526 return INT64_MAX;
6527 }
6528 }
6529 return INT64_MIN;
6530 }
6531 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6532 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6533 float_raise(float_flag_inexact, status);
6534 }
6535 }
6536 else {
6537 if ( aExp < 0x3FFF ) {
6538 if ( aExp | aSig0 | aSig1 ) {
6539 float_raise(float_flag_inexact, status);
6540 }
6541 return 0;
6542 }
6543 z = aSig0>>( - shiftCount );
6544 if ( aSig1
6545 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6546 float_raise(float_flag_inexact, status);
6547 }
6548 }
6549 if ( aSign ) z = - z;
6550 return z;
6551
6552 }
6553
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point value
6556 | `a' to the 64-bit unsigned integer format. The conversion is
6557 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic---which means in particular that the conversion is rounded
6559 | according to the current rounding mode. If `a' is a NaN, the largest
6560 | positive integer is returned. If the conversion overflows, the
6561 | largest unsigned integer is returned. If 'a' is negative, the value is
6562 | rounded and zero is returned; negative values that do not round to zero
6563 | will raise the inexact exception.
6564 *----------------------------------------------------------------------------*/
6565
6566 uint64_t float128_to_uint64(float128 a, float_status *status)
6567 {
6568 bool aSign;
6569 int aExp;
6570 int shiftCount;
6571 uint64_t aSig0, aSig1;
6572
6573 aSig0 = extractFloat128Frac0(a);
6574 aSig1 = extractFloat128Frac1(a);
6575 aExp = extractFloat128Exp(a);
6576 aSign = extractFloat128Sign(a);
6577 if (aSign && (aExp > 0x3FFE)) {
6578 float_raise(float_flag_invalid, status);
6579 if (float128_is_any_nan(a)) {
6580 return UINT64_MAX;
6581 } else {
6582 return 0;
6583 }
6584 }
6585 if (aExp) {
6586 aSig0 |= UINT64_C(0x0001000000000000);
6587 }
6588 shiftCount = 0x402F - aExp;
6589 if (shiftCount <= 0) {
6590 if (0x403E < aExp) {
6591 float_raise(float_flag_invalid, status);
6592 return UINT64_MAX;
6593 }
6594 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6595 } else {
6596 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6597 }
6598 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6599 }
6600
6601 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6602 {
6603 uint64_t v;
6604 signed char current_rounding_mode = status->float_rounding_mode;
6605
6606 set_float_rounding_mode(float_round_to_zero, status);
6607 v = float128_to_uint64(a, status);
6608 set_float_rounding_mode(current_rounding_mode, status);
6609
6610 return v;
6611 }
6612
6613 /*----------------------------------------------------------------------------
6614 | Returns the result of converting the quadruple-precision floating-point
6615 | value `a' to the 32-bit unsigned integer format. The conversion
6616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6617 | Arithmetic except that the conversion is always rounded toward zero.
6618 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6619 | if the conversion overflows, the largest unsigned integer is returned.
6620 | If 'a' is negative, the value is rounded and zero is returned; negative
6621 | values that do not round to zero will raise the inexact exception.
6622 *----------------------------------------------------------------------------*/
6623
6624 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6625 {
6626 uint64_t v;
6627 uint32_t res;
6628 int old_exc_flags = get_float_exception_flags(status);
6629
6630 v = float128_to_uint64_round_to_zero(a, status);
6631 if (v > 0xffffffff) {
6632 res = 0xffffffff;
6633 } else {
6634 return v;
6635 }
6636 set_float_exception_flags(old_exc_flags, status);
6637 float_raise(float_flag_invalid, status);
6638 return res;
6639 }
6640
6641 /*----------------------------------------------------------------------------
6642 | Returns the result of converting the quadruple-precision floating-point value
6643 | `a' to the 32-bit unsigned integer format. The conversion is
6644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6645 | Arithmetic---which means in particular that the conversion is rounded
6646 | according to the current rounding mode. If `a' is a NaN, the largest
6647 | positive integer is returned. If the conversion overflows, the
6648 | largest unsigned integer is returned. If 'a' is negative, the value is
6649 | rounded and zero is returned; negative values that do not round to zero
6650 | will raise the inexact exception.
6651 *----------------------------------------------------------------------------*/
6652
6653 uint32_t float128_to_uint32(float128 a, float_status *status)
6654 {
6655 uint64_t v;
6656 uint32_t res;
6657 int old_exc_flags = get_float_exception_flags(status);
6658
6659 v = float128_to_uint64(a, status);
6660 if (v > 0xffffffff) {
6661 res = 0xffffffff;
6662 } else {
6663 return v;
6664 }
6665 set_float_exception_flags(old_exc_flags, status);
6666 float_raise(float_flag_invalid, status);
6667 return res;
6668 }
6669
6670 /*----------------------------------------------------------------------------
6671 | Returns the result of converting the quadruple-precision floating-point
6672 | value `a' to the single-precision floating-point format. The conversion
6673 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6674 | Arithmetic.
6675 *----------------------------------------------------------------------------*/
6676
6677 float32 float128_to_float32(float128 a, float_status *status)
6678 {
6679 bool aSign;
6680 int32_t aExp;
6681 uint64_t aSig0, aSig1;
6682 uint32_t zSig;
6683
6684 aSig1 = extractFloat128Frac1( a );
6685 aSig0 = extractFloat128Frac0( a );
6686 aExp = extractFloat128Exp( a );
6687 aSign = extractFloat128Sign( a );
6688 if ( aExp == 0x7FFF ) {
6689 if ( aSig0 | aSig1 ) {
6690 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6691 }
6692 return packFloat32( aSign, 0xFF, 0 );
6693 }
6694 aSig0 |= ( aSig1 != 0 );
6695 shift64RightJamming( aSig0, 18, &aSig0 );
6696 zSig = aSig0;
6697 if ( aExp || zSig ) {
6698 zSig |= 0x40000000;
6699 aExp -= 0x3F81;
6700 }
6701 return roundAndPackFloat32(aSign, aExp, zSig, status);
6702
6703 }
6704
6705 /*----------------------------------------------------------------------------
6706 | Returns the result of converting the quadruple-precision floating-point
6707 | value `a' to the double-precision floating-point format. The conversion
6708 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6709 | Arithmetic.
6710 *----------------------------------------------------------------------------*/
6711
6712 float64 float128_to_float64(float128 a, float_status *status)
6713 {
6714 bool aSign;
6715 int32_t aExp;
6716 uint64_t aSig0, aSig1;
6717
6718 aSig1 = extractFloat128Frac1( a );
6719 aSig0 = extractFloat128Frac0( a );
6720 aExp = extractFloat128Exp( a );
6721 aSign = extractFloat128Sign( a );
6722 if ( aExp == 0x7FFF ) {
6723 if ( aSig0 | aSig1 ) {
6724 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6725 }
6726 return packFloat64( aSign, 0x7FF, 0 );
6727 }
6728 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6729 aSig0 |= ( aSig1 != 0 );
6730 if ( aExp || aSig0 ) {
6731 aSig0 |= UINT64_C(0x4000000000000000);
6732 aExp -= 0x3C01;
6733 }
6734 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6735
6736 }
6737
6738 /*----------------------------------------------------------------------------
6739 | Returns the result of converting the quadruple-precision floating-point
6740 | value `a' to the extended double-precision floating-point format. The
6741 | conversion is performed according to the IEC/IEEE Standard for Binary
6742 | Floating-Point Arithmetic.
6743 *----------------------------------------------------------------------------*/
6744
6745 floatx80 float128_to_floatx80(float128 a, float_status *status)
6746 {
6747 bool aSign;
6748 int32_t aExp;
6749 uint64_t aSig0, aSig1;
6750
6751 aSig1 = extractFloat128Frac1( a );
6752 aSig0 = extractFloat128Frac0( a );
6753 aExp = extractFloat128Exp( a );
6754 aSign = extractFloat128Sign( a );
6755 if ( aExp == 0x7FFF ) {
6756 if ( aSig0 | aSig1 ) {
6757 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6758 status);
6759 return floatx80_silence_nan(res, status);
6760 }
6761 return packFloatx80(aSign, floatx80_infinity_high,
6762 floatx80_infinity_low);
6763 }
6764 if ( aExp == 0 ) {
6765 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6766 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6767 }
6768 else {
6769 aSig0 |= UINT64_C(0x0001000000000000);
6770 }
6771 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6772 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6773
6774 }
6775
6776 /*----------------------------------------------------------------------------
6777 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6778 | returns the result as a quadruple-precision floating-point value. The
6779 | operation is performed according to the IEC/IEEE Standard for Binary
6780 | Floating-Point Arithmetic.
6781 *----------------------------------------------------------------------------*/
6782
6783 float128 float128_round_to_int(float128 a, float_status *status)
6784 {
6785 bool aSign;
6786 int32_t aExp;
6787 uint64_t lastBitMask, roundBitsMask;
6788 float128 z;
6789
6790 aExp = extractFloat128Exp( a );
6791 if ( 0x402F <= aExp ) {
6792 if ( 0x406F <= aExp ) {
6793 if ( ( aExp == 0x7FFF )
6794 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6795 ) {
6796 return propagateFloat128NaN(a, a, status);
6797 }
6798 return a;
6799 }
6800 lastBitMask = 1;
6801 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6802 roundBitsMask = lastBitMask - 1;
6803 z = a;
6804 switch (status->float_rounding_mode) {
6805 case float_round_nearest_even:
6806 if ( lastBitMask ) {
6807 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6808 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6809 }
6810 else {
6811 if ( (int64_t) z.low < 0 ) {
6812 ++z.high;
6813 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6814 }
6815 }
6816 break;
6817 case float_round_ties_away:
6818 if (lastBitMask) {
6819 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6820 } else {
6821 if ((int64_t) z.low < 0) {
6822 ++z.high;
6823 }
6824 }
6825 break;
6826 case float_round_to_zero:
6827 break;
6828 case float_round_up:
6829 if (!extractFloat128Sign(z)) {
6830 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6831 }
6832 break;
6833 case float_round_down:
6834 if (extractFloat128Sign(z)) {
6835 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6836 }
6837 break;
6838 case float_round_to_odd:
6839 /*
6840 * Note that if lastBitMask == 0, the last bit is the lsb
6841 * of high, and roundBitsMask == -1.
6842 */
6843 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6844 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6845 }
6846 break;
6847 default:
6848 abort();
6849 }
6850 z.low &= ~ roundBitsMask;
6851 }
6852 else {
6853 if ( aExp < 0x3FFF ) {
6854 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6855 float_raise(float_flag_inexact, status);
6856 aSign = extractFloat128Sign( a );
6857 switch (status->float_rounding_mode) {
6858 case float_round_nearest_even:
6859 if ( ( aExp == 0x3FFE )
6860 && ( extractFloat128Frac0( a )
6861 | extractFloat128Frac1( a ) )
6862 ) {
6863 return packFloat128( aSign, 0x3FFF, 0, 0 );
6864 }
6865 break;
6866 case float_round_ties_away:
6867 if (aExp == 0x3FFE) {
6868 return packFloat128(aSign, 0x3FFF, 0, 0);
6869 }
6870 break;
6871 case float_round_down:
6872 return
6873 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6874 : packFloat128( 0, 0, 0, 0 );
6875 case float_round_up:
6876 return
6877 aSign ? packFloat128( 1, 0, 0, 0 )
6878 : packFloat128( 0, 0x3FFF, 0, 0 );
6879
6880 case float_round_to_odd:
6881 return packFloat128(aSign, 0x3FFF, 0, 0);
6882
6883 case float_round_to_zero:
6884 break;
6885 }
6886 return packFloat128( aSign, 0, 0, 0 );
6887 }
6888 lastBitMask = 1;
6889 lastBitMask <<= 0x402F - aExp;
6890 roundBitsMask = lastBitMask - 1;
6891 z.low = 0;
6892 z.high = a.high;
6893 switch (status->float_rounding_mode) {
6894 case float_round_nearest_even:
6895 z.high += lastBitMask>>1;
6896 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6897 z.high &= ~ lastBitMask;
6898 }
6899 break;
6900 case float_round_ties_away:
6901 z.high += lastBitMask>>1;
6902 break;
6903 case float_round_to_zero:
6904 break;
6905 case float_round_up:
6906 if (!extractFloat128Sign(z)) {
6907 z.high |= ( a.low != 0 );
6908 z.high += roundBitsMask;
6909 }
6910 break;
6911 case float_round_down:
6912 if (extractFloat128Sign(z)) {
6913 z.high |= (a.low != 0);
6914 z.high += roundBitsMask;
6915 }
6916 break;
6917 case float_round_to_odd:
6918 if ((z.high & lastBitMask) == 0) {
6919 z.high |= (a.low != 0);
6920 z.high += roundBitsMask;
6921 }
6922 break;
6923 default:
6924 abort();
6925 }
6926 z.high &= ~ roundBitsMask;
6927 }
6928 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6929 float_raise(float_flag_inexact, status);
6930 }
6931 return z;
6932
6933 }
6934
6935 /*----------------------------------------------------------------------------
6936 | Returns the result of adding the absolute values of the quadruple-precision
6937 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6938 | before being returned. `zSign' is ignored if the result is a NaN.
6939 | The addition is performed according to the IEC/IEEE Standard for Binary
6940 | Floating-Point Arithmetic.
6941 *----------------------------------------------------------------------------*/
6942
6943 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6944 float_status *status)
6945 {
6946 int32_t aExp, bExp, zExp;
6947 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6948 int32_t expDiff;
6949
6950 aSig1 = extractFloat128Frac1( a );
6951 aSig0 = extractFloat128Frac0( a );
6952 aExp = extractFloat128Exp( a );
6953 bSig1 = extractFloat128Frac1( b );
6954 bSig0 = extractFloat128Frac0( b );
6955 bExp = extractFloat128Exp( b );
6956 expDiff = aExp - bExp;
6957 if ( 0 < expDiff ) {
6958 if ( aExp == 0x7FFF ) {
6959 if (aSig0 | aSig1) {
6960 return propagateFloat128NaN(a, b, status);
6961 }
6962 return a;
6963 }
6964 if ( bExp == 0 ) {
6965 --expDiff;
6966 }
6967 else {
6968 bSig0 |= UINT64_C(0x0001000000000000);
6969 }
6970 shift128ExtraRightJamming(
6971 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6972 zExp = aExp;
6973 }
6974 else if ( expDiff < 0 ) {
6975 if ( bExp == 0x7FFF ) {
6976 if (bSig0 | bSig1) {
6977 return propagateFloat128NaN(a, b, status);
6978 }
6979 return packFloat128( zSign, 0x7FFF, 0, 0 );
6980 }
6981 if ( aExp == 0 ) {
6982 ++expDiff;
6983 }
6984 else {
6985 aSig0 |= UINT64_C(0x0001000000000000);
6986 }
6987 shift128ExtraRightJamming(
6988 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6989 zExp = bExp;
6990 }
6991 else {
6992 if ( aExp == 0x7FFF ) {
6993 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6994 return propagateFloat128NaN(a, b, status);
6995 }
6996 return a;
6997 }
6998 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6999 if ( aExp == 0 ) {
7000 if (status->flush_to_zero) {
7001 if (zSig0 | zSig1) {
7002 float_raise(float_flag_output_denormal, status);
7003 }
7004 return packFloat128(zSign, 0, 0, 0);
7005 }
7006 return packFloat128( zSign, 0, zSig0, zSig1 );
7007 }
7008 zSig2 = 0;
7009 zSig0 |= UINT64_C(0x0002000000000000);
7010 zExp = aExp;
7011 goto shiftRight1;
7012 }
7013 aSig0 |= UINT64_C(0x0001000000000000);
7014 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7015 --zExp;
7016 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7017 ++zExp;
7018 shiftRight1:
7019 shift128ExtraRightJamming(
7020 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7021 roundAndPack:
7022 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7023
7024 }
7025
7026 /*----------------------------------------------------------------------------
7027 | Returns the result of subtracting the absolute values of the quadruple-
7028 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7029 | difference is negated before being returned. `zSign' is ignored if the
7030 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7031 | Standard for Binary Floating-Point Arithmetic.
7032 *----------------------------------------------------------------------------*/
7033
7034 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7035 float_status *status)
7036 {
7037 int32_t aExp, bExp, zExp;
7038 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7039 int32_t expDiff;
7040
7041 aSig1 = extractFloat128Frac1( a );
7042 aSig0 = extractFloat128Frac0( a );
7043 aExp = extractFloat128Exp( a );
7044 bSig1 = extractFloat128Frac1( b );
7045 bSig0 = extractFloat128Frac0( b );
7046 bExp = extractFloat128Exp( b );
7047 expDiff = aExp - bExp;
7048 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7049 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7050 if ( 0 < expDiff ) goto aExpBigger;
7051 if ( expDiff < 0 ) goto bExpBigger;
7052 if ( aExp == 0x7FFF ) {
7053 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7054 return propagateFloat128NaN(a, b, status);
7055 }
7056 float_raise(float_flag_invalid, status);
7057 return float128_default_nan(status);
7058 }
7059 if ( aExp == 0 ) {
7060 aExp = 1;
7061 bExp = 1;
7062 }
7063 if ( bSig0 < aSig0 ) goto aBigger;
7064 if ( aSig0 < bSig0 ) goto bBigger;
7065 if ( bSig1 < aSig1 ) goto aBigger;
7066 if ( aSig1 < bSig1 ) goto bBigger;
7067 return packFloat128(status->float_rounding_mode == float_round_down,
7068 0, 0, 0);
7069 bExpBigger:
7070 if ( bExp == 0x7FFF ) {
7071 if (bSig0 | bSig1) {
7072 return propagateFloat128NaN(a, b, status);
7073 }
7074 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7075 }
7076 if ( aExp == 0 ) {
7077 ++expDiff;
7078 }
7079 else {
7080 aSig0 |= UINT64_C(0x4000000000000000);
7081 }
7082 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7083 bSig0 |= UINT64_C(0x4000000000000000);
7084 bBigger:
7085 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7086 zExp = bExp;
7087 zSign ^= 1;
7088 goto normalizeRoundAndPack;
7089 aExpBigger:
7090 if ( aExp == 0x7FFF ) {
7091 if (aSig0 | aSig1) {
7092 return propagateFloat128NaN(a, b, status);
7093 }
7094 return a;
7095 }
7096 if ( bExp == 0 ) {
7097 --expDiff;
7098 }
7099 else {
7100 bSig0 |= UINT64_C(0x4000000000000000);
7101 }
7102 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7103 aSig0 |= UINT64_C(0x4000000000000000);
7104 aBigger:
7105 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7106 zExp = aExp;
7107 normalizeRoundAndPack:
7108 --zExp;
7109 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7110 status);
7111
7112 }
7113
7114 /*----------------------------------------------------------------------------
7115 | Returns the result of adding the quadruple-precision floating-point values
7116 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7117 | for Binary Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7119
7120 float128 float128_add(float128 a, float128 b, float_status *status)
7121 {
7122 bool aSign, bSign;
7123
7124 aSign = extractFloat128Sign( a );
7125 bSign = extractFloat128Sign( b );
7126 if ( aSign == bSign ) {
7127 return addFloat128Sigs(a, b, aSign, status);
7128 }
7129 else {
7130 return subFloat128Sigs(a, b, aSign, status);
7131 }
7132
7133 }
7134
7135 /*----------------------------------------------------------------------------
7136 | Returns the result of subtracting the quadruple-precision floating-point
7137 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7138 | Standard for Binary Floating-Point Arithmetic.
7139 *----------------------------------------------------------------------------*/
7140
7141 float128 float128_sub(float128 a, float128 b, float_status *status)
7142 {
7143 bool aSign, bSign;
7144
7145 aSign = extractFloat128Sign( a );
7146 bSign = extractFloat128Sign( b );
7147 if ( aSign == bSign ) {
7148 return subFloat128Sigs(a, b, aSign, status);
7149 }
7150 else {
7151 return addFloat128Sigs(a, b, aSign, status);
7152 }
7153
7154 }
7155
7156 /*----------------------------------------------------------------------------
7157 | Returns the result of multiplying the quadruple-precision floating-point
7158 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7159 | Standard for Binary Floating-Point Arithmetic.
7160 *----------------------------------------------------------------------------*/
7161
7162 float128 float128_mul(float128 a, float128 b, float_status *status)
7163 {
7164 bool aSign, bSign, zSign;
7165 int32_t aExp, bExp, zExp;
7166 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7167
7168 aSig1 = extractFloat128Frac1( a );
7169 aSig0 = extractFloat128Frac0( a );
7170 aExp = extractFloat128Exp( a );
7171 aSign = extractFloat128Sign( a );
7172 bSig1 = extractFloat128Frac1( b );
7173 bSig0 = extractFloat128Frac0( b );
7174 bExp = extractFloat128Exp( b );
7175 bSign = extractFloat128Sign( b );
7176 zSign = aSign ^ bSign;
7177 if ( aExp == 0x7FFF ) {
7178 if ( ( aSig0 | aSig1 )
7179 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7180 return propagateFloat128NaN(a, b, status);
7181 }
7182 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7183 return packFloat128( zSign, 0x7FFF, 0, 0 );
7184 }
7185 if ( bExp == 0x7FFF ) {
7186 if (bSig0 | bSig1) {
7187 return propagateFloat128NaN(a, b, status);
7188 }
7189 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7190 invalid:
7191 float_raise(float_flag_invalid, status);
7192 return float128_default_nan(status);
7193 }
7194 return packFloat128( zSign, 0x7FFF, 0, 0 );
7195 }
7196 if ( aExp == 0 ) {
7197 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7198 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7199 }
7200 if ( bExp == 0 ) {
7201 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7202 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7203 }
7204 zExp = aExp + bExp - 0x4000;
7205 aSig0 |= UINT64_C(0x0001000000000000);
7206 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7207 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7208 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7209 zSig2 |= ( zSig3 != 0 );
7210 if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7211 shift128ExtraRightJamming(
7212 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7213 ++zExp;
7214 }
7215 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7216
7217 }
7218
7219 /*----------------------------------------------------------------------------
7220 | Returns the result of dividing the quadruple-precision floating-point value
7221 | `a' by the corresponding value `b'. The operation is performed according to
7222 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7223 *----------------------------------------------------------------------------*/
7224
7225 float128 float128_div(float128 a, float128 b, float_status *status)
7226 {
7227 bool aSign, bSign, zSign;
7228 int32_t aExp, bExp, zExp;
7229 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7230 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7231
7232 aSig1 = extractFloat128Frac1( a );
7233 aSig0 = extractFloat128Frac0( a );
7234 aExp = extractFloat128Exp( a );
7235 aSign = extractFloat128Sign( a );
7236 bSig1 = extractFloat128Frac1( b );
7237 bSig0 = extractFloat128Frac0( b );
7238 bExp = extractFloat128Exp( b );
7239 bSign = extractFloat128Sign( b );
7240 zSign = aSign ^ bSign;
7241 if ( aExp == 0x7FFF ) {
7242 if (aSig0 | aSig1) {
7243 return propagateFloat128NaN(a, b, status);
7244 }
7245 if ( bExp == 0x7FFF ) {
7246 if (bSig0 | bSig1) {
7247 return propagateFloat128NaN(a, b, status);
7248 }
7249 goto invalid;
7250 }
7251 return packFloat128( zSign, 0x7FFF, 0, 0 );
7252 }
7253 if ( bExp == 0x7FFF ) {
7254 if (bSig0 | bSig1) {
7255 return propagateFloat128NaN(a, b, status);
7256 }
7257 return packFloat128( zSign, 0, 0, 0 );
7258 }
7259 if ( bExp == 0 ) {
7260 if ( ( bSig0 | bSig1 ) == 0 ) {
7261 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7262 invalid:
7263 float_raise(float_flag_invalid, status);
7264 return float128_default_nan(status);
7265 }
7266 float_raise(float_flag_divbyzero, status);
7267 return packFloat128( zSign, 0x7FFF, 0, 0 );
7268 }
7269 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7270 }
7271 if ( aExp == 0 ) {
7272 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7273 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7274 }
7275 zExp = aExp - bExp + 0x3FFD;
7276 shortShift128Left(
7277 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7278 shortShift128Left(
7279 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7280 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7281 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7282 ++zExp;
7283 }
7284 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7285 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7286 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7287 while ( (int64_t) rem0 < 0 ) {
7288 --zSig0;
7289 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7290 }
7291 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7292 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7293 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7294 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7295 while ( (int64_t) rem1 < 0 ) {
7296 --zSig1;
7297 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7298 }
7299 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7300 }
7301 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7302 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7303
7304 }
7305
7306 /*----------------------------------------------------------------------------
7307 | Returns the remainder of the quadruple-precision floating-point value `a'
7308 | with respect to the corresponding value `b'. The operation is performed
7309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7310 *----------------------------------------------------------------------------*/
7311
7312 float128 float128_rem(float128 a, float128 b, float_status *status)
7313 {
7314 bool aSign, zSign;
7315 int32_t aExp, bExp, expDiff;
7316 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7317 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7318 int64_t sigMean0;
7319
7320 aSig1 = extractFloat128Frac1( a );
7321 aSig0 = extractFloat128Frac0( a );
7322 aExp = extractFloat128Exp( a );
7323 aSign = extractFloat128Sign( a );
7324 bSig1 = extractFloat128Frac1( b );
7325 bSig0 = extractFloat128Frac0( b );
7326 bExp = extractFloat128Exp( b );
7327 if ( aExp == 0x7FFF ) {
7328 if ( ( aSig0 | aSig1 )
7329 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7330 return propagateFloat128NaN(a, b, status);
7331 }
7332 goto invalid;
7333 }
7334 if ( bExp == 0x7FFF ) {
7335 if (bSig0 | bSig1) {
7336 return propagateFloat128NaN(a, b, status);
7337 }
7338 return a;
7339 }
7340 if ( bExp == 0 ) {
7341 if ( ( bSig0 | bSig1 ) == 0 ) {
7342 invalid:
7343 float_raise(float_flag_invalid, status);
7344 return float128_default_nan(status);
7345 }
7346 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7347 }
7348 if ( aExp == 0 ) {
7349 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7350 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7351 }
7352 expDiff = aExp - bExp;
7353 if ( expDiff < -1 ) return a;
7354 shortShift128Left(
7355 aSig0 | UINT64_C(0x0001000000000000),
7356 aSig1,
7357 15 - ( expDiff < 0 ),
7358 &aSig0,
7359 &aSig1
7360 );
7361 shortShift128Left(
7362 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7363 q = le128( bSig0, bSig1, aSig0, aSig1 );
7364 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7365 expDiff -= 64;
7366 while ( 0 < expDiff ) {
7367 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7368 q = ( 4 < q ) ? q - 4 : 0;
7369 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7370 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7371 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7372 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7373 expDiff -= 61;
7374 }
7375 if ( -64 < expDiff ) {
7376 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7377 q = ( 4 < q ) ? q - 4 : 0;
7378 q >>= - expDiff;
7379 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7380 expDiff += 52;
7381 if ( expDiff < 0 ) {
7382 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7383 }
7384 else {
7385 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7386 }
7387 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7388 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7389 }
7390 else {
7391 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7392 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7393 }
7394 do {
7395 alternateASig0 = aSig0;
7396 alternateASig1 = aSig1;
7397 ++q;
7398 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7399 } while ( 0 <= (int64_t) aSig0 );
7400 add128(
7401 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7402 if ( ( sigMean0 < 0 )
7403 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7404 aSig0 = alternateASig0;
7405 aSig1 = alternateASig1;
7406 }
7407 zSign = ( (int64_t) aSig0 < 0 );
7408 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7409 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7410 status);
7411 }
7412
7413 /*----------------------------------------------------------------------------
7414 | Returns the square root of the quadruple-precision floating-point value `a'.
7415 | The operation is performed according to the IEC/IEEE Standard for Binary
7416 | Floating-Point Arithmetic.
7417 *----------------------------------------------------------------------------*/
7418
7419 float128 float128_sqrt(float128 a, float_status *status)
7420 {
7421 bool aSign;
7422 int32_t aExp, zExp;
7423 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7424 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7425
7426 aSig1 = extractFloat128Frac1( a );
7427 aSig0 = extractFloat128Frac0( a );
7428 aExp = extractFloat128Exp( a );
7429 aSign = extractFloat128Sign( a );
7430 if ( aExp == 0x7FFF ) {
7431 if (aSig0 | aSig1) {
7432 return propagateFloat128NaN(a, a, status);
7433 }
7434 if ( ! aSign ) return a;
7435 goto invalid;
7436 }
7437 if ( aSign ) {
7438 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7439 invalid:
7440 float_raise(float_flag_invalid, status);
7441 return float128_default_nan(status);
7442 }
7443 if ( aExp == 0 ) {
7444 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7445 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7446 }
7447 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7448 aSig0 |= UINT64_C(0x0001000000000000);
7449 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7450 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7451 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7452 doubleZSig0 = zSig0<<1;
7453 mul64To128( zSig0, zSig0, &term0, &term1 );
7454 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7455 while ( (int64_t) rem0 < 0 ) {
7456 --zSig0;
7457 doubleZSig0 -= 2;
7458 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7459 }
7460 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7461 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7462 if ( zSig1 == 0 ) zSig1 = 1;
7463 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7464 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7465 mul64To128( zSig1, zSig1, &term2, &term3 );
7466 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7467 while ( (int64_t) rem1 < 0 ) {
7468 --zSig1;
7469 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7470 term3 |= 1;
7471 term2 |= doubleZSig0;
7472 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7473 }
7474 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7475 }
7476 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7477 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7478
7479 }
7480
7481 static inline FloatRelation
7482 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7483 float_status *status)
7484 {
7485 bool aSign, bSign;
7486
7487 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7488 float_raise(float_flag_invalid, status);
7489 return float_relation_unordered;
7490 }
7491 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7492 ( extractFloatx80Frac( a )<<1 ) ) ||
7493 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7494 ( extractFloatx80Frac( b )<<1 ) )) {
7495 if (!is_quiet ||
7496 floatx80_is_signaling_nan(a, status) ||
7497 floatx80_is_signaling_nan(b, status)) {
7498 float_raise(float_flag_invalid, status);
7499 }
7500 return float_relation_unordered;
7501 }
7502 aSign = extractFloatx80Sign( a );
7503 bSign = extractFloatx80Sign( b );
7504 if ( aSign != bSign ) {
7505
7506 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7507 ( ( a.low | b.low ) == 0 ) ) {
7508 /* zero case */
7509 return float_relation_equal;
7510 } else {
7511 return 1 - (2 * aSign);
7512 }
7513 } else {
7514 /* Normalize pseudo-denormals before comparison. */
7515 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7516 ++a.high;
7517 }
7518 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7519 ++b.high;
7520 }
7521 if (a.low == b.low && a.high == b.high) {
7522 return float_relation_equal;
7523 } else {
7524 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7525 }
7526 }
7527 }
7528
7529 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7530 {
7531 return floatx80_compare_internal(a, b, 0, status);
7532 }
7533
7534 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7535 float_status *status)
7536 {
7537 return floatx80_compare_internal(a, b, 1, status);
7538 }
7539
7540 static inline FloatRelation
7541 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7542 float_status *status)
7543 {
7544 bool aSign, bSign;
7545
7546 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7547 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7548 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7549 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7550 if (!is_quiet ||
7551 float128_is_signaling_nan(a, status) ||
7552 float128_is_signaling_nan(b, status)) {
7553 float_raise(float_flag_invalid, status);
7554 }
7555 return float_relation_unordered;
7556 }
7557 aSign = extractFloat128Sign( a );
7558 bSign = extractFloat128Sign( b );
7559 if ( aSign != bSign ) {
7560 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7561 /* zero case */
7562 return float_relation_equal;
7563 } else {
7564 return 1 - (2 * aSign);
7565 }
7566 } else {
7567 if (a.low == b.low && a.high == b.high) {
7568 return float_relation_equal;
7569 } else {
7570 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7571 }
7572 }
7573 }
7574
7575 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7576 {
7577 return float128_compare_internal(a, b, 0, status);
7578 }
7579
7580 FloatRelation float128_compare_quiet(float128 a, float128 b,
7581 float_status *status)
7582 {
7583 return float128_compare_internal(a, b, 1, status);
7584 }
7585
7586 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7587 {
7588 bool aSign;
7589 int32_t aExp;
7590 uint64_t aSig;
7591
7592 if (floatx80_invalid_encoding(a)) {
7593 float_raise(float_flag_invalid, status);
7594 return floatx80_default_nan(status);
7595 }
7596 aSig = extractFloatx80Frac( a );
7597 aExp = extractFloatx80Exp( a );
7598 aSign = extractFloatx80Sign( a );
7599
7600 if ( aExp == 0x7FFF ) {
7601 if ( aSig<<1 ) {
7602 return propagateFloatx80NaN(a, a, status);
7603 }
7604 return a;
7605 }
7606
7607 if (aExp == 0) {
7608 if (aSig == 0) {
7609 return a;
7610 }
7611 aExp++;
7612 }
7613
7614 if (n > 0x10000) {
7615 n = 0x10000;
7616 } else if (n < -0x10000) {
7617 n = -0x10000;
7618 }
7619
7620 aExp += n;
7621 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7622 aSign, aExp, aSig, 0, status);
7623 }
7624
7625 float128 float128_scalbn(float128 a, int n, float_status *status)
7626 {
7627 bool aSign;
7628 int32_t aExp;
7629 uint64_t aSig0, aSig1;
7630
7631 aSig1 = extractFloat128Frac1( a );
7632 aSig0 = extractFloat128Frac0( a );
7633 aExp = extractFloat128Exp( a );
7634 aSign = extractFloat128Sign( a );
7635 if ( aExp == 0x7FFF ) {
7636 if ( aSig0 | aSig1 ) {
7637 return propagateFloat128NaN(a, a, status);
7638 }
7639 return a;
7640 }
7641 if (aExp != 0) {
7642 aSig0 |= UINT64_C(0x0001000000000000);
7643 } else if (aSig0 == 0 && aSig1 == 0) {
7644 return a;
7645 } else {
7646 aExp++;
7647 }
7648
7649 if (n > 0x10000) {
7650 n = 0x10000;
7651 } else if (n < -0x10000) {
7652 n = -0x10000;
7653 }
7654
7655 aExp += n - 1;
7656 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7657 , status);
7658
7659 }
7660
7661 static void __attribute__((constructor)) softfloat_init(void)
7662 {
7663 union_float64 ua, ub, uc, ur;
7664
7665 if (QEMU_NO_HARDFLOAT) {
7666 return;
7667 }
7668 /*
7669 * Test that the host's FMA is not obviously broken. For example,
7670 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7671 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7672 */
7673 ua.s = 0x0020000000000001ULL;
7674 ub.s = 0x3ca0000000000000ULL;
7675 uc.s = 0x0020000000000000ULL;
7676 ur.h = fma(ua.h, ub.h, uc.h);
7677 if (ur.s != 0x0020000000000001ULL) {
7678 force_soft_fma = true;
7679 }
7680 }