fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             s->float_exception_flags |= float_flag_input_denormal;      \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         s->float_exception_flags |= float_flag_overflow;
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         s->float_exception_flags |= float_flag_overflow;
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 /* Simple helpers for checking if, or what kind of, NaN we have */
 473 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 474 {
 475     return unlikely(c >= float_class_qnan);
 476 }
 477
 478 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 479 {
 480     return c == float_class_snan;
 481 }
 482
 483 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 484 {
 485     return c == float_class_qnan;
 486 }
 487
 488 /*
 489  * Structure holding all of the decomposed parts of a float. The
 490  * exponent is unbiased and the fraction is normalized. All
 491  * calculations are done with a 64 bit fraction and then rounded as
 492  * appropriate for the final format.
 493  *
 494  * Thanks to the packed FloatClass a decent compiler should be able to
 495  * fit the whole structure into registers and avoid using the stack
 496  * for parameter passing.
 497  */
 498
 499 typedef struct {
 500     uint64_t frac;
 501     int32_t  exp;
 502     FloatClass cls;
 503     bool sign;
 504 } FloatParts;
 505
 506 #define DECOMPOSED_BINARY_POINT    63
 507 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 508
 509 /* Structure holding all of the relevant parameters for a format.
 510  *   exp_size: the size of the exponent field
 511  *   exp_bias: the offset applied to the exponent field
 512  *   exp_max: the maximum normalised exponent
 513  *   frac_size: the size of the fraction field
 514  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 515  * The following are computed based the size of fraction
 516  *   frac_lsb: least significant bit of fraction
 517  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 518  *   round_mask/roundeven_mask: masks used for rounding
 519  * The following optional modifiers are available:
 520  *   arm_althp: handle ARM Alternative Half Precision
 521  */
 522 typedef struct {
 523     int exp_size;
 524     int exp_bias;
 525     int exp_max;
 526     int frac_size;
 527     int frac_shift;
 528     uint64_t frac_lsb;
 529     uint64_t frac_lsbm1;
 530     uint64_t round_mask;
 531     uint64_t roundeven_mask;
 532     bool arm_althp;
 533 } FloatFmt;
 534
 535 /* Expand fields based on the size of exponent and fraction */
 536 #define FLOAT_PARAMS(E, F)                                           \
 537     .exp_size       = E,                                             \
 538     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 539     .exp_max        = (1 << E) - 1,                                  \
 540     .frac_size      = F,                                             \
 541     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 542     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 543     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 544     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 545     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 546
 547 static const FloatFmt float16_params = {
 548     FLOAT_PARAMS(5, 10)
 549 };
 550
 551 static const FloatFmt float16_params_ahp = {
 552     FLOAT_PARAMS(5, 10),
 553     .arm_althp = true
 554 };
 555
 556 static const FloatFmt bfloat16_params = {
 557     FLOAT_PARAMS(8, 7)
 558 };
 559
 560 static const FloatFmt float32_params = {
 561     FLOAT_PARAMS(8, 23)
 562 };
 563
 564 static const FloatFmt float64_params = {
 565     FLOAT_PARAMS(11, 52)
 566 };
 567
 568 /* Unpack a float to parts, but do not canonicalize.  */
 569 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 570 {
 571     const int sign_pos = fmt.frac_size + fmt.exp_size;
 572
 573     return (FloatParts) {
 574         .cls = float_class_unclassified,
 575         .sign = extract64(raw, sign_pos, 1),
 576         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 577         .frac = extract64(raw, 0, fmt.frac_size),
 578     };
 579 }
 580
 581 static inline FloatParts float16_unpack_raw(float16 f)
 582 {
 583     return unpack_raw(float16_params, f);
 584 }
 585
 586 static inline FloatParts bfloat16_unpack_raw(bfloat16 f)
 587 {
 588     return unpack_raw(bfloat16_params, f);
 589 }
 590
 591 static inline FloatParts float32_unpack_raw(float32 f)
 592 {
 593     return unpack_raw(float32_params, f);
 594 }
 595
 596 static inline FloatParts float64_unpack_raw(float64 f)
 597 {
 598     return unpack_raw(float64_params, f);
 599 }
 600
 601 /* Pack a float from parts, but do not canonicalize.  */
 602 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 603 {
 604     const int sign_pos = fmt.frac_size + fmt.exp_size;
 605     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 606     return deposit64(ret, sign_pos, 1, p.sign);
 607 }
 608
 609 static inline float16 float16_pack_raw(FloatParts p)
 610 {
 611     return make_float16(pack_raw(float16_params, p));
 612 }
 613
 614 static inline bfloat16 bfloat16_pack_raw(FloatParts p)
 615 {
 616     return pack_raw(bfloat16_params, p);
 617 }
 618
 619 static inline float32 float32_pack_raw(FloatParts p)
 620 {
 621     return make_float32(pack_raw(float32_params, p));
 622 }
 623
 624 static inline float64 float64_pack_raw(FloatParts p)
 625 {
 626     return make_float64(pack_raw(float64_params, p));
 627 }
 628
 629 /*----------------------------------------------------------------------------
 630 | Functions and definitions to determine:  (1) whether tininess for underflow
 631 | is detected before or after rounding by default, (2) what (if anything)
 632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 634 | are propagated from function inputs to output.  These details are target-
 635 | specific.
 636 *----------------------------------------------------------------------------*/
 637 #include "softfloat-specialize.c.inc"
 638
 639 /* Canonicalize EXP and FRAC, setting CLS.  */
 640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
 641                                   float_status *status)
 642 {
 643     if (part.exp == parm->exp_max && !parm->arm_althp) {
 644         if (part.frac == 0) {
 645             part.cls = float_class_inf;
 646         } else {
 647             part.frac <<= parm->frac_shift;
 648             part.cls = (parts_is_snan_frac(part.frac, status)
 649                         ? float_class_snan : float_class_qnan);
 650         }
 651     } else if (part.exp == 0) {
 652         if (likely(part.frac == 0)) {
 653             part.cls = float_class_zero;
 654         } else if (status->flush_inputs_to_zero) {
 655             float_raise(float_flag_input_denormal, status);
 656             part.cls = float_class_zero;
 657             part.frac = 0;
 658         } else {
 659             int shift = clz64(part.frac);
 660             part.cls = float_class_normal;
 661             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 662             part.frac <<= shift;
 663         }
 664     } else {
 665         part.cls = float_class_normal;
 666         part.exp -= parm->exp_bias;
 667         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 668     }
 669     return part;
 670 }
 671
 672 /* Round and uncanonicalize a floating-point number by parts. There
 673  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 674  * fraction; these bits will be removed. The exponent will be biased
 675  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 676  */
 677
 678 static FloatParts round_canonical(FloatParts p, float_status *s,
 679                                   const FloatFmt *parm)
 680 {
 681     const uint64_t frac_lsb = parm->frac_lsb;
 682     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 683     const uint64_t round_mask = parm->round_mask;
 684     const uint64_t roundeven_mask = parm->roundeven_mask;
 685     const int exp_max = parm->exp_max;
 686     const int frac_shift = parm->frac_shift;
 687     uint64_t frac, inc;
 688     int exp, flags = 0;
 689     bool overflow_norm;
 690
 691     frac = p.frac;
 692     exp = p.exp;
 693
 694     switch (p.cls) {
 695     case float_class_normal:
 696         switch (s->float_rounding_mode) {
 697         case float_round_nearest_even:
 698             overflow_norm = false;
 699             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 700             break;
 701         case float_round_ties_away:
 702             overflow_norm = false;
 703             inc = frac_lsbm1;
 704             break;
 705         case float_round_to_zero:
 706             overflow_norm = true;
 707             inc = 0;
 708             break;
 709         case float_round_up:
 710             inc = p.sign ? 0 : round_mask;
 711             overflow_norm = p.sign;
 712             break;
 713         case float_round_down:
 714             inc = p.sign ? round_mask : 0;
 715             overflow_norm = !p.sign;
 716             break;
 717         case float_round_to_odd:
 718             overflow_norm = true;
 719             inc = frac & frac_lsb ? 0 : round_mask;
 720             break;
 721         default:
 722             g_assert_not_reached();
 723         }
 724
 725         exp += parm->exp_bias;
 726         if (likely(exp > 0)) {
 727             if (frac & round_mask) {
 728                 flags |= float_flag_inexact;
 729                 if (uadd64_overflow(frac, inc, &frac)) {
 730                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
 731                     exp++;
 732                 }
 733             }
 734             frac >>= frac_shift;
 735
 736             if (parm->arm_althp) {
 737                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 738                 if (unlikely(exp > exp_max)) {
 739                     /* Overflow.  Return the maximum normal.  */
 740                     flags = float_flag_invalid;
 741                     exp = exp_max;
 742                     frac = -1;
 743                 }
 744             } else if (unlikely(exp >= exp_max)) {
 745                 flags |= float_flag_overflow | float_flag_inexact;
 746                 if (overflow_norm) {
 747                     exp = exp_max - 1;
 748                     frac = -1;
 749                 } else {
 750                     p.cls = float_class_inf;
 751                     goto do_inf;
 752                 }
 753             }
 754         } else if (s->flush_to_zero) {
 755             flags |= float_flag_output_denormal;
 756             p.cls = float_class_zero;
 757             goto do_zero;
 758         } else {
 759             bool is_tiny = s->tininess_before_rounding || (exp < 0);
 760
 761             if (!is_tiny) {
 762                 uint64_t discard;
 763                 is_tiny = !uadd64_overflow(frac, inc, &discard);
 764             }
 765
 766             shift64RightJamming(frac, 1 - exp, &frac);
 767             if (frac & round_mask) {
 768                 /* Need to recompute round-to-even.  */
 769                 switch (s->float_rounding_mode) {
 770                 case float_round_nearest_even:
 771                     inc = ((frac & roundeven_mask) != frac_lsbm1
 772                            ? frac_lsbm1 : 0);
 773                     break;
 774                 case float_round_to_odd:
 775                     inc = frac & frac_lsb ? 0 : round_mask;
 776                     break;
 777                 default:
 778                     break;
 779                 }
 780                 flags |= float_flag_inexact;
 781                 frac += inc;
 782             }
 783
 784             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 785             frac >>= frac_shift;
 786
 787             if (is_tiny && (flags & float_flag_inexact)) {
 788                 flags |= float_flag_underflow;
 789             }
 790             if (exp == 0 && frac == 0) {
 791                 p.cls = float_class_zero;
 792             }
 793         }
 794         break;
 795
 796     case float_class_zero:
 797     do_zero:
 798         exp = 0;
 799         frac = 0;
 800         break;
 801
 802     case float_class_inf:
 803     do_inf:
 804         assert(!parm->arm_althp);
 805         exp = exp_max;
 806         frac = 0;
 807         break;
 808
 809     case float_class_qnan:
 810     case float_class_snan:
 811         assert(!parm->arm_althp);
 812         exp = exp_max;
 813         frac >>= parm->frac_shift;
 814         break;
 815
 816     default:
 817         g_assert_not_reached();
 818     }
 819
 820     float_raise(flags, s);
 821     p.exp = exp;
 822     p.frac = frac;
 823     return p;
 824 }
 825
 826 /* Explicit FloatFmt version */
 827 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 828                                             const FloatFmt *params)
 829 {
 830     return sf_canonicalize(float16_unpack_raw(f), params, s);
 831 }
 832
 833 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 834 {
 835     return float16a_unpack_canonical(f, s, &float16_params);
 836 }
 837
 838 static FloatParts bfloat16_unpack_canonical(bfloat16 f, float_status *s)
 839 {
 840     return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
 841 }
 842
 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 844                                              const FloatFmt *params)
 845 {
 846     return float16_pack_raw(round_canonical(p, s, params));
 847 }
 848
 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 850 {
 851     return float16a_round_pack_canonical(p, s, &float16_params);
 852 }
 853
 854 static bfloat16 bfloat16_round_pack_canonical(FloatParts p, float_status *s)
 855 {
 856     return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
 857 }
 858
 859 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 860 {
 861     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 862 }
 863
 864 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 865 {
 866     return float32_pack_raw(round_canonical(p, s, &float32_params));
 867 }
 868
 869 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 870 {
 871     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 872 }
 873
 874 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 875 {
 876     return float64_pack_raw(round_canonical(p, s, &float64_params));
 877 }
 878
 879 static FloatParts return_nan(FloatParts a, float_status *s)
 880 {
 881     switch (a.cls) {
 882     case float_class_snan:
 883         s->float_exception_flags |= float_flag_invalid;
 884         a = parts_silence_nan(a, s);
 885         /* fall through */
 886     case float_class_qnan:
 887         if (s->default_nan_mode) {
 888             return parts_default_nan(s);
 889         }
 890         break;
 891
 892     default:
 893         g_assert_not_reached();
 894     }
 895     return a;
 896 }
 897
 898 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 899 {
 900     if (is_snan(a.cls) || is_snan(b.cls)) {
 901         s->float_exception_flags |= float_flag_invalid;
 902     }
 903
 904     if (s->default_nan_mode) {
 905         return parts_default_nan(s);
 906     } else {
 907         if (pickNaN(a.cls, b.cls,
 908                     a.frac > b.frac ||
 909                     (a.frac == b.frac && a.sign < b.sign), s)) {
 910             a = b;
 911         }
 912         if (is_snan(a.cls)) {
 913             return parts_silence_nan(a, s);
 914         }
 915     }
 916     return a;
 917 }
 918
 919 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 920                                   bool inf_zero, float_status *s)
 921 {
 922     int which;
 923
 924     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 925         s->float_exception_flags |= float_flag_invalid;
 926     }
 927
 928     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 929
 930     if (s->default_nan_mode) {
 931         /* Note that this check is after pickNaNMulAdd so that function
 932          * has an opportunity to set the Invalid flag.
 933          */
 934         which = 3;
 935     }
 936
 937     switch (which) {
 938     case 0:
 939         break;
 940     case 1:
 941         a = b;
 942         break;
 943     case 2:
 944         a = c;
 945         break;
 946     case 3:
 947         return parts_default_nan(s);
 948     default:
 949         g_assert_not_reached();
 950     }
 951
 952     if (is_snan(a.cls)) {
 953         return parts_silence_nan(a, s);
 954     }
 955     return a;
 956 }
 957
 958 /*
 959  * Returns the result of adding or subtracting the values of the
 960  * floating-point values `a' and `b'. The operation is performed
 961  * according to the IEC/IEEE Standard for Binary Floating-Point
 962  * Arithmetic.
 963  */
 964
 965 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 966                                 float_status *s)
 967 {
 968     bool a_sign = a.sign;
 969     bool b_sign = b.sign ^ subtract;
 970
 971     if (a_sign != b_sign) {
 972         /* Subtraction */
 973
 974         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 975             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 976                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 977                 a.frac = a.frac - b.frac;
 978             } else {
 979                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 980                 a.frac = b.frac - a.frac;
 981                 a.exp = b.exp;
 982                 a_sign ^= 1;
 983             }
 984
 985             if (a.frac == 0) {
 986                 a.cls = float_class_zero;
 987                 a.sign = s->float_rounding_mode == float_round_down;
 988             } else {
 989                 int shift = clz64(a.frac);
 990                 a.frac = a.frac << shift;
 991                 a.exp = a.exp - shift;
 992                 a.sign = a_sign;
 993             }
 994             return a;
 995         }
 996         if (is_nan(a.cls) || is_nan(b.cls)) {
 997             return pick_nan(a, b, s);
 998         }
 999         if (a.cls == float_class_inf) {
1000             if (b.cls == float_class_inf) {
1001                 float_raise(float_flag_invalid, s);
1002                 return parts_default_nan(s);
1003             }
1004             return a;
1005         }
1006         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1007             a.sign = s->float_rounding_mode == float_round_down;
1008             return a;
1009         }
1010         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1011             b.sign = a_sign ^ 1;
1012             return b;
1013         }
1014         if (b.cls == float_class_zero) {
1015             return a;
1016         }
1017     } else {
1018         /* Addition */
1019         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1020             if (a.exp > b.exp) {
1021                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1022             } else if (a.exp < b.exp) {
1023                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1024                 a.exp = b.exp;
1025             }
1026
1027             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1028                 shift64RightJamming(a.frac, 1, &a.frac);
1029                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1030                 a.exp += 1;
1031             }
1032             return a;
1033         }
1034         if (is_nan(a.cls) || is_nan(b.cls)) {
1035             return pick_nan(a, b, s);
1036         }
1037         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1038             return a;
1039         }
1040         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1041             b.sign = b_sign;
1042             return b;
1043         }
1044     }
1045     g_assert_not_reached();
1046 }
1047
1048 /*
1049  * Returns the result of adding or subtracting the floating-point
1050  * values `a' and `b'. The operation is performed according to the
1051  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1052  */
1053
1054 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1055 {
1056     FloatParts pa = float16_unpack_canonical(a, status);
1057     FloatParts pb = float16_unpack_canonical(b, status);
1058     FloatParts pr = addsub_floats(pa, pb, false, status);
1059
1060     return float16_round_pack_canonical(pr, status);
1061 }
1062
1063 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1064 {
1065     FloatParts pa = float16_unpack_canonical(a, status);
1066     FloatParts pb = float16_unpack_canonical(b, status);
1067     FloatParts pr = addsub_floats(pa, pb, true, status);
1068
1069     return float16_round_pack_canonical(pr, status);
1070 }
1071
1072 static float32 QEMU_SOFTFLOAT_ATTR
1073 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1074 {
1075     FloatParts pa = float32_unpack_canonical(a, status);
1076     FloatParts pb = float32_unpack_canonical(b, status);
1077     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1078
1079     return float32_round_pack_canonical(pr, status);
1080 }
1081
1082 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1083 {
1084     return soft_f32_addsub(a, b, false, status);
1085 }
1086
1087 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1088 {
1089     return soft_f32_addsub(a, b, true, status);
1090 }
1091
1092 static float64 QEMU_SOFTFLOAT_ATTR
1093 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1094 {
1095     FloatParts pa = float64_unpack_canonical(a, status);
1096     FloatParts pb = float64_unpack_canonical(b, status);
1097     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1098
1099     return float64_round_pack_canonical(pr, status);
1100 }
1101
1102 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1103 {
1104     return soft_f64_addsub(a, b, false, status);
1105 }
1106
1107 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1108 {
1109     return soft_f64_addsub(a, b, true, status);
1110 }
1111
1112 static float hard_f32_add(float a, float b)
1113 {
1114     return a + b;
1115 }
1116
1117 static float hard_f32_sub(float a, float b)
1118 {
1119     return a - b;
1120 }
1121
1122 static double hard_f64_add(double a, double b)
1123 {
1124     return a + b;
1125 }
1126
1127 static double hard_f64_sub(double a, double b)
1128 {
1129     return a - b;
1130 }
1131
1132 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1133 {
1134     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1135         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1136     }
1137     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1138 }
1139
1140 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1141 {
1142     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1143         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1144     } else {
1145         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1146     }
1147 }
1148
1149 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1150                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1151 {
1152     return float32_gen2(a, b, s, hard, soft,
1153                         f32_is_zon2, f32_addsubmul_post);
1154 }
1155
1156 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1157                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1158 {
1159     return float64_gen2(a, b, s, hard, soft,
1160                         f64_is_zon2, f64_addsubmul_post);
1161 }
1162
1163 float32 QEMU_FLATTEN
1164 float32_add(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1167 }
1168
1169 float32 QEMU_FLATTEN
1170 float32_sub(float32 a, float32 b, float_status *s)
1171 {
1172     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1173 }
1174
1175 float64 QEMU_FLATTEN
1176 float64_add(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1179 }
1180
1181 float64 QEMU_FLATTEN
1182 float64_sub(float64 a, float64 b, float_status *s)
1183 {
1184     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1185 }
1186
1187 /*
1188  * Returns the result of adding or subtracting the bfloat16
1189  * values `a' and `b'.
1190  */
1191 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1192 {
1193     FloatParts pa = bfloat16_unpack_canonical(a, status);
1194     FloatParts pb = bfloat16_unpack_canonical(b, status);
1195     FloatParts pr = addsub_floats(pa, pb, false, status);
1196
1197     return bfloat16_round_pack_canonical(pr, status);
1198 }
1199
1200 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1201 {
1202     FloatParts pa = bfloat16_unpack_canonical(a, status);
1203     FloatParts pb = bfloat16_unpack_canonical(b, status);
1204     FloatParts pr = addsub_floats(pa, pb, true, status);
1205
1206     return bfloat16_round_pack_canonical(pr, status);
1207 }
1208
1209 /*
1210  * Returns the result of multiplying the floating-point values `a' and
1211  * `b'. The operation is performed according to the IEC/IEEE Standard
1212  * for Binary Floating-Point Arithmetic.
1213  */
1214
1215 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1216 {
1217     bool sign = a.sign ^ b.sign;
1218
1219     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1220         uint64_t hi, lo;
1221         int exp = a.exp + b.exp;
1222
1223         mul64To128(a.frac, b.frac, &hi, &lo);
1224         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1225             exp += 1;
1226         } else {
1227             hi <<= 1;
1228         }
1229         hi |= (lo != 0);
1230
1231         /* Re-use a */
1232         a.exp = exp;
1233         a.sign = sign;
1234         a.frac = hi;
1235         return a;
1236     }
1237     /* handle all the NaN cases */
1238     if (is_nan(a.cls) || is_nan(b.cls)) {
1239         return pick_nan(a, b, s);
1240     }
1241     /* Inf * Zero == NaN */
1242     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1243         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1244         s->float_exception_flags |= float_flag_invalid;
1245         return parts_default_nan(s);
1246     }
1247     /* Multiply by 0 or Inf */
1248     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1249         a.sign = sign;
1250         return a;
1251     }
1252     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1253         b.sign = sign;
1254         return b;
1255     }
1256     g_assert_not_reached();
1257 }
1258
1259 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1260 {
1261     FloatParts pa = float16_unpack_canonical(a, status);
1262     FloatParts pb = float16_unpack_canonical(b, status);
1263     FloatParts pr = mul_floats(pa, pb, status);
1264
1265     return float16_round_pack_canonical(pr, status);
1266 }
1267
1268 static float32 QEMU_SOFTFLOAT_ATTR
1269 soft_f32_mul(float32 a, float32 b, float_status *status)
1270 {
1271     FloatParts pa = float32_unpack_canonical(a, status);
1272     FloatParts pb = float32_unpack_canonical(b, status);
1273     FloatParts pr = mul_floats(pa, pb, status);
1274
1275     return float32_round_pack_canonical(pr, status);
1276 }
1277
1278 static float64 QEMU_SOFTFLOAT_ATTR
1279 soft_f64_mul(float64 a, float64 b, float_status *status)
1280 {
1281     FloatParts pa = float64_unpack_canonical(a, status);
1282     FloatParts pb = float64_unpack_canonical(b, status);
1283     FloatParts pr = mul_floats(pa, pb, status);
1284
1285     return float64_round_pack_canonical(pr, status);
1286 }
1287
1288 static float hard_f32_mul(float a, float b)
1289 {
1290     return a * b;
1291 }
1292
1293 static double hard_f64_mul(double a, double b)
1294 {
1295     return a * b;
1296 }
1297
1298 float32 QEMU_FLATTEN
1299 float32_mul(float32 a, float32 b, float_status *s)
1300 {
1301     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1302                         f32_is_zon2, f32_addsubmul_post);
1303 }
1304
1305 float64 QEMU_FLATTEN
1306 float64_mul(float64 a, float64 b, float_status *s)
1307 {
1308     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1309                         f64_is_zon2, f64_addsubmul_post);
1310 }
1311
1312 /*
1313  * Returns the result of multiplying the bfloat16
1314  * values `a' and `b'.
1315  */
1316
1317 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1318 {
1319     FloatParts pa = bfloat16_unpack_canonical(a, status);
1320     FloatParts pb = bfloat16_unpack_canonical(b, status);
1321     FloatParts pr = mul_floats(pa, pb, status);
1322
1323     return bfloat16_round_pack_canonical(pr, status);
1324 }
1325
1326 /*
1327  * Returns the result of multiplying the floating-point values `a' and
1328  * `b' then adding 'c', with no intermediate rounding step after the
1329  * multiplication. The operation is performed according to the
1330  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1331  * The flags argument allows the caller to select negation of the
1332  * addend, the intermediate product, or the final result. (The
1333  * difference between this and having the caller do a separate
1334  * negation is that negating externally will flip the sign bit on
1335  * NaNs.)
1336  */
1337
1338 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1339                                 int flags, float_status *s)
1340 {
1341     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1342                     ((1 << float_class_inf) | (1 << float_class_zero));
1343     bool p_sign;
1344     bool sign_flip = flags & float_muladd_negate_result;
1345     FloatClass p_class;
1346     uint64_t hi, lo;
1347     int p_exp;
1348
1349     /* It is implementation-defined whether the cases of (0,inf,qnan)
1350      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1351      * they return if they do), so we have to hand this information
1352      * off to the target-specific pick-a-NaN routine.
1353      */
1354     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1355         return pick_nan_muladd(a, b, c, inf_zero, s);
1356     }
1357
1358     if (inf_zero) {
1359         s->float_exception_flags |= float_flag_invalid;
1360         return parts_default_nan(s);
1361     }
1362
1363     if (flags & float_muladd_negate_c) {
1364         c.sign ^= 1;
1365     }
1366
1367     p_sign = a.sign ^ b.sign;
1368
1369     if (flags & float_muladd_negate_product) {
1370         p_sign ^= 1;
1371     }
1372
1373     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1374         p_class = float_class_inf;
1375     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1376         p_class = float_class_zero;
1377     } else {
1378         p_class = float_class_normal;
1379     }
1380
1381     if (c.cls == float_class_inf) {
1382         if (p_class == float_class_inf && p_sign != c.sign) {
1383             s->float_exception_flags |= float_flag_invalid;
1384             return parts_default_nan(s);
1385         } else {
1386             a.cls = float_class_inf;
1387             a.sign = c.sign ^ sign_flip;
1388             return a;
1389         }
1390     }
1391
1392     if (p_class == float_class_inf) {
1393         a.cls = float_class_inf;
1394         a.sign = p_sign ^ sign_flip;
1395         return a;
1396     }
1397
1398     if (p_class == float_class_zero) {
1399         if (c.cls == float_class_zero) {
1400             if (p_sign != c.sign) {
1401                 p_sign = s->float_rounding_mode == float_round_down;
1402             }
1403             c.sign = p_sign;
1404         } else if (flags & float_muladd_halve_result) {
1405             c.exp -= 1;
1406         }
1407         c.sign ^= sign_flip;
1408         return c;
1409     }
1410
1411     /* a & b should be normals now... */
1412     assert(a.cls == float_class_normal &&
1413            b.cls == float_class_normal);
1414
1415     p_exp = a.exp + b.exp;
1416
1417     mul64To128(a.frac, b.frac, &hi, &lo);
1418
1419     /* Renormalize to the msb. */
1420     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1421         p_exp += 1;
1422     } else {
1423         shortShift128Left(hi, lo, 1, &hi, &lo);
1424     }
1425
1426     /* + add/sub */
1427     if (c.cls != float_class_zero) {
1428         int exp_diff = p_exp - c.exp;
1429         if (p_sign == c.sign) {
1430             /* Addition */
1431             if (exp_diff <= 0) {
1432                 shift64RightJamming(hi, -exp_diff, &hi);
1433                 p_exp = c.exp;
1434                 if (uadd64_overflow(hi, c.frac, &hi)) {
1435                     shift64RightJamming(hi, 1, &hi);
1436                     hi |= DECOMPOSED_IMPLICIT_BIT;
1437                     p_exp += 1;
1438                 }
1439             } else {
1440                 uint64_t c_hi, c_lo, over;
1441                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1442                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1443                 if (over) {
1444                     shift64RightJamming(hi, 1, &hi);
1445                     hi |= DECOMPOSED_IMPLICIT_BIT;
1446                     p_exp += 1;
1447                 }
1448             }
1449         } else {
1450             /* Subtraction */
1451             uint64_t c_hi = c.frac, c_lo = 0;
1452
1453             if (exp_diff <= 0) {
1454                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1455                 if (exp_diff == 0
1456                     &&
1457                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1458                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1459                 } else {
1460                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1461                     p_sign ^= 1;
1462                     p_exp = c.exp;
1463                 }
1464             } else {
1465                 shift128RightJamming(c_hi, c_lo,
1466                                      exp_diff,
1467                                      &c_hi, &c_lo);
1468                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1469             }
1470
1471             if (hi == 0 && lo == 0) {
1472                 a.cls = float_class_zero;
1473                 a.sign = s->float_rounding_mode == float_round_down;
1474                 a.sign ^= sign_flip;
1475                 return a;
1476             } else {
1477                 int shift;
1478                 if (hi != 0) {
1479                     shift = clz64(hi);
1480                 } else {
1481                     shift = clz64(lo) + 64;
1482                 }
1483                 /* Normalizing to a binary point of 124 is the
1484                    correct adjust for the exponent.  However since we're
1485                    shifting, we might as well put the binary point back
1486                    at 63 where we really want it.  Therefore shift as
1487                    if we're leaving 1 bit at the top of the word, but
1488                    adjust the exponent as if we're leaving 3 bits.  */
1489                 shift128Left(hi, lo, shift, &hi, &lo);
1490                 p_exp -= shift;
1491             }
1492         }
1493     }
1494     hi |= (lo != 0);
1495
1496     if (flags & float_muladd_halve_result) {
1497         p_exp -= 1;
1498     }
1499
1500     /* finally prepare our result */
1501     a.cls = float_class_normal;
1502     a.sign = p_sign ^ sign_flip;
1503     a.exp = p_exp;
1504     a.frac = hi;
1505
1506     return a;
1507 }
1508
1509 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1510                                                 int flags, float_status *status)
1511 {
1512     FloatParts pa = float16_unpack_canonical(a, status);
1513     FloatParts pb = float16_unpack_canonical(b, status);
1514     FloatParts pc = float16_unpack_canonical(c, status);
1515     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1516
1517     return float16_round_pack_canonical(pr, status);
1518 }
1519
1520 static float32 QEMU_SOFTFLOAT_ATTR
1521 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1522                 float_status *status)
1523 {
1524     FloatParts pa = float32_unpack_canonical(a, status);
1525     FloatParts pb = float32_unpack_canonical(b, status);
1526     FloatParts pc = float32_unpack_canonical(c, status);
1527     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1528
1529     return float32_round_pack_canonical(pr, status);
1530 }
1531
1532 static float64 QEMU_SOFTFLOAT_ATTR
1533 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1534                 float_status *status)
1535 {
1536     FloatParts pa = float64_unpack_canonical(a, status);
1537     FloatParts pb = float64_unpack_canonical(b, status);
1538     FloatParts pc = float64_unpack_canonical(c, status);
1539     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1540
1541     return float64_round_pack_canonical(pr, status);
1542 }
1543
1544 static bool force_soft_fma;
1545
1546 float32 QEMU_FLATTEN
1547 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1548 {
1549     union_float32 ua, ub, uc, ur;
1550
1551     ua.s = xa;
1552     ub.s = xb;
1553     uc.s = xc;
1554
1555     if (unlikely(!can_use_fpu(s))) {
1556         goto soft;
1557     }
1558     if (unlikely(flags & float_muladd_halve_result)) {
1559         goto soft;
1560     }
1561
1562     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1563     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1564         goto soft;
1565     }
1566
1567     if (unlikely(force_soft_fma)) {
1568         goto soft;
1569     }
1570
1571     /*
1572      * When (a || b) == 0, there's no need to check for under/over flow,
1573      * since we know the addend is (normal || 0) and the product is 0.
1574      */
1575     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1576         union_float32 up;
1577         bool prod_sign;
1578
1579         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1580         prod_sign ^= !!(flags & float_muladd_negate_product);
1581         up.s = float32_set_sign(float32_zero, prod_sign);
1582
1583         if (flags & float_muladd_negate_c) {
1584             uc.h = -uc.h;
1585         }
1586         ur.h = up.h + uc.h;
1587     } else {
1588         union_float32 ua_orig = ua;
1589         union_float32 uc_orig = uc;
1590
1591         if (flags & float_muladd_negate_product) {
1592             ua.h = -ua.h;
1593         }
1594         if (flags & float_muladd_negate_c) {
1595             uc.h = -uc.h;
1596         }
1597
1598         ur.h = fmaf(ua.h, ub.h, uc.h);
1599
1600         if (unlikely(f32_is_inf(ur))) {
1601             s->float_exception_flags |= float_flag_overflow;
1602         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1603             ua = ua_orig;
1604             uc = uc_orig;
1605             goto soft;
1606         }
1607     }
1608     if (flags & float_muladd_negate_result) {
1609         return float32_chs(ur.s);
1610     }
1611     return ur.s;
1612
1613  soft:
1614     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1615 }
1616
1617 float64 QEMU_FLATTEN
1618 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1619 {
1620     union_float64 ua, ub, uc, ur;
1621
1622     ua.s = xa;
1623     ub.s = xb;
1624     uc.s = xc;
1625
1626     if (unlikely(!can_use_fpu(s))) {
1627         goto soft;
1628     }
1629     if (unlikely(flags & float_muladd_halve_result)) {
1630         goto soft;
1631     }
1632
1633     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1634     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1635         goto soft;
1636     }
1637
1638     if (unlikely(force_soft_fma)) {
1639         goto soft;
1640     }
1641
1642     /*
1643      * When (a || b) == 0, there's no need to check for under/over flow,
1644      * since we know the addend is (normal || 0) and the product is 0.
1645      */
1646     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1647         union_float64 up;
1648         bool prod_sign;
1649
1650         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1651         prod_sign ^= !!(flags & float_muladd_negate_product);
1652         up.s = float64_set_sign(float64_zero, prod_sign);
1653
1654         if (flags & float_muladd_negate_c) {
1655             uc.h = -uc.h;
1656         }
1657         ur.h = up.h + uc.h;
1658     } else {
1659         union_float64 ua_orig = ua;
1660         union_float64 uc_orig = uc;
1661
1662         if (flags & float_muladd_negate_product) {
1663             ua.h = -ua.h;
1664         }
1665         if (flags & float_muladd_negate_c) {
1666             uc.h = -uc.h;
1667         }
1668
1669         ur.h = fma(ua.h, ub.h, uc.h);
1670
1671         if (unlikely(f64_is_inf(ur))) {
1672             s->float_exception_flags |= float_flag_overflow;
1673         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1674             ua = ua_orig;
1675             uc = uc_orig;
1676             goto soft;
1677         }
1678     }
1679     if (flags & float_muladd_negate_result) {
1680         return float64_chs(ur.s);
1681     }
1682     return ur.s;
1683
1684  soft:
1685     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1686 }
1687
1688 /*
1689  * Returns the result of multiplying the bfloat16 values `a'
1690  * and `b' then adding 'c', with no intermediate rounding step after the
1691  * multiplication.
1692  */
1693
1694 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1695                                       int flags, float_status *status)
1696 {
1697     FloatParts pa = bfloat16_unpack_canonical(a, status);
1698     FloatParts pb = bfloat16_unpack_canonical(b, status);
1699     FloatParts pc = bfloat16_unpack_canonical(c, status);
1700     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1701
1702     return bfloat16_round_pack_canonical(pr, status);
1703 }
1704
1705 /*
1706  * Returns the result of dividing the floating-point value `a' by the
1707  * corresponding value `b'. The operation is performed according to
1708  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1709  */
1710
1711 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1712 {
1713     bool sign = a.sign ^ b.sign;
1714
1715     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1716         uint64_t n0, n1, q, r;
1717         int exp = a.exp - b.exp;
1718
1719         /*
1720          * We want a 2*N / N-bit division to produce exactly an N-bit
1721          * result, so that we do not lose any precision and so that we
1722          * do not have to renormalize afterward.  If A.frac < B.frac,
1723          * then division would produce an (N-1)-bit result; shift A left
1724          * by one to produce the an N-bit result, and decrement the
1725          * exponent to match.
1726          *
1727          * The udiv_qrnnd algorithm that we're using requires normalization,
1728          * i.e. the msb of the denominator must be set, which is already true.
1729          */
1730         if (a.frac < b.frac) {
1731             exp -= 1;
1732             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1733         } else {
1734             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1735         }
1736         q = udiv_qrnnd(&r, n1, n0, b.frac);
1737
1738         /* Set lsb if there is a remainder, to set inexact. */
1739         a.frac = q | (r != 0);
1740         a.sign = sign;
1741         a.exp = exp;
1742         return a;
1743     }
1744     /* handle all the NaN cases */
1745     if (is_nan(a.cls) || is_nan(b.cls)) {
1746         return pick_nan(a, b, s);
1747     }
1748     /* 0/0 or Inf/Inf */
1749     if (a.cls == b.cls
1750         &&
1751         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1752         s->float_exception_flags |= float_flag_invalid;
1753         return parts_default_nan(s);
1754     }
1755     /* Inf / x or 0 / x */
1756     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1757         a.sign = sign;
1758         return a;
1759     }
1760     /* Div 0 => Inf */
1761     if (b.cls == float_class_zero) {
1762         s->float_exception_flags |= float_flag_divbyzero;
1763         a.cls = float_class_inf;
1764         a.sign = sign;
1765         return a;
1766     }
1767     /* Div by Inf */
1768     if (b.cls == float_class_inf) {
1769         a.cls = float_class_zero;
1770         a.sign = sign;
1771         return a;
1772     }
1773     g_assert_not_reached();
1774 }
1775
1776 float16 float16_div(float16 a, float16 b, float_status *status)
1777 {
1778     FloatParts pa = float16_unpack_canonical(a, status);
1779     FloatParts pb = float16_unpack_canonical(b, status);
1780     FloatParts pr = div_floats(pa, pb, status);
1781
1782     return float16_round_pack_canonical(pr, status);
1783 }
1784
1785 static float32 QEMU_SOFTFLOAT_ATTR
1786 soft_f32_div(float32 a, float32 b, float_status *status)
1787 {
1788     FloatParts pa = float32_unpack_canonical(a, status);
1789     FloatParts pb = float32_unpack_canonical(b, status);
1790     FloatParts pr = div_floats(pa, pb, status);
1791
1792     return float32_round_pack_canonical(pr, status);
1793 }
1794
1795 static float64 QEMU_SOFTFLOAT_ATTR
1796 soft_f64_div(float64 a, float64 b, float_status *status)
1797 {
1798     FloatParts pa = float64_unpack_canonical(a, status);
1799     FloatParts pb = float64_unpack_canonical(b, status);
1800     FloatParts pr = div_floats(pa, pb, status);
1801
1802     return float64_round_pack_canonical(pr, status);
1803 }
1804
1805 static float hard_f32_div(float a, float b)
1806 {
1807     return a / b;
1808 }
1809
1810 static double hard_f64_div(double a, double b)
1811 {
1812     return a / b;
1813 }
1814
1815 static bool f32_div_pre(union_float32 a, union_float32 b)
1816 {
1817     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1818         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1819                fpclassify(b.h) == FP_NORMAL;
1820     }
1821     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1822 }
1823
1824 static bool f64_div_pre(union_float64 a, union_float64 b)
1825 {
1826     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1827         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1828                fpclassify(b.h) == FP_NORMAL;
1829     }
1830     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1831 }
1832
1833 static bool f32_div_post(union_float32 a, union_float32 b)
1834 {
1835     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1836         return fpclassify(a.h) != FP_ZERO;
1837     }
1838     return !float32_is_zero(a.s);
1839 }
1840
1841 static bool f64_div_post(union_float64 a, union_float64 b)
1842 {
1843     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1844         return fpclassify(a.h) != FP_ZERO;
1845     }
1846     return !float64_is_zero(a.s);
1847 }
1848
1849 float32 QEMU_FLATTEN
1850 float32_div(float32 a, float32 b, float_status *s)
1851 {
1852     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1853                         f32_div_pre, f32_div_post);
1854 }
1855
1856 float64 QEMU_FLATTEN
1857 float64_div(float64 a, float64 b, float_status *s)
1858 {
1859     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1860                         f64_div_pre, f64_div_post);
1861 }
1862
1863 /*
1864  * Returns the result of dividing the bfloat16
1865  * value `a' by the corresponding value `b'.
1866  */
1867
1868 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1869 {
1870     FloatParts pa = bfloat16_unpack_canonical(a, status);
1871     FloatParts pb = bfloat16_unpack_canonical(b, status);
1872     FloatParts pr = div_floats(pa, pb, status);
1873
1874     return bfloat16_round_pack_canonical(pr, status);
1875 }
1876
1877 /*
1878  * Float to Float conversions
1879  *
1880  * Returns the result of converting one float format to another. The
1881  * conversion is performed according to the IEC/IEEE Standard for
1882  * Binary Floating-Point Arithmetic.
1883  *
1884  * The float_to_float helper only needs to take care of raising
1885  * invalid exceptions and handling the conversion on NaNs.
1886  */
1887
1888 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1889                                  float_status *s)
1890 {
1891     if (dstf->arm_althp) {
1892         switch (a.cls) {
1893         case float_class_qnan:
1894         case float_class_snan:
1895             /* There is no NaN in the destination format.  Raise Invalid
1896              * and return a zero with the sign of the input NaN.
1897              */
1898             s->float_exception_flags |= float_flag_invalid;
1899             a.cls = float_class_zero;
1900             a.frac = 0;
1901             a.exp = 0;
1902             break;
1903
1904         case float_class_inf:
1905             /* There is no Inf in the destination format.  Raise Invalid
1906              * and return the maximum normal with the correct sign.
1907              */
1908             s->float_exception_flags |= float_flag_invalid;
1909             a.cls = float_class_normal;
1910             a.exp = dstf->exp_max;
1911             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1912             break;
1913
1914         default:
1915             break;
1916         }
1917     } else if (is_nan(a.cls)) {
1918         if (is_snan(a.cls)) {
1919             s->float_exception_flags |= float_flag_invalid;
1920             a = parts_silence_nan(a, s);
1921         }
1922         if (s->default_nan_mode) {
1923             return parts_default_nan(s);
1924         }
1925     }
1926     return a;
1927 }
1928
1929 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1930 {
1931     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1932     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1933     FloatParts pr = float_to_float(p, &float32_params, s);
1934     return float32_round_pack_canonical(pr, s);
1935 }
1936
1937 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1938 {
1939     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1940     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1941     FloatParts pr = float_to_float(p, &float64_params, s);
1942     return float64_round_pack_canonical(pr, s);
1943 }
1944
1945 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1946 {
1947     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1948     FloatParts p = float32_unpack_canonical(a, s);
1949     FloatParts pr = float_to_float(p, fmt16, s);
1950     return float16a_round_pack_canonical(pr, s, fmt16);
1951 }
1952
1953 static float64 QEMU_SOFTFLOAT_ATTR
1954 soft_float32_to_float64(float32 a, float_status *s)
1955 {
1956     FloatParts p = float32_unpack_canonical(a, s);
1957     FloatParts pr = float_to_float(p, &float64_params, s);
1958     return float64_round_pack_canonical(pr, s);
1959 }
1960
1961 float64 float32_to_float64(float32 a, float_status *s)
1962 {
1963     if (likely(float32_is_normal(a))) {
1964         /* Widening conversion can never produce inexact results.  */
1965         union_float32 uf;
1966         union_float64 ud;
1967         uf.s = a;
1968         ud.h = uf.h;
1969         return ud.s;
1970     } else if (float32_is_zero(a)) {
1971         return float64_set_sign(float64_zero, float32_is_neg(a));
1972     } else {
1973         return soft_float32_to_float64(a, s);
1974     }
1975 }
1976
1977 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1978 {
1979     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1980     FloatParts p = float64_unpack_canonical(a, s);
1981     FloatParts pr = float_to_float(p, fmt16, s);
1982     return float16a_round_pack_canonical(pr, s, fmt16);
1983 }
1984
1985 float32 float64_to_float32(float64 a, float_status *s)
1986 {
1987     FloatParts p = float64_unpack_canonical(a, s);
1988     FloatParts pr = float_to_float(p, &float32_params, s);
1989     return float32_round_pack_canonical(pr, s);
1990 }
1991
1992 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
1993 {
1994     FloatParts p = bfloat16_unpack_canonical(a, s);
1995     FloatParts pr = float_to_float(p, &float32_params, s);
1996     return float32_round_pack_canonical(pr, s);
1997 }
1998
1999 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2000 {
2001     FloatParts p = bfloat16_unpack_canonical(a, s);
2002     FloatParts pr = float_to_float(p, &float64_params, s);
2003     return float64_round_pack_canonical(pr, s);
2004 }
2005
2006 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2007 {
2008     FloatParts p = float32_unpack_canonical(a, s);
2009     FloatParts pr = float_to_float(p, &bfloat16_params, s);
2010     return bfloat16_round_pack_canonical(pr, s);
2011 }
2012
2013 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2014 {
2015     FloatParts p = float64_unpack_canonical(a, s);
2016     FloatParts pr = float_to_float(p, &bfloat16_params, s);
2017     return bfloat16_round_pack_canonical(pr, s);
2018 }
2019
2020 /*
2021  * Rounds the floating-point value `a' to an integer, and returns the
2022  * result as a floating-point value. The operation is performed
2023  * according to the IEC/IEEE Standard for Binary Floating-Point
2024  * Arithmetic.
2025  */
2026
2027 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
2028                                int scale, float_status *s)
2029 {
2030     switch (a.cls) {
2031     case float_class_qnan:
2032     case float_class_snan:
2033         return return_nan(a, s);
2034
2035     case float_class_zero:
2036     case float_class_inf:
2037         /* already "integral" */
2038         break;
2039
2040     case float_class_normal:
2041         scale = MIN(MAX(scale, -0x10000), 0x10000);
2042         a.exp += scale;
2043
2044         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2045             /* already integral */
2046             break;
2047         }
2048         if (a.exp < 0) {
2049             bool one;
2050             /* all fractional */
2051             s->float_exception_flags |= float_flag_inexact;
2052             switch (rmode) {
2053             case float_round_nearest_even:
2054                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2055                 break;
2056             case float_round_ties_away:
2057                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2058                 break;
2059             case float_round_to_zero:
2060                 one = false;
2061                 break;
2062             case float_round_up:
2063                 one = !a.sign;
2064                 break;
2065             case float_round_down:
2066                 one = a.sign;
2067                 break;
2068             case float_round_to_odd:
2069                 one = true;
2070                 break;
2071             default:
2072                 g_assert_not_reached();
2073             }
2074
2075             if (one) {
2076                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2077                 a.exp = 0;
2078             } else {
2079                 a.cls = float_class_zero;
2080             }
2081         } else {
2082             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2083             uint64_t frac_lsbm1 = frac_lsb >> 1;
2084             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2085             uint64_t rnd_mask = rnd_even_mask >> 1;
2086             uint64_t inc;
2087
2088             switch (rmode) {
2089             case float_round_nearest_even:
2090                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2091                 break;
2092             case float_round_ties_away:
2093                 inc = frac_lsbm1;
2094                 break;
2095             case float_round_to_zero:
2096                 inc = 0;
2097                 break;
2098             case float_round_up:
2099                 inc = a.sign ? 0 : rnd_mask;
2100                 break;
2101             case float_round_down:
2102                 inc = a.sign ? rnd_mask : 0;
2103                 break;
2104             case float_round_to_odd:
2105                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2106                 break;
2107             default:
2108                 g_assert_not_reached();
2109             }
2110
2111             if (a.frac & rnd_mask) {
2112                 s->float_exception_flags |= float_flag_inexact;
2113                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2114                     a.frac >>= 1;
2115                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2116                     a.exp++;
2117                 }
2118                 a.frac &= ~rnd_mask;
2119             }
2120         }
2121         break;
2122     default:
2123         g_assert_not_reached();
2124     }
2125     return a;
2126 }
2127
2128 float16 float16_round_to_int(float16 a, float_status *s)
2129 {
2130     FloatParts pa = float16_unpack_canonical(a, s);
2131     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2132     return float16_round_pack_canonical(pr, s);
2133 }
2134
2135 float32 float32_round_to_int(float32 a, float_status *s)
2136 {
2137     FloatParts pa = float32_unpack_canonical(a, s);
2138     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2139     return float32_round_pack_canonical(pr, s);
2140 }
2141
2142 float64 float64_round_to_int(float64 a, float_status *s)
2143 {
2144     FloatParts pa = float64_unpack_canonical(a, s);
2145     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2146     return float64_round_pack_canonical(pr, s);
2147 }
2148
2149 /*
2150  * Rounds the bfloat16 value `a' to an integer, and returns the
2151  * result as a bfloat16 value.
2152  */
2153
2154 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2155 {
2156     FloatParts pa = bfloat16_unpack_canonical(a, s);
2157     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2158     return bfloat16_round_pack_canonical(pr, s);
2159 }
2160
2161 /*
2162  * Returns the result of converting the floating-point value `a' to
2163  * the two's complement integer format. The conversion is performed
2164  * according to the IEC/IEEE Standard for Binary Floating-Point
2165  * Arithmetic---which means in particular that the conversion is
2166  * rounded according to the current rounding mode. If `a' is a NaN,
2167  * the largest positive integer is returned. Otherwise, if the
2168  * conversion overflows, the largest integer with the same sign as `a'
2169  * is returned.
2170 */
2171
2172 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
2173                                      int scale, int64_t min, int64_t max,
2174                                      float_status *s)
2175 {
2176     uint64_t r;
2177     int orig_flags = get_float_exception_flags(s);
2178     FloatParts p = round_to_int(in, rmode, scale, s);
2179
2180     switch (p.cls) {
2181     case float_class_snan:
2182     case float_class_qnan:
2183         s->float_exception_flags = orig_flags | float_flag_invalid;
2184         return max;
2185     case float_class_inf:
2186         s->float_exception_flags = orig_flags | float_flag_invalid;
2187         return p.sign ? min : max;
2188     case float_class_zero:
2189         return 0;
2190     case float_class_normal:
2191         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2192             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2193         } else {
2194             r = UINT64_MAX;
2195         }
2196         if (p.sign) {
2197             if (r <= -(uint64_t) min) {
2198                 return -r;
2199             } else {
2200                 s->float_exception_flags = orig_flags | float_flag_invalid;
2201                 return min;
2202             }
2203         } else {
2204             if (r <= max) {
2205                 return r;
2206             } else {
2207                 s->float_exception_flags = orig_flags | float_flag_invalid;
2208                 return max;
2209             }
2210         }
2211     default:
2212         g_assert_not_reached();
2213     }
2214 }
2215
2216 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2217                               float_status *s)
2218 {
2219     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2220                                  rmode, scale, INT8_MIN, INT8_MAX, s);
2221 }
2222
2223 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2224                                 float_status *s)
2225 {
2226     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2227                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2228 }
2229
2230 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2231                                 float_status *s)
2232 {
2233     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2234                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2235 }
2236
2237 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2238                                 float_status *s)
2239 {
2240     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2241                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2242 }
2243
2244 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2245                                 float_status *s)
2246 {
2247     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2248                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2249 }
2250
2251 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2252                                 float_status *s)
2253 {
2254     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2255                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2256 }
2257
2258 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2259                                 float_status *s)
2260 {
2261     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2262                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2263 }
2264
2265 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2266                                 float_status *s)
2267 {
2268     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2269                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2270 }
2271
2272 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2273                                 float_status *s)
2274 {
2275     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2276                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2277 }
2278
2279 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2280                                 float_status *s)
2281 {
2282     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2283                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2284 }
2285
2286 int8_t float16_to_int8(float16 a, float_status *s)
2287 {
2288     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2289 }
2290
2291 int16_t float16_to_int16(float16 a, float_status *s)
2292 {
2293     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2294 }
2295
2296 int32_t float16_to_int32(float16 a, float_status *s)
2297 {
2298     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2299 }
2300
2301 int64_t float16_to_int64(float16 a, float_status *s)
2302 {
2303     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2304 }
2305
2306 int16_t float32_to_int16(float32 a, float_status *s)
2307 {
2308     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2309 }
2310
2311 int32_t float32_to_int32(float32 a, float_status *s)
2312 {
2313     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2314 }
2315
2316 int64_t float32_to_int64(float32 a, float_status *s)
2317 {
2318     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2319 }
2320
2321 int16_t float64_to_int16(float64 a, float_status *s)
2322 {
2323     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2324 }
2325
2326 int32_t float64_to_int32(float64 a, float_status *s)
2327 {
2328     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2329 }
2330
2331 int64_t float64_to_int64(float64 a, float_status *s)
2332 {
2333     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2334 }
2335
2336 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2337 {
2338     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2339 }
2340
2341 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2342 {
2343     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2344 }
2345
2346 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2347 {
2348     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2349 }
2350
2351 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2352 {
2353     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2354 }
2355
2356 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2357 {
2358     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2359 }
2360
2361 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2362 {
2363     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2364 }
2365
2366 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2367 {
2368     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2369 }
2370
2371 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2372 {
2373     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2374 }
2375
2376 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2377 {
2378     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2379 }
2380
2381 /*
2382  * Returns the result of converting the floating-point value `a' to
2383  * the two's complement integer format.
2384  */
2385
2386 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2387                                  float_status *s)
2388 {
2389     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2390                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2391 }
2392
2393 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2394                                  float_status *s)
2395 {
2396     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2397                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2398 }
2399
2400 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2401                                  float_status *s)
2402 {
2403     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2404                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2405 }
2406
2407 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2408 {
2409     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2410 }
2411
2412 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2413 {
2414     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2415 }
2416
2417 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2418 {
2419     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2420 }
2421
2422 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2423 {
2424     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2425 }
2426
2427 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2428 {
2429     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2430 }
2431
2432 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2433 {
2434     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2435 }
2436
2437 /*
2438  *  Returns the result of converting the floating-point value `a' to
2439  *  the unsigned integer format. The conversion is performed according
2440  *  to the IEC/IEEE Standard for Binary Floating-Point
2441  *  Arithmetic---which means in particular that the conversion is
2442  *  rounded according to the current rounding mode. If `a' is a NaN,
2443  *  the largest unsigned integer is returned. Otherwise, if the
2444  *  conversion overflows, the largest unsigned integer is returned. If
2445  *  the 'a' is negative, the result is rounded and zero is returned;
2446  *  values that do not round to zero will raise the inexact exception
2447  *  flag.
2448  */
2449
2450 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
2451                                        int scale, uint64_t max,
2452                                        float_status *s)
2453 {
2454     int orig_flags = get_float_exception_flags(s);
2455     FloatParts p = round_to_int(in, rmode, scale, s);
2456     uint64_t r;
2457
2458     switch (p.cls) {
2459     case float_class_snan:
2460     case float_class_qnan:
2461         s->float_exception_flags = orig_flags | float_flag_invalid;
2462         return max;
2463     case float_class_inf:
2464         s->float_exception_flags = orig_flags | float_flag_invalid;
2465         return p.sign ? 0 : max;
2466     case float_class_zero:
2467         return 0;
2468     case float_class_normal:
2469         if (p.sign) {
2470             s->float_exception_flags = orig_flags | float_flag_invalid;
2471             return 0;
2472         }
2473
2474         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2475             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2476         } else {
2477             s->float_exception_flags = orig_flags | float_flag_invalid;
2478             return max;
2479         }
2480
2481         /* For uint64 this will never trip, but if p.exp is too large
2482          * to shift a decomposed fraction we shall have exited via the
2483          * 3rd leg above.
2484          */
2485         if (r > max) {
2486             s->float_exception_flags = orig_flags | float_flag_invalid;
2487             return max;
2488         }
2489         return r;
2490     default:
2491         g_assert_not_reached();
2492     }
2493 }
2494
2495 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2496                                 float_status *s)
2497 {
2498     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2499                                   rmode, scale, UINT8_MAX, s);
2500 }
2501
2502 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2503                                   float_status *s)
2504 {
2505     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2506                                   rmode, scale, UINT16_MAX, s);
2507 }
2508
2509 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2510                                   float_status *s)
2511 {
2512     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2513                                   rmode, scale, UINT32_MAX, s);
2514 }
2515
2516 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2517                                   float_status *s)
2518 {
2519     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2520                                   rmode, scale, UINT64_MAX, s);
2521 }
2522
2523 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2524                                   float_status *s)
2525 {
2526     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2527                                   rmode, scale, UINT16_MAX, s);
2528 }
2529
2530 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2531                                   float_status *s)
2532 {
2533     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2534                                   rmode, scale, UINT32_MAX, s);
2535 }
2536
2537 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2538                                   float_status *s)
2539 {
2540     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2541                                   rmode, scale, UINT64_MAX, s);
2542 }
2543
2544 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2545                                   float_status *s)
2546 {
2547     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2548                                   rmode, scale, UINT16_MAX, s);
2549 }
2550
2551 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2552                                   float_status *s)
2553 {
2554     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2555                                   rmode, scale, UINT32_MAX, s);
2556 }
2557
2558 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2559                                   float_status *s)
2560 {
2561     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2562                                   rmode, scale, UINT64_MAX, s);
2563 }
2564
2565 uint8_t float16_to_uint8(float16 a, float_status *s)
2566 {
2567     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2568 }
2569
2570 uint16_t float16_to_uint16(float16 a, float_status *s)
2571 {
2572     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2573 }
2574
2575 uint32_t float16_to_uint32(float16 a, float_status *s)
2576 {
2577     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2578 }
2579
2580 uint64_t float16_to_uint64(float16 a, float_status *s)
2581 {
2582     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2583 }
2584
2585 uint16_t float32_to_uint16(float32 a, float_status *s)
2586 {
2587     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2588 }
2589
2590 uint32_t float32_to_uint32(float32 a, float_status *s)
2591 {
2592     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2593 }
2594
2595 uint64_t float32_to_uint64(float32 a, float_status *s)
2596 {
2597     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2598 }
2599
2600 uint16_t float64_to_uint16(float64 a, float_status *s)
2601 {
2602     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2603 }
2604
2605 uint32_t float64_to_uint32(float64 a, float_status *s)
2606 {
2607     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2608 }
2609
2610 uint64_t float64_to_uint64(float64 a, float_status *s)
2611 {
2612     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614
2615 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2616 {
2617     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2618 }
2619
2620 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2621 {
2622     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2623 }
2624
2625 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2626 {
2627     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2628 }
2629
2630 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2631 {
2632     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634
2635 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2636 {
2637     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639
2640 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2641 {
2642     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2643 }
2644
2645 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2646 {
2647     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2648 }
2649
2650 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2651 {
2652     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2653 }
2654
2655 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2656 {
2657     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2658 }
2659
2660 /*
2661  *  Returns the result of converting the bfloat16 value `a' to
2662  *  the unsigned integer format.
2663  */
2664
2665 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2666                                    int scale, float_status *s)
2667 {
2668     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2669                                   rmode, scale, UINT16_MAX, s);
2670 }
2671
2672 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2673                                    int scale, float_status *s)
2674 {
2675     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2676                                   rmode, scale, UINT32_MAX, s);
2677 }
2678
2679 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2680                                    int scale, float_status *s)
2681 {
2682     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2683                                   rmode, scale, UINT64_MAX, s);
2684 }
2685
2686 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2687 {
2688     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2689 }
2690
2691 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2692 {
2693     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2694 }
2695
2696 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2697 {
2698     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2699 }
2700
2701 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2702 {
2703     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2704 }
2705
2706 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2707 {
2708     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2709 }
2710
2711 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2712 {
2713     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2714 }
2715
2716 /*
2717  * Integer to float conversions
2718  *
2719  * Returns the result of converting the two's complement integer `a'
2720  * to the floating-point format. The conversion is performed according
2721  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2722  */
2723
2724 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2725 {
2726     FloatParts r = { .sign = false };
2727
2728     if (a == 0) {
2729         r.cls = float_class_zero;
2730     } else {
2731         uint64_t f = a;
2732         int shift;
2733
2734         r.cls = float_class_normal;
2735         if (a < 0) {
2736             f = -f;
2737             r.sign = true;
2738         }
2739         shift = clz64(f);
2740         scale = MIN(MAX(scale, -0x10000), 0x10000);
2741
2742         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2743         r.frac = f << shift;
2744     }
2745
2746     return r;
2747 }
2748
2749 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2750 {
2751     FloatParts pa = int_to_float(a, scale, status);
2752     return float16_round_pack_canonical(pa, status);
2753 }
2754
2755 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2756 {
2757     return int64_to_float16_scalbn(a, scale, status);
2758 }
2759
2760 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2761 {
2762     return int64_to_float16_scalbn(a, scale, status);
2763 }
2764
2765 float16 int64_to_float16(int64_t a, float_status *status)
2766 {
2767     return int64_to_float16_scalbn(a, 0, status);
2768 }
2769
2770 float16 int32_to_float16(int32_t a, float_status *status)
2771 {
2772     return int64_to_float16_scalbn(a, 0, status);
2773 }
2774
2775 float16 int16_to_float16(int16_t a, float_status *status)
2776 {
2777     return int64_to_float16_scalbn(a, 0, status);
2778 }
2779
2780 float16 int8_to_float16(int8_t a, float_status *status)
2781 {
2782     return int64_to_float16_scalbn(a, 0, status);
2783 }
2784
2785 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2786 {
2787     FloatParts pa = int_to_float(a, scale, status);
2788     return float32_round_pack_canonical(pa, status);
2789 }
2790
2791 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2792 {
2793     return int64_to_float32_scalbn(a, scale, status);
2794 }
2795
2796 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2797 {
2798     return int64_to_float32_scalbn(a, scale, status);
2799 }
2800
2801 float32 int64_to_float32(int64_t a, float_status *status)
2802 {
2803     return int64_to_float32_scalbn(a, 0, status);
2804 }
2805
2806 float32 int32_to_float32(int32_t a, float_status *status)
2807 {
2808     return int64_to_float32_scalbn(a, 0, status);
2809 }
2810
2811 float32 int16_to_float32(int16_t a, float_status *status)
2812 {
2813     return int64_to_float32_scalbn(a, 0, status);
2814 }
2815
2816 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2817 {
2818     FloatParts pa = int_to_float(a, scale, status);
2819     return float64_round_pack_canonical(pa, status);
2820 }
2821
2822 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2823 {
2824     return int64_to_float64_scalbn(a, scale, status);
2825 }
2826
2827 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2828 {
2829     return int64_to_float64_scalbn(a, scale, status);
2830 }
2831
2832 float64 int64_to_float64(int64_t a, float_status *status)
2833 {
2834     return int64_to_float64_scalbn(a, 0, status);
2835 }
2836
2837 float64 int32_to_float64(int32_t a, float_status *status)
2838 {
2839     return int64_to_float64_scalbn(a, 0, status);
2840 }
2841
2842 float64 int16_to_float64(int16_t a, float_status *status)
2843 {
2844     return int64_to_float64_scalbn(a, 0, status);
2845 }
2846
2847 /*
2848  * Returns the result of converting the two's complement integer `a'
2849  * to the bfloat16 format.
2850  */
2851
2852 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2853 {
2854     FloatParts pa = int_to_float(a, scale, status);
2855     return bfloat16_round_pack_canonical(pa, status);
2856 }
2857
2858 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2859 {
2860     return int64_to_bfloat16_scalbn(a, scale, status);
2861 }
2862
2863 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2864 {
2865     return int64_to_bfloat16_scalbn(a, scale, status);
2866 }
2867
2868 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2869 {
2870     return int64_to_bfloat16_scalbn(a, 0, status);
2871 }
2872
2873 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2874 {
2875     return int64_to_bfloat16_scalbn(a, 0, status);
2876 }
2877
2878 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2879 {
2880     return int64_to_bfloat16_scalbn(a, 0, status);
2881 }
2882
2883 /*
2884  * Unsigned Integer to float conversions
2885  *
2886  * Returns the result of converting the unsigned integer `a' to the
2887  * floating-point format. The conversion is performed according to the
2888  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2889  */
2890
2891 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2892 {
2893     FloatParts r = { .sign = false };
2894     int shift;
2895
2896     if (a == 0) {
2897         r.cls = float_class_zero;
2898     } else {
2899         scale = MIN(MAX(scale, -0x10000), 0x10000);
2900         shift = clz64(a);
2901         r.cls = float_class_normal;
2902         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2903         r.frac = a << shift;
2904     }
2905
2906     return r;
2907 }
2908
2909 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2910 {
2911     FloatParts pa = uint_to_float(a, scale, status);
2912     return float16_round_pack_canonical(pa, status);
2913 }
2914
2915 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2916 {
2917     return uint64_to_float16_scalbn(a, scale, status);
2918 }
2919
2920 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2921 {
2922     return uint64_to_float16_scalbn(a, scale, status);
2923 }
2924
2925 float16 uint64_to_float16(uint64_t a, float_status *status)
2926 {
2927     return uint64_to_float16_scalbn(a, 0, status);
2928 }
2929
2930 float16 uint32_to_float16(uint32_t a, float_status *status)
2931 {
2932     return uint64_to_float16_scalbn(a, 0, status);
2933 }
2934
2935 float16 uint16_to_float16(uint16_t a, float_status *status)
2936 {
2937     return uint64_to_float16_scalbn(a, 0, status);
2938 }
2939
2940 float16 uint8_to_float16(uint8_t a, float_status *status)
2941 {
2942     return uint64_to_float16_scalbn(a, 0, status);
2943 }
2944
2945 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2946 {
2947     FloatParts pa = uint_to_float(a, scale, status);
2948     return float32_round_pack_canonical(pa, status);
2949 }
2950
2951 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2952 {
2953     return uint64_to_float32_scalbn(a, scale, status);
2954 }
2955
2956 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2957 {
2958     return uint64_to_float32_scalbn(a, scale, status);
2959 }
2960
2961 float32 uint64_to_float32(uint64_t a, float_status *status)
2962 {
2963     return uint64_to_float32_scalbn(a, 0, status);
2964 }
2965
2966 float32 uint32_to_float32(uint32_t a, float_status *status)
2967 {
2968     return uint64_to_float32_scalbn(a, 0, status);
2969 }
2970
2971 float32 uint16_to_float32(uint16_t a, float_status *status)
2972 {
2973     return uint64_to_float32_scalbn(a, 0, status);
2974 }
2975
2976 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2977 {
2978     FloatParts pa = uint_to_float(a, scale, status);
2979     return float64_round_pack_canonical(pa, status);
2980 }
2981
2982 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2983 {
2984     return uint64_to_float64_scalbn(a, scale, status);
2985 }
2986
2987 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2988 {
2989     return uint64_to_float64_scalbn(a, scale, status);
2990 }
2991
2992 float64 uint64_to_float64(uint64_t a, float_status *status)
2993 {
2994     return uint64_to_float64_scalbn(a, 0, status);
2995 }
2996
2997 float64 uint32_to_float64(uint32_t a, float_status *status)
2998 {
2999     return uint64_to_float64_scalbn(a, 0, status);
3000 }
3001
3002 float64 uint16_to_float64(uint16_t a, float_status *status)
3003 {
3004     return uint64_to_float64_scalbn(a, 0, status);
3005 }
3006
3007 /*
3008  * Returns the result of converting the unsigned integer `a' to the
3009  * bfloat16 format.
3010  */
3011
3012 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3013 {
3014     FloatParts pa = uint_to_float(a, scale, status);
3015     return bfloat16_round_pack_canonical(pa, status);
3016 }
3017
3018 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3019 {
3020     return uint64_to_bfloat16_scalbn(a, scale, status);
3021 }
3022
3023 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3024 {
3025     return uint64_to_bfloat16_scalbn(a, scale, status);
3026 }
3027
3028 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3029 {
3030     return uint64_to_bfloat16_scalbn(a, 0, status);
3031 }
3032
3033 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3034 {
3035     return uint64_to_bfloat16_scalbn(a, 0, status);
3036 }
3037
3038 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3039 {
3040     return uint64_to_bfloat16_scalbn(a, 0, status);
3041 }
3042
3043 /* Float Min/Max */
3044 /* min() and max() functions. These can't be implemented as
3045  * 'compare and pick one input' because that would mishandle
3046  * NaNs and +0 vs -0.
3047  *
3048  * minnum() and maxnum() functions. These are similar to the min()
3049  * and max() functions but if one of the arguments is a QNaN and
3050  * the other is numerical then the numerical argument is returned.
3051  * SNaNs will get quietened before being returned.
3052  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3053  * and maxNum() operations. min() and max() are the typical min/max
3054  * semantics provided by many CPUs which predate that specification.
3055  *
3056  * minnummag() and maxnummag() functions correspond to minNumMag()
3057  * and minNumMag() from the IEEE-754 2008.
3058  */
3059 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
3060                                 bool ieee, bool ismag, float_status *s)
3061 {
3062     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3063         if (ieee) {
3064             /* Takes two floating-point values `a' and `b', one of
3065              * which is a NaN, and returns the appropriate NaN
3066              * result. If either `a' or `b' is a signaling NaN,
3067              * the invalid exception is raised.
3068              */
3069             if (is_snan(a.cls) || is_snan(b.cls)) {
3070                 return pick_nan(a, b, s);
3071             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3072                 return b;
3073             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3074                 return a;
3075             }
3076         }
3077         return pick_nan(a, b, s);
3078     } else {
3079         int a_exp, b_exp;
3080
3081         switch (a.cls) {
3082         case float_class_normal:
3083             a_exp = a.exp;
3084             break;
3085         case float_class_inf:
3086             a_exp = INT_MAX;
3087             break;
3088         case float_class_zero:
3089             a_exp = INT_MIN;
3090             break;
3091         default:
3092             g_assert_not_reached();
3093             break;
3094         }
3095         switch (b.cls) {
3096         case float_class_normal:
3097             b_exp = b.exp;
3098             break;
3099         case float_class_inf:
3100             b_exp = INT_MAX;
3101             break;
3102         case float_class_zero:
3103             b_exp = INT_MIN;
3104             break;
3105         default:
3106             g_assert_not_reached();
3107             break;
3108         }
3109
3110         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3111             bool a_less = a_exp < b_exp;
3112             if (a_exp == b_exp) {
3113                 a_less = a.frac < b.frac;
3114             }
3115             return a_less ^ ismin ? b : a;
3116         }
3117
3118         if (a.sign == b.sign) {
3119             bool a_less = a_exp < b_exp;
3120             if (a_exp == b_exp) {
3121                 a_less = a.frac < b.frac;
3122             }
3123             return a.sign ^ a_less ^ ismin ? b : a;
3124         } else {
3125             return a.sign ^ ismin ? b : a;
3126         }
3127     }
3128 }
3129
3130 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3131 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3132                                      float_status *s)                   \
3133 {                                                                       \
3134     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
3135     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
3136     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3137                                                                         \
3138     return float ## sz ## _round_pack_canonical(pr, s);                 \
3139 }
3140
3141 MINMAX(16, min, true, false, false)
3142 MINMAX(16, minnum, true, true, false)
3143 MINMAX(16, minnummag, true, true, true)
3144 MINMAX(16, max, false, false, false)
3145 MINMAX(16, maxnum, false, true, false)
3146 MINMAX(16, maxnummag, false, true, true)
3147
3148 MINMAX(32, min, true, false, false)
3149 MINMAX(32, minnum, true, true, false)
3150 MINMAX(32, minnummag, true, true, true)
3151 MINMAX(32, max, false, false, false)
3152 MINMAX(32, maxnum, false, true, false)
3153 MINMAX(32, maxnummag, false, true, true)
3154
3155 MINMAX(64, min, true, false, false)
3156 MINMAX(64, minnum, true, true, false)
3157 MINMAX(64, minnummag, true, true, true)
3158 MINMAX(64, max, false, false, false)
3159 MINMAX(64, maxnum, false, true, false)
3160 MINMAX(64, maxnummag, false, true, true)
3161
3162 #undef MINMAX
3163
3164 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3165 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3166 {                                                                       \
3167     FloatParts pa = bfloat16_unpack_canonical(a, s);                    \
3168     FloatParts pb = bfloat16_unpack_canonical(b, s);                    \
3169     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3170                                                                         \
3171     return bfloat16_round_pack_canonical(pr, s);                        \
3172 }
3173
3174 BF16_MINMAX(min, true, false, false)
3175 BF16_MINMAX(minnum, true, true, false)
3176 BF16_MINMAX(minnummag, true, true, true)
3177 BF16_MINMAX(max, false, false, false)
3178 BF16_MINMAX(maxnum, false, true, false)
3179 BF16_MINMAX(maxnummag, false, true, true)
3180
3181 #undef BF16_MINMAX
3182
3183 /* Floating point compare */
3184 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet,
3185                                     float_status *s)
3186 {
3187     if (is_nan(a.cls) || is_nan(b.cls)) {
3188         if (!is_quiet ||
3189             a.cls == float_class_snan ||
3190             b.cls == float_class_snan) {
3191             s->float_exception_flags |= float_flag_invalid;
3192         }
3193         return float_relation_unordered;
3194     }
3195
3196     if (a.cls == float_class_zero) {
3197         if (b.cls == float_class_zero) {
3198             return float_relation_equal;
3199         }
3200         return b.sign ? float_relation_greater : float_relation_less;
3201     } else if (b.cls == float_class_zero) {
3202         return a.sign ? float_relation_less : float_relation_greater;
3203     }
3204
3205     /* The only really important thing about infinity is its sign. If
3206      * both are infinities the sign marks the smallest of the two.
3207      */
3208     if (a.cls == float_class_inf) {
3209         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3210             return float_relation_equal;
3211         }
3212         return a.sign ? float_relation_less : float_relation_greater;
3213     } else if (b.cls == float_class_inf) {
3214         return b.sign ? float_relation_greater : float_relation_less;
3215     }
3216
3217     if (a.sign != b.sign) {
3218         return a.sign ? float_relation_less : float_relation_greater;
3219     }
3220
3221     if (a.exp == b.exp) {
3222         if (a.frac == b.frac) {
3223             return float_relation_equal;
3224         }
3225         if (a.sign) {
3226             return a.frac > b.frac ?
3227                 float_relation_less : float_relation_greater;
3228         } else {
3229             return a.frac > b.frac ?
3230                 float_relation_greater : float_relation_less;
3231         }
3232     } else {
3233         if (a.sign) {
3234             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3235         } else {
3236             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3237         }
3238     }
3239 }
3240
3241 #define COMPARE(name, attr, sz)                                         \
3242 static int attr                                                         \
3243 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3244 {                                                                       \
3245     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
3246     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
3247     return compare_floats(pa, pb, is_quiet, s);                         \
3248 }
3249
3250 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3251 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3252 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3253
3254 #undef COMPARE
3255
3256 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3257 {
3258     return soft_f16_compare(a, b, false, s);
3259 }
3260
3261 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3262 {
3263     return soft_f16_compare(a, b, true, s);
3264 }
3265
3266 static FloatRelation QEMU_FLATTEN
3267 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3268 {
3269     union_float32 ua, ub;
3270
3271     ua.s = xa;
3272     ub.s = xb;
3273
3274     if (QEMU_NO_HARDFLOAT) {
3275         goto soft;
3276     }
3277
3278     float32_input_flush2(&ua.s, &ub.s, s);
3279     if (isgreaterequal(ua.h, ub.h)) {
3280         if (isgreater(ua.h, ub.h)) {
3281             return float_relation_greater;
3282         }
3283         return float_relation_equal;
3284     }
3285     if (likely(isless(ua.h, ub.h))) {
3286         return float_relation_less;
3287     }
3288     /* The only condition remaining is unordered.
3289      * Fall through to set flags.
3290      */
3291  soft:
3292     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3293 }
3294
3295 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3296 {
3297     return f32_compare(a, b, false, s);
3298 }
3299
3300 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3301 {
3302     return f32_compare(a, b, true, s);
3303 }
3304
3305 static FloatRelation QEMU_FLATTEN
3306 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3307 {
3308     union_float64 ua, ub;
3309
3310     ua.s = xa;
3311     ub.s = xb;
3312
3313     if (QEMU_NO_HARDFLOAT) {
3314         goto soft;
3315     }
3316
3317     float64_input_flush2(&ua.s, &ub.s, s);
3318     if (isgreaterequal(ua.h, ub.h)) {
3319         if (isgreater(ua.h, ub.h)) {
3320             return float_relation_greater;
3321         }
3322         return float_relation_equal;
3323     }
3324     if (likely(isless(ua.h, ub.h))) {
3325         return float_relation_less;
3326     }
3327     /* The only condition remaining is unordered.
3328      * Fall through to set flags.
3329      */
3330  soft:
3331     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3332 }
3333
3334 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3335 {
3336     return f64_compare(a, b, false, s);
3337 }
3338
3339 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3340 {
3341     return f64_compare(a, b, true, s);
3342 }
3343
3344 static FloatRelation QEMU_FLATTEN
3345 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3346 {
3347     FloatParts pa = bfloat16_unpack_canonical(a, s);
3348     FloatParts pb = bfloat16_unpack_canonical(b, s);
3349     return compare_floats(pa, pb, is_quiet, s);
3350 }
3351
3352 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3353 {
3354     return soft_bf16_compare(a, b, false, s);
3355 }
3356
3357 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3358 {
3359     return soft_bf16_compare(a, b, true, s);
3360 }
3361
3362 /* Multiply A by 2 raised to the power N.  */
3363 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3364 {
3365     if (unlikely(is_nan(a.cls))) {
3366         return return_nan(a, s);
3367     }
3368     if (a.cls == float_class_normal) {
3369         /* The largest float type (even though not supported by FloatParts)
3370          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3371          * still allows rounding to infinity, without allowing overflow
3372          * within the int32_t that backs FloatParts.exp.
3373          */
3374         n = MIN(MAX(n, -0x10000), 0x10000);
3375         a.exp += n;
3376     }
3377     return a;
3378 }
3379
3380 float16 float16_scalbn(float16 a, int n, float_status *status)
3381 {
3382     FloatParts pa = float16_unpack_canonical(a, status);
3383     FloatParts pr = scalbn_decomposed(pa, n, status);
3384     return float16_round_pack_canonical(pr, status);
3385 }
3386
3387 float32 float32_scalbn(float32 a, int n, float_status *status)
3388 {
3389     FloatParts pa = float32_unpack_canonical(a, status);
3390     FloatParts pr = scalbn_decomposed(pa, n, status);
3391     return float32_round_pack_canonical(pr, status);
3392 }
3393
3394 float64 float64_scalbn(float64 a, int n, float_status *status)
3395 {
3396     FloatParts pa = float64_unpack_canonical(a, status);
3397     FloatParts pr = scalbn_decomposed(pa, n, status);
3398     return float64_round_pack_canonical(pr, status);
3399 }
3400
3401 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3402 {
3403     FloatParts pa = bfloat16_unpack_canonical(a, status);
3404     FloatParts pr = scalbn_decomposed(pa, n, status);
3405     return bfloat16_round_pack_canonical(pr, status);
3406 }
3407
3408 /*
3409  * Square Root
3410  *
3411  * The old softfloat code did an approximation step before zeroing in
3412  * on the final result. However for simpleness we just compute the
3413  * square root by iterating down from the implicit bit to enough extra
3414  * bits to ensure we get a correctly rounded result.
3415  *
3416  * This does mean however the calculation is slower than before,
3417  * especially for 64 bit floats.
3418  */
3419
3420 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3421 {
3422     uint64_t a_frac, r_frac, s_frac;
3423     int bit, last_bit;
3424
3425     if (is_nan(a.cls)) {
3426         return return_nan(a, s);
3427     }
3428     if (a.cls == float_class_zero) {
3429         return a;  /* sqrt(+-0) = +-0 */
3430     }
3431     if (a.sign) {
3432         s->float_exception_flags |= float_flag_invalid;
3433         return parts_default_nan(s);
3434     }
3435     if (a.cls == float_class_inf) {
3436         return a;  /* sqrt(+inf) = +inf */
3437     }
3438
3439     assert(a.cls == float_class_normal);
3440
3441     /* We need two overflow bits at the top. Adding room for that is a
3442      * right shift. If the exponent is odd, we can discard the low bit
3443      * by multiplying the fraction by 2; that's a left shift. Combine
3444      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3445      */
3446     a_frac = a.frac >> (2 - (a.exp & 1));
3447     a.exp >>= 1;
3448
3449     /* Bit-by-bit computation of sqrt.  */
3450     r_frac = 0;
3451     s_frac = 0;
3452
3453     /* Iterate from implicit bit down to the 3 extra bits to compute a
3454      * properly rounded result. Remember we've inserted two more bits
3455      * at the top, so these positions are two less.
3456      */
3457     bit = DECOMPOSED_BINARY_POINT - 2;
3458     last_bit = MAX(p->frac_shift - 4, 0);
3459     do {
3460         uint64_t q = 1ULL << bit;
3461         uint64_t t_frac = s_frac + q;
3462         if (t_frac <= a_frac) {
3463             s_frac = t_frac + q;
3464             a_frac -= t_frac;
3465             r_frac += q;
3466         }
3467         a_frac <<= 1;
3468     } while (--bit >= last_bit);
3469
3470     /* Undo the right shift done above. If there is any remaining
3471      * fraction, the result is inexact. Set the sticky bit.
3472      */
3473     a.frac = (r_frac << 2) + (a_frac != 0);
3474
3475     return a;
3476 }
3477
3478 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3479 {
3480     FloatParts pa = float16_unpack_canonical(a, status);
3481     FloatParts pr = sqrt_float(pa, status, &float16_params);
3482     return float16_round_pack_canonical(pr, status);
3483 }
3484
3485 static float32 QEMU_SOFTFLOAT_ATTR
3486 soft_f32_sqrt(float32 a, float_status *status)
3487 {
3488     FloatParts pa = float32_unpack_canonical(a, status);
3489     FloatParts pr = sqrt_float(pa, status, &float32_params);
3490     return float32_round_pack_canonical(pr, status);
3491 }
3492
3493 static float64 QEMU_SOFTFLOAT_ATTR
3494 soft_f64_sqrt(float64 a, float_status *status)
3495 {
3496     FloatParts pa = float64_unpack_canonical(a, status);
3497     FloatParts pr = sqrt_float(pa, status, &float64_params);
3498     return float64_round_pack_canonical(pr, status);
3499 }
3500
3501 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3502 {
3503     union_float32 ua, ur;
3504
3505     ua.s = xa;
3506     if (unlikely(!can_use_fpu(s))) {
3507         goto soft;
3508     }
3509
3510     float32_input_flush1(&ua.s, s);
3511     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3512         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3513                        fpclassify(ua.h) == FP_ZERO) ||
3514                      signbit(ua.h))) {
3515             goto soft;
3516         }
3517     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3518                         float32_is_neg(ua.s))) {
3519         goto soft;
3520     }
3521     ur.h = sqrtf(ua.h);
3522     return ur.s;
3523
3524  soft:
3525     return soft_f32_sqrt(ua.s, s);
3526 }
3527
3528 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3529 {
3530     union_float64 ua, ur;
3531
3532     ua.s = xa;
3533     if (unlikely(!can_use_fpu(s))) {
3534         goto soft;
3535     }
3536
3537     float64_input_flush1(&ua.s, s);
3538     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3539         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3540                        fpclassify(ua.h) == FP_ZERO) ||
3541                      signbit(ua.h))) {
3542             goto soft;
3543         }
3544     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3545                         float64_is_neg(ua.s))) {
3546         goto soft;
3547     }
3548     ur.h = sqrt(ua.h);
3549     return ur.s;
3550
3551  soft:
3552     return soft_f64_sqrt(ua.s, s);
3553 }
3554
3555 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3556 {
3557     FloatParts pa = bfloat16_unpack_canonical(a, status);
3558     FloatParts pr = sqrt_float(pa, status, &bfloat16_params);
3559     return bfloat16_round_pack_canonical(pr, status);
3560 }
3561
3562 /*----------------------------------------------------------------------------
3563 | The pattern for a default generated NaN.
3564 *----------------------------------------------------------------------------*/
3565
3566 float16 float16_default_nan(float_status *status)
3567 {
3568     FloatParts p = parts_default_nan(status);
3569     p.frac >>= float16_params.frac_shift;
3570     return float16_pack_raw(p);
3571 }
3572
3573 float32 float32_default_nan(float_status *status)
3574 {
3575     FloatParts p = parts_default_nan(status);
3576     p.frac >>= float32_params.frac_shift;
3577     return float32_pack_raw(p);
3578 }
3579
3580 float64 float64_default_nan(float_status *status)
3581 {
3582     FloatParts p = parts_default_nan(status);
3583     p.frac >>= float64_params.frac_shift;
3584     return float64_pack_raw(p);
3585 }
3586
3587 float128 float128_default_nan(float_status *status)
3588 {
3589     FloatParts p = parts_default_nan(status);
3590     float128 r;
3591
3592     /* Extrapolate from the choices made by parts_default_nan to fill
3593      * in the quad-floating format.  If the low bit is set, assume we
3594      * want to set all non-snan bits.
3595      */
3596     r.low = -(p.frac & 1);
3597     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3598     r.high |= UINT64_C(0x7FFF000000000000);
3599     r.high |= (uint64_t)p.sign << 63;
3600
3601     return r;
3602 }
3603
3604 bfloat16 bfloat16_default_nan(float_status *status)
3605 {
3606     FloatParts p = parts_default_nan(status);
3607     p.frac >>= bfloat16_params.frac_shift;
3608     return bfloat16_pack_raw(p);
3609 }
3610
3611 /*----------------------------------------------------------------------------
3612 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3613 *----------------------------------------------------------------------------*/
3614
3615 float16 float16_silence_nan(float16 a, float_status *status)
3616 {
3617     FloatParts p = float16_unpack_raw(a);
3618     p.frac <<= float16_params.frac_shift;
3619     p = parts_silence_nan(p, status);
3620     p.frac >>= float16_params.frac_shift;
3621     return float16_pack_raw(p);
3622 }
3623
3624 float32 float32_silence_nan(float32 a, float_status *status)
3625 {
3626     FloatParts p = float32_unpack_raw(a);
3627     p.frac <<= float32_params.frac_shift;
3628     p = parts_silence_nan(p, status);
3629     p.frac >>= float32_params.frac_shift;
3630     return float32_pack_raw(p);
3631 }
3632
3633 float64 float64_silence_nan(float64 a, float_status *status)
3634 {
3635     FloatParts p = float64_unpack_raw(a);
3636     p.frac <<= float64_params.frac_shift;
3637     p = parts_silence_nan(p, status);
3638     p.frac >>= float64_params.frac_shift;
3639     return float64_pack_raw(p);
3640 }
3641
3642 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3643 {
3644     FloatParts p = bfloat16_unpack_raw(a);
3645     p.frac <<= bfloat16_params.frac_shift;
3646     p = parts_silence_nan(p, status);
3647     p.frac >>= bfloat16_params.frac_shift;
3648     return bfloat16_pack_raw(p);
3649 }
3650
3651 /*----------------------------------------------------------------------------
3652 | If `a' is denormal and we are in flush-to-zero mode then set the
3653 | input-denormal exception and return zero. Otherwise just return the value.
3654 *----------------------------------------------------------------------------*/
3655
3656 static bool parts_squash_denormal(FloatParts p, float_status *status)
3657 {
3658     if (p.exp == 0 && p.frac != 0) {
3659         float_raise(float_flag_input_denormal, status);
3660         return true;
3661     }
3662
3663     return false;
3664 }
3665
3666 float16 float16_squash_input_denormal(float16 a, float_status *status)
3667 {
3668     if (status->flush_inputs_to_zero) {
3669         FloatParts p = float16_unpack_raw(a);
3670         if (parts_squash_denormal(p, status)) {
3671             return float16_set_sign(float16_zero, p.sign);
3672         }
3673     }
3674     return a;
3675 }
3676
3677 float32 float32_squash_input_denormal(float32 a, float_status *status)
3678 {
3679     if (status->flush_inputs_to_zero) {
3680         FloatParts p = float32_unpack_raw(a);
3681         if (parts_squash_denormal(p, status)) {
3682             return float32_set_sign(float32_zero, p.sign);
3683         }
3684     }
3685     return a;
3686 }
3687
3688 float64 float64_squash_input_denormal(float64 a, float_status *status)
3689 {
3690     if (status->flush_inputs_to_zero) {
3691         FloatParts p = float64_unpack_raw(a);
3692         if (parts_squash_denormal(p, status)) {
3693             return float64_set_sign(float64_zero, p.sign);
3694         }
3695     }
3696     return a;
3697 }
3698
3699 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3700 {
3701     if (status->flush_inputs_to_zero) {
3702         FloatParts p = bfloat16_unpack_raw(a);
3703         if (parts_squash_denormal(p, status)) {
3704             return bfloat16_set_sign(bfloat16_zero, p.sign);
3705         }
3706     }
3707     return a;
3708 }
3709
3710 /*----------------------------------------------------------------------------
3711 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3712 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3713 | input.  If `zSign' is 1, the input is negated before being converted to an
3714 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3715 | is simply rounded to an integer, with the inexact exception raised if the
3716 | input cannot be represented exactly as an integer.  However, if the fixed-
3717 | point input is too large, the invalid exception is raised and the largest
3718 | positive or negative integer is returned.
3719 *----------------------------------------------------------------------------*/
3720
3721 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3722                                  float_status *status)
3723 {
3724     int8_t roundingMode;
3725     bool roundNearestEven;
3726     int8_t roundIncrement, roundBits;
3727     int32_t z;
3728
3729     roundingMode = status->float_rounding_mode;
3730     roundNearestEven = ( roundingMode == float_round_nearest_even );
3731     switch (roundingMode) {
3732     case float_round_nearest_even:
3733     case float_round_ties_away:
3734         roundIncrement = 0x40;
3735         break;
3736     case float_round_to_zero:
3737         roundIncrement = 0;
3738         break;
3739     case float_round_up:
3740         roundIncrement = zSign ? 0 : 0x7f;
3741         break;
3742     case float_round_down:
3743         roundIncrement = zSign ? 0x7f : 0;
3744         break;
3745     case float_round_to_odd:
3746         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3747         break;
3748     default:
3749         abort();
3750     }
3751     roundBits = absZ & 0x7F;
3752     absZ = ( absZ + roundIncrement )>>7;
3753     if (!(roundBits ^ 0x40) && roundNearestEven) {
3754         absZ &= ~1;
3755     }
3756     z = absZ;
3757     if ( zSign ) z = - z;
3758     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3759         float_raise(float_flag_invalid, status);
3760         return zSign ? INT32_MIN : INT32_MAX;
3761     }
3762     if (roundBits) {
3763         status->float_exception_flags |= float_flag_inexact;
3764     }
3765     return z;
3766
3767 }
3768
3769 /*----------------------------------------------------------------------------
3770 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3771 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3772 | and returns the properly rounded 64-bit integer corresponding to the input.
3773 | If `zSign' is 1, the input is negated before being converted to an integer.
3774 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3775 | the inexact exception raised if the input cannot be represented exactly as
3776 | an integer.  However, if the fixed-point input is too large, the invalid
3777 | exception is raised and the largest positive or negative integer is
3778 | returned.
3779 *----------------------------------------------------------------------------*/
3780
3781 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3782                                float_status *status)
3783 {
3784     int8_t roundingMode;
3785     bool roundNearestEven, increment;
3786     int64_t z;
3787
3788     roundingMode = status->float_rounding_mode;
3789     roundNearestEven = ( roundingMode == float_round_nearest_even );
3790     switch (roundingMode) {
3791     case float_round_nearest_even:
3792     case float_round_ties_away:
3793         increment = ((int64_t) absZ1 < 0);
3794         break;
3795     case float_round_to_zero:
3796         increment = 0;
3797         break;
3798     case float_round_up:
3799         increment = !zSign && absZ1;
3800         break;
3801     case float_round_down:
3802         increment = zSign && absZ1;
3803         break;
3804     case float_round_to_odd:
3805         increment = !(absZ0 & 1) && absZ1;
3806         break;
3807     default:
3808         abort();
3809     }
3810     if ( increment ) {
3811         ++absZ0;
3812         if ( absZ0 == 0 ) goto overflow;
3813         if (!(absZ1 << 1) && roundNearestEven) {
3814             absZ0 &= ~1;
3815         }
3816     }
3817     z = absZ0;
3818     if ( zSign ) z = - z;
3819     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3820  overflow:
3821         float_raise(float_flag_invalid, status);
3822         return zSign ? INT64_MIN : INT64_MAX;
3823     }
3824     if (absZ1) {
3825         status->float_exception_flags |= float_flag_inexact;
3826     }
3827     return z;
3828
3829 }
3830
3831 /*----------------------------------------------------------------------------
3832 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3833 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3834 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3835 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3836 | with the inexact exception raised if the input cannot be represented exactly
3837 | as an integer.  However, if the fixed-point input is too large, the invalid
3838 | exception is raised and the largest unsigned integer is returned.
3839 *----------------------------------------------------------------------------*/
3840
3841 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3842                                 uint64_t absZ1, float_status *status)
3843 {
3844     int8_t roundingMode;
3845     bool roundNearestEven, increment;
3846
3847     roundingMode = status->float_rounding_mode;
3848     roundNearestEven = (roundingMode == float_round_nearest_even);
3849     switch (roundingMode) {
3850     case float_round_nearest_even:
3851     case float_round_ties_away:
3852         increment = ((int64_t)absZ1 < 0);
3853         break;
3854     case float_round_to_zero:
3855         increment = 0;
3856         break;
3857     case float_round_up:
3858         increment = !zSign && absZ1;
3859         break;
3860     case float_round_down:
3861         increment = zSign && absZ1;
3862         break;
3863     case float_round_to_odd:
3864         increment = !(absZ0 & 1) && absZ1;
3865         break;
3866     default:
3867         abort();
3868     }
3869     if (increment) {
3870         ++absZ0;
3871         if (absZ0 == 0) {
3872             float_raise(float_flag_invalid, status);
3873             return UINT64_MAX;
3874         }
3875         if (!(absZ1 << 1) && roundNearestEven) {
3876             absZ0 &= ~1;
3877         }
3878     }
3879
3880     if (zSign && absZ0) {
3881         float_raise(float_flag_invalid, status);
3882         return 0;
3883     }
3884
3885     if (absZ1) {
3886         status->float_exception_flags |= float_flag_inexact;
3887     }
3888     return absZ0;
3889 }
3890
3891 /*----------------------------------------------------------------------------
3892 | Normalizes the subnormal single-precision floating-point value represented
3893 | by the denormalized significand `aSig'.  The normalized exponent and
3894 | significand are stored at the locations pointed to by `zExpPtr' and
3895 | `zSigPtr', respectively.
3896 *----------------------------------------------------------------------------*/
3897
3898 static void
3899  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3900 {
3901     int8_t shiftCount;
3902
3903     shiftCount = clz32(aSig) - 8;
3904     *zSigPtr = aSig<<shiftCount;
3905     *zExpPtr = 1 - shiftCount;
3906
3907 }
3908
3909 /*----------------------------------------------------------------------------
3910 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3911 | and significand `zSig', and returns the proper single-precision floating-
3912 | point value corresponding to the abstract input.  Ordinarily, the abstract
3913 | value is simply rounded and packed into the single-precision format, with
3914 | the inexact exception raised if the abstract input cannot be represented
3915 | exactly.  However, if the abstract value is too large, the overflow and
3916 | inexact exceptions are raised and an infinity or maximal finite value is
3917 | returned.  If the abstract value is too small, the input value is rounded to
3918 | a subnormal number, and the underflow and inexact exceptions are raised if
3919 | the abstract input cannot be represented exactly as a subnormal single-
3920 | precision floating-point number.
3921 |     The input significand `zSig' has its binary point between bits 30
3922 | and 29, which is 7 bits to the left of the usual location.  This shifted
3923 | significand must be normalized or smaller.  If `zSig' is not normalized,
3924 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3925 | and it must not require rounding.  In the usual case that `zSig' is
3926 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3927 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3928 | Binary Floating-Point Arithmetic.
3929 *----------------------------------------------------------------------------*/
3930
3931 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3932                                    float_status *status)
3933 {
3934     int8_t roundingMode;
3935     bool roundNearestEven;
3936     int8_t roundIncrement, roundBits;
3937     bool isTiny;
3938
3939     roundingMode = status->float_rounding_mode;
3940     roundNearestEven = ( roundingMode == float_round_nearest_even );
3941     switch (roundingMode) {
3942     case float_round_nearest_even:
3943     case float_round_ties_away:
3944         roundIncrement = 0x40;
3945         break;
3946     case float_round_to_zero:
3947         roundIncrement = 0;
3948         break;
3949     case float_round_up:
3950         roundIncrement = zSign ? 0 : 0x7f;
3951         break;
3952     case float_round_down:
3953         roundIncrement = zSign ? 0x7f : 0;
3954         break;
3955     case float_round_to_odd:
3956         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3957         break;
3958     default:
3959         abort();
3960         break;
3961     }
3962     roundBits = zSig & 0x7F;
3963     if ( 0xFD <= (uint16_t) zExp ) {
3964         if (    ( 0xFD < zExp )
3965              || (    ( zExp == 0xFD )
3966                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3967            ) {
3968             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3969                                    roundIncrement != 0;
3970             float_raise(float_flag_overflow | float_flag_inexact, status);
3971             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3972         }
3973         if ( zExp < 0 ) {
3974             if (status->flush_to_zero) {
3975                 float_raise(float_flag_output_denormal, status);
3976                 return packFloat32(zSign, 0, 0);
3977             }
3978             isTiny = status->tininess_before_rounding
3979                   || (zExp < -1)
3980                   || (zSig + roundIncrement < 0x80000000);
3981             shift32RightJamming( zSig, - zExp, &zSig );
3982             zExp = 0;
3983             roundBits = zSig & 0x7F;
3984             if (isTiny && roundBits) {
3985                 float_raise(float_flag_underflow, status);
3986             }
3987             if (roundingMode == float_round_to_odd) {
3988                 /*
3989                  * For round-to-odd case, the roundIncrement depends on
3990                  * zSig which just changed.
3991                  */
3992                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3993             }
3994         }
3995     }
3996     if (roundBits) {
3997         status->float_exception_flags |= float_flag_inexact;
3998     }
3999     zSig = ( zSig + roundIncrement )>>7;
4000     if (!(roundBits ^ 0x40) && roundNearestEven) {
4001         zSig &= ~1;
4002     }
4003     if ( zSig == 0 ) zExp = 0;
4004     return packFloat32( zSign, zExp, zSig );
4005
4006 }
4007
4008 /*----------------------------------------------------------------------------
4009 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4010 | and significand `zSig', and returns the proper single-precision floating-
4011 | point value corresponding to the abstract input.  This routine is just like
4012 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4013 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4014 | floating-point exponent.
4015 *----------------------------------------------------------------------------*/
4016
4017 static float32
4018  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4019                               float_status *status)
4020 {
4021     int8_t shiftCount;
4022
4023     shiftCount = clz32(zSig) - 1;
4024     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4025                                status);
4026
4027 }
4028
4029 /*----------------------------------------------------------------------------
4030 | Normalizes the subnormal double-precision floating-point value represented
4031 | by the denormalized significand `aSig'.  The normalized exponent and
4032 | significand are stored at the locations pointed to by `zExpPtr' and
4033 | `zSigPtr', respectively.
4034 *----------------------------------------------------------------------------*/
4035
4036 static void
4037  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4038 {
4039     int8_t shiftCount;
4040
4041     shiftCount = clz64(aSig) - 11;
4042     *zSigPtr = aSig<<shiftCount;
4043     *zExpPtr = 1 - shiftCount;
4044
4045 }
4046
4047 /*----------------------------------------------------------------------------
4048 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4049 | double-precision floating-point value, returning the result.  After being
4050 | shifted into the proper positions, the three fields are simply added
4051 | together to form the result.  This means that any integer portion of `zSig'
4052 | will be added into the exponent.  Since a properly normalized significand
4053 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4054 | than the desired result exponent whenever `zSig' is a complete, normalized
4055 | significand.
4056 *----------------------------------------------------------------------------*/
4057
4058 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4059 {
4060
4061     return make_float64(
4062         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4063
4064 }
4065
4066 /*----------------------------------------------------------------------------
4067 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4068 | and significand `zSig', and returns the proper double-precision floating-
4069 | point value corresponding to the abstract input.  Ordinarily, the abstract
4070 | value is simply rounded and packed into the double-precision format, with
4071 | the inexact exception raised if the abstract input cannot be represented
4072 | exactly.  However, if the abstract value is too large, the overflow and
4073 | inexact exceptions are raised and an infinity or maximal finite value is
4074 | returned.  If the abstract value is too small, the input value is rounded to
4075 | a subnormal number, and the underflow and inexact exceptions are raised if
4076 | the abstract input cannot be represented exactly as a subnormal double-
4077 | precision floating-point number.
4078 |     The input significand `zSig' has its binary point between bits 62
4079 | and 61, which is 10 bits to the left of the usual location.  This shifted
4080 | significand must be normalized or smaller.  If `zSig' is not normalized,
4081 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4082 | and it must not require rounding.  In the usual case that `zSig' is
4083 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4084 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4085 | Binary Floating-Point Arithmetic.
4086 *----------------------------------------------------------------------------*/
4087
4088 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4089                                    float_status *status)
4090 {
4091     int8_t roundingMode;
4092     bool roundNearestEven;
4093     int roundIncrement, roundBits;
4094     bool isTiny;
4095
4096     roundingMode = status->float_rounding_mode;
4097     roundNearestEven = ( roundingMode == float_round_nearest_even );
4098     switch (roundingMode) {
4099     case float_round_nearest_even:
4100     case float_round_ties_away:
4101         roundIncrement = 0x200;
4102         break;
4103     case float_round_to_zero:
4104         roundIncrement = 0;
4105         break;
4106     case float_round_up:
4107         roundIncrement = zSign ? 0 : 0x3ff;
4108         break;
4109     case float_round_down:
4110         roundIncrement = zSign ? 0x3ff : 0;
4111         break;
4112     case float_round_to_odd:
4113         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4114         break;
4115     default:
4116         abort();
4117     }
4118     roundBits = zSig & 0x3FF;
4119     if ( 0x7FD <= (uint16_t) zExp ) {
4120         if (    ( 0x7FD < zExp )
4121              || (    ( zExp == 0x7FD )
4122                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4123            ) {
4124             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4125                                    roundIncrement != 0;
4126             float_raise(float_flag_overflow | float_flag_inexact, status);
4127             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4128         }
4129         if ( zExp < 0 ) {
4130             if (status->flush_to_zero) {
4131                 float_raise(float_flag_output_denormal, status);
4132                 return packFloat64(zSign, 0, 0);
4133             }
4134             isTiny = status->tininess_before_rounding
4135                   || (zExp < -1)
4136                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4137             shift64RightJamming( zSig, - zExp, &zSig );
4138             zExp = 0;
4139             roundBits = zSig & 0x3FF;
4140             if (isTiny && roundBits) {
4141                 float_raise(float_flag_underflow, status);
4142             }
4143             if (roundingMode == float_round_to_odd) {
4144                 /*
4145                  * For round-to-odd case, the roundIncrement depends on
4146                  * zSig which just changed.
4147                  */
4148                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4149             }
4150         }
4151     }
4152     if (roundBits) {
4153         status->float_exception_flags |= float_flag_inexact;
4154     }
4155     zSig = ( zSig + roundIncrement )>>10;
4156     if (!(roundBits ^ 0x200) && roundNearestEven) {
4157         zSig &= ~1;
4158     }
4159     if ( zSig == 0 ) zExp = 0;
4160     return packFloat64( zSign, zExp, zSig );
4161
4162 }
4163
4164 /*----------------------------------------------------------------------------
4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4166 | and significand `zSig', and returns the proper double-precision floating-
4167 | point value corresponding to the abstract input.  This routine is just like
4168 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4169 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4170 | floating-point exponent.
4171 *----------------------------------------------------------------------------*/
4172
4173 static float64
4174  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4175                               float_status *status)
4176 {
4177     int8_t shiftCount;
4178
4179     shiftCount = clz64(zSig) - 1;
4180     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4181                                status);
4182
4183 }
4184
4185 /*----------------------------------------------------------------------------
4186 | Normalizes the subnormal extended double-precision floating-point value
4187 | represented by the denormalized significand `aSig'.  The normalized exponent
4188 | and significand are stored at the locations pointed to by `zExpPtr' and
4189 | `zSigPtr', respectively.
4190 *----------------------------------------------------------------------------*/
4191
4192 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4193                                 uint64_t *zSigPtr)
4194 {
4195     int8_t shiftCount;
4196
4197     shiftCount = clz64(aSig);
4198     *zSigPtr = aSig<<shiftCount;
4199     *zExpPtr = 1 - shiftCount;
4200 }
4201
4202 /*----------------------------------------------------------------------------
4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4204 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4205 | and returns the proper extended double-precision floating-point value
4206 | corresponding to the abstract input.  Ordinarily, the abstract value is
4207 | rounded and packed into the extended double-precision format, with the
4208 | inexact exception raised if the abstract input cannot be represented
4209 | exactly.  However, if the abstract value is too large, the overflow and
4210 | inexact exceptions are raised and an infinity or maximal finite value is
4211 | returned.  If the abstract value is too small, the input value is rounded to
4212 | a subnormal number, and the underflow and inexact exceptions are raised if
4213 | the abstract input cannot be represented exactly as a subnormal extended
4214 | double-precision floating-point number.
4215 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4216 | number of bits as single or double precision, respectively.  Otherwise, the
4217 | result is rounded to the full precision of the extended double-precision
4218 | format.
4219 |     The input significand must be normalized or smaller.  If the input
4220 | significand is not normalized, `zExp' must be 0; in that case, the result
4221 | returned is a subnormal number, and it must not require rounding.  The
4222 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4223 | Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4225
4226 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4227                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4228                               float_status *status)
4229 {
4230     int8_t roundingMode;
4231     bool roundNearestEven, increment, isTiny;
4232     int64_t roundIncrement, roundMask, roundBits;
4233
4234     roundingMode = status->float_rounding_mode;
4235     roundNearestEven = ( roundingMode == float_round_nearest_even );
4236     if ( roundingPrecision == 80 ) goto precision80;
4237     if ( roundingPrecision == 64 ) {
4238         roundIncrement = UINT64_C(0x0000000000000400);
4239         roundMask = UINT64_C(0x00000000000007FF);
4240     }
4241     else if ( roundingPrecision == 32 ) {
4242         roundIncrement = UINT64_C(0x0000008000000000);
4243         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4244     }
4245     else {
4246         goto precision80;
4247     }
4248     zSig0 |= ( zSig1 != 0 );
4249     switch (roundingMode) {
4250     case float_round_nearest_even:
4251     case float_round_ties_away:
4252         break;
4253     case float_round_to_zero:
4254         roundIncrement = 0;
4255         break;
4256     case float_round_up:
4257         roundIncrement = zSign ? 0 : roundMask;
4258         break;
4259     case float_round_down:
4260         roundIncrement = zSign ? roundMask : 0;
4261         break;
4262     default:
4263         abort();
4264     }
4265     roundBits = zSig0 & roundMask;
4266     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4267         if (    ( 0x7FFE < zExp )
4268              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4269            ) {
4270             goto overflow;
4271         }
4272         if ( zExp <= 0 ) {
4273             if (status->flush_to_zero) {
4274                 float_raise(float_flag_output_denormal, status);
4275                 return packFloatx80(zSign, 0, 0);
4276             }
4277             isTiny = status->tininess_before_rounding
4278                   || (zExp < 0 )
4279                   || (zSig0 <= zSig0 + roundIncrement);
4280             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4281             zExp = 0;
4282             roundBits = zSig0 & roundMask;
4283             if (isTiny && roundBits) {
4284                 float_raise(float_flag_underflow, status);
4285             }
4286             if (roundBits) {
4287                 status->float_exception_flags |= float_flag_inexact;
4288             }
4289             zSig0 += roundIncrement;
4290             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4291             roundIncrement = roundMask + 1;
4292             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4293                 roundMask |= roundIncrement;
4294             }
4295             zSig0 &= ~ roundMask;
4296             return packFloatx80( zSign, zExp, zSig0 );
4297         }
4298     }
4299     if (roundBits) {
4300         status->float_exception_flags |= float_flag_inexact;
4301     }
4302     zSig0 += roundIncrement;
4303     if ( zSig0 < roundIncrement ) {
4304         ++zExp;
4305         zSig0 = UINT64_C(0x8000000000000000);
4306     }
4307     roundIncrement = roundMask + 1;
4308     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4309         roundMask |= roundIncrement;
4310     }
4311     zSig0 &= ~ roundMask;
4312     if ( zSig0 == 0 ) zExp = 0;
4313     return packFloatx80( zSign, zExp, zSig0 );
4314  precision80:
4315     switch (roundingMode) {
4316     case float_round_nearest_even:
4317     case float_round_ties_away:
4318         increment = ((int64_t)zSig1 < 0);
4319         break;
4320     case float_round_to_zero:
4321         increment = 0;
4322         break;
4323     case float_round_up:
4324         increment = !zSign && zSig1;
4325         break;
4326     case float_round_down:
4327         increment = zSign && zSig1;
4328         break;
4329     default:
4330         abort();
4331     }
4332     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4333         if (    ( 0x7FFE < zExp )
4334              || (    ( zExp == 0x7FFE )
4335                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4336                   && increment
4337                 )
4338            ) {
4339             roundMask = 0;
4340  overflow:
4341             float_raise(float_flag_overflow | float_flag_inexact, status);
4342             if (    ( roundingMode == float_round_to_zero )
4343                  || ( zSign && ( roundingMode == float_round_up ) )
4344                  || ( ! zSign && ( roundingMode == float_round_down ) )
4345                ) {
4346                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4347             }
4348             return packFloatx80(zSign,
4349                                 floatx80_infinity_high,
4350                                 floatx80_infinity_low);
4351         }
4352         if ( zExp <= 0 ) {
4353             isTiny = status->tininess_before_rounding
4354                   || (zExp < 0)
4355                   || !increment
4356                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4357             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4358             zExp = 0;
4359             if (isTiny && zSig1) {
4360                 float_raise(float_flag_underflow, status);
4361             }
4362             if (zSig1) {
4363                 status->float_exception_flags |= float_flag_inexact;
4364             }
4365             switch (roundingMode) {
4366             case float_round_nearest_even:
4367             case float_round_ties_away:
4368                 increment = ((int64_t)zSig1 < 0);
4369                 break;
4370             case float_round_to_zero:
4371                 increment = 0;
4372                 break;
4373             case float_round_up:
4374                 increment = !zSign && zSig1;
4375                 break;
4376             case float_round_down:
4377                 increment = zSign && zSig1;
4378                 break;
4379             default:
4380                 abort();
4381             }
4382             if ( increment ) {
4383                 ++zSig0;
4384                 if (!(zSig1 << 1) && roundNearestEven) {
4385                     zSig0 &= ~1;
4386                 }
4387                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4388             }
4389             return packFloatx80( zSign, zExp, zSig0 );
4390         }
4391     }
4392     if (zSig1) {
4393         status->float_exception_flags |= float_flag_inexact;
4394     }
4395     if ( increment ) {
4396         ++zSig0;
4397         if ( zSig0 == 0 ) {
4398             ++zExp;
4399             zSig0 = UINT64_C(0x8000000000000000);
4400         }
4401         else {
4402             if (!(zSig1 << 1) && roundNearestEven) {
4403                 zSig0 &= ~1;
4404             }
4405         }
4406     }
4407     else {
4408         if ( zSig0 == 0 ) zExp = 0;
4409     }
4410     return packFloatx80( zSign, zExp, zSig0 );
4411
4412 }
4413
4414 /*----------------------------------------------------------------------------
4415 | Takes an abstract floating-point value having sign `zSign', exponent
4416 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4417 | and returns the proper extended double-precision floating-point value
4418 | corresponding to the abstract input.  This routine is just like
4419 | `roundAndPackFloatx80' except that the input significand does not have to be
4420 | normalized.
4421 *----------------------------------------------------------------------------*/
4422
4423 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4424                                        bool zSign, int32_t zExp,
4425                                        uint64_t zSig0, uint64_t zSig1,
4426                                        float_status *status)
4427 {
4428     int8_t shiftCount;
4429
4430     if ( zSig0 == 0 ) {
4431         zSig0 = zSig1;
4432         zSig1 = 0;
4433         zExp -= 64;
4434     }
4435     shiftCount = clz64(zSig0);
4436     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4437     zExp -= shiftCount;
4438     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4439                                 zSig0, zSig1, status);
4440
4441 }
4442
4443 /*----------------------------------------------------------------------------
4444 | Returns the least-significant 64 fraction bits of the quadruple-precision
4445 | floating-point value `a'.
4446 *----------------------------------------------------------------------------*/
4447
4448 static inline uint64_t extractFloat128Frac1( float128 a )
4449 {
4450
4451     return a.low;
4452
4453 }
4454
4455 /*----------------------------------------------------------------------------
4456 | Returns the most-significant 48 fraction bits of the quadruple-precision
4457 | floating-point value `a'.
4458 *----------------------------------------------------------------------------*/
4459
4460 static inline uint64_t extractFloat128Frac0( float128 a )
4461 {
4462
4463     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4464
4465 }
4466
4467 /*----------------------------------------------------------------------------
4468 | Returns the exponent bits of the quadruple-precision floating-point value
4469 | `a'.
4470 *----------------------------------------------------------------------------*/
4471
4472 static inline int32_t extractFloat128Exp( float128 a )
4473 {
4474
4475     return ( a.high>>48 ) & 0x7FFF;
4476
4477 }
4478
4479 /*----------------------------------------------------------------------------
4480 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4481 *----------------------------------------------------------------------------*/
4482
4483 static inline bool extractFloat128Sign(float128 a)
4484 {
4485     return a.high >> 63;
4486 }
4487
4488 /*----------------------------------------------------------------------------
4489 | Normalizes the subnormal quadruple-precision floating-point value
4490 | represented by the denormalized significand formed by the concatenation of
4491 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4492 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4493 | significand are stored at the location pointed to by `zSig0Ptr', and the
4494 | least significant 64 bits of the normalized significand are stored at the
4495 | location pointed to by `zSig1Ptr'.
4496 *----------------------------------------------------------------------------*/
4497
4498 static void
4499  normalizeFloat128Subnormal(
4500      uint64_t aSig0,
4501      uint64_t aSig1,
4502      int32_t *zExpPtr,
4503      uint64_t *zSig0Ptr,
4504      uint64_t *zSig1Ptr
4505  )
4506 {
4507     int8_t shiftCount;
4508
4509     if ( aSig0 == 0 ) {
4510         shiftCount = clz64(aSig1) - 15;
4511         if ( shiftCount < 0 ) {
4512             *zSig0Ptr = aSig1>>( - shiftCount );
4513             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4514         }
4515         else {
4516             *zSig0Ptr = aSig1<<shiftCount;
4517             *zSig1Ptr = 0;
4518         }
4519         *zExpPtr = - shiftCount - 63;
4520     }
4521     else {
4522         shiftCount = clz64(aSig0) - 15;
4523         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4524         *zExpPtr = 1 - shiftCount;
4525     }
4526
4527 }
4528
4529 /*----------------------------------------------------------------------------
4530 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4531 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4532 | floating-point value, returning the result.  After being shifted into the
4533 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4534 | added together to form the most significant 32 bits of the result.  This
4535 | means that any integer portion of `zSig0' will be added into the exponent.
4536 | Since a properly normalized significand will have an integer portion equal
4537 | to 1, the `zExp' input should be 1 less than the desired result exponent
4538 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4539 | significand.
4540 *----------------------------------------------------------------------------*/
4541
4542 static inline float128
4543 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4544 {
4545     float128 z;
4546
4547     z.low = zSig1;
4548     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4549     return z;
4550 }
4551
4552 /*----------------------------------------------------------------------------
4553 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4554 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4555 | and `zSig2', and returns the proper quadruple-precision floating-point value
4556 | corresponding to the abstract input.  Ordinarily, the abstract value is
4557 | simply rounded and packed into the quadruple-precision format, with the
4558 | inexact exception raised if the abstract input cannot be represented
4559 | exactly.  However, if the abstract value is too large, the overflow and
4560 | inexact exceptions are raised and an infinity or maximal finite value is
4561 | returned.  If the abstract value is too small, the input value is rounded to
4562 | a subnormal number, and the underflow and inexact exceptions are raised if
4563 | the abstract input cannot be represented exactly as a subnormal quadruple-
4564 | precision floating-point number.
4565 |     The input significand must be normalized or smaller.  If the input
4566 | significand is not normalized, `zExp' must be 0; in that case, the result
4567 | returned is a subnormal number, and it must not require rounding.  In the
4568 | usual case that the input significand is normalized, `zExp' must be 1 less
4569 | than the ``true'' floating-point exponent.  The handling of underflow and
4570 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4572
4573 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4574                                      uint64_t zSig0, uint64_t zSig1,
4575                                      uint64_t zSig2, float_status *status)
4576 {
4577     int8_t roundingMode;
4578     bool roundNearestEven, increment, isTiny;
4579
4580     roundingMode = status->float_rounding_mode;
4581     roundNearestEven = ( roundingMode == float_round_nearest_even );
4582     switch (roundingMode) {
4583     case float_round_nearest_even:
4584     case float_round_ties_away:
4585         increment = ((int64_t)zSig2 < 0);
4586         break;
4587     case float_round_to_zero:
4588         increment = 0;
4589         break;
4590     case float_round_up:
4591         increment = !zSign && zSig2;
4592         break;
4593     case float_round_down:
4594         increment = zSign && zSig2;
4595         break;
4596     case float_round_to_odd:
4597         increment = !(zSig1 & 0x1) && zSig2;
4598         break;
4599     default:
4600         abort();
4601     }
4602     if ( 0x7FFD <= (uint32_t) zExp ) {
4603         if (    ( 0x7FFD < zExp )
4604              || (    ( zExp == 0x7FFD )
4605                   && eq128(
4606                          UINT64_C(0x0001FFFFFFFFFFFF),
4607                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4608                          zSig0,
4609                          zSig1
4610                      )
4611                   && increment
4612                 )
4613            ) {
4614             float_raise(float_flag_overflow | float_flag_inexact, status);
4615             if (    ( roundingMode == float_round_to_zero )
4616                  || ( zSign && ( roundingMode == float_round_up ) )
4617                  || ( ! zSign && ( roundingMode == float_round_down ) )
4618                  || (roundingMode == float_round_to_odd)
4619                ) {
4620                 return
4621                     packFloat128(
4622                         zSign,
4623                         0x7FFE,
4624                         UINT64_C(0x0000FFFFFFFFFFFF),
4625                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4626                     );
4627             }
4628             return packFloat128( zSign, 0x7FFF, 0, 0 );
4629         }
4630         if ( zExp < 0 ) {
4631             if (status->flush_to_zero) {
4632                 float_raise(float_flag_output_denormal, status);
4633                 return packFloat128(zSign, 0, 0, 0);
4634             }
4635             isTiny = status->tininess_before_rounding
4636                   || (zExp < -1)
4637                   || !increment
4638                   || lt128(zSig0, zSig1,
4639                            UINT64_C(0x0001FFFFFFFFFFFF),
4640                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4641             shift128ExtraRightJamming(
4642                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4643             zExp = 0;
4644             if (isTiny && zSig2) {
4645                 float_raise(float_flag_underflow, status);
4646             }
4647             switch (roundingMode) {
4648             case float_round_nearest_even:
4649             case float_round_ties_away:
4650                 increment = ((int64_t)zSig2 < 0);
4651                 break;
4652             case float_round_to_zero:
4653                 increment = 0;
4654                 break;
4655             case float_round_up:
4656                 increment = !zSign && zSig2;
4657                 break;
4658             case float_round_down:
4659                 increment = zSign && zSig2;
4660                 break;
4661             case float_round_to_odd:
4662                 increment = !(zSig1 & 0x1) && zSig2;
4663                 break;
4664             default:
4665                 abort();
4666             }
4667         }
4668     }
4669     if (zSig2) {
4670         status->float_exception_flags |= float_flag_inexact;
4671     }
4672     if ( increment ) {
4673         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4674         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4675             zSig1 &= ~1;
4676         }
4677     }
4678     else {
4679         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4680     }
4681     return packFloat128( zSign, zExp, zSig0, zSig1 );
4682
4683 }
4684
4685 /*----------------------------------------------------------------------------
4686 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4687 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4688 | returns the proper quadruple-precision floating-point value corresponding
4689 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4690 | except that the input significand has fewer bits and does not have to be
4691 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4692 | point exponent.
4693 *----------------------------------------------------------------------------*/
4694
4695 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4696                                               uint64_t zSig0, uint64_t zSig1,
4697                                               float_status *status)
4698 {
4699     int8_t shiftCount;
4700     uint64_t zSig2;
4701
4702     if ( zSig0 == 0 ) {
4703         zSig0 = zSig1;
4704         zSig1 = 0;
4705         zExp -= 64;
4706     }
4707     shiftCount = clz64(zSig0) - 15;
4708     if ( 0 <= shiftCount ) {
4709         zSig2 = 0;
4710         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4711     }
4712     else {
4713         shift128ExtraRightJamming(
4714             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4715     }
4716     zExp -= shiftCount;
4717     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4718
4719 }
4720
4721
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the 32-bit two's complement integer `a'
4724 | to the extended double-precision floating-point format.  The conversion
4725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4726 | Arithmetic.
4727 *----------------------------------------------------------------------------*/
4728
4729 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4730 {
4731     bool zSign;
4732     uint32_t absA;
4733     int8_t shiftCount;
4734     uint64_t zSig;
4735
4736     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4737     zSign = ( a < 0 );
4738     absA = zSign ? - a : a;
4739     shiftCount = clz32(absA) + 32;
4740     zSig = absA;
4741     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4742
4743 }
4744
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of converting the 32-bit two's complement integer `a' to
4747 | the quadruple-precision floating-point format.  The conversion is performed
4748 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4750
4751 float128 int32_to_float128(int32_t a, float_status *status)
4752 {
4753     bool zSign;
4754     uint32_t absA;
4755     int8_t shiftCount;
4756     uint64_t zSig0;
4757
4758     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4759     zSign = ( a < 0 );
4760     absA = zSign ? - a : a;
4761     shiftCount = clz32(absA) + 17;
4762     zSig0 = absA;
4763     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4764
4765 }
4766
4767 /*----------------------------------------------------------------------------
4768 | Returns the result of converting the 64-bit two's complement integer `a'
4769 | to the extended double-precision floating-point format.  The conversion
4770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4771 | Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773
4774 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4775 {
4776     bool zSign;
4777     uint64_t absA;
4778     int8_t shiftCount;
4779
4780     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4781     zSign = ( a < 0 );
4782     absA = zSign ? - a : a;
4783     shiftCount = clz64(absA);
4784     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4785
4786 }
4787
4788 /*----------------------------------------------------------------------------
4789 | Returns the result of converting the 64-bit two's complement integer `a' to
4790 | the quadruple-precision floating-point format.  The conversion is performed
4791 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4792 *----------------------------------------------------------------------------*/
4793
4794 float128 int64_to_float128(int64_t a, float_status *status)
4795 {
4796     bool zSign;
4797     uint64_t absA;
4798     int8_t shiftCount;
4799     int32_t zExp;
4800     uint64_t zSig0, zSig1;
4801
4802     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4803     zSign = ( a < 0 );
4804     absA = zSign ? - a : a;
4805     shiftCount = clz64(absA) + 49;
4806     zExp = 0x406E - shiftCount;
4807     if ( 64 <= shiftCount ) {
4808         zSig1 = 0;
4809         zSig0 = absA;
4810         shiftCount -= 64;
4811     }
4812     else {
4813         zSig1 = absA;
4814         zSig0 = 0;
4815     }
4816     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4817     return packFloat128( zSign, zExp, zSig0, zSig1 );
4818
4819 }
4820
4821 /*----------------------------------------------------------------------------
4822 | Returns the result of converting the 64-bit unsigned integer `a'
4823 | to the quadruple-precision floating-point format.  The conversion is performed
4824 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4826
4827 float128 uint64_to_float128(uint64_t a, float_status *status)
4828 {
4829     if (a == 0) {
4830         return float128_zero;
4831     }
4832     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4833 }
4834
4835 /*----------------------------------------------------------------------------
4836 | Returns the result of converting the single-precision floating-point value
4837 | `a' to the extended double-precision floating-point format.  The conversion
4838 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4839 | Arithmetic.
4840 *----------------------------------------------------------------------------*/
4841
4842 floatx80 float32_to_floatx80(float32 a, float_status *status)
4843 {
4844     bool aSign;
4845     int aExp;
4846     uint32_t aSig;
4847
4848     a = float32_squash_input_denormal(a, status);
4849     aSig = extractFloat32Frac( a );
4850     aExp = extractFloat32Exp( a );
4851     aSign = extractFloat32Sign( a );
4852     if ( aExp == 0xFF ) {
4853         if (aSig) {
4854             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4855                                                status);
4856             return floatx80_silence_nan(res, status);
4857         }
4858         return packFloatx80(aSign,
4859                             floatx80_infinity_high,
4860                             floatx80_infinity_low);
4861     }
4862     if ( aExp == 0 ) {
4863         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4864         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4865     }
4866     aSig |= 0x00800000;
4867     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4868
4869 }
4870
4871 /*----------------------------------------------------------------------------
4872 | Returns the result of converting the single-precision floating-point value
4873 | `a' to the double-precision floating-point format.  The conversion is
4874 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4875 | Arithmetic.
4876 *----------------------------------------------------------------------------*/
4877
4878 float128 float32_to_float128(float32 a, float_status *status)
4879 {
4880     bool aSign;
4881     int aExp;
4882     uint32_t aSig;
4883
4884     a = float32_squash_input_denormal(a, status);
4885     aSig = extractFloat32Frac( a );
4886     aExp = extractFloat32Exp( a );
4887     aSign = extractFloat32Sign( a );
4888     if ( aExp == 0xFF ) {
4889         if (aSig) {
4890             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4891         }
4892         return packFloat128( aSign, 0x7FFF, 0, 0 );
4893     }
4894     if ( aExp == 0 ) {
4895         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4896         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4897         --aExp;
4898     }
4899     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4900
4901 }
4902
4903 /*----------------------------------------------------------------------------
4904 | Returns the remainder of the single-precision floating-point value `a'
4905 | with respect to the corresponding value `b'.  The operation is performed
4906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4907 *----------------------------------------------------------------------------*/
4908
4909 float32 float32_rem(float32 a, float32 b, float_status *status)
4910 {
4911     bool aSign, zSign;
4912     int aExp, bExp, expDiff;
4913     uint32_t aSig, bSig;
4914     uint32_t q;
4915     uint64_t aSig64, bSig64, q64;
4916     uint32_t alternateASig;
4917     int32_t sigMean;
4918     a = float32_squash_input_denormal(a, status);
4919     b = float32_squash_input_denormal(b, status);
4920
4921     aSig = extractFloat32Frac( a );
4922     aExp = extractFloat32Exp( a );
4923     aSign = extractFloat32Sign( a );
4924     bSig = extractFloat32Frac( b );
4925     bExp = extractFloat32Exp( b );
4926     if ( aExp == 0xFF ) {
4927         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4928             return propagateFloat32NaN(a, b, status);
4929         }
4930         float_raise(float_flag_invalid, status);
4931         return float32_default_nan(status);
4932     }
4933     if ( bExp == 0xFF ) {
4934         if (bSig) {
4935             return propagateFloat32NaN(a, b, status);
4936         }
4937         return a;
4938     }
4939     if ( bExp == 0 ) {
4940         if ( bSig == 0 ) {
4941             float_raise(float_flag_invalid, status);
4942             return float32_default_nan(status);
4943         }
4944         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4945     }
4946     if ( aExp == 0 ) {
4947         if ( aSig == 0 ) return a;
4948         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4949     }
4950     expDiff = aExp - bExp;
4951     aSig |= 0x00800000;
4952     bSig |= 0x00800000;
4953     if ( expDiff < 32 ) {
4954         aSig <<= 8;
4955         bSig <<= 8;
4956         if ( expDiff < 0 ) {
4957             if ( expDiff < -1 ) return a;
4958             aSig >>= 1;
4959         }
4960         q = ( bSig <= aSig );
4961         if ( q ) aSig -= bSig;
4962         if ( 0 < expDiff ) {
4963             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4964             q >>= 32 - expDiff;
4965             bSig >>= 2;
4966             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4967         }
4968         else {
4969             aSig >>= 2;
4970             bSig >>= 2;
4971         }
4972     }
4973     else {
4974         if ( bSig <= aSig ) aSig -= bSig;
4975         aSig64 = ( (uint64_t) aSig )<<40;
4976         bSig64 = ( (uint64_t) bSig )<<40;
4977         expDiff -= 64;
4978         while ( 0 < expDiff ) {
4979             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4980             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4981             aSig64 = - ( ( bSig * q64 )<<38 );
4982             expDiff -= 62;
4983         }
4984         expDiff += 64;
4985         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4986         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4987         q = q64>>( 64 - expDiff );
4988         bSig <<= 6;
4989         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4990     }
4991     do {
4992         alternateASig = aSig;
4993         ++q;
4994         aSig -= bSig;
4995     } while ( 0 <= (int32_t) aSig );
4996     sigMean = aSig + alternateASig;
4997     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4998         aSig = alternateASig;
4999     }
5000     zSign = ( (int32_t) aSig < 0 );
5001     if ( zSign ) aSig = - aSig;
5002     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5003 }
5004
5005
5006
5007 /*----------------------------------------------------------------------------
5008 | Returns the binary exponential of the single-precision floating-point value
5009 | `a'. The operation is performed according to the IEC/IEEE Standard for
5010 | Binary Floating-Point Arithmetic.
5011 |
5012 | Uses the following identities:
5013 |
5014 | 1. -------------------------------------------------------------------------
5015 |      x    x*ln(2)
5016 |     2  = e
5017 |
5018 | 2. -------------------------------------------------------------------------
5019 |                      2     3     4     5           n
5020 |      x        x     x     x     x     x           x
5021 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5022 |               1!    2!    3!    4!    5!          n!
5023 *----------------------------------------------------------------------------*/
5024
5025 static const float64 float32_exp2_coefficients[15] =
5026 {
5027     const_float64( 0x3ff0000000000000ll ), /*  1 */
5028     const_float64( 0x3fe0000000000000ll ), /*  2 */
5029     const_float64( 0x3fc5555555555555ll ), /*  3 */
5030     const_float64( 0x3fa5555555555555ll ), /*  4 */
5031     const_float64( 0x3f81111111111111ll ), /*  5 */
5032     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5033     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5034     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5035     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5036     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5037     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5038     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5039     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5040     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5041     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5042 };
5043
5044 float32 float32_exp2(float32 a, float_status *status)
5045 {
5046     bool aSign;
5047     int aExp;
5048     uint32_t aSig;
5049     float64 r, x, xn;
5050     int i;
5051     a = float32_squash_input_denormal(a, status);
5052
5053     aSig = extractFloat32Frac( a );
5054     aExp = extractFloat32Exp( a );
5055     aSign = extractFloat32Sign( a );
5056
5057     if ( aExp == 0xFF) {
5058         if (aSig) {
5059             return propagateFloat32NaN(a, float32_zero, status);
5060         }
5061         return (aSign) ? float32_zero : a;
5062     }
5063     if (aExp == 0) {
5064         if (aSig == 0) return float32_one;
5065     }
5066
5067     float_raise(float_flag_inexact, status);
5068
5069     /* ******************************* */
5070     /* using float64 for approximation */
5071     /* ******************************* */
5072     x = float32_to_float64(a, status);
5073     x = float64_mul(x, float64_ln2, status);
5074
5075     xn = x;
5076     r = float64_one;
5077     for (i = 0 ; i < 15 ; i++) {
5078         float64 f;
5079
5080         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5081         r = float64_add(r, f, status);
5082
5083         xn = float64_mul(xn, x, status);
5084     }
5085
5086     return float64_to_float32(r, status);
5087 }
5088
5089 /*----------------------------------------------------------------------------
5090 | Returns the binary log of the single-precision floating-point value `a'.
5091 | The operation is performed according to the IEC/IEEE Standard for Binary
5092 | Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094 float32 float32_log2(float32 a, float_status *status)
5095 {
5096     bool aSign, zSign;
5097     int aExp;
5098     uint32_t aSig, zSig, i;
5099
5100     a = float32_squash_input_denormal(a, status);
5101     aSig = extractFloat32Frac( a );
5102     aExp = extractFloat32Exp( a );
5103     aSign = extractFloat32Sign( a );
5104
5105     if ( aExp == 0 ) {
5106         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5107         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5108     }
5109     if ( aSign ) {
5110         float_raise(float_flag_invalid, status);
5111         return float32_default_nan(status);
5112     }
5113     if ( aExp == 0xFF ) {
5114         if (aSig) {
5115             return propagateFloat32NaN(a, float32_zero, status);
5116         }
5117         return a;
5118     }
5119
5120     aExp -= 0x7F;
5121     aSig |= 0x00800000;
5122     zSign = aExp < 0;
5123     zSig = aExp << 23;
5124
5125     for (i = 1 << 22; i > 0; i >>= 1) {
5126         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5127         if ( aSig & 0x01000000 ) {
5128             aSig >>= 1;
5129             zSig |= i;
5130         }
5131     }
5132
5133     if ( zSign )
5134         zSig = -zSig;
5135
5136     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5137 }
5138
5139 /*----------------------------------------------------------------------------
5140 | Returns the result of converting the double-precision floating-point value
5141 | `a' to the extended double-precision floating-point format.  The conversion
5142 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5143 | Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145
5146 floatx80 float64_to_floatx80(float64 a, float_status *status)
5147 {
5148     bool aSign;
5149     int aExp;
5150     uint64_t aSig;
5151
5152     a = float64_squash_input_denormal(a, status);
5153     aSig = extractFloat64Frac( a );
5154     aExp = extractFloat64Exp( a );
5155     aSign = extractFloat64Sign( a );
5156     if ( aExp == 0x7FF ) {
5157         if (aSig) {
5158             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5159                                                status);
5160             return floatx80_silence_nan(res, status);
5161         }
5162         return packFloatx80(aSign,
5163                             floatx80_infinity_high,
5164                             floatx80_infinity_low);
5165     }
5166     if ( aExp == 0 ) {
5167         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5168         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5169     }
5170     return
5171         packFloatx80(
5172             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5173
5174 }
5175
5176 /*----------------------------------------------------------------------------
5177 | Returns the result of converting the double-precision floating-point value
5178 | `a' to the quadruple-precision floating-point format.  The conversion is
5179 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5180 | Arithmetic.
5181 *----------------------------------------------------------------------------*/
5182
5183 float128 float64_to_float128(float64 a, float_status *status)
5184 {
5185     bool aSign;
5186     int aExp;
5187     uint64_t aSig, zSig0, zSig1;
5188
5189     a = float64_squash_input_denormal(a, status);
5190     aSig = extractFloat64Frac( a );
5191     aExp = extractFloat64Exp( a );
5192     aSign = extractFloat64Sign( a );
5193     if ( aExp == 0x7FF ) {
5194         if (aSig) {
5195             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5196         }
5197         return packFloat128( aSign, 0x7FFF, 0, 0 );
5198     }
5199     if ( aExp == 0 ) {
5200         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5201         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5202         --aExp;
5203     }
5204     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5205     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5206
5207 }
5208
5209
5210 /*----------------------------------------------------------------------------
5211 | Returns the remainder of the double-precision floating-point value `a'
5212 | with respect to the corresponding value `b'.  The operation is performed
5213 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5214 *----------------------------------------------------------------------------*/
5215
5216 float64 float64_rem(float64 a, float64 b, float_status *status)
5217 {
5218     bool aSign, zSign;
5219     int aExp, bExp, expDiff;
5220     uint64_t aSig, bSig;
5221     uint64_t q, alternateASig;
5222     int64_t sigMean;
5223
5224     a = float64_squash_input_denormal(a, status);
5225     b = float64_squash_input_denormal(b, status);
5226     aSig = extractFloat64Frac( a );
5227     aExp = extractFloat64Exp( a );
5228     aSign = extractFloat64Sign( a );
5229     bSig = extractFloat64Frac( b );
5230     bExp = extractFloat64Exp( b );
5231     if ( aExp == 0x7FF ) {
5232         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5233             return propagateFloat64NaN(a, b, status);
5234         }
5235         float_raise(float_flag_invalid, status);
5236         return float64_default_nan(status);
5237     }
5238     if ( bExp == 0x7FF ) {
5239         if (bSig) {
5240             return propagateFloat64NaN(a, b, status);
5241         }
5242         return a;
5243     }
5244     if ( bExp == 0 ) {
5245         if ( bSig == 0 ) {
5246             float_raise(float_flag_invalid, status);
5247             return float64_default_nan(status);
5248         }
5249         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5250     }
5251     if ( aExp == 0 ) {
5252         if ( aSig == 0 ) return a;
5253         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5254     }
5255     expDiff = aExp - bExp;
5256     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5257     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5258     if ( expDiff < 0 ) {
5259         if ( expDiff < -1 ) return a;
5260         aSig >>= 1;
5261     }
5262     q = ( bSig <= aSig );
5263     if ( q ) aSig -= bSig;
5264     expDiff -= 64;
5265     while ( 0 < expDiff ) {
5266         q = estimateDiv128To64( aSig, 0, bSig );
5267         q = ( 2 < q ) ? q - 2 : 0;
5268         aSig = - ( ( bSig>>2 ) * q );
5269         expDiff -= 62;
5270     }
5271     expDiff += 64;
5272     if ( 0 < expDiff ) {
5273         q = estimateDiv128To64( aSig, 0, bSig );
5274         q = ( 2 < q ) ? q - 2 : 0;
5275         q >>= 64 - expDiff;
5276         bSig >>= 2;
5277         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5278     }
5279     else {
5280         aSig >>= 2;
5281         bSig >>= 2;
5282     }
5283     do {
5284         alternateASig = aSig;
5285         ++q;
5286         aSig -= bSig;
5287     } while ( 0 <= (int64_t) aSig );
5288     sigMean = aSig + alternateASig;
5289     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5290         aSig = alternateASig;
5291     }
5292     zSign = ( (int64_t) aSig < 0 );
5293     if ( zSign ) aSig = - aSig;
5294     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5295
5296 }
5297
5298 /*----------------------------------------------------------------------------
5299 | Returns the binary log of the double-precision floating-point value `a'.
5300 | The operation is performed according to the IEC/IEEE Standard for Binary
5301 | Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303 float64 float64_log2(float64 a, float_status *status)
5304 {
5305     bool aSign, zSign;
5306     int aExp;
5307     uint64_t aSig, aSig0, aSig1, zSig, i;
5308     a = float64_squash_input_denormal(a, status);
5309
5310     aSig = extractFloat64Frac( a );
5311     aExp = extractFloat64Exp( a );
5312     aSign = extractFloat64Sign( a );
5313
5314     if ( aExp == 0 ) {
5315         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5316         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5317     }
5318     if ( aSign ) {
5319         float_raise(float_flag_invalid, status);
5320         return float64_default_nan(status);
5321     }
5322     if ( aExp == 0x7FF ) {
5323         if (aSig) {
5324             return propagateFloat64NaN(a, float64_zero, status);
5325         }
5326         return a;
5327     }
5328
5329     aExp -= 0x3FF;
5330     aSig |= UINT64_C(0x0010000000000000);
5331     zSign = aExp < 0;
5332     zSig = (uint64_t)aExp << 52;
5333     for (i = 1LL << 51; i > 0; i >>= 1) {
5334         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5335         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5336         if ( aSig & UINT64_C(0x0020000000000000) ) {
5337             aSig >>= 1;
5338             zSig |= i;
5339         }
5340     }
5341
5342     if ( zSign )
5343         zSig = -zSig;
5344     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5345 }
5346
5347 /*----------------------------------------------------------------------------
5348 | Returns the result of converting the extended double-precision floating-
5349 | point value `a' to the 32-bit two's complement integer format.  The
5350 | conversion is performed according to the IEC/IEEE Standard for Binary
5351 | Floating-Point Arithmetic---which means in particular that the conversion
5352 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5353 | largest positive integer is returned.  Otherwise, if the conversion
5354 | overflows, the largest integer with the same sign as `a' is returned.
5355 *----------------------------------------------------------------------------*/
5356
5357 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5358 {
5359     bool aSign;
5360     int32_t aExp, shiftCount;
5361     uint64_t aSig;
5362
5363     if (floatx80_invalid_encoding(a)) {
5364         float_raise(float_flag_invalid, status);
5365         return 1 << 31;
5366     }
5367     aSig = extractFloatx80Frac( a );
5368     aExp = extractFloatx80Exp( a );
5369     aSign = extractFloatx80Sign( a );
5370     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5371     shiftCount = 0x4037 - aExp;
5372     if ( shiftCount <= 0 ) shiftCount = 1;
5373     shift64RightJamming( aSig, shiftCount, &aSig );
5374     return roundAndPackInt32(aSign, aSig, status);
5375
5376 }
5377
5378 /*----------------------------------------------------------------------------
5379 | Returns the result of converting the extended double-precision floating-
5380 | point value `a' to the 32-bit two's complement integer format.  The
5381 | conversion is performed according to the IEC/IEEE Standard for Binary
5382 | Floating-Point Arithmetic, except that the conversion is always rounded
5383 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5384 | Otherwise, if the conversion overflows, the largest integer with the same
5385 | sign as `a' is returned.
5386 *----------------------------------------------------------------------------*/
5387
5388 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5389 {
5390     bool aSign;
5391     int32_t aExp, shiftCount;
5392     uint64_t aSig, savedASig;
5393     int32_t z;
5394
5395     if (floatx80_invalid_encoding(a)) {
5396         float_raise(float_flag_invalid, status);
5397         return 1 << 31;
5398     }
5399     aSig = extractFloatx80Frac( a );
5400     aExp = extractFloatx80Exp( a );
5401     aSign = extractFloatx80Sign( a );
5402     if ( 0x401E < aExp ) {
5403         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5404         goto invalid;
5405     }
5406     else if ( aExp < 0x3FFF ) {
5407         if (aExp || aSig) {
5408             status->float_exception_flags |= float_flag_inexact;
5409         }
5410         return 0;
5411     }
5412     shiftCount = 0x403E - aExp;
5413     savedASig = aSig;
5414     aSig >>= shiftCount;
5415     z = aSig;
5416     if ( aSign ) z = - z;
5417     if ( ( z < 0 ) ^ aSign ) {
5418  invalid:
5419         float_raise(float_flag_invalid, status);
5420         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5421     }
5422     if ( ( aSig<<shiftCount ) != savedASig ) {
5423         status->float_exception_flags |= float_flag_inexact;
5424     }
5425     return z;
5426
5427 }
5428
5429 /*----------------------------------------------------------------------------
5430 | Returns the result of converting the extended double-precision floating-
5431 | point value `a' to the 64-bit two's complement integer format.  The
5432 | conversion is performed according to the IEC/IEEE Standard for Binary
5433 | Floating-Point Arithmetic---which means in particular that the conversion
5434 | is rounded according to the current rounding mode.  If `a' is a NaN,
5435 | the largest positive integer is returned.  Otherwise, if the conversion
5436 | overflows, the largest integer with the same sign as `a' is returned.
5437 *----------------------------------------------------------------------------*/
5438
5439 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5440 {
5441     bool aSign;
5442     int32_t aExp, shiftCount;
5443     uint64_t aSig, aSigExtra;
5444
5445     if (floatx80_invalid_encoding(a)) {
5446         float_raise(float_flag_invalid, status);
5447         return 1ULL << 63;
5448     }
5449     aSig = extractFloatx80Frac( a );
5450     aExp = extractFloatx80Exp( a );
5451     aSign = extractFloatx80Sign( a );
5452     shiftCount = 0x403E - aExp;
5453     if ( shiftCount <= 0 ) {
5454         if ( shiftCount ) {
5455             float_raise(float_flag_invalid, status);
5456             if (!aSign || floatx80_is_any_nan(a)) {
5457                 return INT64_MAX;
5458             }
5459             return INT64_MIN;
5460         }
5461         aSigExtra = 0;
5462     }
5463     else {
5464         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5465     }
5466     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5467
5468 }
5469
5470 /*----------------------------------------------------------------------------
5471 | Returns the result of converting the extended double-precision floating-
5472 | point value `a' to the 64-bit two's complement integer format.  The
5473 | conversion is performed according to the IEC/IEEE Standard for Binary
5474 | Floating-Point Arithmetic, except that the conversion is always rounded
5475 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5476 | Otherwise, if the conversion overflows, the largest integer with the same
5477 | sign as `a' is returned.
5478 *----------------------------------------------------------------------------*/
5479
5480 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5481 {
5482     bool aSign;
5483     int32_t aExp, shiftCount;
5484     uint64_t aSig;
5485     int64_t z;
5486
5487     if (floatx80_invalid_encoding(a)) {
5488         float_raise(float_flag_invalid, status);
5489         return 1ULL << 63;
5490     }
5491     aSig = extractFloatx80Frac( a );
5492     aExp = extractFloatx80Exp( a );
5493     aSign = extractFloatx80Sign( a );
5494     shiftCount = aExp - 0x403E;
5495     if ( 0 <= shiftCount ) {
5496         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5497         if ( ( a.high != 0xC03E ) || aSig ) {
5498             float_raise(float_flag_invalid, status);
5499             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5500                 return INT64_MAX;
5501             }
5502         }
5503         return INT64_MIN;
5504     }
5505     else if ( aExp < 0x3FFF ) {
5506         if (aExp | aSig) {
5507             status->float_exception_flags |= float_flag_inexact;
5508         }
5509         return 0;
5510     }
5511     z = aSig>>( - shiftCount );
5512     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5513         status->float_exception_flags |= float_flag_inexact;
5514     }
5515     if ( aSign ) z = - z;
5516     return z;
5517
5518 }
5519
5520 /*----------------------------------------------------------------------------
5521 | Returns the result of converting the extended double-precision floating-
5522 | point value `a' to the single-precision floating-point format.  The
5523 | conversion is performed according to the IEC/IEEE Standard for Binary
5524 | Floating-Point Arithmetic.
5525 *----------------------------------------------------------------------------*/
5526
5527 float32 floatx80_to_float32(floatx80 a, float_status *status)
5528 {
5529     bool aSign;
5530     int32_t aExp;
5531     uint64_t aSig;
5532
5533     if (floatx80_invalid_encoding(a)) {
5534         float_raise(float_flag_invalid, status);
5535         return float32_default_nan(status);
5536     }
5537     aSig = extractFloatx80Frac( a );
5538     aExp = extractFloatx80Exp( a );
5539     aSign = extractFloatx80Sign( a );
5540     if ( aExp == 0x7FFF ) {
5541         if ( (uint64_t) ( aSig<<1 ) ) {
5542             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5543                                              status);
5544             return float32_silence_nan(res, status);
5545         }
5546         return packFloat32( aSign, 0xFF, 0 );
5547     }
5548     shift64RightJamming( aSig, 33, &aSig );
5549     if ( aExp || aSig ) aExp -= 0x3F81;
5550     return roundAndPackFloat32(aSign, aExp, aSig, status);
5551
5552 }
5553
5554 /*----------------------------------------------------------------------------
5555 | Returns the result of converting the extended double-precision floating-
5556 | point value `a' to the double-precision floating-point format.  The
5557 | conversion is performed according to the IEC/IEEE Standard for Binary
5558 | Floating-Point Arithmetic.
5559 *----------------------------------------------------------------------------*/
5560
5561 float64 floatx80_to_float64(floatx80 a, float_status *status)
5562 {
5563     bool aSign;
5564     int32_t aExp;
5565     uint64_t aSig, zSig;
5566
5567     if (floatx80_invalid_encoding(a)) {
5568         float_raise(float_flag_invalid, status);
5569         return float64_default_nan(status);
5570     }
5571     aSig = extractFloatx80Frac( a );
5572     aExp = extractFloatx80Exp( a );
5573     aSign = extractFloatx80Sign( a );
5574     if ( aExp == 0x7FFF ) {
5575         if ( (uint64_t) ( aSig<<1 ) ) {
5576             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5577                                              status);
5578             return float64_silence_nan(res, status);
5579         }
5580         return packFloat64( aSign, 0x7FF, 0 );
5581     }
5582     shift64RightJamming( aSig, 1, &zSig );
5583     if ( aExp || aSig ) aExp -= 0x3C01;
5584     return roundAndPackFloat64(aSign, aExp, zSig, status);
5585
5586 }
5587
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the quadruple-precision floating-point format.  The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5594
5595 float128 floatx80_to_float128(floatx80 a, float_status *status)
5596 {
5597     bool aSign;
5598     int aExp;
5599     uint64_t aSig, zSig0, zSig1;
5600
5601     if (floatx80_invalid_encoding(a)) {
5602         float_raise(float_flag_invalid, status);
5603         return float128_default_nan(status);
5604     }
5605     aSig = extractFloatx80Frac( a );
5606     aExp = extractFloatx80Exp( a );
5607     aSign = extractFloatx80Sign( a );
5608     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5609         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5610                                            status);
5611         return float128_silence_nan(res, status);
5612     }
5613     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5614     return packFloat128( aSign, aExp, zSig0, zSig1 );
5615
5616 }
5617
5618 /*----------------------------------------------------------------------------
5619 | Rounds the extended double-precision floating-point value `a'
5620 | to the precision provided by floatx80_rounding_precision and returns the
5621 | result as an extended double-precision floating-point value.
5622 | The operation is performed according to the IEC/IEEE Standard for Binary
5623 | Floating-Point Arithmetic.
5624 *----------------------------------------------------------------------------*/
5625
5626 floatx80 floatx80_round(floatx80 a, float_status *status)
5627 {
5628     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5629                                 extractFloatx80Sign(a),
5630                                 extractFloatx80Exp(a),
5631                                 extractFloatx80Frac(a), 0, status);
5632 }
5633
5634 /*----------------------------------------------------------------------------
5635 | Rounds the extended double-precision floating-point value `a' to an integer,
5636 | and returns the result as an extended quadruple-precision floating-point
5637 | value.  The operation is performed according to the IEC/IEEE Standard for
5638 | Binary Floating-Point Arithmetic.
5639 *----------------------------------------------------------------------------*/
5640
5641 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5642 {
5643     bool aSign;
5644     int32_t aExp;
5645     uint64_t lastBitMask, roundBitsMask;
5646     floatx80 z;
5647
5648     if (floatx80_invalid_encoding(a)) {
5649         float_raise(float_flag_invalid, status);
5650         return floatx80_default_nan(status);
5651     }
5652     aExp = extractFloatx80Exp( a );
5653     if ( 0x403E <= aExp ) {
5654         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5655             return propagateFloatx80NaN(a, a, status);
5656         }
5657         return a;
5658     }
5659     if ( aExp < 0x3FFF ) {
5660         if (    ( aExp == 0 )
5661              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5662             return a;
5663         }
5664         status->float_exception_flags |= float_flag_inexact;
5665         aSign = extractFloatx80Sign( a );
5666         switch (status->float_rounding_mode) {
5667          case float_round_nearest_even:
5668             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5669                ) {
5670                 return
5671                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5672             }
5673             break;
5674         case float_round_ties_away:
5675             if (aExp == 0x3FFE) {
5676                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5677             }
5678             break;
5679          case float_round_down:
5680             return
5681                   aSign ?
5682                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5683                 : packFloatx80( 0, 0, 0 );
5684          case float_round_up:
5685             return
5686                   aSign ? packFloatx80( 1, 0, 0 )
5687                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5688
5689         case float_round_to_zero:
5690             break;
5691         default:
5692             g_assert_not_reached();
5693         }
5694         return packFloatx80( aSign, 0, 0 );
5695     }
5696     lastBitMask = 1;
5697     lastBitMask <<= 0x403E - aExp;
5698     roundBitsMask = lastBitMask - 1;
5699     z = a;
5700     switch (status->float_rounding_mode) {
5701     case float_round_nearest_even:
5702         z.low += lastBitMask>>1;
5703         if ((z.low & roundBitsMask) == 0) {
5704             z.low &= ~lastBitMask;
5705         }
5706         break;
5707     case float_round_ties_away:
5708         z.low += lastBitMask >> 1;
5709         break;
5710     case float_round_to_zero:
5711         break;
5712     case float_round_up:
5713         if (!extractFloatx80Sign(z)) {
5714             z.low += roundBitsMask;
5715         }
5716         break;
5717     case float_round_down:
5718         if (extractFloatx80Sign(z)) {
5719             z.low += roundBitsMask;
5720         }
5721         break;
5722     default:
5723         abort();
5724     }
5725     z.low &= ~ roundBitsMask;
5726     if ( z.low == 0 ) {
5727         ++z.high;
5728         z.low = UINT64_C(0x8000000000000000);
5729     }
5730     if (z.low != a.low) {
5731         status->float_exception_flags |= float_flag_inexact;
5732     }
5733     return z;
5734
5735 }
5736
5737 /*----------------------------------------------------------------------------
5738 | Returns the result of adding the absolute values of the extended double-
5739 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5740 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5741 | The addition is performed according to the IEC/IEEE Standard for Binary
5742 | Floating-Point Arithmetic.
5743 *----------------------------------------------------------------------------*/
5744
5745 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5746                                 float_status *status)
5747 {
5748     int32_t aExp, bExp, zExp;
5749     uint64_t aSig, bSig, zSig0, zSig1;
5750     int32_t expDiff;
5751
5752     aSig = extractFloatx80Frac( a );
5753     aExp = extractFloatx80Exp( a );
5754     bSig = extractFloatx80Frac( b );
5755     bExp = extractFloatx80Exp( b );
5756     expDiff = aExp - bExp;
5757     if ( 0 < expDiff ) {
5758         if ( aExp == 0x7FFF ) {
5759             if ((uint64_t)(aSig << 1)) {
5760                 return propagateFloatx80NaN(a, b, status);
5761             }
5762             return a;
5763         }
5764         if ( bExp == 0 ) --expDiff;
5765         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5766         zExp = aExp;
5767     }
5768     else if ( expDiff < 0 ) {
5769         if ( bExp == 0x7FFF ) {
5770             if ((uint64_t)(bSig << 1)) {
5771                 return propagateFloatx80NaN(a, b, status);
5772             }
5773             return packFloatx80(zSign,
5774                                 floatx80_infinity_high,
5775                                 floatx80_infinity_low);
5776         }
5777         if ( aExp == 0 ) ++expDiff;
5778         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5779         zExp = bExp;
5780     }
5781     else {
5782         if ( aExp == 0x7FFF ) {
5783             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5784                 return propagateFloatx80NaN(a, b, status);
5785             }
5786             return a;
5787         }
5788         zSig1 = 0;
5789         zSig0 = aSig + bSig;
5790         if ( aExp == 0 ) {
5791             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5792                 /* At least one of the values is a pseudo-denormal,
5793                  * and there is a carry out of the result.  */
5794                 zExp = 1;
5795                 goto shiftRight1;
5796             }
5797             if (zSig0 == 0) {
5798                 return packFloatx80(zSign, 0, 0);
5799             }
5800             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5801             goto roundAndPack;
5802         }
5803         zExp = aExp;
5804         goto shiftRight1;
5805     }
5806     zSig0 = aSig + bSig;
5807     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5808  shiftRight1:
5809     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5810     zSig0 |= UINT64_C(0x8000000000000000);
5811     ++zExp;
5812  roundAndPack:
5813     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5814                                 zSign, zExp, zSig0, zSig1, status);
5815 }
5816
5817 /*----------------------------------------------------------------------------
5818 | Returns the result of subtracting the absolute values of the extended
5819 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5820 | difference is negated before being returned.  `zSign' is ignored if the
5821 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5822 | Standard for Binary Floating-Point Arithmetic.
5823 *----------------------------------------------------------------------------*/
5824
5825 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5826                                 float_status *status)
5827 {
5828     int32_t aExp, bExp, zExp;
5829     uint64_t aSig, bSig, zSig0, zSig1;
5830     int32_t expDiff;
5831
5832     aSig = extractFloatx80Frac( a );
5833     aExp = extractFloatx80Exp( a );
5834     bSig = extractFloatx80Frac( b );
5835     bExp = extractFloatx80Exp( b );
5836     expDiff = aExp - bExp;
5837     if ( 0 < expDiff ) goto aExpBigger;
5838     if ( expDiff < 0 ) goto bExpBigger;
5839     if ( aExp == 0x7FFF ) {
5840         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5841             return propagateFloatx80NaN(a, b, status);
5842         }
5843         float_raise(float_flag_invalid, status);
5844         return floatx80_default_nan(status);
5845     }
5846     if ( aExp == 0 ) {
5847         aExp = 1;
5848         bExp = 1;
5849     }
5850     zSig1 = 0;
5851     if ( bSig < aSig ) goto aBigger;
5852     if ( aSig < bSig ) goto bBigger;
5853     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5854  bExpBigger:
5855     if ( bExp == 0x7FFF ) {
5856         if ((uint64_t)(bSig << 1)) {
5857             return propagateFloatx80NaN(a, b, status);
5858         }
5859         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5860                             floatx80_infinity_low);
5861     }
5862     if ( aExp == 0 ) ++expDiff;
5863     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5864  bBigger:
5865     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5866     zExp = bExp;
5867     zSign ^= 1;
5868     goto normalizeRoundAndPack;
5869  aExpBigger:
5870     if ( aExp == 0x7FFF ) {
5871         if ((uint64_t)(aSig << 1)) {
5872             return propagateFloatx80NaN(a, b, status);
5873         }
5874         return a;
5875     }
5876     if ( bExp == 0 ) --expDiff;
5877     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5878  aBigger:
5879     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5880     zExp = aExp;
5881  normalizeRoundAndPack:
5882     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5883                                          zSign, zExp, zSig0, zSig1, status);
5884 }
5885
5886 /*----------------------------------------------------------------------------
5887 | Returns the result of adding the extended double-precision floating-point
5888 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5889 | Standard for Binary Floating-Point Arithmetic.
5890 *----------------------------------------------------------------------------*/
5891
5892 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5893 {
5894     bool aSign, bSign;
5895
5896     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5897         float_raise(float_flag_invalid, status);
5898         return floatx80_default_nan(status);
5899     }
5900     aSign = extractFloatx80Sign( a );
5901     bSign = extractFloatx80Sign( b );
5902     if ( aSign == bSign ) {
5903         return addFloatx80Sigs(a, b, aSign, status);
5904     }
5905     else {
5906         return subFloatx80Sigs(a, b, aSign, status);
5907     }
5908
5909 }
5910
5911 /*----------------------------------------------------------------------------
5912 | Returns the result of subtracting the extended double-precision floating-
5913 | point values `a' and `b'.  The operation is performed according to the
5914 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5916
5917 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5918 {
5919     bool aSign, bSign;
5920
5921     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5922         float_raise(float_flag_invalid, status);
5923         return floatx80_default_nan(status);
5924     }
5925     aSign = extractFloatx80Sign( a );
5926     bSign = extractFloatx80Sign( b );
5927     if ( aSign == bSign ) {
5928         return subFloatx80Sigs(a, b, aSign, status);
5929     }
5930     else {
5931         return addFloatx80Sigs(a, b, aSign, status);
5932     }
5933
5934 }
5935
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of multiplying the extended double-precision floating-
5938 | point values `a' and `b'.  The operation is performed according to the
5939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5941
5942 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5943 {
5944     bool aSign, bSign, zSign;
5945     int32_t aExp, bExp, zExp;
5946     uint64_t aSig, bSig, zSig0, zSig1;
5947
5948     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5949         float_raise(float_flag_invalid, status);
5950         return floatx80_default_nan(status);
5951     }
5952     aSig = extractFloatx80Frac( a );
5953     aExp = extractFloatx80Exp( a );
5954     aSign = extractFloatx80Sign( a );
5955     bSig = extractFloatx80Frac( b );
5956     bExp = extractFloatx80Exp( b );
5957     bSign = extractFloatx80Sign( b );
5958     zSign = aSign ^ bSign;
5959     if ( aExp == 0x7FFF ) {
5960         if (    (uint64_t) ( aSig<<1 )
5961              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5962             return propagateFloatx80NaN(a, b, status);
5963         }
5964         if ( ( bExp | bSig ) == 0 ) goto invalid;
5965         return packFloatx80(zSign, floatx80_infinity_high,
5966                                    floatx80_infinity_low);
5967     }
5968     if ( bExp == 0x7FFF ) {
5969         if ((uint64_t)(bSig << 1)) {
5970             return propagateFloatx80NaN(a, b, status);
5971         }
5972         if ( ( aExp | aSig ) == 0 ) {
5973  invalid:
5974             float_raise(float_flag_invalid, status);
5975             return floatx80_default_nan(status);
5976         }
5977         return packFloatx80(zSign, floatx80_infinity_high,
5978                                    floatx80_infinity_low);
5979     }
5980     if ( aExp == 0 ) {
5981         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5982         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5983     }
5984     if ( bExp == 0 ) {
5985         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5986         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5987     }
5988     zExp = aExp + bExp - 0x3FFE;
5989     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5990     if ( 0 < (int64_t) zSig0 ) {
5991         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5992         --zExp;
5993     }
5994     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5995                                 zSign, zExp, zSig0, zSig1, status);
5996 }
5997
5998 /*----------------------------------------------------------------------------
5999 | Returns the result of dividing the extended double-precision floating-point
6000 | value `a' by the corresponding value `b'.  The operation is performed
6001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002 *----------------------------------------------------------------------------*/
6003
6004 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6005 {
6006     bool aSign, bSign, zSign;
6007     int32_t aExp, bExp, zExp;
6008     uint64_t aSig, bSig, zSig0, zSig1;
6009     uint64_t rem0, rem1, rem2, term0, term1, term2;
6010
6011     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6012         float_raise(float_flag_invalid, status);
6013         return floatx80_default_nan(status);
6014     }
6015     aSig = extractFloatx80Frac( a );
6016     aExp = extractFloatx80Exp( a );
6017     aSign = extractFloatx80Sign( a );
6018     bSig = extractFloatx80Frac( b );
6019     bExp = extractFloatx80Exp( b );
6020     bSign = extractFloatx80Sign( b );
6021     zSign = aSign ^ bSign;
6022     if ( aExp == 0x7FFF ) {
6023         if ((uint64_t)(aSig << 1)) {
6024             return propagateFloatx80NaN(a, b, status);
6025         }
6026         if ( bExp == 0x7FFF ) {
6027             if ((uint64_t)(bSig << 1)) {
6028                 return propagateFloatx80NaN(a, b, status);
6029             }
6030             goto invalid;
6031         }
6032         return packFloatx80(zSign, floatx80_infinity_high,
6033                                    floatx80_infinity_low);
6034     }
6035     if ( bExp == 0x7FFF ) {
6036         if ((uint64_t)(bSig << 1)) {
6037             return propagateFloatx80NaN(a, b, status);
6038         }
6039         return packFloatx80( zSign, 0, 0 );
6040     }
6041     if ( bExp == 0 ) {
6042         if ( bSig == 0 ) {
6043             if ( ( aExp | aSig ) == 0 ) {
6044  invalid:
6045                 float_raise(float_flag_invalid, status);
6046                 return floatx80_default_nan(status);
6047             }
6048             float_raise(float_flag_divbyzero, status);
6049             return packFloatx80(zSign, floatx80_infinity_high,
6050                                        floatx80_infinity_low);
6051         }
6052         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6053     }
6054     if ( aExp == 0 ) {
6055         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6056         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6057     }
6058     zExp = aExp - bExp + 0x3FFE;
6059     rem1 = 0;
6060     if ( bSig <= aSig ) {
6061         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6062         ++zExp;
6063     }
6064     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6065     mul64To128( bSig, zSig0, &term0, &term1 );
6066     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6067     while ( (int64_t) rem0 < 0 ) {
6068         --zSig0;
6069         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6070     }
6071     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6072     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6073         mul64To128( bSig, zSig1, &term1, &term2 );
6074         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6075         while ( (int64_t) rem1 < 0 ) {
6076             --zSig1;
6077             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6078         }
6079         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6080     }
6081     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6082                                 zSign, zExp, zSig0, zSig1, status);
6083 }
6084
6085 /*----------------------------------------------------------------------------
6086 | Returns the remainder of the extended double-precision floating-point value
6087 | `a' with respect to the corresponding value `b'.  The operation is performed
6088 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6089 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6090 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6091 | the absolute value of the integer quotient.
6092 *----------------------------------------------------------------------------*/
6093
6094 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6095                          float_status *status)
6096 {
6097     bool aSign, zSign;
6098     int32_t aExp, bExp, expDiff, aExpOrig;
6099     uint64_t aSig0, aSig1, bSig;
6100     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6101
6102     *quotient = 0;
6103     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6104         float_raise(float_flag_invalid, status);
6105         return floatx80_default_nan(status);
6106     }
6107     aSig0 = extractFloatx80Frac( a );
6108     aExpOrig = aExp = extractFloatx80Exp( a );
6109     aSign = extractFloatx80Sign( a );
6110     bSig = extractFloatx80Frac( b );
6111     bExp = extractFloatx80Exp( b );
6112     if ( aExp == 0x7FFF ) {
6113         if (    (uint64_t) ( aSig0<<1 )
6114              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6115             return propagateFloatx80NaN(a, b, status);
6116         }
6117         goto invalid;
6118     }
6119     if ( bExp == 0x7FFF ) {
6120         if ((uint64_t)(bSig << 1)) {
6121             return propagateFloatx80NaN(a, b, status);
6122         }
6123         if (aExp == 0 && aSig0 >> 63) {
6124             /*
6125              * Pseudo-denormal argument must be returned in normalized
6126              * form.
6127              */
6128             return packFloatx80(aSign, 1, aSig0);
6129         }
6130         return a;
6131     }
6132     if ( bExp == 0 ) {
6133         if ( bSig == 0 ) {
6134  invalid:
6135             float_raise(float_flag_invalid, status);
6136             return floatx80_default_nan(status);
6137         }
6138         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6139     }
6140     if ( aExp == 0 ) {
6141         if ( aSig0 == 0 ) return a;
6142         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6143     }
6144     zSign = aSign;
6145     expDiff = aExp - bExp;
6146     aSig1 = 0;
6147     if ( expDiff < 0 ) {
6148         if ( mod || expDiff < -1 ) {
6149             if (aExp == 1 && aExpOrig == 0) {
6150                 /*
6151                  * Pseudo-denormal argument must be returned in
6152                  * normalized form.
6153                  */
6154                 return packFloatx80(aSign, aExp, aSig0);
6155             }
6156             return a;
6157         }
6158         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6159         expDiff = 0;
6160     }
6161     *quotient = q = ( bSig <= aSig0 );
6162     if ( q ) aSig0 -= bSig;
6163     expDiff -= 64;
6164     while ( 0 < expDiff ) {
6165         q = estimateDiv128To64( aSig0, aSig1, bSig );
6166         q = ( 2 < q ) ? q - 2 : 0;
6167         mul64To128( bSig, q, &term0, &term1 );
6168         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6169         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6170         expDiff -= 62;
6171         *quotient <<= 62;
6172         *quotient += q;
6173     }
6174     expDiff += 64;
6175     if ( 0 < expDiff ) {
6176         q = estimateDiv128To64( aSig0, aSig1, bSig );
6177         q = ( 2 < q ) ? q - 2 : 0;
6178         q >>= 64 - expDiff;
6179         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6180         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6181         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6182         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6183             ++q;
6184             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6185         }
6186         if (expDiff < 64) {
6187             *quotient <<= expDiff;
6188         } else {
6189             *quotient = 0;
6190         }
6191         *quotient += q;
6192     }
6193     else {
6194         term1 = 0;
6195         term0 = bSig;
6196     }
6197     if (!mod) {
6198         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6199         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6200                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6201                         && ( q & 1 ) )
6202             ) {
6203             aSig0 = alternateASig0;
6204             aSig1 = alternateASig1;
6205             zSign = ! zSign;
6206             ++*quotient;
6207         }
6208     }
6209     return
6210         normalizeRoundAndPackFloatx80(
6211             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6212
6213 }
6214
6215 /*----------------------------------------------------------------------------
6216 | Returns the remainder of the extended double-precision floating-point value
6217 | `a' with respect to the corresponding value `b'.  The operation is performed
6218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6220
6221 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6222 {
6223     uint64_t quotient;
6224     return floatx80_modrem(a, b, false, &quotient, status);
6225 }
6226
6227 /*----------------------------------------------------------------------------
6228 | Returns the remainder of the extended double-precision floating-point value
6229 | `a' with respect to the corresponding value `b', with the quotient truncated
6230 | toward zero.
6231 *----------------------------------------------------------------------------*/
6232
6233 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6234 {
6235     uint64_t quotient;
6236     return floatx80_modrem(a, b, true, &quotient, status);
6237 }
6238
6239 /*----------------------------------------------------------------------------
6240 | Returns the square root of the extended double-precision floating-point
6241 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6242 | for Binary Floating-Point Arithmetic.
6243 *----------------------------------------------------------------------------*/
6244
6245 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6246 {
6247     bool aSign;
6248     int32_t aExp, zExp;
6249     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6250     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6251
6252     if (floatx80_invalid_encoding(a)) {
6253         float_raise(float_flag_invalid, status);
6254         return floatx80_default_nan(status);
6255     }
6256     aSig0 = extractFloatx80Frac( a );
6257     aExp = extractFloatx80Exp( a );
6258     aSign = extractFloatx80Sign( a );
6259     if ( aExp == 0x7FFF ) {
6260         if ((uint64_t)(aSig0 << 1)) {
6261             return propagateFloatx80NaN(a, a, status);
6262         }
6263         if ( ! aSign ) return a;
6264         goto invalid;
6265     }
6266     if ( aSign ) {
6267         if ( ( aExp | aSig0 ) == 0 ) return a;
6268  invalid:
6269         float_raise(float_flag_invalid, status);
6270         return floatx80_default_nan(status);
6271     }
6272     if ( aExp == 0 ) {
6273         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6274         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6275     }
6276     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6277     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6278     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6279     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6280     doubleZSig0 = zSig0<<1;
6281     mul64To128( zSig0, zSig0, &term0, &term1 );
6282     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6283     while ( (int64_t) rem0 < 0 ) {
6284         --zSig0;
6285         doubleZSig0 -= 2;
6286         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6287     }
6288     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6289     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6290         if ( zSig1 == 0 ) zSig1 = 1;
6291         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6292         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6293         mul64To128( zSig1, zSig1, &term2, &term3 );
6294         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6295         while ( (int64_t) rem1 < 0 ) {
6296             --zSig1;
6297             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6298             term3 |= 1;
6299             term2 |= doubleZSig0;
6300             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6301         }
6302         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6303     }
6304     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6305     zSig0 |= doubleZSig0;
6306     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6307                                 0, zExp, zSig0, zSig1, status);
6308 }
6309
6310 /*----------------------------------------------------------------------------
6311 | Returns the result of converting the quadruple-precision floating-point
6312 | value `a' to the 32-bit two's complement integer format.  The conversion
6313 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6314 | Arithmetic---which means in particular that the conversion is rounded
6315 | according to the current rounding mode.  If `a' is a NaN, the largest
6316 | positive integer is returned.  Otherwise, if the conversion overflows, the
6317 | largest integer with the same sign as `a' is returned.
6318 *----------------------------------------------------------------------------*/
6319
6320 int32_t float128_to_int32(float128 a, float_status *status)
6321 {
6322     bool aSign;
6323     int32_t aExp, shiftCount;
6324     uint64_t aSig0, aSig1;
6325
6326     aSig1 = extractFloat128Frac1( a );
6327     aSig0 = extractFloat128Frac0( a );
6328     aExp = extractFloat128Exp( a );
6329     aSign = extractFloat128Sign( a );
6330     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6331     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6332     aSig0 |= ( aSig1 != 0 );
6333     shiftCount = 0x4028 - aExp;
6334     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6335     return roundAndPackInt32(aSign, aSig0, status);
6336
6337 }
6338
6339 /*----------------------------------------------------------------------------
6340 | Returns the result of converting the quadruple-precision floating-point
6341 | value `a' to the 32-bit two's complement integer format.  The conversion
6342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6343 | Arithmetic, except that the conversion is always rounded toward zero.  If
6344 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6345 | conversion overflows, the largest integer with the same sign as `a' is
6346 | returned.
6347 *----------------------------------------------------------------------------*/
6348
6349 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6350 {
6351     bool aSign;
6352     int32_t aExp, shiftCount;
6353     uint64_t aSig0, aSig1, savedASig;
6354     int32_t z;
6355
6356     aSig1 = extractFloat128Frac1( a );
6357     aSig0 = extractFloat128Frac0( a );
6358     aExp = extractFloat128Exp( a );
6359     aSign = extractFloat128Sign( a );
6360     aSig0 |= ( aSig1 != 0 );
6361     if ( 0x401E < aExp ) {
6362         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6363         goto invalid;
6364     }
6365     else if ( aExp < 0x3FFF ) {
6366         if (aExp || aSig0) {
6367             status->float_exception_flags |= float_flag_inexact;
6368         }
6369         return 0;
6370     }
6371     aSig0 |= UINT64_C(0x0001000000000000);
6372     shiftCount = 0x402F - aExp;
6373     savedASig = aSig0;
6374     aSig0 >>= shiftCount;
6375     z = aSig0;
6376     if ( aSign ) z = - z;
6377     if ( ( z < 0 ) ^ aSign ) {
6378  invalid:
6379         float_raise(float_flag_invalid, status);
6380         return aSign ? INT32_MIN : INT32_MAX;
6381     }
6382     if ( ( aSig0<<shiftCount ) != savedASig ) {
6383         status->float_exception_flags |= float_flag_inexact;
6384     }
6385     return z;
6386
6387 }
6388
6389 /*----------------------------------------------------------------------------
6390 | Returns the result of converting the quadruple-precision floating-point
6391 | value `a' to the 64-bit two's complement integer format.  The conversion
6392 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6393 | Arithmetic---which means in particular that the conversion is rounded
6394 | according to the current rounding mode.  If `a' is a NaN, the largest
6395 | positive integer is returned.  Otherwise, if the conversion overflows, the
6396 | largest integer with the same sign as `a' is returned.
6397 *----------------------------------------------------------------------------*/
6398
6399 int64_t float128_to_int64(float128 a, float_status *status)
6400 {
6401     bool aSign;
6402     int32_t aExp, shiftCount;
6403     uint64_t aSig0, aSig1;
6404
6405     aSig1 = extractFloat128Frac1( a );
6406     aSig0 = extractFloat128Frac0( a );
6407     aExp = extractFloat128Exp( a );
6408     aSign = extractFloat128Sign( a );
6409     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6410     shiftCount = 0x402F - aExp;
6411     if ( shiftCount <= 0 ) {
6412         if ( 0x403E < aExp ) {
6413             float_raise(float_flag_invalid, status);
6414             if (    ! aSign
6415                  || (    ( aExp == 0x7FFF )
6416                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6417                     )
6418                ) {
6419                 return INT64_MAX;
6420             }
6421             return INT64_MIN;
6422         }
6423         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6424     }
6425     else {
6426         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6427     }
6428     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6429
6430 }
6431
6432 /*----------------------------------------------------------------------------
6433 | Returns the result of converting the quadruple-precision floating-point
6434 | value `a' to the 64-bit two's complement integer format.  The conversion
6435 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6436 | Arithmetic, except that the conversion is always rounded toward zero.
6437 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6438 | the conversion overflows, the largest integer with the same sign as `a' is
6439 | returned.
6440 *----------------------------------------------------------------------------*/
6441
6442 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6443 {
6444     bool aSign;
6445     int32_t aExp, shiftCount;
6446     uint64_t aSig0, aSig1;
6447     int64_t z;
6448
6449     aSig1 = extractFloat128Frac1( a );
6450     aSig0 = extractFloat128Frac0( a );
6451     aExp = extractFloat128Exp( a );
6452     aSign = extractFloat128Sign( a );
6453     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6454     shiftCount = aExp - 0x402F;
6455     if ( 0 < shiftCount ) {
6456         if ( 0x403E <= aExp ) {
6457             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6458             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6459                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6460                 if (aSig1) {
6461                     status->float_exception_flags |= float_flag_inexact;
6462                 }
6463             }
6464             else {
6465                 float_raise(float_flag_invalid, status);
6466                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6467                     return INT64_MAX;
6468                 }
6469             }
6470             return INT64_MIN;
6471         }
6472         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6473         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6474             status->float_exception_flags |= float_flag_inexact;
6475         }
6476     }
6477     else {
6478         if ( aExp < 0x3FFF ) {
6479             if ( aExp | aSig0 | aSig1 ) {
6480                 status->float_exception_flags |= float_flag_inexact;
6481             }
6482             return 0;
6483         }
6484         z = aSig0>>( - shiftCount );
6485         if (    aSig1
6486              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6487             status->float_exception_flags |= float_flag_inexact;
6488         }
6489     }
6490     if ( aSign ) z = - z;
6491     return z;
6492
6493 }
6494
6495 /*----------------------------------------------------------------------------
6496 | Returns the result of converting the quadruple-precision floating-point value
6497 | `a' to the 64-bit unsigned integer format.  The conversion is
6498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6499 | Arithmetic---which means in particular that the conversion is rounded
6500 | according to the current rounding mode.  If `a' is a NaN, the largest
6501 | positive integer is returned.  If the conversion overflows, the
6502 | largest unsigned integer is returned.  If 'a' is negative, the value is
6503 | rounded and zero is returned; negative values that do not round to zero
6504 | will raise the inexact exception.
6505 *----------------------------------------------------------------------------*/
6506
6507 uint64_t float128_to_uint64(float128 a, float_status *status)
6508 {
6509     bool aSign;
6510     int aExp;
6511     int shiftCount;
6512     uint64_t aSig0, aSig1;
6513
6514     aSig0 = extractFloat128Frac0(a);
6515     aSig1 = extractFloat128Frac1(a);
6516     aExp = extractFloat128Exp(a);
6517     aSign = extractFloat128Sign(a);
6518     if (aSign && (aExp > 0x3FFE)) {
6519         float_raise(float_flag_invalid, status);
6520         if (float128_is_any_nan(a)) {
6521             return UINT64_MAX;
6522         } else {
6523             return 0;
6524         }
6525     }
6526     if (aExp) {
6527         aSig0 |= UINT64_C(0x0001000000000000);
6528     }
6529     shiftCount = 0x402F - aExp;
6530     if (shiftCount <= 0) {
6531         if (0x403E < aExp) {
6532             float_raise(float_flag_invalid, status);
6533             return UINT64_MAX;
6534         }
6535         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6536     } else {
6537         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6538     }
6539     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6540 }
6541
6542 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6543 {
6544     uint64_t v;
6545     signed char current_rounding_mode = status->float_rounding_mode;
6546
6547     set_float_rounding_mode(float_round_to_zero, status);
6548     v = float128_to_uint64(a, status);
6549     set_float_rounding_mode(current_rounding_mode, status);
6550
6551     return v;
6552 }
6553
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point
6556 | value `a' to the 32-bit unsigned integer format.  The conversion
6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic except that the conversion is always rounded toward zero.
6559 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6560 | if the conversion overflows, the largest unsigned integer is returned.
6561 | If 'a' is negative, the value is rounded and zero is returned; negative
6562 | values that do not round to zero will raise the inexact exception.
6563 *----------------------------------------------------------------------------*/
6564
6565 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6566 {
6567     uint64_t v;
6568     uint32_t res;
6569     int old_exc_flags = get_float_exception_flags(status);
6570
6571     v = float128_to_uint64_round_to_zero(a, status);
6572     if (v > 0xffffffff) {
6573         res = 0xffffffff;
6574     } else {
6575         return v;
6576     }
6577     set_float_exception_flags(old_exc_flags, status);
6578     float_raise(float_flag_invalid, status);
6579     return res;
6580 }
6581
6582 /*----------------------------------------------------------------------------
6583 | Returns the result of converting the quadruple-precision floating-point value
6584 | `a' to the 32-bit unsigned integer format.  The conversion is
6585 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6586 | Arithmetic---which means in particular that the conversion is rounded
6587 | according to the current rounding mode.  If `a' is a NaN, the largest
6588 | positive integer is returned.  If the conversion overflows, the
6589 | largest unsigned integer is returned.  If 'a' is negative, the value is
6590 | rounded and zero is returned; negative values that do not round to zero
6591 | will raise the inexact exception.
6592 *----------------------------------------------------------------------------*/
6593
6594 uint32_t float128_to_uint32(float128 a, float_status *status)
6595 {
6596     uint64_t v;
6597     uint32_t res;
6598     int old_exc_flags = get_float_exception_flags(status);
6599
6600     v = float128_to_uint64(a, status);
6601     if (v > 0xffffffff) {
6602         res = 0xffffffff;
6603     } else {
6604         return v;
6605     }
6606     set_float_exception_flags(old_exc_flags, status);
6607     float_raise(float_flag_invalid, status);
6608     return res;
6609 }
6610
6611 /*----------------------------------------------------------------------------
6612 | Returns the result of converting the quadruple-precision floating-point
6613 | value `a' to the single-precision floating-point format.  The conversion
6614 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6615 | Arithmetic.
6616 *----------------------------------------------------------------------------*/
6617
6618 float32 float128_to_float32(float128 a, float_status *status)
6619 {
6620     bool aSign;
6621     int32_t aExp;
6622     uint64_t aSig0, aSig1;
6623     uint32_t zSig;
6624
6625     aSig1 = extractFloat128Frac1( a );
6626     aSig0 = extractFloat128Frac0( a );
6627     aExp = extractFloat128Exp( a );
6628     aSign = extractFloat128Sign( a );
6629     if ( aExp == 0x7FFF ) {
6630         if ( aSig0 | aSig1 ) {
6631             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6632         }
6633         return packFloat32( aSign, 0xFF, 0 );
6634     }
6635     aSig0 |= ( aSig1 != 0 );
6636     shift64RightJamming( aSig0, 18, &aSig0 );
6637     zSig = aSig0;
6638     if ( aExp || zSig ) {
6639         zSig |= 0x40000000;
6640         aExp -= 0x3F81;
6641     }
6642     return roundAndPackFloat32(aSign, aExp, zSig, status);
6643
6644 }
6645
6646 /*----------------------------------------------------------------------------
6647 | Returns the result of converting the quadruple-precision floating-point
6648 | value `a' to the double-precision floating-point format.  The conversion
6649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6650 | Arithmetic.
6651 *----------------------------------------------------------------------------*/
6652
6653 float64 float128_to_float64(float128 a, float_status *status)
6654 {
6655     bool aSign;
6656     int32_t aExp;
6657     uint64_t aSig0, aSig1;
6658
6659     aSig1 = extractFloat128Frac1( a );
6660     aSig0 = extractFloat128Frac0( a );
6661     aExp = extractFloat128Exp( a );
6662     aSign = extractFloat128Sign( a );
6663     if ( aExp == 0x7FFF ) {
6664         if ( aSig0 | aSig1 ) {
6665             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6666         }
6667         return packFloat64( aSign, 0x7FF, 0 );
6668     }
6669     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6670     aSig0 |= ( aSig1 != 0 );
6671     if ( aExp || aSig0 ) {
6672         aSig0 |= UINT64_C(0x4000000000000000);
6673         aExp -= 0x3C01;
6674     }
6675     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6676
6677 }
6678
6679 /*----------------------------------------------------------------------------
6680 | Returns the result of converting the quadruple-precision floating-point
6681 | value `a' to the extended double-precision floating-point format.  The
6682 | conversion is performed according to the IEC/IEEE Standard for Binary
6683 | Floating-Point Arithmetic.
6684 *----------------------------------------------------------------------------*/
6685
6686 floatx80 float128_to_floatx80(float128 a, float_status *status)
6687 {
6688     bool aSign;
6689     int32_t aExp;
6690     uint64_t aSig0, aSig1;
6691
6692     aSig1 = extractFloat128Frac1( a );
6693     aSig0 = extractFloat128Frac0( a );
6694     aExp = extractFloat128Exp( a );
6695     aSign = extractFloat128Sign( a );
6696     if ( aExp == 0x7FFF ) {
6697         if ( aSig0 | aSig1 ) {
6698             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6699                                                status);
6700             return floatx80_silence_nan(res, status);
6701         }
6702         return packFloatx80(aSign, floatx80_infinity_high,
6703                                    floatx80_infinity_low);
6704     }
6705     if ( aExp == 0 ) {
6706         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6707         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6708     }
6709     else {
6710         aSig0 |= UINT64_C(0x0001000000000000);
6711     }
6712     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6713     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6714
6715 }
6716
6717 /*----------------------------------------------------------------------------
6718 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6719 | returns the result as a quadruple-precision floating-point value.  The
6720 | operation is performed according to the IEC/IEEE Standard for Binary
6721 | Floating-Point Arithmetic.
6722 *----------------------------------------------------------------------------*/
6723
6724 float128 float128_round_to_int(float128 a, float_status *status)
6725 {
6726     bool aSign;
6727     int32_t aExp;
6728     uint64_t lastBitMask, roundBitsMask;
6729     float128 z;
6730
6731     aExp = extractFloat128Exp( a );
6732     if ( 0x402F <= aExp ) {
6733         if ( 0x406F <= aExp ) {
6734             if (    ( aExp == 0x7FFF )
6735                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6736                ) {
6737                 return propagateFloat128NaN(a, a, status);
6738             }
6739             return a;
6740         }
6741         lastBitMask = 1;
6742         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6743         roundBitsMask = lastBitMask - 1;
6744         z = a;
6745         switch (status->float_rounding_mode) {
6746         case float_round_nearest_even:
6747             if ( lastBitMask ) {
6748                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6749                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6750             }
6751             else {
6752                 if ( (int64_t) z.low < 0 ) {
6753                     ++z.high;
6754                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6755                 }
6756             }
6757             break;
6758         case float_round_ties_away:
6759             if (lastBitMask) {
6760                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6761             } else {
6762                 if ((int64_t) z.low < 0) {
6763                     ++z.high;
6764                 }
6765             }
6766             break;
6767         case float_round_to_zero:
6768             break;
6769         case float_round_up:
6770             if (!extractFloat128Sign(z)) {
6771                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6772             }
6773             break;
6774         case float_round_down:
6775             if (extractFloat128Sign(z)) {
6776                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6777             }
6778             break;
6779         case float_round_to_odd:
6780             /*
6781              * Note that if lastBitMask == 0, the last bit is the lsb
6782              * of high, and roundBitsMask == -1.
6783              */
6784             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6785                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6786             }
6787             break;
6788         default:
6789             abort();
6790         }
6791         z.low &= ~ roundBitsMask;
6792     }
6793     else {
6794         if ( aExp < 0x3FFF ) {
6795             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6796             status->float_exception_flags |= float_flag_inexact;
6797             aSign = extractFloat128Sign( a );
6798             switch (status->float_rounding_mode) {
6799             case float_round_nearest_even:
6800                 if (    ( aExp == 0x3FFE )
6801                      && (   extractFloat128Frac0( a )
6802                           | extractFloat128Frac1( a ) )
6803                    ) {
6804                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6805                 }
6806                 break;
6807             case float_round_ties_away:
6808                 if (aExp == 0x3FFE) {
6809                     return packFloat128(aSign, 0x3FFF, 0, 0);
6810                 }
6811                 break;
6812             case float_round_down:
6813                 return
6814                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6815                     : packFloat128( 0, 0, 0, 0 );
6816             case float_round_up:
6817                 return
6818                       aSign ? packFloat128( 1, 0, 0, 0 )
6819                     : packFloat128( 0, 0x3FFF, 0, 0 );
6820
6821             case float_round_to_odd:
6822                 return packFloat128(aSign, 0x3FFF, 0, 0);
6823
6824             case float_round_to_zero:
6825                 break;
6826             }
6827             return packFloat128( aSign, 0, 0, 0 );
6828         }
6829         lastBitMask = 1;
6830         lastBitMask <<= 0x402F - aExp;
6831         roundBitsMask = lastBitMask - 1;
6832         z.low = 0;
6833         z.high = a.high;
6834         switch (status->float_rounding_mode) {
6835         case float_round_nearest_even:
6836             z.high += lastBitMask>>1;
6837             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6838                 z.high &= ~ lastBitMask;
6839             }
6840             break;
6841         case float_round_ties_away:
6842             z.high += lastBitMask>>1;
6843             break;
6844         case float_round_to_zero:
6845             break;
6846         case float_round_up:
6847             if (!extractFloat128Sign(z)) {
6848                 z.high |= ( a.low != 0 );
6849                 z.high += roundBitsMask;
6850             }
6851             break;
6852         case float_round_down:
6853             if (extractFloat128Sign(z)) {
6854                 z.high |= (a.low != 0);
6855                 z.high += roundBitsMask;
6856             }
6857             break;
6858         case float_round_to_odd:
6859             if ((z.high & lastBitMask) == 0) {
6860                 z.high |= (a.low != 0);
6861                 z.high += roundBitsMask;
6862             }
6863             break;
6864         default:
6865             abort();
6866         }
6867         z.high &= ~ roundBitsMask;
6868     }
6869     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6870         status->float_exception_flags |= float_flag_inexact;
6871     }
6872     return z;
6873
6874 }
6875
6876 /*----------------------------------------------------------------------------
6877 | Returns the result of adding the absolute values of the quadruple-precision
6878 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6879 | before being returned.  `zSign' is ignored if the result is a NaN.
6880 | The addition is performed according to the IEC/IEEE Standard for Binary
6881 | Floating-Point Arithmetic.
6882 *----------------------------------------------------------------------------*/
6883
6884 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6885                                 float_status *status)
6886 {
6887     int32_t aExp, bExp, zExp;
6888     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6889     int32_t expDiff;
6890
6891     aSig1 = extractFloat128Frac1( a );
6892     aSig0 = extractFloat128Frac0( a );
6893     aExp = extractFloat128Exp( a );
6894     bSig1 = extractFloat128Frac1( b );
6895     bSig0 = extractFloat128Frac0( b );
6896     bExp = extractFloat128Exp( b );
6897     expDiff = aExp - bExp;
6898     if ( 0 < expDiff ) {
6899         if ( aExp == 0x7FFF ) {
6900             if (aSig0 | aSig1) {
6901                 return propagateFloat128NaN(a, b, status);
6902             }
6903             return a;
6904         }
6905         if ( bExp == 0 ) {
6906             --expDiff;
6907         }
6908         else {
6909             bSig0 |= UINT64_C(0x0001000000000000);
6910         }
6911         shift128ExtraRightJamming(
6912             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6913         zExp = aExp;
6914     }
6915     else if ( expDiff < 0 ) {
6916         if ( bExp == 0x7FFF ) {
6917             if (bSig0 | bSig1) {
6918                 return propagateFloat128NaN(a, b, status);
6919             }
6920             return packFloat128( zSign, 0x7FFF, 0, 0 );
6921         }
6922         if ( aExp == 0 ) {
6923             ++expDiff;
6924         }
6925         else {
6926             aSig0 |= UINT64_C(0x0001000000000000);
6927         }
6928         shift128ExtraRightJamming(
6929             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6930         zExp = bExp;
6931     }
6932     else {
6933         if ( aExp == 0x7FFF ) {
6934             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6935                 return propagateFloat128NaN(a, b, status);
6936             }
6937             return a;
6938         }
6939         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6940         if ( aExp == 0 ) {
6941             if (status->flush_to_zero) {
6942                 if (zSig0 | zSig1) {
6943                     float_raise(float_flag_output_denormal, status);
6944                 }
6945                 return packFloat128(zSign, 0, 0, 0);
6946             }
6947             return packFloat128( zSign, 0, zSig0, zSig1 );
6948         }
6949         zSig2 = 0;
6950         zSig0 |= UINT64_C(0x0002000000000000);
6951         zExp = aExp;
6952         goto shiftRight1;
6953     }
6954     aSig0 |= UINT64_C(0x0001000000000000);
6955     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6956     --zExp;
6957     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6958     ++zExp;
6959  shiftRight1:
6960     shift128ExtraRightJamming(
6961         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6962  roundAndPack:
6963     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6964
6965 }
6966
6967 /*----------------------------------------------------------------------------
6968 | Returns the result of subtracting the absolute values of the quadruple-
6969 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6970 | difference is negated before being returned.  `zSign' is ignored if the
6971 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6972 | Standard for Binary Floating-Point Arithmetic.
6973 *----------------------------------------------------------------------------*/
6974
6975 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6976                                 float_status *status)
6977 {
6978     int32_t aExp, bExp, zExp;
6979     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6980     int32_t expDiff;
6981
6982     aSig1 = extractFloat128Frac1( a );
6983     aSig0 = extractFloat128Frac0( a );
6984     aExp = extractFloat128Exp( a );
6985     bSig1 = extractFloat128Frac1( b );
6986     bSig0 = extractFloat128Frac0( b );
6987     bExp = extractFloat128Exp( b );
6988     expDiff = aExp - bExp;
6989     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6990     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6991     if ( 0 < expDiff ) goto aExpBigger;
6992     if ( expDiff < 0 ) goto bExpBigger;
6993     if ( aExp == 0x7FFF ) {
6994         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6995             return propagateFloat128NaN(a, b, status);
6996         }
6997         float_raise(float_flag_invalid, status);
6998         return float128_default_nan(status);
6999     }
7000     if ( aExp == 0 ) {
7001         aExp = 1;
7002         bExp = 1;
7003     }
7004     if ( bSig0 < aSig0 ) goto aBigger;
7005     if ( aSig0 < bSig0 ) goto bBigger;
7006     if ( bSig1 < aSig1 ) goto aBigger;
7007     if ( aSig1 < bSig1 ) goto bBigger;
7008     return packFloat128(status->float_rounding_mode == float_round_down,
7009                         0, 0, 0);
7010  bExpBigger:
7011     if ( bExp == 0x7FFF ) {
7012         if (bSig0 | bSig1) {
7013             return propagateFloat128NaN(a, b, status);
7014         }
7015         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7016     }
7017     if ( aExp == 0 ) {
7018         ++expDiff;
7019     }
7020     else {
7021         aSig0 |= UINT64_C(0x4000000000000000);
7022     }
7023     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7024     bSig0 |= UINT64_C(0x4000000000000000);
7025  bBigger:
7026     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7027     zExp = bExp;
7028     zSign ^= 1;
7029     goto normalizeRoundAndPack;
7030  aExpBigger:
7031     if ( aExp == 0x7FFF ) {
7032         if (aSig0 | aSig1) {
7033             return propagateFloat128NaN(a, b, status);
7034         }
7035         return a;
7036     }
7037     if ( bExp == 0 ) {
7038         --expDiff;
7039     }
7040     else {
7041         bSig0 |= UINT64_C(0x4000000000000000);
7042     }
7043     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7044     aSig0 |= UINT64_C(0x4000000000000000);
7045  aBigger:
7046     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7047     zExp = aExp;
7048  normalizeRoundAndPack:
7049     --zExp;
7050     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7051                                          status);
7052
7053 }
7054
7055 /*----------------------------------------------------------------------------
7056 | Returns the result of adding the quadruple-precision floating-point values
7057 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7058 | for Binary Floating-Point Arithmetic.
7059 *----------------------------------------------------------------------------*/
7060
7061 float128 float128_add(float128 a, float128 b, float_status *status)
7062 {
7063     bool aSign, bSign;
7064
7065     aSign = extractFloat128Sign( a );
7066     bSign = extractFloat128Sign( b );
7067     if ( aSign == bSign ) {
7068         return addFloat128Sigs(a, b, aSign, status);
7069     }
7070     else {
7071         return subFloat128Sigs(a, b, aSign, status);
7072     }
7073
7074 }
7075
7076 /*----------------------------------------------------------------------------
7077 | Returns the result of subtracting the quadruple-precision floating-point
7078 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7079 | Standard for Binary Floating-Point Arithmetic.
7080 *----------------------------------------------------------------------------*/
7081
7082 float128 float128_sub(float128 a, float128 b, float_status *status)
7083 {
7084     bool aSign, bSign;
7085
7086     aSign = extractFloat128Sign( a );
7087     bSign = extractFloat128Sign( b );
7088     if ( aSign == bSign ) {
7089         return subFloat128Sigs(a, b, aSign, status);
7090     }
7091     else {
7092         return addFloat128Sigs(a, b, aSign, status);
7093     }
7094
7095 }
7096
7097 /*----------------------------------------------------------------------------
7098 | Returns the result of multiplying the quadruple-precision floating-point
7099 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7100 | Standard for Binary Floating-Point Arithmetic.
7101 *----------------------------------------------------------------------------*/
7102
7103 float128 float128_mul(float128 a, float128 b, float_status *status)
7104 {
7105     bool aSign, bSign, zSign;
7106     int32_t aExp, bExp, zExp;
7107     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7108
7109     aSig1 = extractFloat128Frac1( a );
7110     aSig0 = extractFloat128Frac0( a );
7111     aExp = extractFloat128Exp( a );
7112     aSign = extractFloat128Sign( a );
7113     bSig1 = extractFloat128Frac1( b );
7114     bSig0 = extractFloat128Frac0( b );
7115     bExp = extractFloat128Exp( b );
7116     bSign = extractFloat128Sign( b );
7117     zSign = aSign ^ bSign;
7118     if ( aExp == 0x7FFF ) {
7119         if (    ( aSig0 | aSig1 )
7120              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7121             return propagateFloat128NaN(a, b, status);
7122         }
7123         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7124         return packFloat128( zSign, 0x7FFF, 0, 0 );
7125     }
7126     if ( bExp == 0x7FFF ) {
7127         if (bSig0 | bSig1) {
7128             return propagateFloat128NaN(a, b, status);
7129         }
7130         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7131  invalid:
7132             float_raise(float_flag_invalid, status);
7133             return float128_default_nan(status);
7134         }
7135         return packFloat128( zSign, 0x7FFF, 0, 0 );
7136     }
7137     if ( aExp == 0 ) {
7138         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7139         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7140     }
7141     if ( bExp == 0 ) {
7142         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7143         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7144     }
7145     zExp = aExp + bExp - 0x4000;
7146     aSig0 |= UINT64_C(0x0001000000000000);
7147     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7148     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7149     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7150     zSig2 |= ( zSig3 != 0 );
7151     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7152         shift128ExtraRightJamming(
7153             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7154         ++zExp;
7155     }
7156     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7157
7158 }
7159
7160 /*----------------------------------------------------------------------------
7161 | Returns the result of dividing the quadruple-precision floating-point value
7162 | `a' by the corresponding value `b'.  The operation is performed according to
7163 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7164 *----------------------------------------------------------------------------*/
7165
7166 float128 float128_div(float128 a, float128 b, float_status *status)
7167 {
7168     bool aSign, bSign, zSign;
7169     int32_t aExp, bExp, zExp;
7170     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7171     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7172
7173     aSig1 = extractFloat128Frac1( a );
7174     aSig0 = extractFloat128Frac0( a );
7175     aExp = extractFloat128Exp( a );
7176     aSign = extractFloat128Sign( a );
7177     bSig1 = extractFloat128Frac1( b );
7178     bSig0 = extractFloat128Frac0( b );
7179     bExp = extractFloat128Exp( b );
7180     bSign = extractFloat128Sign( b );
7181     zSign = aSign ^ bSign;
7182     if ( aExp == 0x7FFF ) {
7183         if (aSig0 | aSig1) {
7184             return propagateFloat128NaN(a, b, status);
7185         }
7186         if ( bExp == 0x7FFF ) {
7187             if (bSig0 | bSig1) {
7188                 return propagateFloat128NaN(a, b, status);
7189             }
7190             goto invalid;
7191         }
7192         return packFloat128( zSign, 0x7FFF, 0, 0 );
7193     }
7194     if ( bExp == 0x7FFF ) {
7195         if (bSig0 | bSig1) {
7196             return propagateFloat128NaN(a, b, status);
7197         }
7198         return packFloat128( zSign, 0, 0, 0 );
7199     }
7200     if ( bExp == 0 ) {
7201         if ( ( bSig0 | bSig1 ) == 0 ) {
7202             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7203  invalid:
7204                 float_raise(float_flag_invalid, status);
7205                 return float128_default_nan(status);
7206             }
7207             float_raise(float_flag_divbyzero, status);
7208             return packFloat128( zSign, 0x7FFF, 0, 0 );
7209         }
7210         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7211     }
7212     if ( aExp == 0 ) {
7213         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7214         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7215     }
7216     zExp = aExp - bExp + 0x3FFD;
7217     shortShift128Left(
7218         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7219     shortShift128Left(
7220         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7221     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7222         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7223         ++zExp;
7224     }
7225     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7226     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7227     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7228     while ( (int64_t) rem0 < 0 ) {
7229         --zSig0;
7230         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7231     }
7232     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7233     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7234         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7235         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7236         while ( (int64_t) rem1 < 0 ) {
7237             --zSig1;
7238             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7239         }
7240         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7241     }
7242     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7243     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7244
7245 }
7246
7247 /*----------------------------------------------------------------------------
7248 | Returns the remainder of the quadruple-precision floating-point value `a'
7249 | with respect to the corresponding value `b'.  The operation is performed
7250 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7251 *----------------------------------------------------------------------------*/
7252
7253 float128 float128_rem(float128 a, float128 b, float_status *status)
7254 {
7255     bool aSign, zSign;
7256     int32_t aExp, bExp, expDiff;
7257     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7258     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7259     int64_t sigMean0;
7260
7261     aSig1 = extractFloat128Frac1( a );
7262     aSig0 = extractFloat128Frac0( a );
7263     aExp = extractFloat128Exp( a );
7264     aSign = extractFloat128Sign( a );
7265     bSig1 = extractFloat128Frac1( b );
7266     bSig0 = extractFloat128Frac0( b );
7267     bExp = extractFloat128Exp( b );
7268     if ( aExp == 0x7FFF ) {
7269         if (    ( aSig0 | aSig1 )
7270              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7271             return propagateFloat128NaN(a, b, status);
7272         }
7273         goto invalid;
7274     }
7275     if ( bExp == 0x7FFF ) {
7276         if (bSig0 | bSig1) {
7277             return propagateFloat128NaN(a, b, status);
7278         }
7279         return a;
7280     }
7281     if ( bExp == 0 ) {
7282         if ( ( bSig0 | bSig1 ) == 0 ) {
7283  invalid:
7284             float_raise(float_flag_invalid, status);
7285             return float128_default_nan(status);
7286         }
7287         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7288     }
7289     if ( aExp == 0 ) {
7290         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7291         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7292     }
7293     expDiff = aExp - bExp;
7294     if ( expDiff < -1 ) return a;
7295     shortShift128Left(
7296         aSig0 | UINT64_C(0x0001000000000000),
7297         aSig1,
7298         15 - ( expDiff < 0 ),
7299         &aSig0,
7300         &aSig1
7301     );
7302     shortShift128Left(
7303         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7304     q = le128( bSig0, bSig1, aSig0, aSig1 );
7305     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7306     expDiff -= 64;
7307     while ( 0 < expDiff ) {
7308         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7309         q = ( 4 < q ) ? q - 4 : 0;
7310         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7311         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7312         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7313         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7314         expDiff -= 61;
7315     }
7316     if ( -64 < expDiff ) {
7317         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7318         q = ( 4 < q ) ? q - 4 : 0;
7319         q >>= - expDiff;
7320         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7321         expDiff += 52;
7322         if ( expDiff < 0 ) {
7323             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7324         }
7325         else {
7326             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7327         }
7328         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7329         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7330     }
7331     else {
7332         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7333         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7334     }
7335     do {
7336         alternateASig0 = aSig0;
7337         alternateASig1 = aSig1;
7338         ++q;
7339         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7340     } while ( 0 <= (int64_t) aSig0 );
7341     add128(
7342         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7343     if (    ( sigMean0 < 0 )
7344          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7345         aSig0 = alternateASig0;
7346         aSig1 = alternateASig1;
7347     }
7348     zSign = ( (int64_t) aSig0 < 0 );
7349     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7350     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7351                                          status);
7352 }
7353
7354 /*----------------------------------------------------------------------------
7355 | Returns the square root of the quadruple-precision floating-point value `a'.
7356 | The operation is performed according to the IEC/IEEE Standard for Binary
7357 | Floating-Point Arithmetic.
7358 *----------------------------------------------------------------------------*/
7359
7360 float128 float128_sqrt(float128 a, float_status *status)
7361 {
7362     bool aSign;
7363     int32_t aExp, zExp;
7364     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7365     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7366
7367     aSig1 = extractFloat128Frac1( a );
7368     aSig0 = extractFloat128Frac0( a );
7369     aExp = extractFloat128Exp( a );
7370     aSign = extractFloat128Sign( a );
7371     if ( aExp == 0x7FFF ) {
7372         if (aSig0 | aSig1) {
7373             return propagateFloat128NaN(a, a, status);
7374         }
7375         if ( ! aSign ) return a;
7376         goto invalid;
7377     }
7378     if ( aSign ) {
7379         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7380  invalid:
7381         float_raise(float_flag_invalid, status);
7382         return float128_default_nan(status);
7383     }
7384     if ( aExp == 0 ) {
7385         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7386         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7387     }
7388     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7389     aSig0 |= UINT64_C(0x0001000000000000);
7390     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7391     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7392     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7393     doubleZSig0 = zSig0<<1;
7394     mul64To128( zSig0, zSig0, &term0, &term1 );
7395     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7396     while ( (int64_t) rem0 < 0 ) {
7397         --zSig0;
7398         doubleZSig0 -= 2;
7399         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7400     }
7401     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7402     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7403         if ( zSig1 == 0 ) zSig1 = 1;
7404         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7405         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7406         mul64To128( zSig1, zSig1, &term2, &term3 );
7407         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7408         while ( (int64_t) rem1 < 0 ) {
7409             --zSig1;
7410             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7411             term3 |= 1;
7412             term2 |= doubleZSig0;
7413             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7414         }
7415         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7416     }
7417     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7418     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7419
7420 }
7421
7422 static inline FloatRelation
7423 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7424                           float_status *status)
7425 {
7426     bool aSign, bSign;
7427
7428     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7429         float_raise(float_flag_invalid, status);
7430         return float_relation_unordered;
7431     }
7432     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7433           ( extractFloatx80Frac( a )<<1 ) ) ||
7434         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7435           ( extractFloatx80Frac( b )<<1 ) )) {
7436         if (!is_quiet ||
7437             floatx80_is_signaling_nan(a, status) ||
7438             floatx80_is_signaling_nan(b, status)) {
7439             float_raise(float_flag_invalid, status);
7440         }
7441         return float_relation_unordered;
7442     }
7443     aSign = extractFloatx80Sign( a );
7444     bSign = extractFloatx80Sign( b );
7445     if ( aSign != bSign ) {
7446
7447         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7448              ( ( a.low | b.low ) == 0 ) ) {
7449             /* zero case */
7450             return float_relation_equal;
7451         } else {
7452             return 1 - (2 * aSign);
7453         }
7454     } else {
7455         /* Normalize pseudo-denormals before comparison.  */
7456         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7457             ++a.high;
7458         }
7459         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7460             ++b.high;
7461         }
7462         if (a.low == b.low && a.high == b.high) {
7463             return float_relation_equal;
7464         } else {
7465             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7466         }
7467     }
7468 }
7469
7470 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7471 {
7472     return floatx80_compare_internal(a, b, 0, status);
7473 }
7474
7475 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7476                                      float_status *status)
7477 {
7478     return floatx80_compare_internal(a, b, 1, status);
7479 }
7480
7481 static inline FloatRelation
7482 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7483                           float_status *status)
7484 {
7485     bool aSign, bSign;
7486
7487     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7488           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7489         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7490           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7491         if (!is_quiet ||
7492             float128_is_signaling_nan(a, status) ||
7493             float128_is_signaling_nan(b, status)) {
7494             float_raise(float_flag_invalid, status);
7495         }
7496         return float_relation_unordered;
7497     }
7498     aSign = extractFloat128Sign( a );
7499     bSign = extractFloat128Sign( b );
7500     if ( aSign != bSign ) {
7501         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7502             /* zero case */
7503             return float_relation_equal;
7504         } else {
7505             return 1 - (2 * aSign);
7506         }
7507     } else {
7508         if (a.low == b.low && a.high == b.high) {
7509             return float_relation_equal;
7510         } else {
7511             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7512         }
7513     }
7514 }
7515
7516 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7517 {
7518     return float128_compare_internal(a, b, 0, status);
7519 }
7520
7521 FloatRelation float128_compare_quiet(float128 a, float128 b,
7522                                      float_status *status)
7523 {
7524     return float128_compare_internal(a, b, 1, status);
7525 }
7526
7527 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7528 {
7529     bool aSign;
7530     int32_t aExp;
7531     uint64_t aSig;
7532
7533     if (floatx80_invalid_encoding(a)) {
7534         float_raise(float_flag_invalid, status);
7535         return floatx80_default_nan(status);
7536     }
7537     aSig = extractFloatx80Frac( a );
7538     aExp = extractFloatx80Exp( a );
7539     aSign = extractFloatx80Sign( a );
7540
7541     if ( aExp == 0x7FFF ) {
7542         if ( aSig<<1 ) {
7543             return propagateFloatx80NaN(a, a, status);
7544         }
7545         return a;
7546     }
7547
7548     if (aExp == 0) {
7549         if (aSig == 0) {
7550             return a;
7551         }
7552         aExp++;
7553     }
7554
7555     if (n > 0x10000) {
7556         n = 0x10000;
7557     } else if (n < -0x10000) {
7558         n = -0x10000;
7559     }
7560
7561     aExp += n;
7562     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7563                                          aSign, aExp, aSig, 0, status);
7564 }
7565
7566 float128 float128_scalbn(float128 a, int n, float_status *status)
7567 {
7568     bool aSign;
7569     int32_t aExp;
7570     uint64_t aSig0, aSig1;
7571
7572     aSig1 = extractFloat128Frac1( a );
7573     aSig0 = extractFloat128Frac0( a );
7574     aExp = extractFloat128Exp( a );
7575     aSign = extractFloat128Sign( a );
7576     if ( aExp == 0x7FFF ) {
7577         if ( aSig0 | aSig1 ) {
7578             return propagateFloat128NaN(a, a, status);
7579         }
7580         return a;
7581     }
7582     if (aExp != 0) {
7583         aSig0 |= UINT64_C(0x0001000000000000);
7584     } else if (aSig0 == 0 && aSig1 == 0) {
7585         return a;
7586     } else {
7587         aExp++;
7588     }
7589
7590     if (n > 0x10000) {
7591         n = 0x10000;
7592     } else if (n < -0x10000) {
7593         n = -0x10000;
7594     }
7595
7596     aExp += n - 1;
7597     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7598                                          , status);
7599
7600 }
7601
7602 static void __attribute__((constructor)) softfloat_init(void)
7603 {
7604     union_float64 ua, ub, uc, ur;
7605
7606     if (QEMU_NO_HARDFLOAT) {
7607         return;
7608     }
7609     /*
7610      * Test that the host's FMA is not obviously broken. For example,
7611      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7612      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7613      */
7614     ua.s = 0x0020000000000001ULL;
7615     ub.s = 0x3ca0000000000000ULL;
7616     uc.s = 0x0020000000000000ULL;
7617     ur.h = fma(ua.h, ub.h, uc.h);
7618     if (ur.s != 0x0020000000000001ULL) {
7619         force_soft_fma = true;
7620     }
7621 }