fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float.
 504  * The exponent is unbiased and the fraction is normalized.
 505  *
 506  * The fraction words are stored in big-endian word ordering,
 507  * so that truncation from a larger format to a smaller format
 508  * can be done simply by ignoring subsequent elements.
 509  */
 510
 511 typedef struct {
 512     FloatClass cls;
 513     bool sign;
 514     int32_t exp;
 515     union {
 516         /* Routines that know the structure may reference the singular name. */
 517         uint64_t frac;
 518         /*
 519          * Routines expanded with multiple structures reference "hi" and "lo"
 520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
 521          * both the same word and aliased here.
 522          */
 523         uint64_t frac_hi;
 524         uint64_t frac_lo;
 525     };
 526 } FloatParts64;
 527
 528 typedef struct {
 529     FloatClass cls;
 530     bool sign;
 531     int32_t exp;
 532     uint64_t frac_hi;
 533     uint64_t frac_lo;
 534 } FloatParts128;
 535
 536 typedef struct {
 537     FloatClass cls;
 538     bool sign;
 539     int32_t exp;
 540     uint64_t frac_hi;
 541     uint64_t frac_hm;  /* high-middle */
 542     uint64_t frac_lm;  /* low-middle */
 543     uint64_t frac_lo;
 544 } FloatParts256;
 545
 546 /* These apply to the most significant word of each FloatPartsN. */
 547 #define DECOMPOSED_BINARY_POINT    63
 548 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 549
 550 /* Structure holding all of the relevant parameters for a format.
 551  *   exp_size: the size of the exponent field
 552  *   exp_bias: the offset applied to the exponent field
 553  *   exp_max: the maximum normalised exponent
 554  *   frac_size: the size of the fraction field
 555  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 556  * The following are computed based the size of fraction
 557  *   frac_lsb: least significant bit of fraction
 558  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 559  *   round_mask/roundeven_mask: masks used for rounding
 560  * The following optional modifiers are available:
 561  *   arm_althp: handle ARM Alternative Half Precision
 562  */
 563 typedef struct {
 564     int exp_size;
 565     int exp_bias;
 566     int exp_max;
 567     int frac_size;
 568     int frac_shift;
 569     uint64_t frac_lsb;
 570     uint64_t frac_lsbm1;
 571     uint64_t round_mask;
 572     uint64_t roundeven_mask;
 573     bool arm_althp;
 574 } FloatFmt;
 575
 576 /* Expand fields based on the size of exponent and fraction */
 577 #define FLOAT_PARAMS(E, F)                                           \
 578     .exp_size       = E,                                             \
 579     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 580     .exp_max        = (1 << E) - 1,                                  \
 581     .frac_size      = F,                                             \
 582     .frac_shift     = (-F - 1) & 63,                                 \
 583     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
 584     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
 585     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
 586     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
 587
 588 static const FloatFmt float16_params = {
 589     FLOAT_PARAMS(5, 10)
 590 };
 591
 592 static const FloatFmt float16_params_ahp = {
 593     FLOAT_PARAMS(5, 10),
 594     .arm_althp = true
 595 };
 596
 597 static const FloatFmt bfloat16_params = {
 598     FLOAT_PARAMS(8, 7)
 599 };
 600
 601 static const FloatFmt float32_params = {
 602     FLOAT_PARAMS(8, 23)
 603 };
 604
 605 static const FloatFmt float64_params = {
 606     FLOAT_PARAMS(11, 52)
 607 };
 608
 609 static const FloatFmt float128_params = {
 610     FLOAT_PARAMS(15, 112)
 611 };
 612
 613 /* Unpack a float to parts, but do not canonicalize.  */
 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 615 {
 616     const int f_size = fmt->frac_size;
 617     const int e_size = fmt->exp_size;
 618
 619     *r = (FloatParts64) {
 620         .cls = float_class_unclassified,
 621         .sign = extract64(raw, f_size + e_size, 1),
 622         .exp = extract64(raw, f_size, e_size),
 623         .frac = extract64(raw, 0, f_size)
 624     };
 625 }
 626
 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
 628 {
 629     unpack_raw64(p, &float16_params, f);
 630 }
 631
 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 633 {
 634     unpack_raw64(p, &bfloat16_params, f);
 635 }
 636
 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
 638 {
 639     unpack_raw64(p, &float32_params, f);
 640 }
 641
 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
 643 {
 644     unpack_raw64(p, &float64_params, f);
 645 }
 646
 647 static void float128_unpack_raw(FloatParts128 *p, float128 f)
 648 {
 649     const int f_size = float128_params.frac_size - 64;
 650     const int e_size = float128_params.exp_size;
 651
 652     *p = (FloatParts128) {
 653         .cls = float_class_unclassified,
 654         .sign = extract64(f.high, f_size + e_size, 1),
 655         .exp = extract64(f.high, f_size, e_size),
 656         .frac_hi = extract64(f.high, 0, f_size),
 657         .frac_lo = f.low,
 658     };
 659 }
 660
 661 /* Pack a float from parts, but do not canonicalize.  */
 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 663 {
 664     const int f_size = fmt->frac_size;
 665     const int e_size = fmt->exp_size;
 666     uint64_t ret;
 667
 668     ret = (uint64_t)p->sign << (f_size + e_size);
 669     ret = deposit64(ret, f_size, e_size, p->exp);
 670     ret = deposit64(ret, 0, f_size, p->frac);
 671     return ret;
 672 }
 673
 674 static inline float16 float16_pack_raw(const FloatParts64 *p)
 675 {
 676     return make_float16(pack_raw64(p, &float16_params));
 677 }
 678
 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
 680 {
 681     return pack_raw64(p, &bfloat16_params);
 682 }
 683
 684 static inline float32 float32_pack_raw(const FloatParts64 *p)
 685 {
 686     return make_float32(pack_raw64(p, &float32_params));
 687 }
 688
 689 static inline float64 float64_pack_raw(const FloatParts64 *p)
 690 {
 691     return make_float64(pack_raw64(p, &float64_params));
 692 }
 693
 694 static float128 float128_pack_raw(const FloatParts128 *p)
 695 {
 696     const int f_size = float128_params.frac_size - 64;
 697     const int e_size = float128_params.exp_size;
 698     uint64_t hi;
 699
 700     hi = (uint64_t)p->sign << (f_size + e_size);
 701     hi = deposit64(hi, f_size, e_size, p->exp);
 702     hi = deposit64(hi, 0, f_size, p->frac_hi);
 703     return make_float128(hi, p->frac_lo);
 704 }
 705
 706 /*----------------------------------------------------------------------------
 707 | Functions and definitions to determine:  (1) whether tininess for underflow
 708 | is detected before or after rounding by default, (2) what (if anything)
 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 711 | are propagated from function inputs to output.  These details are target-
 712 | specific.
 713 *----------------------------------------------------------------------------*/
 714 #include "softfloat-specialize.c.inc"
 715
 716 #define PARTS_GENERIC_64_128(NAME, P) \
 717     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 718
 719 #define PARTS_GENERIC_64_128_256(NAME, P) \
 720     QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
 721                  (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 722
 723 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
 724 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
 725
 726 static void parts64_return_nan(FloatParts64 *a, float_status *s);
 727 static void parts128_return_nan(FloatParts128 *a, float_status *s);
 728
 729 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
 730
 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
 732                                       float_status *s);
 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
 734                                         float_status *s);
 735
 736 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
 737
 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
 739                                              FloatParts64 *c, float_status *s,
 740                                              int ab_mask, int abc_mask);
 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
 742                                                FloatParts128 *b,
 743                                                FloatParts128 *c,
 744                                                float_status *s,
 745                                                int ab_mask, int abc_mask);
 746
 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
 748     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
 749
 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
 751                                  const FloatFmt *fmt);
 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
 753                                   const FloatFmt *fmt);
 754
 755 #define parts_canonicalize(A, S, F) \
 756     PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
 757
 758 static void parts64_uncanon(FloatParts64 *p, float_status *status,
 759                             const FloatFmt *fmt);
 760 static void parts128_uncanon(FloatParts128 *p, float_status *status,
 761                              const FloatFmt *fmt);
 762
 763 #define parts_uncanon(A, S, F) \
 764     PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
 765
 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b);
 769
 770 #define parts_add_normal(A, B) \
 771     PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
 772
 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b);
 776
 777 #define parts_sub_normal(A, B) \
 778     PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
 779
 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
 781                                     float_status *s, bool subtract);
 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
 783                                       float_status *s, bool subtract);
 784
 785 #define parts_addsub(A, B, S, Z) \
 786     PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
 787
 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b,
 789                                  float_status *s);
 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
 791                                    float_status *s);
 792
 793 #define parts_mul(A, B, S) \
 794     PARTS_GENERIC_64_128(mul, A)(A, B, S)
 795
 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
 797                                     FloatParts64 *c, int flags,
 798                                     float_status *s);
 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
 800                                       FloatParts128 *c, int flags,
 801                                       float_status *s);
 802
 803 #define parts_muladd(A, B, C, Z, S) \
 804     PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
 805
 806 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
 807                                  float_status *s);
 808 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b,
 809                                    float_status *s);
 810
 811 #define parts_div(A, B, S) \
 812     PARTS_GENERIC_64_128(div, A)(A, B, S)
 813
 814 static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm,
 815                                         int scale, int frac_size);
 816 static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r,
 817                                          int scale, int frac_size);
 818
 819 #define parts_round_to_int_normal(A, R, C, F) \
 820     PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F)
 821
 822 static void parts64_round_to_int(FloatParts64 *a, FloatRoundMode rm,
 823                                  int scale, float_status *s,
 824                                  const FloatFmt *fmt);
 825 static void parts128_round_to_int(FloatParts128 *a, FloatRoundMode r,
 826                                   int scale, float_status *s,
 827                                   const FloatFmt *fmt);
 828
 829 #define parts_round_to_int(A, R, C, S, F) \
 830     PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F)
 831
 832 /*
 833  * Helper functions for softfloat-parts.c.inc, per-size operations.
 834  */
 835
 836 #define FRAC_GENERIC_64_128(NAME, P) \
 837     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 838
 839 #define FRAC_GENERIC_64_128_256(NAME, P) \
 840     QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
 841                  (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 842
 843 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
 844 {
 845     return uadd64_overflow(a->frac, b->frac, &r->frac);
 846 }
 847
 848 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
 849 {
 850     bool c = 0;
 851     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
 852     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
 853     return c;
 854 }
 855
 856 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
 857 {
 858     bool c = 0;
 859     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
 860     r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c);
 861     r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c);
 862     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
 863     return c;
 864 }
 865
 866 #define frac_add(R, A, B)  FRAC_GENERIC_64_128_256(add, R)(R, A, B)
 867
 868 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
 869 {
 870     return uadd64_overflow(a->frac, c, &r->frac);
 871 }
 872
 873 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
 874 {
 875     c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
 876     return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
 877 }
 878
 879 #define frac_addi(R, A, C)  FRAC_GENERIC_64_128(addi, R)(R, A, C)
 880
 881 static void frac64_allones(FloatParts64 *a)
 882 {
 883     a->frac = -1;
 884 }
 885
 886 static void frac128_allones(FloatParts128 *a)
 887 {
 888     a->frac_hi = a->frac_lo = -1;
 889 }
 890
 891 #define frac_allones(A)  FRAC_GENERIC_64_128(allones, A)(A)
 892
 893 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
 894 {
 895     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
 896 }
 897
 898 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
 899 {
 900     uint64_t ta = a->frac_hi, tb = b->frac_hi;
 901     if (ta == tb) {
 902         ta = a->frac_lo, tb = b->frac_lo;
 903         if (ta == tb) {
 904             return 0;
 905         }
 906     }
 907     return ta < tb ? -1 : 1;
 908 }
 909
 910 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
 911
 912 static void frac64_clear(FloatParts64 *a)
 913 {
 914     a->frac = 0;
 915 }
 916
 917 static void frac128_clear(FloatParts128 *a)
 918 {
 919     a->frac_hi = a->frac_lo = 0;
 920 }
 921
 922 #define frac_clear(A)  FRAC_GENERIC_64_128(clear, A)(A)
 923
 924 static bool frac64_div(FloatParts64 *a, FloatParts64 *b)
 925 {
 926     uint64_t n1, n0, r, q;
 927     bool ret;
 928
 929     /*
 930      * We want a 2*N / N-bit division to produce exactly an N-bit
 931      * result, so that we do not lose any precision and so that we
 932      * do not have to renormalize afterward.  If A.frac < B.frac,
 933      * then division would produce an (N-1)-bit result; shift A left
 934      * by one to produce the an N-bit result, and return true to
 935      * decrement the exponent to match.
 936      *
 937      * The udiv_qrnnd algorithm that we're using requires normalization,
 938      * i.e. the msb of the denominator must be set, which is already true.
 939      */
 940     ret = a->frac < b->frac;
 941     if (ret) {
 942         n0 = a->frac;
 943         n1 = 0;
 944     } else {
 945         n0 = a->frac >> 1;
 946         n1 = a->frac << 63;
 947     }
 948     q = udiv_qrnnd(&r, n0, n1, b->frac);
 949
 950     /* Set lsb if there is a remainder, to set inexact. */
 951     a->frac = q | (r != 0);
 952
 953     return ret;
 954 }
 955
 956 static bool frac128_div(FloatParts128 *a, FloatParts128 *b)
 957 {
 958     uint64_t q0, q1, a0, a1, b0, b1;
 959     uint64_t r0, r1, r2, r3, t0, t1, t2, t3;
 960     bool ret = false;
 961
 962     a0 = a->frac_hi, a1 = a->frac_lo;
 963     b0 = b->frac_hi, b1 = b->frac_lo;
 964
 965     ret = lt128(a0, a1, b0, b1);
 966     if (!ret) {
 967         a1 = shr_double(a0, a1, 1);
 968         a0 = a0 >> 1;
 969     }
 970
 971     /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */
 972     q0 = estimateDiv128To64(a0, a1, b0);
 973
 974     /*
 975      * Estimate is high because B1 was not included (unless B1 == 0).
 976      * Reduce quotient and increase remainder until remainder is non-negative.
 977      * This loop will execute 0 to 2 times.
 978      */
 979     mul128By64To192(b0, b1, q0, &t0, &t1, &t2);
 980     sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2);
 981     while (r0 != 0) {
 982         q0--;
 983         add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2);
 984     }
 985
 986     /* Repeat using the remainder, producing a second word of quotient. */
 987     q1 = estimateDiv128To64(r1, r2, b0);
 988     mul128By64To192(b0, b1, q1, &t1, &t2, &t3);
 989     sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3);
 990     while (r1 != 0) {
 991         q1--;
 992         add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3);
 993     }
 994
 995     /* Any remainder indicates inexact; set sticky bit. */
 996     q1 |= (r2 | r3) != 0;
 997
 998     a->frac_hi = q0;
 999     a->frac_lo = q1;
1000     return ret;
1001 }
1002
1003 #define frac_div(A, B)  FRAC_GENERIC_64_128(div, A)(A, B)
1004
1005 static bool frac64_eqz(FloatParts64 *a)
1006 {
1007     return a->frac == 0;
1008 }
1009
1010 static bool frac128_eqz(FloatParts128 *a)
1011 {
1012     return (a->frac_hi | a->frac_lo) == 0;
1013 }
1014
1015 #define frac_eqz(A)  FRAC_GENERIC_64_128(eqz, A)(A)
1016
1017 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b)
1018 {
1019     mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac);
1020 }
1021
1022 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b)
1023 {
1024     mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo,
1025                 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo);
1026 }
1027
1028 #define frac_mulw(R, A, B)  FRAC_GENERIC_64_128(mulw, A)(R, A, B)
1029
1030 static void frac64_neg(FloatParts64 *a)
1031 {
1032     a->frac = -a->frac;
1033 }
1034
1035 static void frac128_neg(FloatParts128 *a)
1036 {
1037     bool c = 0;
1038     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
1039     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
1040 }
1041
1042 static void frac256_neg(FloatParts256 *a)
1043 {
1044     bool c = 0;
1045     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
1046     a->frac_lm = usub64_borrow(0, a->frac_lm, &c);
1047     a->frac_hm = usub64_borrow(0, a->frac_hm, &c);
1048     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
1049 }
1050
1051 #define frac_neg(A)  FRAC_GENERIC_64_128_256(neg, A)(A)
1052
1053 static int frac64_normalize(FloatParts64 *a)
1054 {
1055     if (a->frac) {
1056         int shift = clz64(a->frac);
1057         a->frac <<= shift;
1058         return shift;
1059     }
1060     return 64;
1061 }
1062
1063 static int frac128_normalize(FloatParts128 *a)
1064 {
1065     if (a->frac_hi) {
1066         int shl = clz64(a->frac_hi);
1067         a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl);
1068         a->frac_lo <<= shl;
1069         return shl;
1070     } else if (a->frac_lo) {
1071         int shl = clz64(a->frac_lo);
1072         a->frac_hi = a->frac_lo << shl;
1073         a->frac_lo = 0;
1074         return shl + 64;
1075     }
1076     return 128;
1077 }
1078
1079 static int frac256_normalize(FloatParts256 *a)
1080 {
1081     uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
1082     uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
1083     int ret, shl;
1084
1085     if (likely(a0)) {
1086         shl = clz64(a0);
1087         if (shl == 0) {
1088             return 0;
1089         }
1090         ret = shl;
1091     } else {
1092         if (a1) {
1093             ret = 64;
1094             a0 = a1, a1 = a2, a2 = a3, a3 = 0;
1095         } else if (a2) {
1096             ret = 128;
1097             a0 = a2, a1 = a3, a2 = 0, a3 = 0;
1098         } else if (a3) {
1099             ret = 192;
1100             a0 = a3, a1 = 0, a2 = 0, a3 = 0;
1101         } else {
1102             ret = 256;
1103             a0 = 0, a1 = 0, a2 = 0, a3 = 0;
1104             goto done;
1105         }
1106         shl = clz64(a0);
1107         if (shl == 0) {
1108             goto done;
1109         }
1110         ret += shl;
1111     }
1112
1113     a0 = shl_double(a0, a1, shl);
1114     a1 = shl_double(a1, a2, shl);
1115     a2 = shl_double(a2, a3, shl);
1116     a3 <<= shl;
1117
1118  done:
1119     a->frac_hi = a0;
1120     a->frac_hm = a1;
1121     a->frac_lm = a2;
1122     a->frac_lo = a3;
1123     return ret;
1124 }
1125
1126 #define frac_normalize(A)  FRAC_GENERIC_64_128_256(normalize, A)(A)
1127
1128 static void frac64_shl(FloatParts64 *a, int c)
1129 {
1130     a->frac <<= c;
1131 }
1132
1133 static void frac128_shl(FloatParts128 *a, int c)
1134 {
1135     uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
1136
1137     if (c & 64) {
1138         a0 = a1, a1 = 0;
1139     }
1140
1141     c &= 63;
1142     if (c) {
1143         a0 = shl_double(a0, a1, c);
1144         a1 = a1 << c;
1145     }
1146
1147     a->frac_hi = a0;
1148     a->frac_lo = a1;
1149 }
1150
1151 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
1152
1153 static void frac64_shr(FloatParts64 *a, int c)
1154 {
1155     a->frac >>= c;
1156 }
1157
1158 static void frac128_shr(FloatParts128 *a, int c)
1159 {
1160     uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
1161
1162     if (c & 64) {
1163         a1 = a0, a0 = 0;
1164     }
1165
1166     c &= 63;
1167     if (c) {
1168         a1 = shr_double(a0, a1, c);
1169         a0 = a0 >> c;
1170     }
1171
1172     a->frac_hi = a0;
1173     a->frac_lo = a1;
1174 }
1175
1176 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
1177
1178 static void frac64_shrjam(FloatParts64 *a, int c)
1179 {
1180     uint64_t a0 = a->frac;
1181
1182     if (likely(c != 0)) {
1183         if (likely(c < 64)) {
1184             a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0);
1185         } else {
1186             a0 = a0 != 0;
1187         }
1188         a->frac = a0;
1189     }
1190 }
1191
1192 static void frac128_shrjam(FloatParts128 *a, int c)
1193 {
1194     uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
1195     uint64_t sticky = 0;
1196
1197     if (unlikely(c == 0)) {
1198         return;
1199     } else if (likely(c < 64)) {
1200         /* nothing */
1201     } else if (likely(c < 128)) {
1202         sticky = a1;
1203         a1 = a0;
1204         a0 = 0;
1205         c &= 63;
1206         if (c == 0) {
1207             goto done;
1208         }
1209     } else {
1210         sticky = a0 | a1;
1211         a0 = a1 = 0;
1212         goto done;
1213     }
1214
1215     sticky |= shr_double(a1, 0, c);
1216     a1 = shr_double(a0, a1, c);
1217     a0 = a0 >> c;
1218
1219  done:
1220     a->frac_lo = a1 | (sticky != 0);
1221     a->frac_hi = a0;
1222 }
1223
1224 static void frac256_shrjam(FloatParts256 *a, int c)
1225 {
1226     uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
1227     uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
1228     uint64_t sticky = 0;
1229
1230     if (unlikely(c == 0)) {
1231         return;
1232     } else if (likely(c < 64)) {
1233         /* nothing */
1234     } else if (likely(c < 256)) {
1235         if (unlikely(c & 128)) {
1236             sticky |= a2 | a3;
1237             a3 = a1, a2 = a0, a1 = 0, a0 = 0;
1238         }
1239         if (unlikely(c & 64)) {
1240             sticky |= a3;
1241             a3 = a2, a2 = a1, a1 = a0, a0 = 0;
1242         }
1243         c &= 63;
1244         if (c == 0) {
1245             goto done;
1246         }
1247     } else {
1248         sticky = a0 | a1 | a2 | a3;
1249         a0 = a1 = a2 = a3 = 0;
1250         goto done;
1251     }
1252
1253     sticky |= shr_double(a3, 0, c);
1254     a3 = shr_double(a2, a3, c);
1255     a2 = shr_double(a1, a2, c);
1256     a1 = shr_double(a0, a1, c);
1257     a0 = a0 >> c;
1258
1259  done:
1260     a->frac_lo = a3 | (sticky != 0);
1261     a->frac_lm = a2;
1262     a->frac_hm = a1;
1263     a->frac_hi = a0;
1264 }
1265
1266 #define frac_shrjam(A, C)  FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1267
1268 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
1269 {
1270     return usub64_overflow(a->frac, b->frac, &r->frac);
1271 }
1272
1273 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
1274 {
1275     bool c = 0;
1276     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1277     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1278     return c;
1279 }
1280
1281 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
1282 {
1283     bool c = 0;
1284     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1285     r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c);
1286     r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c);
1287     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1288     return c;
1289 }
1290
1291 #define frac_sub(R, A, B)  FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1292
1293 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a)
1294 {
1295     r->frac = a->frac_hi | (a->frac_lo != 0);
1296 }
1297
1298 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a)
1299 {
1300     r->frac_hi = a->frac_hi;
1301     r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0);
1302 }
1303
1304 #define frac_truncjam(R, A)  FRAC_GENERIC_64_128(truncjam, R)(R, A)
1305
1306 static void frac64_widen(FloatParts128 *r, FloatParts64 *a)
1307 {
1308     r->frac_hi = a->frac;
1309     r->frac_lo = 0;
1310 }
1311
1312 static void frac128_widen(FloatParts256 *r, FloatParts128 *a)
1313 {
1314     r->frac_hi = a->frac_hi;
1315     r->frac_hm = a->frac_lo;
1316     r->frac_lm = 0;
1317     r->frac_lo = 0;
1318 }
1319
1320 #define frac_widen(A, B)  FRAC_GENERIC_64_128(widen, B)(A, B)
1321
1322 #define partsN(NAME)   glue(glue(glue(parts,N),_),NAME)
1323 #define FloatPartsN    glue(FloatParts,N)
1324 #define FloatPartsW    glue(FloatParts,W)
1325
1326 #define N 64
1327 #define W 128
1328
1329 #include "softfloat-parts-addsub.c.inc"
1330 #include "softfloat-parts.c.inc"
1331
1332 #undef  N
1333 #undef  W
1334 #define N 128
1335 #define W 256
1336
1337 #include "softfloat-parts-addsub.c.inc"
1338 #include "softfloat-parts.c.inc"
1339
1340 #undef  N
1341 #undef  W
1342 #define N            256
1343
1344 #include "softfloat-parts-addsub.c.inc"
1345
1346 #undef  N
1347 #undef  W
1348 #undef  partsN
1349 #undef  FloatPartsN
1350 #undef  FloatPartsW
1351
1352 /*
1353  * Pack/unpack routines with a specific FloatFmt.
1354  */
1355
1356 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1357                                       float_status *s, const FloatFmt *params)
1358 {
1359     float16_unpack_raw(p, f);
1360     parts_canonicalize(p, s, params);
1361 }
1362
1363 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1364                                      float_status *s)
1365 {
1366     float16a_unpack_canonical(p, f, s, &float16_params);
1367 }
1368
1369 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1370                                       float_status *s)
1371 {
1372     bfloat16_unpack_raw(p, f);
1373     parts_canonicalize(p, s, &bfloat16_params);
1374 }
1375
1376 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1377                                              float_status *s,
1378                                              const FloatFmt *params)
1379 {
1380     parts_uncanon(p, s, params);
1381     return float16_pack_raw(p);
1382 }
1383
1384 static float16 float16_round_pack_canonical(FloatParts64 *p,
1385                                             float_status *s)
1386 {
1387     return float16a_round_pack_canonical(p, s, &float16_params);
1388 }
1389
1390 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1391                                               float_status *s)
1392 {
1393     parts_uncanon(p, s, &bfloat16_params);
1394     return bfloat16_pack_raw(p);
1395 }
1396
1397 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1398                                      float_status *s)
1399 {
1400     float32_unpack_raw(p, f);
1401     parts_canonicalize(p, s, &float32_params);
1402 }
1403
1404 static float32 float32_round_pack_canonical(FloatParts64 *p,
1405                                             float_status *s)
1406 {
1407     parts_uncanon(p, s, &float32_params);
1408     return float32_pack_raw(p);
1409 }
1410
1411 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1412                                      float_status *s)
1413 {
1414     float64_unpack_raw(p, f);
1415     parts_canonicalize(p, s, &float64_params);
1416 }
1417
1418 static float64 float64_round_pack_canonical(FloatParts64 *p,
1419                                             float_status *s)
1420 {
1421     parts_uncanon(p, s, &float64_params);
1422     return float64_pack_raw(p);
1423 }
1424
1425 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1426                                       float_status *s)
1427 {
1428     float128_unpack_raw(p, f);
1429     parts_canonicalize(p, s, &float128_params);
1430 }
1431
1432 static float128 float128_round_pack_canonical(FloatParts128 *p,
1433                                               float_status *s)
1434 {
1435     parts_uncanon(p, s, &float128_params);
1436     return float128_pack_raw(p);
1437 }
1438
1439 /*
1440  * Addition and subtraction
1441  */
1442
1443 static float16 QEMU_FLATTEN
1444 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1445 {
1446     FloatParts64 pa, pb, *pr;
1447
1448     float16_unpack_canonical(&pa, a, status);
1449     float16_unpack_canonical(&pb, b, status);
1450     pr = parts_addsub(&pa, &pb, status, subtract);
1451
1452     return float16_round_pack_canonical(pr, status);
1453 }
1454
1455 float16 float16_add(float16 a, float16 b, float_status *status)
1456 {
1457     return float16_addsub(a, b, status, false);
1458 }
1459
1460 float16 float16_sub(float16 a, float16 b, float_status *status)
1461 {
1462     return float16_addsub(a, b, status, true);
1463 }
1464
1465 static float32 QEMU_SOFTFLOAT_ATTR
1466 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1467 {
1468     FloatParts64 pa, pb, *pr;
1469
1470     float32_unpack_canonical(&pa, a, status);
1471     float32_unpack_canonical(&pb, b, status);
1472     pr = parts_addsub(&pa, &pb, status, subtract);
1473
1474     return float32_round_pack_canonical(pr, status);
1475 }
1476
1477 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1478 {
1479     return soft_f32_addsub(a, b, status, false);
1480 }
1481
1482 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1483 {
1484     return soft_f32_addsub(a, b, status, true);
1485 }
1486
1487 static float64 QEMU_SOFTFLOAT_ATTR
1488 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1489 {
1490     FloatParts64 pa, pb, *pr;
1491
1492     float64_unpack_canonical(&pa, a, status);
1493     float64_unpack_canonical(&pb, b, status);
1494     pr = parts_addsub(&pa, &pb, status, subtract);
1495
1496     return float64_round_pack_canonical(pr, status);
1497 }
1498
1499 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1500 {
1501     return soft_f64_addsub(a, b, status, false);
1502 }
1503
1504 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1505 {
1506     return soft_f64_addsub(a, b, status, true);
1507 }
1508
1509 static float hard_f32_add(float a, float b)
1510 {
1511     return a + b;
1512 }
1513
1514 static float hard_f32_sub(float a, float b)
1515 {
1516     return a - b;
1517 }
1518
1519 static double hard_f64_add(double a, double b)
1520 {
1521     return a + b;
1522 }
1523
1524 static double hard_f64_sub(double a, double b)
1525 {
1526     return a - b;
1527 }
1528
1529 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1530 {
1531     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1532         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1533     }
1534     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1535 }
1536
1537 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1538 {
1539     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1540         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1541     } else {
1542         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1543     }
1544 }
1545
1546 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1547                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1548 {
1549     return float32_gen2(a, b, s, hard, soft,
1550                         f32_is_zon2, f32_addsubmul_post);
1551 }
1552
1553 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1554                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1555 {
1556     return float64_gen2(a, b, s, hard, soft,
1557                         f64_is_zon2, f64_addsubmul_post);
1558 }
1559
1560 float32 QEMU_FLATTEN
1561 float32_add(float32 a, float32 b, float_status *s)
1562 {
1563     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1564 }
1565
1566 float32 QEMU_FLATTEN
1567 float32_sub(float32 a, float32 b, float_status *s)
1568 {
1569     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1570 }
1571
1572 float64 QEMU_FLATTEN
1573 float64_add(float64 a, float64 b, float_status *s)
1574 {
1575     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1576 }
1577
1578 float64 QEMU_FLATTEN
1579 float64_sub(float64 a, float64 b, float_status *s)
1580 {
1581     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1582 }
1583
1584 static bfloat16 QEMU_FLATTEN
1585 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1586 {
1587     FloatParts64 pa, pb, *pr;
1588
1589     bfloat16_unpack_canonical(&pa, a, status);
1590     bfloat16_unpack_canonical(&pb, b, status);
1591     pr = parts_addsub(&pa, &pb, status, subtract);
1592
1593     return bfloat16_round_pack_canonical(pr, status);
1594 }
1595
1596 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1597 {
1598     return bfloat16_addsub(a, b, status, false);
1599 }
1600
1601 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1602 {
1603     return bfloat16_addsub(a, b, status, true);
1604 }
1605
1606 static float128 QEMU_FLATTEN
1607 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1608 {
1609     FloatParts128 pa, pb, *pr;
1610
1611     float128_unpack_canonical(&pa, a, status);
1612     float128_unpack_canonical(&pb, b, status);
1613     pr = parts_addsub(&pa, &pb, status, subtract);
1614
1615     return float128_round_pack_canonical(pr, status);
1616 }
1617
1618 float128 float128_add(float128 a, float128 b, float_status *status)
1619 {
1620     return float128_addsub(a, b, status, false);
1621 }
1622
1623 float128 float128_sub(float128 a, float128 b, float_status *status)
1624 {
1625     return float128_addsub(a, b, status, true);
1626 }
1627
1628 /*
1629  * Multiplication
1630  */
1631
1632 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1633 {
1634     FloatParts64 pa, pb, *pr;
1635
1636     float16_unpack_canonical(&pa, a, status);
1637     float16_unpack_canonical(&pb, b, status);
1638     pr = parts_mul(&pa, &pb, status);
1639
1640     return float16_round_pack_canonical(pr, status);
1641 }
1642
1643 static float32 QEMU_SOFTFLOAT_ATTR
1644 soft_f32_mul(float32 a, float32 b, float_status *status)
1645 {
1646     FloatParts64 pa, pb, *pr;
1647
1648     float32_unpack_canonical(&pa, a, status);
1649     float32_unpack_canonical(&pb, b, status);
1650     pr = parts_mul(&pa, &pb, status);
1651
1652     return float32_round_pack_canonical(pr, status);
1653 }
1654
1655 static float64 QEMU_SOFTFLOAT_ATTR
1656 soft_f64_mul(float64 a, float64 b, float_status *status)
1657 {
1658     FloatParts64 pa, pb, *pr;
1659
1660     float64_unpack_canonical(&pa, a, status);
1661     float64_unpack_canonical(&pb, b, status);
1662     pr = parts_mul(&pa, &pb, status);
1663
1664     return float64_round_pack_canonical(pr, status);
1665 }
1666
1667 static float hard_f32_mul(float a, float b)
1668 {
1669     return a * b;
1670 }
1671
1672 static double hard_f64_mul(double a, double b)
1673 {
1674     return a * b;
1675 }
1676
1677 float32 QEMU_FLATTEN
1678 float32_mul(float32 a, float32 b, float_status *s)
1679 {
1680     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1681                         f32_is_zon2, f32_addsubmul_post);
1682 }
1683
1684 float64 QEMU_FLATTEN
1685 float64_mul(float64 a, float64 b, float_status *s)
1686 {
1687     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1688                         f64_is_zon2, f64_addsubmul_post);
1689 }
1690
1691 bfloat16 QEMU_FLATTEN
1692 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1693 {
1694     FloatParts64 pa, pb, *pr;
1695
1696     bfloat16_unpack_canonical(&pa, a, status);
1697     bfloat16_unpack_canonical(&pb, b, status);
1698     pr = parts_mul(&pa, &pb, status);
1699
1700     return bfloat16_round_pack_canonical(pr, status);
1701 }
1702
1703 float128 QEMU_FLATTEN
1704 float128_mul(float128 a, float128 b, float_status *status)
1705 {
1706     FloatParts128 pa, pb, *pr;
1707
1708     float128_unpack_canonical(&pa, a, status);
1709     float128_unpack_canonical(&pb, b, status);
1710     pr = parts_mul(&pa, &pb, status);
1711
1712     return float128_round_pack_canonical(pr, status);
1713 }
1714
1715 /*
1716  * Fused multiply-add
1717  */
1718
1719 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1720                                     int flags, float_status *status)
1721 {
1722     FloatParts64 pa, pb, pc, *pr;
1723
1724     float16_unpack_canonical(&pa, a, status);
1725     float16_unpack_canonical(&pb, b, status);
1726     float16_unpack_canonical(&pc, c, status);
1727     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1728
1729     return float16_round_pack_canonical(pr, status);
1730 }
1731
1732 static float32 QEMU_SOFTFLOAT_ATTR
1733 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1734                 float_status *status)
1735 {
1736     FloatParts64 pa, pb, pc, *pr;
1737
1738     float32_unpack_canonical(&pa, a, status);
1739     float32_unpack_canonical(&pb, b, status);
1740     float32_unpack_canonical(&pc, c, status);
1741     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1742
1743     return float32_round_pack_canonical(pr, status);
1744 }
1745
1746 static float64 QEMU_SOFTFLOAT_ATTR
1747 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1748                 float_status *status)
1749 {
1750     FloatParts64 pa, pb, pc, *pr;
1751
1752     float64_unpack_canonical(&pa, a, status);
1753     float64_unpack_canonical(&pb, b, status);
1754     float64_unpack_canonical(&pc, c, status);
1755     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1756
1757     return float64_round_pack_canonical(pr, status);
1758 }
1759
1760 static bool force_soft_fma;
1761
1762 float32 QEMU_FLATTEN
1763 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1764 {
1765     union_float32 ua, ub, uc, ur;
1766
1767     ua.s = xa;
1768     ub.s = xb;
1769     uc.s = xc;
1770
1771     if (unlikely(!can_use_fpu(s))) {
1772         goto soft;
1773     }
1774     if (unlikely(flags & float_muladd_halve_result)) {
1775         goto soft;
1776     }
1777
1778     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1779     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1780         goto soft;
1781     }
1782
1783     if (unlikely(force_soft_fma)) {
1784         goto soft;
1785     }
1786
1787     /*
1788      * When (a || b) == 0, there's no need to check for under/over flow,
1789      * since we know the addend is (normal || 0) and the product is 0.
1790      */
1791     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1792         union_float32 up;
1793         bool prod_sign;
1794
1795         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1796         prod_sign ^= !!(flags & float_muladd_negate_product);
1797         up.s = float32_set_sign(float32_zero, prod_sign);
1798
1799         if (flags & float_muladd_negate_c) {
1800             uc.h = -uc.h;
1801         }
1802         ur.h = up.h + uc.h;
1803     } else {
1804         union_float32 ua_orig = ua;
1805         union_float32 uc_orig = uc;
1806
1807         if (flags & float_muladd_negate_product) {
1808             ua.h = -ua.h;
1809         }
1810         if (flags & float_muladd_negate_c) {
1811             uc.h = -uc.h;
1812         }
1813
1814         ur.h = fmaf(ua.h, ub.h, uc.h);
1815
1816         if (unlikely(f32_is_inf(ur))) {
1817             float_raise(float_flag_overflow, s);
1818         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1819             ua = ua_orig;
1820             uc = uc_orig;
1821             goto soft;
1822         }
1823     }
1824     if (flags & float_muladd_negate_result) {
1825         return float32_chs(ur.s);
1826     }
1827     return ur.s;
1828
1829  soft:
1830     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1831 }
1832
1833 float64 QEMU_FLATTEN
1834 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1835 {
1836     union_float64 ua, ub, uc, ur;
1837
1838     ua.s = xa;
1839     ub.s = xb;
1840     uc.s = xc;
1841
1842     if (unlikely(!can_use_fpu(s))) {
1843         goto soft;
1844     }
1845     if (unlikely(flags & float_muladd_halve_result)) {
1846         goto soft;
1847     }
1848
1849     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1850     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1851         goto soft;
1852     }
1853
1854     if (unlikely(force_soft_fma)) {
1855         goto soft;
1856     }
1857
1858     /*
1859      * When (a || b) == 0, there's no need to check for under/over flow,
1860      * since we know the addend is (normal || 0) and the product is 0.
1861      */
1862     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1863         union_float64 up;
1864         bool prod_sign;
1865
1866         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1867         prod_sign ^= !!(flags & float_muladd_negate_product);
1868         up.s = float64_set_sign(float64_zero, prod_sign);
1869
1870         if (flags & float_muladd_negate_c) {
1871             uc.h = -uc.h;
1872         }
1873         ur.h = up.h + uc.h;
1874     } else {
1875         union_float64 ua_orig = ua;
1876         union_float64 uc_orig = uc;
1877
1878         if (flags & float_muladd_negate_product) {
1879             ua.h = -ua.h;
1880         }
1881         if (flags & float_muladd_negate_c) {
1882             uc.h = -uc.h;
1883         }
1884
1885         ur.h = fma(ua.h, ub.h, uc.h);
1886
1887         if (unlikely(f64_is_inf(ur))) {
1888             float_raise(float_flag_overflow, s);
1889         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1890             ua = ua_orig;
1891             uc = uc_orig;
1892             goto soft;
1893         }
1894     }
1895     if (flags & float_muladd_negate_result) {
1896         return float64_chs(ur.s);
1897     }
1898     return ur.s;
1899
1900  soft:
1901     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1902 }
1903
1904 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1905                                       int flags, float_status *status)
1906 {
1907     FloatParts64 pa, pb, pc, *pr;
1908
1909     bfloat16_unpack_canonical(&pa, a, status);
1910     bfloat16_unpack_canonical(&pb, b, status);
1911     bfloat16_unpack_canonical(&pc, c, status);
1912     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1913
1914     return bfloat16_round_pack_canonical(pr, status);
1915 }
1916
1917 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
1918                                       int flags, float_status *status)
1919 {
1920     FloatParts128 pa, pb, pc, *pr;
1921
1922     float128_unpack_canonical(&pa, a, status);
1923     float128_unpack_canonical(&pb, b, status);
1924     float128_unpack_canonical(&pc, c, status);
1925     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1926
1927     return float128_round_pack_canonical(pr, status);
1928 }
1929
1930 /*
1931  * Division
1932  */
1933
1934 float16 float16_div(float16 a, float16 b, float_status *status)
1935 {
1936     FloatParts64 pa, pb, *pr;
1937
1938     float16_unpack_canonical(&pa, a, status);
1939     float16_unpack_canonical(&pb, b, status);
1940     pr = parts_div(&pa, &pb, status);
1941
1942     return float16_round_pack_canonical(pr, status);
1943 }
1944
1945 static float32 QEMU_SOFTFLOAT_ATTR
1946 soft_f32_div(float32 a, float32 b, float_status *status)
1947 {
1948     FloatParts64 pa, pb, *pr;
1949
1950     float32_unpack_canonical(&pa, a, status);
1951     float32_unpack_canonical(&pb, b, status);
1952     pr = parts_div(&pa, &pb, status);
1953
1954     return float32_round_pack_canonical(pr, status);
1955 }
1956
1957 static float64 QEMU_SOFTFLOAT_ATTR
1958 soft_f64_div(float64 a, float64 b, float_status *status)
1959 {
1960     FloatParts64 pa, pb, *pr;
1961
1962     float64_unpack_canonical(&pa, a, status);
1963     float64_unpack_canonical(&pb, b, status);
1964     pr = parts_div(&pa, &pb, status);
1965
1966     return float64_round_pack_canonical(pr, status);
1967 }
1968
1969 static float hard_f32_div(float a, float b)
1970 {
1971     return a / b;
1972 }
1973
1974 static double hard_f64_div(double a, double b)
1975 {
1976     return a / b;
1977 }
1978
1979 static bool f32_div_pre(union_float32 a, union_float32 b)
1980 {
1981     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1982         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1983                fpclassify(b.h) == FP_NORMAL;
1984     }
1985     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1986 }
1987
1988 static bool f64_div_pre(union_float64 a, union_float64 b)
1989 {
1990     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1991         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1992                fpclassify(b.h) == FP_NORMAL;
1993     }
1994     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1995 }
1996
1997 static bool f32_div_post(union_float32 a, union_float32 b)
1998 {
1999     if (QEMU_HARDFLOAT_2F32_USE_FP) {
2000         return fpclassify(a.h) != FP_ZERO;
2001     }
2002     return !float32_is_zero(a.s);
2003 }
2004
2005 static bool f64_div_post(union_float64 a, union_float64 b)
2006 {
2007     if (QEMU_HARDFLOAT_2F64_USE_FP) {
2008         return fpclassify(a.h) != FP_ZERO;
2009     }
2010     return !float64_is_zero(a.s);
2011 }
2012
2013 float32 QEMU_FLATTEN
2014 float32_div(float32 a, float32 b, float_status *s)
2015 {
2016     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2017                         f32_div_pre, f32_div_post);
2018 }
2019
2020 float64 QEMU_FLATTEN
2021 float64_div(float64 a, float64 b, float_status *s)
2022 {
2023     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2024                         f64_div_pre, f64_div_post);
2025 }
2026
2027 bfloat16 QEMU_FLATTEN
2028 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2029 {
2030     FloatParts64 pa, pb, *pr;
2031
2032     bfloat16_unpack_canonical(&pa, a, status);
2033     bfloat16_unpack_canonical(&pb, b, status);
2034     pr = parts_div(&pa, &pb, status);
2035
2036     return bfloat16_round_pack_canonical(pr, status);
2037 }
2038
2039 float128 QEMU_FLATTEN
2040 float128_div(float128 a, float128 b, float_status *status)
2041 {
2042     FloatParts128 pa, pb, *pr;
2043
2044     float128_unpack_canonical(&pa, a, status);
2045     float128_unpack_canonical(&pb, b, status);
2046     pr = parts_div(&pa, &pb, status);
2047
2048     return float128_round_pack_canonical(pr, status);
2049 }
2050
2051 /*
2052  * Float to Float conversions
2053  *
2054  * Returns the result of converting one float format to another. The
2055  * conversion is performed according to the IEC/IEEE Standard for
2056  * Binary Floating-Point Arithmetic.
2057  *
2058  * Usually this only needs to take care of raising invalid exceptions
2059  * and handling the conversion on NaNs.
2060  */
2061
2062 static void parts_float_to_ahp(FloatParts64 *a, float_status *s)
2063 {
2064     switch (a->cls) {
2065     case float_class_qnan:
2066     case float_class_snan:
2067         /*
2068          * There is no NaN in the destination format.  Raise Invalid
2069          * and return a zero with the sign of the input NaN.
2070          */
2071         float_raise(float_flag_invalid, s);
2072         a->cls = float_class_zero;
2073         break;
2074
2075     case float_class_inf:
2076         /*
2077          * There is no Inf in the destination format.  Raise Invalid
2078          * and return the maximum normal with the correct sign.
2079          */
2080         float_raise(float_flag_invalid, s);
2081         a->cls = float_class_normal;
2082         a->exp = float16_params_ahp.exp_max;
2083         a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift,
2084                                   float16_params_ahp.frac_size + 1);
2085         break;
2086
2087     case float_class_normal:
2088     case float_class_zero:
2089         break;
2090
2091     default:
2092         g_assert_not_reached();
2093     }
2094 }
2095
2096 static void parts64_float_to_float(FloatParts64 *a, float_status *s)
2097 {
2098     if (is_nan(a->cls)) {
2099         parts_return_nan(a, s);
2100     }
2101 }
2102
2103 static void parts128_float_to_float(FloatParts128 *a, float_status *s)
2104 {
2105     if (is_nan(a->cls)) {
2106         parts_return_nan(a, s);
2107     }
2108 }
2109
2110 #define parts_float_to_float(P, S) \
2111     PARTS_GENERIC_64_128(float_to_float, P)(P, S)
2112
2113 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b,
2114                                         float_status *s)
2115 {
2116     a->cls = b->cls;
2117     a->sign = b->sign;
2118     a->exp = b->exp;
2119
2120     if (a->cls == float_class_normal) {
2121         frac_truncjam(a, b);
2122     } else if (is_nan(a->cls)) {
2123         /* Discard the low bits of the NaN. */
2124         a->frac = b->frac_hi;
2125         parts_return_nan(a, s);
2126     }
2127 }
2128
2129 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b,
2130                                        float_status *s)
2131 {
2132     a->cls = b->cls;
2133     a->sign = b->sign;
2134     a->exp = b->exp;
2135     frac_widen(a, b);
2136
2137     if (is_nan(a->cls)) {
2138         parts_return_nan(a, s);
2139     }
2140 }
2141
2142 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2143 {
2144     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2145     FloatParts64 p;
2146
2147     float16a_unpack_canonical(&p, a, s, fmt16);
2148     parts_float_to_float(&p, s);
2149     return float32_round_pack_canonical(&p, s);
2150 }
2151
2152 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2153 {
2154     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2155     FloatParts64 p;
2156
2157     float16a_unpack_canonical(&p, a, s, fmt16);
2158     parts_float_to_float(&p, s);
2159     return float64_round_pack_canonical(&p, s);
2160 }
2161
2162 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2163 {
2164     FloatParts64 p;
2165     const FloatFmt *fmt;
2166
2167     float32_unpack_canonical(&p, a, s);
2168     if (ieee) {
2169         parts_float_to_float(&p, s);
2170         fmt = &float16_params;
2171     } else {
2172         parts_float_to_ahp(&p, s);
2173         fmt = &float16_params_ahp;
2174     }
2175     return float16a_round_pack_canonical(&p, s, fmt);
2176 }
2177
2178 static float64 QEMU_SOFTFLOAT_ATTR
2179 soft_float32_to_float64(float32 a, float_status *s)
2180 {
2181     FloatParts64 p;
2182
2183     float32_unpack_canonical(&p, a, s);
2184     parts_float_to_float(&p, s);
2185     return float64_round_pack_canonical(&p, s);
2186 }
2187
2188 float64 float32_to_float64(float32 a, float_status *s)
2189 {
2190     if (likely(float32_is_normal(a))) {
2191         /* Widening conversion can never produce inexact results.  */
2192         union_float32 uf;
2193         union_float64 ud;
2194         uf.s = a;
2195         ud.h = uf.h;
2196         return ud.s;
2197     } else if (float32_is_zero(a)) {
2198         return float64_set_sign(float64_zero, float32_is_neg(a));
2199     } else {
2200         return soft_float32_to_float64(a, s);
2201     }
2202 }
2203
2204 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2205 {
2206     FloatParts64 p;
2207     const FloatFmt *fmt;
2208
2209     float64_unpack_canonical(&p, a, s);
2210     if (ieee) {
2211         parts_float_to_float(&p, s);
2212         fmt = &float16_params;
2213     } else {
2214         parts_float_to_ahp(&p, s);
2215         fmt = &float16_params_ahp;
2216     }
2217     return float16a_round_pack_canonical(&p, s, fmt);
2218 }
2219
2220 float32 float64_to_float32(float64 a, float_status *s)
2221 {
2222     FloatParts64 p;
2223
2224     float64_unpack_canonical(&p, a, s);
2225     parts_float_to_float(&p, s);
2226     return float32_round_pack_canonical(&p, s);
2227 }
2228
2229 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2230 {
2231     FloatParts64 p;
2232
2233     bfloat16_unpack_canonical(&p, a, s);
2234     parts_float_to_float(&p, s);
2235     return float32_round_pack_canonical(&p, s);
2236 }
2237
2238 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2239 {
2240     FloatParts64 p;
2241
2242     bfloat16_unpack_canonical(&p, a, s);
2243     parts_float_to_float(&p, s);
2244     return float64_round_pack_canonical(&p, s);
2245 }
2246
2247 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2248 {
2249     FloatParts64 p;
2250
2251     float32_unpack_canonical(&p, a, s);
2252     parts_float_to_float(&p, s);
2253     return bfloat16_round_pack_canonical(&p, s);
2254 }
2255
2256 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2257 {
2258     FloatParts64 p;
2259
2260     float64_unpack_canonical(&p, a, s);
2261     parts_float_to_float(&p, s);
2262     return bfloat16_round_pack_canonical(&p, s);
2263 }
2264
2265 float32 float128_to_float32(float128 a, float_status *s)
2266 {
2267     FloatParts64 p64;
2268     FloatParts128 p128;
2269
2270     float128_unpack_canonical(&p128, a, s);
2271     parts_float_to_float_narrow(&p64, &p128, s);
2272     return float32_round_pack_canonical(&p64, s);
2273 }
2274
2275 float64 float128_to_float64(float128 a, float_status *s)
2276 {
2277     FloatParts64 p64;
2278     FloatParts128 p128;
2279
2280     float128_unpack_canonical(&p128, a, s);
2281     parts_float_to_float_narrow(&p64, &p128, s);
2282     return float64_round_pack_canonical(&p64, s);
2283 }
2284
2285 float128 float32_to_float128(float32 a, float_status *s)
2286 {
2287     FloatParts64 p64;
2288     FloatParts128 p128;
2289
2290     float32_unpack_canonical(&p64, a, s);
2291     parts_float_to_float_widen(&p128, &p64, s);
2292     return float128_round_pack_canonical(&p128, s);
2293 }
2294
2295 float128 float64_to_float128(float64 a, float_status *s)
2296 {
2297     FloatParts64 p64;
2298     FloatParts128 p128;
2299
2300     float64_unpack_canonical(&p64, a, s);
2301     parts_float_to_float_widen(&p128, &p64, s);
2302     return float128_round_pack_canonical(&p128, s);
2303 }
2304
2305 /*
2306  * Round to integral value
2307  */
2308
2309 float16 float16_round_to_int(float16 a, float_status *s)
2310 {
2311     FloatParts64 p;
2312
2313     float16_unpack_canonical(&p, a, s);
2314     parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float16_params);
2315     return float16_round_pack_canonical(&p, s);
2316 }
2317
2318 float32 float32_round_to_int(float32 a, float_status *s)
2319 {
2320     FloatParts64 p;
2321
2322     float32_unpack_canonical(&p, a, s);
2323     parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float32_params);
2324     return float32_round_pack_canonical(&p, s);
2325 }
2326
2327 float64 float64_round_to_int(float64 a, float_status *s)
2328 {
2329     FloatParts64 p;
2330
2331     float64_unpack_canonical(&p, a, s);
2332     parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float64_params);
2333     return float64_round_pack_canonical(&p, s);
2334 }
2335
2336 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2337 {
2338     FloatParts64 p;
2339
2340     bfloat16_unpack_canonical(&p, a, s);
2341     parts_round_to_int(&p, s->float_rounding_mode, 0, s, &bfloat16_params);
2342     return bfloat16_round_pack_canonical(&p, s);
2343 }
2344
2345 float128 float128_round_to_int(float128 a, float_status *s)
2346 {
2347     FloatParts128 p;
2348
2349     float128_unpack_canonical(&p, a, s);
2350     parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float128_params);
2351     return float128_round_pack_canonical(&p, s);
2352 }
2353
2354 /*
2355  * Returns the result of converting the floating-point value `a' to
2356  * the two's complement integer format. The conversion is performed
2357  * according to the IEC/IEEE Standard for Binary Floating-Point
2358  * Arithmetic---which means in particular that the conversion is
2359  * rounded according to the current rounding mode. If `a' is a NaN,
2360  * the largest positive integer is returned. Otherwise, if the
2361  * conversion overflows, the largest integer with the same sign as `a'
2362  * is returned.
2363 */
2364
2365 static int64_t round_to_int_and_pack(FloatParts64 p, FloatRoundMode rmode,
2366                                      int scale, int64_t min, int64_t max,
2367                                      float_status *s)
2368 {
2369     int flags = 0;
2370     uint64_t r;
2371
2372     switch (p.cls) {
2373     case float_class_snan:
2374     case float_class_qnan:
2375         flags = float_flag_invalid;
2376         r = max;
2377         break;
2378
2379     case float_class_inf:
2380         flags = float_flag_invalid;
2381         r = p.sign ? min : max;
2382         break;
2383
2384     case float_class_zero:
2385         return 0;
2386
2387     case float_class_normal:
2388         /* TODO: 62 = N - 2, frac_size for rounding */
2389         if (parts_round_to_int_normal(&p, rmode, scale, 62)) {
2390             flags = float_flag_inexact;
2391         }
2392
2393         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2394             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2395         } else {
2396             r = UINT64_MAX;
2397         }
2398         if (p.sign) {
2399             if (r <= -(uint64_t)min) {
2400                 r = -r;
2401             } else {
2402                 flags = float_flag_invalid;
2403                 r = min;
2404             }
2405         } else if (r > max) {
2406             flags = float_flag_invalid;
2407             r = max;
2408         }
2409         break;
2410
2411     default:
2412         g_assert_not_reached();
2413     }
2414
2415     float_raise(flags, s);
2416     return r;
2417 }
2418
2419 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2420                               float_status *s)
2421 {
2422     FloatParts64 p;
2423
2424     float16_unpack_canonical(&p, a, s);
2425     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2426 }
2427
2428 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2429                                 float_status *s)
2430 {
2431     FloatParts64 p;
2432
2433     float16_unpack_canonical(&p, a, s);
2434     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2435 }
2436
2437 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2438                                 float_status *s)
2439 {
2440     FloatParts64 p;
2441
2442     float16_unpack_canonical(&p, a, s);
2443     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2444 }
2445
2446 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2447                                 float_status *s)
2448 {
2449     FloatParts64 p;
2450
2451     float16_unpack_canonical(&p, a, s);
2452     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2453 }
2454
2455 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2456                                 float_status *s)
2457 {
2458     FloatParts64 p;
2459
2460     float32_unpack_canonical(&p, a, s);
2461     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2462 }
2463
2464 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2465                                 float_status *s)
2466 {
2467     FloatParts64 p;
2468
2469     float32_unpack_canonical(&p, a, s);
2470     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2471 }
2472
2473 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2474                                 float_status *s)
2475 {
2476     FloatParts64 p;
2477
2478     float32_unpack_canonical(&p, a, s);
2479     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2480 }
2481
2482 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2483                                 float_status *s)
2484 {
2485     FloatParts64 p;
2486
2487     float64_unpack_canonical(&p, a, s);
2488     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2489 }
2490
2491 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2492                                 float_status *s)
2493 {
2494     FloatParts64 p;
2495
2496     float64_unpack_canonical(&p, a, s);
2497     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2498 }
2499
2500 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2501                                 float_status *s)
2502 {
2503     FloatParts64 p;
2504
2505     float64_unpack_canonical(&p, a, s);
2506     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2507 }
2508
2509 int8_t float16_to_int8(float16 a, float_status *s)
2510 {
2511     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2512 }
2513
2514 int16_t float16_to_int16(float16 a, float_status *s)
2515 {
2516     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2517 }
2518
2519 int32_t float16_to_int32(float16 a, float_status *s)
2520 {
2521     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2522 }
2523
2524 int64_t float16_to_int64(float16 a, float_status *s)
2525 {
2526     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2527 }
2528
2529 int16_t float32_to_int16(float32 a, float_status *s)
2530 {
2531     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2532 }
2533
2534 int32_t float32_to_int32(float32 a, float_status *s)
2535 {
2536     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2537 }
2538
2539 int64_t float32_to_int64(float32 a, float_status *s)
2540 {
2541     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2542 }
2543
2544 int16_t float64_to_int16(float64 a, float_status *s)
2545 {
2546     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2547 }
2548
2549 int32_t float64_to_int32(float64 a, float_status *s)
2550 {
2551     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2552 }
2553
2554 int64_t float64_to_int64(float64 a, float_status *s)
2555 {
2556     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2557 }
2558
2559 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2560 {
2561     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2562 }
2563
2564 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2565 {
2566     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2567 }
2568
2569 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2570 {
2571     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2572 }
2573
2574 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2575 {
2576     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2577 }
2578
2579 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2580 {
2581     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2582 }
2583
2584 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2585 {
2586     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2587 }
2588
2589 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2590 {
2591     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2592 }
2593
2594 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2595 {
2596     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2597 }
2598
2599 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2600 {
2601     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2602 }
2603
2604 /*
2605  * Returns the result of converting the floating-point value `a' to
2606  * the two's complement integer format.
2607  */
2608
2609 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2610                                  float_status *s)
2611 {
2612     FloatParts64 p;
2613
2614     bfloat16_unpack_canonical(&p, a, s);
2615     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2616 }
2617
2618 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2619                                  float_status *s)
2620 {
2621     FloatParts64 p;
2622
2623     bfloat16_unpack_canonical(&p, a, s);
2624     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2625 }
2626
2627 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2628                                  float_status *s)
2629 {
2630     FloatParts64 p;
2631
2632     bfloat16_unpack_canonical(&p, a, s);
2633     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2634 }
2635
2636 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2637 {
2638     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2639 }
2640
2641 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2642 {
2643     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2644 }
2645
2646 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2647 {
2648     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2649 }
2650
2651 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2652 {
2653     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2654 }
2655
2656 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2657 {
2658     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2659 }
2660
2661 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2662 {
2663     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2664 }
2665
2666 /*
2667  *  Returns the result of converting the floating-point value `a' to
2668  *  the unsigned integer format. The conversion is performed according
2669  *  to the IEC/IEEE Standard for Binary Floating-Point
2670  *  Arithmetic---which means in particular that the conversion is
2671  *  rounded according to the current rounding mode. If `a' is a NaN,
2672  *  the largest unsigned integer is returned. Otherwise, if the
2673  *  conversion overflows, the largest unsigned integer is returned. If
2674  *  the 'a' is negative, the result is rounded and zero is returned;
2675  *  values that do not round to zero will raise the inexact exception
2676  *  flag.
2677  */
2678
2679 static uint64_t round_to_uint_and_pack(FloatParts64 p, FloatRoundMode rmode,
2680                                        int scale, uint64_t max,
2681                                        float_status *s)
2682 {
2683     int flags = 0;
2684     uint64_t r;
2685
2686     switch (p.cls) {
2687     case float_class_snan:
2688     case float_class_qnan:
2689         flags = float_flag_invalid;
2690         r = max;
2691         break;
2692
2693     case float_class_inf:
2694         flags = float_flag_invalid;
2695         r = p.sign ? 0 : max;
2696         break;
2697
2698     case float_class_zero:
2699         return 0;
2700
2701     case float_class_normal:
2702         /* TODO: 62 = N - 2, frac_size for rounding */
2703         if (parts_round_to_int_normal(&p, rmode, scale, 62)) {
2704             flags = float_flag_inexact;
2705             if (p.cls == float_class_zero) {
2706                 r = 0;
2707                 break;
2708             }
2709         }
2710
2711         if (p.sign) {
2712             flags = float_flag_invalid;
2713             r = 0;
2714         } else if (p.exp > DECOMPOSED_BINARY_POINT) {
2715             flags = float_flag_invalid;
2716             r = max;
2717         } else {
2718             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2719             if (r > max) {
2720                 flags = float_flag_invalid;
2721                 r = max;
2722             }
2723         }
2724         break;
2725
2726     default:
2727         g_assert_not_reached();
2728     }
2729
2730     float_raise(flags, s);
2731     return r;
2732 }
2733
2734 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2735                                 float_status *s)
2736 {
2737     FloatParts64 p;
2738
2739     float16_unpack_canonical(&p, a, s);
2740     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2741 }
2742
2743 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2744                                   float_status *s)
2745 {
2746     FloatParts64 p;
2747
2748     float16_unpack_canonical(&p, a, s);
2749     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2750 }
2751
2752 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2753                                   float_status *s)
2754 {
2755     FloatParts64 p;
2756
2757     float16_unpack_canonical(&p, a, s);
2758     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2759 }
2760
2761 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2762                                   float_status *s)
2763 {
2764     FloatParts64 p;
2765
2766     float16_unpack_canonical(&p, a, s);
2767     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2768 }
2769
2770 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2771                                   float_status *s)
2772 {
2773     FloatParts64 p;
2774
2775     float32_unpack_canonical(&p, a, s);
2776     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2777 }
2778
2779 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2780                                   float_status *s)
2781 {
2782     FloatParts64 p;
2783
2784     float32_unpack_canonical(&p, a, s);
2785     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2786 }
2787
2788 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2789                                   float_status *s)
2790 {
2791     FloatParts64 p;
2792
2793     float32_unpack_canonical(&p, a, s);
2794     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2795 }
2796
2797 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2798                                   float_status *s)
2799 {
2800     FloatParts64 p;
2801
2802     float64_unpack_canonical(&p, a, s);
2803     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2804 }
2805
2806 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2807                                   float_status *s)
2808 {
2809     FloatParts64 p;
2810
2811     float64_unpack_canonical(&p, a, s);
2812     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2813 }
2814
2815 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2816                                   float_status *s)
2817 {
2818     FloatParts64 p;
2819
2820     float64_unpack_canonical(&p, a, s);
2821     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2822 }
2823
2824 uint8_t float16_to_uint8(float16 a, float_status *s)
2825 {
2826     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2827 }
2828
2829 uint16_t float16_to_uint16(float16 a, float_status *s)
2830 {
2831     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2832 }
2833
2834 uint32_t float16_to_uint32(float16 a, float_status *s)
2835 {
2836     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2837 }
2838
2839 uint64_t float16_to_uint64(float16 a, float_status *s)
2840 {
2841     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2842 }
2843
2844 uint16_t float32_to_uint16(float32 a, float_status *s)
2845 {
2846     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2847 }
2848
2849 uint32_t float32_to_uint32(float32 a, float_status *s)
2850 {
2851     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2852 }
2853
2854 uint64_t float32_to_uint64(float32 a, float_status *s)
2855 {
2856     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2857 }
2858
2859 uint16_t float64_to_uint16(float64 a, float_status *s)
2860 {
2861     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2862 }
2863
2864 uint32_t float64_to_uint32(float64 a, float_status *s)
2865 {
2866     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2867 }
2868
2869 uint64_t float64_to_uint64(float64 a, float_status *s)
2870 {
2871     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2872 }
2873
2874 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2875 {
2876     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2877 }
2878
2879 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2880 {
2881     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2882 }
2883
2884 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2885 {
2886     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2887 }
2888
2889 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2890 {
2891     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2892 }
2893
2894 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2895 {
2896     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2897 }
2898
2899 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2900 {
2901     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2902 }
2903
2904 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2905 {
2906     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2907 }
2908
2909 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2910 {
2911     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2912 }
2913
2914 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2915 {
2916     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2917 }
2918
2919 /*
2920  *  Returns the result of converting the bfloat16 value `a' to
2921  *  the unsigned integer format.
2922  */
2923
2924 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2925                                    int scale, float_status *s)
2926 {
2927     FloatParts64 p;
2928
2929     bfloat16_unpack_canonical(&p, a, s);
2930     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2931 }
2932
2933 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2934                                    int scale, float_status *s)
2935 {
2936     FloatParts64 p;
2937
2938     bfloat16_unpack_canonical(&p, a, s);
2939     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2940 }
2941
2942 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2943                                    int scale, float_status *s)
2944 {
2945     FloatParts64 p;
2946
2947     bfloat16_unpack_canonical(&p, a, s);
2948     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2949 }
2950
2951 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2952 {
2953     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2954 }
2955
2956 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2957 {
2958     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2959 }
2960
2961 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2962 {
2963     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2964 }
2965
2966 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2967 {
2968     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2969 }
2970
2971 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2972 {
2973     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2974 }
2975
2976 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2977 {
2978     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2979 }
2980
2981 /*
2982  * Integer to float conversions
2983  *
2984  * Returns the result of converting the two's complement integer `a'
2985  * to the floating-point format. The conversion is performed according
2986  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987  */
2988
2989 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2990 {
2991     FloatParts64 r = { .sign = false };
2992
2993     if (a == 0) {
2994         r.cls = float_class_zero;
2995     } else {
2996         uint64_t f = a;
2997         int shift;
2998
2999         r.cls = float_class_normal;
3000         if (a < 0) {
3001             f = -f;
3002             r.sign = true;
3003         }
3004         shift = clz64(f);
3005         scale = MIN(MAX(scale, -0x10000), 0x10000);
3006
3007         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3008         r.frac = f << shift;
3009     }
3010
3011     return r;
3012 }
3013
3014 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
3015 {
3016     FloatParts64 pa = int_to_float(a, scale, status);
3017     return float16_round_pack_canonical(&pa, status);
3018 }
3019
3020 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
3021 {
3022     return int64_to_float16_scalbn(a, scale, status);
3023 }
3024
3025 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
3026 {
3027     return int64_to_float16_scalbn(a, scale, status);
3028 }
3029
3030 float16 int64_to_float16(int64_t a, float_status *status)
3031 {
3032     return int64_to_float16_scalbn(a, 0, status);
3033 }
3034
3035 float16 int32_to_float16(int32_t a, float_status *status)
3036 {
3037     return int64_to_float16_scalbn(a, 0, status);
3038 }
3039
3040 float16 int16_to_float16(int16_t a, float_status *status)
3041 {
3042     return int64_to_float16_scalbn(a, 0, status);
3043 }
3044
3045 float16 int8_to_float16(int8_t a, float_status *status)
3046 {
3047     return int64_to_float16_scalbn(a, 0, status);
3048 }
3049
3050 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3051 {
3052     FloatParts64 pa = int_to_float(a, scale, status);
3053     return float32_round_pack_canonical(&pa, status);
3054 }
3055
3056 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3057 {
3058     return int64_to_float32_scalbn(a, scale, status);
3059 }
3060
3061 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3062 {
3063     return int64_to_float32_scalbn(a, scale, status);
3064 }
3065
3066 float32 int64_to_float32(int64_t a, float_status *status)
3067 {
3068     return int64_to_float32_scalbn(a, 0, status);
3069 }
3070
3071 float32 int32_to_float32(int32_t a, float_status *status)
3072 {
3073     return int64_to_float32_scalbn(a, 0, status);
3074 }
3075
3076 float32 int16_to_float32(int16_t a, float_status *status)
3077 {
3078     return int64_to_float32_scalbn(a, 0, status);
3079 }
3080
3081 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3082 {
3083     FloatParts64 pa = int_to_float(a, scale, status);
3084     return float64_round_pack_canonical(&pa, status);
3085 }
3086
3087 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3088 {
3089     return int64_to_float64_scalbn(a, scale, status);
3090 }
3091
3092 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3093 {
3094     return int64_to_float64_scalbn(a, scale, status);
3095 }
3096
3097 float64 int64_to_float64(int64_t a, float_status *status)
3098 {
3099     return int64_to_float64_scalbn(a, 0, status);
3100 }
3101
3102 float64 int32_to_float64(int32_t a, float_status *status)
3103 {
3104     return int64_to_float64_scalbn(a, 0, status);
3105 }
3106
3107 float64 int16_to_float64(int16_t a, float_status *status)
3108 {
3109     return int64_to_float64_scalbn(a, 0, status);
3110 }
3111
3112 /*
3113  * Returns the result of converting the two's complement integer `a'
3114  * to the bfloat16 format.
3115  */
3116
3117 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3118 {
3119     FloatParts64 pa = int_to_float(a, scale, status);
3120     return bfloat16_round_pack_canonical(&pa, status);
3121 }
3122
3123 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3124 {
3125     return int64_to_bfloat16_scalbn(a, scale, status);
3126 }
3127
3128 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3129 {
3130     return int64_to_bfloat16_scalbn(a, scale, status);
3131 }
3132
3133 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3134 {
3135     return int64_to_bfloat16_scalbn(a, 0, status);
3136 }
3137
3138 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3139 {
3140     return int64_to_bfloat16_scalbn(a, 0, status);
3141 }
3142
3143 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3144 {
3145     return int64_to_bfloat16_scalbn(a, 0, status);
3146 }
3147
3148 /*
3149  * Unsigned Integer to float conversions
3150  *
3151  * Returns the result of converting the unsigned integer `a' to the
3152  * floating-point format. The conversion is performed according to the
3153  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3154  */
3155
3156 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3157 {
3158     FloatParts64 r = { .sign = false };
3159     int shift;
3160
3161     if (a == 0) {
3162         r.cls = float_class_zero;
3163     } else {
3164         scale = MIN(MAX(scale, -0x10000), 0x10000);
3165         shift = clz64(a);
3166         r.cls = float_class_normal;
3167         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3168         r.frac = a << shift;
3169     }
3170
3171     return r;
3172 }
3173
3174 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3175 {
3176     FloatParts64 pa = uint_to_float(a, scale, status);
3177     return float16_round_pack_canonical(&pa, status);
3178 }
3179
3180 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3181 {
3182     return uint64_to_float16_scalbn(a, scale, status);
3183 }
3184
3185 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3186 {
3187     return uint64_to_float16_scalbn(a, scale, status);
3188 }
3189
3190 float16 uint64_to_float16(uint64_t a, float_status *status)
3191 {
3192     return uint64_to_float16_scalbn(a, 0, status);
3193 }
3194
3195 float16 uint32_to_float16(uint32_t a, float_status *status)
3196 {
3197     return uint64_to_float16_scalbn(a, 0, status);
3198 }
3199
3200 float16 uint16_to_float16(uint16_t a, float_status *status)
3201 {
3202     return uint64_to_float16_scalbn(a, 0, status);
3203 }
3204
3205 float16 uint8_to_float16(uint8_t a, float_status *status)
3206 {
3207     return uint64_to_float16_scalbn(a, 0, status);
3208 }
3209
3210 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3211 {
3212     FloatParts64 pa = uint_to_float(a, scale, status);
3213     return float32_round_pack_canonical(&pa, status);
3214 }
3215
3216 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3217 {
3218     return uint64_to_float32_scalbn(a, scale, status);
3219 }
3220
3221 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3222 {
3223     return uint64_to_float32_scalbn(a, scale, status);
3224 }
3225
3226 float32 uint64_to_float32(uint64_t a, float_status *status)
3227 {
3228     return uint64_to_float32_scalbn(a, 0, status);
3229 }
3230
3231 float32 uint32_to_float32(uint32_t a, float_status *status)
3232 {
3233     return uint64_to_float32_scalbn(a, 0, status);
3234 }
3235
3236 float32 uint16_to_float32(uint16_t a, float_status *status)
3237 {
3238     return uint64_to_float32_scalbn(a, 0, status);
3239 }
3240
3241 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3242 {
3243     FloatParts64 pa = uint_to_float(a, scale, status);
3244     return float64_round_pack_canonical(&pa, status);
3245 }
3246
3247 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3248 {
3249     return uint64_to_float64_scalbn(a, scale, status);
3250 }
3251
3252 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3253 {
3254     return uint64_to_float64_scalbn(a, scale, status);
3255 }
3256
3257 float64 uint64_to_float64(uint64_t a, float_status *status)
3258 {
3259     return uint64_to_float64_scalbn(a, 0, status);
3260 }
3261
3262 float64 uint32_to_float64(uint32_t a, float_status *status)
3263 {
3264     return uint64_to_float64_scalbn(a, 0, status);
3265 }
3266
3267 float64 uint16_to_float64(uint16_t a, float_status *status)
3268 {
3269     return uint64_to_float64_scalbn(a, 0, status);
3270 }
3271
3272 /*
3273  * Returns the result of converting the unsigned integer `a' to the
3274  * bfloat16 format.
3275  */
3276
3277 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3278 {
3279     FloatParts64 pa = uint_to_float(a, scale, status);
3280     return bfloat16_round_pack_canonical(&pa, status);
3281 }
3282
3283 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3284 {
3285     return uint64_to_bfloat16_scalbn(a, scale, status);
3286 }
3287
3288 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3289 {
3290     return uint64_to_bfloat16_scalbn(a, scale, status);
3291 }
3292
3293 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3294 {
3295     return uint64_to_bfloat16_scalbn(a, 0, status);
3296 }
3297
3298 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3299 {
3300     return uint64_to_bfloat16_scalbn(a, 0, status);
3301 }
3302
3303 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3304 {
3305     return uint64_to_bfloat16_scalbn(a, 0, status);
3306 }
3307
3308 /* Float Min/Max */
3309 /* min() and max() functions. These can't be implemented as
3310  * 'compare and pick one input' because that would mishandle
3311  * NaNs and +0 vs -0.
3312  *
3313  * minnum() and maxnum() functions. These are similar to the min()
3314  * and max() functions but if one of the arguments is a QNaN and
3315  * the other is numerical then the numerical argument is returned.
3316  * SNaNs will get quietened before being returned.
3317  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3318  * and maxNum() operations. min() and max() are the typical min/max
3319  * semantics provided by many CPUs which predate that specification.
3320  *
3321  * minnummag() and maxnummag() functions correspond to minNumMag()
3322  * and minNumMag() from the IEEE-754 2008.
3323  */
3324 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3325                                 bool ieee, bool ismag, float_status *s)
3326 {
3327     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3328         if (ieee) {
3329             /* Takes two floating-point values `a' and `b', one of
3330              * which is a NaN, and returns the appropriate NaN
3331              * result. If either `a' or `b' is a signaling NaN,
3332              * the invalid exception is raised.
3333              */
3334             if (is_snan(a.cls) || is_snan(b.cls)) {
3335                 return *parts_pick_nan(&a, &b, s);
3336             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3337                 return b;
3338             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3339                 return a;
3340             }
3341         }
3342         return *parts_pick_nan(&a, &b, s);
3343     } else {
3344         int a_exp, b_exp;
3345
3346         switch (a.cls) {
3347         case float_class_normal:
3348             a_exp = a.exp;
3349             break;
3350         case float_class_inf:
3351             a_exp = INT_MAX;
3352             break;
3353         case float_class_zero:
3354             a_exp = INT_MIN;
3355             break;
3356         default:
3357             g_assert_not_reached();
3358             break;
3359         }
3360         switch (b.cls) {
3361         case float_class_normal:
3362             b_exp = b.exp;
3363             break;
3364         case float_class_inf:
3365             b_exp = INT_MAX;
3366             break;
3367         case float_class_zero:
3368             b_exp = INT_MIN;
3369             break;
3370         default:
3371             g_assert_not_reached();
3372             break;
3373         }
3374
3375         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3376             bool a_less = a_exp < b_exp;
3377             if (a_exp == b_exp) {
3378                 a_less = a.frac < b.frac;
3379             }
3380             return a_less ^ ismin ? b : a;
3381         }
3382
3383         if (a.sign == b.sign) {
3384             bool a_less = a_exp < b_exp;
3385             if (a_exp == b_exp) {
3386                 a_less = a.frac < b.frac;
3387             }
3388             return a.sign ^ a_less ^ ismin ? b : a;
3389         } else {
3390             return a.sign ^ ismin ? b : a;
3391         }
3392     }
3393 }
3394
3395 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3396 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3397                                      float_status *s)                   \
3398 {                                                                       \
3399     FloatParts64 pa, pb, pr;                                            \
3400     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3401     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3402     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3403     return float ## sz ## _round_pack_canonical(&pr, s);                \
3404 }
3405
3406 MINMAX(16, min, true, false, false)
3407 MINMAX(16, minnum, true, true, false)
3408 MINMAX(16, minnummag, true, true, true)
3409 MINMAX(16, max, false, false, false)
3410 MINMAX(16, maxnum, false, true, false)
3411 MINMAX(16, maxnummag, false, true, true)
3412
3413 MINMAX(32, min, true, false, false)
3414 MINMAX(32, minnum, true, true, false)
3415 MINMAX(32, minnummag, true, true, true)
3416 MINMAX(32, max, false, false, false)
3417 MINMAX(32, maxnum, false, true, false)
3418 MINMAX(32, maxnummag, false, true, true)
3419
3420 MINMAX(64, min, true, false, false)
3421 MINMAX(64, minnum, true, true, false)
3422 MINMAX(64, minnummag, true, true, true)
3423 MINMAX(64, max, false, false, false)
3424 MINMAX(64, maxnum, false, true, false)
3425 MINMAX(64, maxnummag, false, true, true)
3426
3427 #undef MINMAX
3428
3429 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3430 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3431 {                                                                       \
3432     FloatParts64 pa, pb, pr;                                            \
3433     bfloat16_unpack_canonical(&pa, a, s);                               \
3434     bfloat16_unpack_canonical(&pb, b, s);                               \
3435     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3436     return bfloat16_round_pack_canonical(&pr, s);                       \
3437 }
3438
3439 BF16_MINMAX(min, true, false, false)
3440 BF16_MINMAX(minnum, true, true, false)
3441 BF16_MINMAX(minnummag, true, true, true)
3442 BF16_MINMAX(max, false, false, false)
3443 BF16_MINMAX(maxnum, false, true, false)
3444 BF16_MINMAX(maxnummag, false, true, true)
3445
3446 #undef BF16_MINMAX
3447
3448 /* Floating point compare */
3449 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3450                                     float_status *s)
3451 {
3452     if (is_nan(a.cls) || is_nan(b.cls)) {
3453         if (!is_quiet ||
3454             a.cls == float_class_snan ||
3455             b.cls == float_class_snan) {
3456             float_raise(float_flag_invalid, s);
3457         }
3458         return float_relation_unordered;
3459     }
3460
3461     if (a.cls == float_class_zero) {
3462         if (b.cls == float_class_zero) {
3463             return float_relation_equal;
3464         }
3465         return b.sign ? float_relation_greater : float_relation_less;
3466     } else if (b.cls == float_class_zero) {
3467         return a.sign ? float_relation_less : float_relation_greater;
3468     }
3469
3470     /* The only really important thing about infinity is its sign. If
3471      * both are infinities the sign marks the smallest of the two.
3472      */
3473     if (a.cls == float_class_inf) {
3474         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3475             return float_relation_equal;
3476         }
3477         return a.sign ? float_relation_less : float_relation_greater;
3478     } else if (b.cls == float_class_inf) {
3479         return b.sign ? float_relation_greater : float_relation_less;
3480     }
3481
3482     if (a.sign != b.sign) {
3483         return a.sign ? float_relation_less : float_relation_greater;
3484     }
3485
3486     if (a.exp == b.exp) {
3487         if (a.frac == b.frac) {
3488             return float_relation_equal;
3489         }
3490         if (a.sign) {
3491             return a.frac > b.frac ?
3492                 float_relation_less : float_relation_greater;
3493         } else {
3494             return a.frac > b.frac ?
3495                 float_relation_greater : float_relation_less;
3496         }
3497     } else {
3498         if (a.sign) {
3499             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3500         } else {
3501             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3502         }
3503     }
3504 }
3505
3506 #define COMPARE(name, attr, sz)                                         \
3507 static int attr                                                         \
3508 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3509 {                                                                       \
3510     FloatParts64 pa, pb;                                                \
3511     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3512     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3513     return compare_floats(pa, pb, is_quiet, s);                         \
3514 }
3515
3516 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3517 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3518 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3519
3520 #undef COMPARE
3521
3522 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3523 {
3524     return soft_f16_compare(a, b, false, s);
3525 }
3526
3527 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3528 {
3529     return soft_f16_compare(a, b, true, s);
3530 }
3531
3532 static FloatRelation QEMU_FLATTEN
3533 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3534 {
3535     union_float32 ua, ub;
3536
3537     ua.s = xa;
3538     ub.s = xb;
3539
3540     if (QEMU_NO_HARDFLOAT) {
3541         goto soft;
3542     }
3543
3544     float32_input_flush2(&ua.s, &ub.s, s);
3545     if (isgreaterequal(ua.h, ub.h)) {
3546         if (isgreater(ua.h, ub.h)) {
3547             return float_relation_greater;
3548         }
3549         return float_relation_equal;
3550     }
3551     if (likely(isless(ua.h, ub.h))) {
3552         return float_relation_less;
3553     }
3554     /* The only condition remaining is unordered.
3555      * Fall through to set flags.
3556      */
3557  soft:
3558     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3559 }
3560
3561 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3562 {
3563     return f32_compare(a, b, false, s);
3564 }
3565
3566 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3567 {
3568     return f32_compare(a, b, true, s);
3569 }
3570
3571 static FloatRelation QEMU_FLATTEN
3572 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3573 {
3574     union_float64 ua, ub;
3575
3576     ua.s = xa;
3577     ub.s = xb;
3578
3579     if (QEMU_NO_HARDFLOAT) {
3580         goto soft;
3581     }
3582
3583     float64_input_flush2(&ua.s, &ub.s, s);
3584     if (isgreaterequal(ua.h, ub.h)) {
3585         if (isgreater(ua.h, ub.h)) {
3586             return float_relation_greater;
3587         }
3588         return float_relation_equal;
3589     }
3590     if (likely(isless(ua.h, ub.h))) {
3591         return float_relation_less;
3592     }
3593     /* The only condition remaining is unordered.
3594      * Fall through to set flags.
3595      */
3596  soft:
3597     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3598 }
3599
3600 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3601 {
3602     return f64_compare(a, b, false, s);
3603 }
3604
3605 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3606 {
3607     return f64_compare(a, b, true, s);
3608 }
3609
3610 static FloatRelation QEMU_FLATTEN
3611 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3612 {
3613     FloatParts64 pa, pb;
3614
3615     bfloat16_unpack_canonical(&pa, a, s);
3616     bfloat16_unpack_canonical(&pb, b, s);
3617     return compare_floats(pa, pb, is_quiet, s);
3618 }
3619
3620 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3621 {
3622     return soft_bf16_compare(a, b, false, s);
3623 }
3624
3625 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3626 {
3627     return soft_bf16_compare(a, b, true, s);
3628 }
3629
3630 /* Multiply A by 2 raised to the power N.  */
3631 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3632 {
3633     if (unlikely(is_nan(a.cls))) {
3634         parts_return_nan(&a, s);
3635     }
3636     if (a.cls == float_class_normal) {
3637         /* The largest float type (even though not supported by FloatParts64)
3638          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3639          * still allows rounding to infinity, without allowing overflow
3640          * within the int32_t that backs FloatParts64.exp.
3641          */
3642         n = MIN(MAX(n, -0x10000), 0x10000);
3643         a.exp += n;
3644     }
3645     return a;
3646 }
3647
3648 float16 float16_scalbn(float16 a, int n, float_status *status)
3649 {
3650     FloatParts64 pa, pr;
3651
3652     float16_unpack_canonical(&pa, a, status);
3653     pr = scalbn_decomposed(pa, n, status);
3654     return float16_round_pack_canonical(&pr, status);
3655 }
3656
3657 float32 float32_scalbn(float32 a, int n, float_status *status)
3658 {
3659     FloatParts64 pa, pr;
3660
3661     float32_unpack_canonical(&pa, a, status);
3662     pr = scalbn_decomposed(pa, n, status);
3663     return float32_round_pack_canonical(&pr, status);
3664 }
3665
3666 float64 float64_scalbn(float64 a, int n, float_status *status)
3667 {
3668     FloatParts64 pa, pr;
3669
3670     float64_unpack_canonical(&pa, a, status);
3671     pr = scalbn_decomposed(pa, n, status);
3672     return float64_round_pack_canonical(&pr, status);
3673 }
3674
3675 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3676 {
3677     FloatParts64 pa, pr;
3678
3679     bfloat16_unpack_canonical(&pa, a, status);
3680     pr = scalbn_decomposed(pa, n, status);
3681     return bfloat16_round_pack_canonical(&pr, status);
3682 }
3683
3684 /*
3685  * Square Root
3686  *
3687  * The old softfloat code did an approximation step before zeroing in
3688  * on the final result. However for simpleness we just compute the
3689  * square root by iterating down from the implicit bit to enough extra
3690  * bits to ensure we get a correctly rounded result.
3691  *
3692  * This does mean however the calculation is slower than before,
3693  * especially for 64 bit floats.
3694  */
3695
3696 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3697 {
3698     uint64_t a_frac, r_frac, s_frac;
3699     int bit, last_bit;
3700
3701     if (is_nan(a.cls)) {
3702         parts_return_nan(&a, s);
3703         return a;
3704     }
3705     if (a.cls == float_class_zero) {
3706         return a;  /* sqrt(+-0) = +-0 */
3707     }
3708     if (a.sign) {
3709         float_raise(float_flag_invalid, s);
3710         parts_default_nan(&a, s);
3711         return a;
3712     }
3713     if (a.cls == float_class_inf) {
3714         return a;  /* sqrt(+inf) = +inf */
3715     }
3716
3717     assert(a.cls == float_class_normal);
3718
3719     /* We need two overflow bits at the top. Adding room for that is a
3720      * right shift. If the exponent is odd, we can discard the low bit
3721      * by multiplying the fraction by 2; that's a left shift. Combine
3722      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3723      */
3724     a_frac = a.frac >> (2 - (a.exp & 1));
3725     a.exp >>= 1;
3726
3727     /* Bit-by-bit computation of sqrt.  */
3728     r_frac = 0;
3729     s_frac = 0;
3730
3731     /* Iterate from implicit bit down to the 3 extra bits to compute a
3732      * properly rounded result. Remember we've inserted two more bits
3733      * at the top, so these positions are two less.
3734      */
3735     bit = DECOMPOSED_BINARY_POINT - 2;
3736     last_bit = MAX(p->frac_shift - 4, 0);
3737     do {
3738         uint64_t q = 1ULL << bit;
3739         uint64_t t_frac = s_frac + q;
3740         if (t_frac <= a_frac) {
3741             s_frac = t_frac + q;
3742             a_frac -= t_frac;
3743             r_frac += q;
3744         }
3745         a_frac <<= 1;
3746     } while (--bit >= last_bit);
3747
3748     /* Undo the right shift done above. If there is any remaining
3749      * fraction, the result is inexact. Set the sticky bit.
3750      */
3751     a.frac = (r_frac << 2) + (a_frac != 0);
3752
3753     return a;
3754 }
3755
3756 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3757 {
3758     FloatParts64 pa, pr;
3759
3760     float16_unpack_canonical(&pa, a, status);
3761     pr = sqrt_float(pa, status, &float16_params);
3762     return float16_round_pack_canonical(&pr, status);
3763 }
3764
3765 static float32 QEMU_SOFTFLOAT_ATTR
3766 soft_f32_sqrt(float32 a, float_status *status)
3767 {
3768     FloatParts64 pa, pr;
3769
3770     float32_unpack_canonical(&pa, a, status);
3771     pr = sqrt_float(pa, status, &float32_params);
3772     return float32_round_pack_canonical(&pr, status);
3773 }
3774
3775 static float64 QEMU_SOFTFLOAT_ATTR
3776 soft_f64_sqrt(float64 a, float_status *status)
3777 {
3778     FloatParts64 pa, pr;
3779
3780     float64_unpack_canonical(&pa, a, status);
3781     pr = sqrt_float(pa, status, &float64_params);
3782     return float64_round_pack_canonical(&pr, status);
3783 }
3784
3785 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3786 {
3787     union_float32 ua, ur;
3788
3789     ua.s = xa;
3790     if (unlikely(!can_use_fpu(s))) {
3791         goto soft;
3792     }
3793
3794     float32_input_flush1(&ua.s, s);
3795     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3796         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3797                        fpclassify(ua.h) == FP_ZERO) ||
3798                      signbit(ua.h))) {
3799             goto soft;
3800         }
3801     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3802                         float32_is_neg(ua.s))) {
3803         goto soft;
3804     }
3805     ur.h = sqrtf(ua.h);
3806     return ur.s;
3807
3808  soft:
3809     return soft_f32_sqrt(ua.s, s);
3810 }
3811
3812 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3813 {
3814     union_float64 ua, ur;
3815
3816     ua.s = xa;
3817     if (unlikely(!can_use_fpu(s))) {
3818         goto soft;
3819     }
3820
3821     float64_input_flush1(&ua.s, s);
3822     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3823         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3824                        fpclassify(ua.h) == FP_ZERO) ||
3825                      signbit(ua.h))) {
3826             goto soft;
3827         }
3828     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3829                         float64_is_neg(ua.s))) {
3830         goto soft;
3831     }
3832     ur.h = sqrt(ua.h);
3833     return ur.s;
3834
3835  soft:
3836     return soft_f64_sqrt(ua.s, s);
3837 }
3838
3839 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3840 {
3841     FloatParts64 pa, pr;
3842
3843     bfloat16_unpack_canonical(&pa, a, status);
3844     pr = sqrt_float(pa, status, &bfloat16_params);
3845     return bfloat16_round_pack_canonical(&pr, status);
3846 }
3847
3848 /*----------------------------------------------------------------------------
3849 | The pattern for a default generated NaN.
3850 *----------------------------------------------------------------------------*/
3851
3852 float16 float16_default_nan(float_status *status)
3853 {
3854     FloatParts64 p;
3855
3856     parts_default_nan(&p, status);
3857     p.frac >>= float16_params.frac_shift;
3858     return float16_pack_raw(&p);
3859 }
3860
3861 float32 float32_default_nan(float_status *status)
3862 {
3863     FloatParts64 p;
3864
3865     parts_default_nan(&p, status);
3866     p.frac >>= float32_params.frac_shift;
3867     return float32_pack_raw(&p);
3868 }
3869
3870 float64 float64_default_nan(float_status *status)
3871 {
3872     FloatParts64 p;
3873
3874     parts_default_nan(&p, status);
3875     p.frac >>= float64_params.frac_shift;
3876     return float64_pack_raw(&p);
3877 }
3878
3879 float128 float128_default_nan(float_status *status)
3880 {
3881     FloatParts128 p;
3882
3883     parts_default_nan(&p, status);
3884     frac_shr(&p, float128_params.frac_shift);
3885     return float128_pack_raw(&p);
3886 }
3887
3888 bfloat16 bfloat16_default_nan(float_status *status)
3889 {
3890     FloatParts64 p;
3891
3892     parts_default_nan(&p, status);
3893     p.frac >>= bfloat16_params.frac_shift;
3894     return bfloat16_pack_raw(&p);
3895 }
3896
3897 /*----------------------------------------------------------------------------
3898 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3899 *----------------------------------------------------------------------------*/
3900
3901 float16 float16_silence_nan(float16 a, float_status *status)
3902 {
3903     FloatParts64 p;
3904
3905     float16_unpack_raw(&p, a);
3906     p.frac <<= float16_params.frac_shift;
3907     parts_silence_nan(&p, status);
3908     p.frac >>= float16_params.frac_shift;
3909     return float16_pack_raw(&p);
3910 }
3911
3912 float32 float32_silence_nan(float32 a, float_status *status)
3913 {
3914     FloatParts64 p;
3915
3916     float32_unpack_raw(&p, a);
3917     p.frac <<= float32_params.frac_shift;
3918     parts_silence_nan(&p, status);
3919     p.frac >>= float32_params.frac_shift;
3920     return float32_pack_raw(&p);
3921 }
3922
3923 float64 float64_silence_nan(float64 a, float_status *status)
3924 {
3925     FloatParts64 p;
3926
3927     float64_unpack_raw(&p, a);
3928     p.frac <<= float64_params.frac_shift;
3929     parts_silence_nan(&p, status);
3930     p.frac >>= float64_params.frac_shift;
3931     return float64_pack_raw(&p);
3932 }
3933
3934 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3935 {
3936     FloatParts64 p;
3937
3938     bfloat16_unpack_raw(&p, a);
3939     p.frac <<= bfloat16_params.frac_shift;
3940     parts_silence_nan(&p, status);
3941     p.frac >>= bfloat16_params.frac_shift;
3942     return bfloat16_pack_raw(&p);
3943 }
3944
3945 float128 float128_silence_nan(float128 a, float_status *status)
3946 {
3947     FloatParts128 p;
3948
3949     float128_unpack_raw(&p, a);
3950     frac_shl(&p, float128_params.frac_shift);
3951     parts_silence_nan(&p, status);
3952     frac_shr(&p, float128_params.frac_shift);
3953     return float128_pack_raw(&p);
3954 }
3955
3956 /*----------------------------------------------------------------------------
3957 | If `a' is denormal and we are in flush-to-zero mode then set the
3958 | input-denormal exception and return zero. Otherwise just return the value.
3959 *----------------------------------------------------------------------------*/
3960
3961 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3962 {
3963     if (p.exp == 0 && p.frac != 0) {
3964         float_raise(float_flag_input_denormal, status);
3965         return true;
3966     }
3967
3968     return false;
3969 }
3970
3971 float16 float16_squash_input_denormal(float16 a, float_status *status)
3972 {
3973     if (status->flush_inputs_to_zero) {
3974         FloatParts64 p;
3975
3976         float16_unpack_raw(&p, a);
3977         if (parts_squash_denormal(p, status)) {
3978             return float16_set_sign(float16_zero, p.sign);
3979         }
3980     }
3981     return a;
3982 }
3983
3984 float32 float32_squash_input_denormal(float32 a, float_status *status)
3985 {
3986     if (status->flush_inputs_to_zero) {
3987         FloatParts64 p;
3988
3989         float32_unpack_raw(&p, a);
3990         if (parts_squash_denormal(p, status)) {
3991             return float32_set_sign(float32_zero, p.sign);
3992         }
3993     }
3994     return a;
3995 }
3996
3997 float64 float64_squash_input_denormal(float64 a, float_status *status)
3998 {
3999     if (status->flush_inputs_to_zero) {
4000         FloatParts64 p;
4001
4002         float64_unpack_raw(&p, a);
4003         if (parts_squash_denormal(p, status)) {
4004             return float64_set_sign(float64_zero, p.sign);
4005         }
4006     }
4007     return a;
4008 }
4009
4010 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
4011 {
4012     if (status->flush_inputs_to_zero) {
4013         FloatParts64 p;
4014
4015         bfloat16_unpack_raw(&p, a);
4016         if (parts_squash_denormal(p, status)) {
4017             return bfloat16_set_sign(bfloat16_zero, p.sign);
4018         }
4019     }
4020     return a;
4021 }
4022
4023 /*----------------------------------------------------------------------------
4024 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
4025 | and 7, and returns the properly rounded 32-bit integer corresponding to the
4026 | input.  If `zSign' is 1, the input is negated before being converted to an
4027 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
4028 | is simply rounded to an integer, with the inexact exception raised if the
4029 | input cannot be represented exactly as an integer.  However, if the fixed-
4030 | point input is too large, the invalid exception is raised and the largest
4031 | positive or negative integer is returned.
4032 *----------------------------------------------------------------------------*/
4033
4034 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
4035                                  float_status *status)
4036 {
4037     int8_t roundingMode;
4038     bool roundNearestEven;
4039     int8_t roundIncrement, roundBits;
4040     int32_t z;
4041
4042     roundingMode = status->float_rounding_mode;
4043     roundNearestEven = ( roundingMode == float_round_nearest_even );
4044     switch (roundingMode) {
4045     case float_round_nearest_even:
4046     case float_round_ties_away:
4047         roundIncrement = 0x40;
4048         break;
4049     case float_round_to_zero:
4050         roundIncrement = 0;
4051         break;
4052     case float_round_up:
4053         roundIncrement = zSign ? 0 : 0x7f;
4054         break;
4055     case float_round_down:
4056         roundIncrement = zSign ? 0x7f : 0;
4057         break;
4058     case float_round_to_odd:
4059         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4060         break;
4061     default:
4062         abort();
4063     }
4064     roundBits = absZ & 0x7F;
4065     absZ = ( absZ + roundIncrement )>>7;
4066     if (!(roundBits ^ 0x40) && roundNearestEven) {
4067         absZ &= ~1;
4068     }
4069     z = absZ;
4070     if ( zSign ) z = - z;
4071     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4072         float_raise(float_flag_invalid, status);
4073         return zSign ? INT32_MIN : INT32_MAX;
4074     }
4075     if (roundBits) {
4076         float_raise(float_flag_inexact, status);
4077     }
4078     return z;
4079
4080 }
4081
4082 /*----------------------------------------------------------------------------
4083 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4084 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4085 | and returns the properly rounded 64-bit integer corresponding to the input.
4086 | If `zSign' is 1, the input is negated before being converted to an integer.
4087 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4088 | the inexact exception raised if the input cannot be represented exactly as
4089 | an integer.  However, if the fixed-point input is too large, the invalid
4090 | exception is raised and the largest positive or negative integer is
4091 | returned.
4092 *----------------------------------------------------------------------------*/
4093
4094 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4095                                float_status *status)
4096 {
4097     int8_t roundingMode;
4098     bool roundNearestEven, increment;
4099     int64_t z;
4100
4101     roundingMode = status->float_rounding_mode;
4102     roundNearestEven = ( roundingMode == float_round_nearest_even );
4103     switch (roundingMode) {
4104     case float_round_nearest_even:
4105     case float_round_ties_away:
4106         increment = ((int64_t) absZ1 < 0);
4107         break;
4108     case float_round_to_zero:
4109         increment = 0;
4110         break;
4111     case float_round_up:
4112         increment = !zSign && absZ1;
4113         break;
4114     case float_round_down:
4115         increment = zSign && absZ1;
4116         break;
4117     case float_round_to_odd:
4118         increment = !(absZ0 & 1) && absZ1;
4119         break;
4120     default:
4121         abort();
4122     }
4123     if ( increment ) {
4124         ++absZ0;
4125         if ( absZ0 == 0 ) goto overflow;
4126         if (!(absZ1 << 1) && roundNearestEven) {
4127             absZ0 &= ~1;
4128         }
4129     }
4130     z = absZ0;
4131     if ( zSign ) z = - z;
4132     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4133  overflow:
4134         float_raise(float_flag_invalid, status);
4135         return zSign ? INT64_MIN : INT64_MAX;
4136     }
4137     if (absZ1) {
4138         float_raise(float_flag_inexact, status);
4139     }
4140     return z;
4141
4142 }
4143
4144 /*----------------------------------------------------------------------------
4145 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4146 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4147 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4148 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4149 | with the inexact exception raised if the input cannot be represented exactly
4150 | as an integer.  However, if the fixed-point input is too large, the invalid
4151 | exception is raised and the largest unsigned integer is returned.
4152 *----------------------------------------------------------------------------*/
4153
4154 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4155                                 uint64_t absZ1, float_status *status)
4156 {
4157     int8_t roundingMode;
4158     bool roundNearestEven, increment;
4159
4160     roundingMode = status->float_rounding_mode;
4161     roundNearestEven = (roundingMode == float_round_nearest_even);
4162     switch (roundingMode) {
4163     case float_round_nearest_even:
4164     case float_round_ties_away:
4165         increment = ((int64_t)absZ1 < 0);
4166         break;
4167     case float_round_to_zero:
4168         increment = 0;
4169         break;
4170     case float_round_up:
4171         increment = !zSign && absZ1;
4172         break;
4173     case float_round_down:
4174         increment = zSign && absZ1;
4175         break;
4176     case float_round_to_odd:
4177         increment = !(absZ0 & 1) && absZ1;
4178         break;
4179     default:
4180         abort();
4181     }
4182     if (increment) {
4183         ++absZ0;
4184         if (absZ0 == 0) {
4185             float_raise(float_flag_invalid, status);
4186             return UINT64_MAX;
4187         }
4188         if (!(absZ1 << 1) && roundNearestEven) {
4189             absZ0 &= ~1;
4190         }
4191     }
4192
4193     if (zSign && absZ0) {
4194         float_raise(float_flag_invalid, status);
4195         return 0;
4196     }
4197
4198     if (absZ1) {
4199         float_raise(float_flag_inexact, status);
4200     }
4201     return absZ0;
4202 }
4203
4204 /*----------------------------------------------------------------------------
4205 | Normalizes the subnormal single-precision floating-point value represented
4206 | by the denormalized significand `aSig'.  The normalized exponent and
4207 | significand are stored at the locations pointed to by `zExpPtr' and
4208 | `zSigPtr', respectively.
4209 *----------------------------------------------------------------------------*/
4210
4211 static void
4212  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4213 {
4214     int8_t shiftCount;
4215
4216     shiftCount = clz32(aSig) - 8;
4217     *zSigPtr = aSig<<shiftCount;
4218     *zExpPtr = 1 - shiftCount;
4219
4220 }
4221
4222 /*----------------------------------------------------------------------------
4223 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4224 | and significand `zSig', and returns the proper single-precision floating-
4225 | point value corresponding to the abstract input.  Ordinarily, the abstract
4226 | value is simply rounded and packed into the single-precision format, with
4227 | the inexact exception raised if the abstract input cannot be represented
4228 | exactly.  However, if the abstract value is too large, the overflow and
4229 | inexact exceptions are raised and an infinity or maximal finite value is
4230 | returned.  If the abstract value is too small, the input value is rounded to
4231 | a subnormal number, and the underflow and inexact exceptions are raised if
4232 | the abstract input cannot be represented exactly as a subnormal single-
4233 | precision floating-point number.
4234 |     The input significand `zSig' has its binary point between bits 30
4235 | and 29, which is 7 bits to the left of the usual location.  This shifted
4236 | significand must be normalized or smaller.  If `zSig' is not normalized,
4237 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4238 | and it must not require rounding.  In the usual case that `zSig' is
4239 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4240 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4241 | Binary Floating-Point Arithmetic.
4242 *----------------------------------------------------------------------------*/
4243
4244 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4245                                    float_status *status)
4246 {
4247     int8_t roundingMode;
4248     bool roundNearestEven;
4249     int8_t roundIncrement, roundBits;
4250     bool isTiny;
4251
4252     roundingMode = status->float_rounding_mode;
4253     roundNearestEven = ( roundingMode == float_round_nearest_even );
4254     switch (roundingMode) {
4255     case float_round_nearest_even:
4256     case float_round_ties_away:
4257         roundIncrement = 0x40;
4258         break;
4259     case float_round_to_zero:
4260         roundIncrement = 0;
4261         break;
4262     case float_round_up:
4263         roundIncrement = zSign ? 0 : 0x7f;
4264         break;
4265     case float_round_down:
4266         roundIncrement = zSign ? 0x7f : 0;
4267         break;
4268     case float_round_to_odd:
4269         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4270         break;
4271     default:
4272         abort();
4273         break;
4274     }
4275     roundBits = zSig & 0x7F;
4276     if ( 0xFD <= (uint16_t) zExp ) {
4277         if (    ( 0xFD < zExp )
4278              || (    ( zExp == 0xFD )
4279                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4280            ) {
4281             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4282                                    roundIncrement != 0;
4283             float_raise(float_flag_overflow | float_flag_inexact, status);
4284             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4285         }
4286         if ( zExp < 0 ) {
4287             if (status->flush_to_zero) {
4288                 float_raise(float_flag_output_denormal, status);
4289                 return packFloat32(zSign, 0, 0);
4290             }
4291             isTiny = status->tininess_before_rounding
4292                   || (zExp < -1)
4293                   || (zSig + roundIncrement < 0x80000000);
4294             shift32RightJamming( zSig, - zExp, &zSig );
4295             zExp = 0;
4296             roundBits = zSig & 0x7F;
4297             if (isTiny && roundBits) {
4298                 float_raise(float_flag_underflow, status);
4299             }
4300             if (roundingMode == float_round_to_odd) {
4301                 /*
4302                  * For round-to-odd case, the roundIncrement depends on
4303                  * zSig which just changed.
4304                  */
4305                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4306             }
4307         }
4308     }
4309     if (roundBits) {
4310         float_raise(float_flag_inexact, status);
4311     }
4312     zSig = ( zSig + roundIncrement )>>7;
4313     if (!(roundBits ^ 0x40) && roundNearestEven) {
4314         zSig &= ~1;
4315     }
4316     if ( zSig == 0 ) zExp = 0;
4317     return packFloat32( zSign, zExp, zSig );
4318
4319 }
4320
4321 /*----------------------------------------------------------------------------
4322 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4323 | and significand `zSig', and returns the proper single-precision floating-
4324 | point value corresponding to the abstract input.  This routine is just like
4325 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4326 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4327 | floating-point exponent.
4328 *----------------------------------------------------------------------------*/
4329
4330 static float32
4331  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4332                               float_status *status)
4333 {
4334     int8_t shiftCount;
4335
4336     shiftCount = clz32(zSig) - 1;
4337     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4338                                status);
4339
4340 }
4341
4342 /*----------------------------------------------------------------------------
4343 | Normalizes the subnormal double-precision floating-point value represented
4344 | by the denormalized significand `aSig'.  The normalized exponent and
4345 | significand are stored at the locations pointed to by `zExpPtr' and
4346 | `zSigPtr', respectively.
4347 *----------------------------------------------------------------------------*/
4348
4349 static void
4350  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4351 {
4352     int8_t shiftCount;
4353
4354     shiftCount = clz64(aSig) - 11;
4355     *zSigPtr = aSig<<shiftCount;
4356     *zExpPtr = 1 - shiftCount;
4357
4358 }
4359
4360 /*----------------------------------------------------------------------------
4361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4362 | double-precision floating-point value, returning the result.  After being
4363 | shifted into the proper positions, the three fields are simply added
4364 | together to form the result.  This means that any integer portion of `zSig'
4365 | will be added into the exponent.  Since a properly normalized significand
4366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4367 | than the desired result exponent whenever `zSig' is a complete, normalized
4368 | significand.
4369 *----------------------------------------------------------------------------*/
4370
4371 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4372 {
4373
4374     return make_float64(
4375         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4376
4377 }
4378
4379 /*----------------------------------------------------------------------------
4380 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4381 | and significand `zSig', and returns the proper double-precision floating-
4382 | point value corresponding to the abstract input.  Ordinarily, the abstract
4383 | value is simply rounded and packed into the double-precision format, with
4384 | the inexact exception raised if the abstract input cannot be represented
4385 | exactly.  However, if the abstract value is too large, the overflow and
4386 | inexact exceptions are raised and an infinity or maximal finite value is
4387 | returned.  If the abstract value is too small, the input value is rounded to
4388 | a subnormal number, and the underflow and inexact exceptions are raised if
4389 | the abstract input cannot be represented exactly as a subnormal double-
4390 | precision floating-point number.
4391 |     The input significand `zSig' has its binary point between bits 62
4392 | and 61, which is 10 bits to the left of the usual location.  This shifted
4393 | significand must be normalized or smaller.  If `zSig' is not normalized,
4394 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4395 | and it must not require rounding.  In the usual case that `zSig' is
4396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4397 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4398 | Binary Floating-Point Arithmetic.
4399 *----------------------------------------------------------------------------*/
4400
4401 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4402                                    float_status *status)
4403 {
4404     int8_t roundingMode;
4405     bool roundNearestEven;
4406     int roundIncrement, roundBits;
4407     bool isTiny;
4408
4409     roundingMode = status->float_rounding_mode;
4410     roundNearestEven = ( roundingMode == float_round_nearest_even );
4411     switch (roundingMode) {
4412     case float_round_nearest_even:
4413     case float_round_ties_away:
4414         roundIncrement = 0x200;
4415         break;
4416     case float_round_to_zero:
4417         roundIncrement = 0;
4418         break;
4419     case float_round_up:
4420         roundIncrement = zSign ? 0 : 0x3ff;
4421         break;
4422     case float_round_down:
4423         roundIncrement = zSign ? 0x3ff : 0;
4424         break;
4425     case float_round_to_odd:
4426         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4427         break;
4428     default:
4429         abort();
4430     }
4431     roundBits = zSig & 0x3FF;
4432     if ( 0x7FD <= (uint16_t) zExp ) {
4433         if (    ( 0x7FD < zExp )
4434              || (    ( zExp == 0x7FD )
4435                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4436            ) {
4437             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4438                                    roundIncrement != 0;
4439             float_raise(float_flag_overflow | float_flag_inexact, status);
4440             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4441         }
4442         if ( zExp < 0 ) {
4443             if (status->flush_to_zero) {
4444                 float_raise(float_flag_output_denormal, status);
4445                 return packFloat64(zSign, 0, 0);
4446             }
4447             isTiny = status->tininess_before_rounding
4448                   || (zExp < -1)
4449                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4450             shift64RightJamming( zSig, - zExp, &zSig );
4451             zExp = 0;
4452             roundBits = zSig & 0x3FF;
4453             if (isTiny && roundBits) {
4454                 float_raise(float_flag_underflow, status);
4455             }
4456             if (roundingMode == float_round_to_odd) {
4457                 /*
4458                  * For round-to-odd case, the roundIncrement depends on
4459                  * zSig which just changed.
4460                  */
4461                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4462             }
4463         }
4464     }
4465     if (roundBits) {
4466         float_raise(float_flag_inexact, status);
4467     }
4468     zSig = ( zSig + roundIncrement )>>10;
4469     if (!(roundBits ^ 0x200) && roundNearestEven) {
4470         zSig &= ~1;
4471     }
4472     if ( zSig == 0 ) zExp = 0;
4473     return packFloat64( zSign, zExp, zSig );
4474
4475 }
4476
4477 /*----------------------------------------------------------------------------
4478 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4479 | and significand `zSig', and returns the proper double-precision floating-
4480 | point value corresponding to the abstract input.  This routine is just like
4481 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4482 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4483 | floating-point exponent.
4484 *----------------------------------------------------------------------------*/
4485
4486 static float64
4487  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4488                               float_status *status)
4489 {
4490     int8_t shiftCount;
4491
4492     shiftCount = clz64(zSig) - 1;
4493     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4494                                status);
4495
4496 }
4497
4498 /*----------------------------------------------------------------------------
4499 | Normalizes the subnormal extended double-precision floating-point value
4500 | represented by the denormalized significand `aSig'.  The normalized exponent
4501 | and significand are stored at the locations pointed to by `zExpPtr' and
4502 | `zSigPtr', respectively.
4503 *----------------------------------------------------------------------------*/
4504
4505 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4506                                 uint64_t *zSigPtr)
4507 {
4508     int8_t shiftCount;
4509
4510     shiftCount = clz64(aSig);
4511     *zSigPtr = aSig<<shiftCount;
4512     *zExpPtr = 1 - shiftCount;
4513 }
4514
4515 /*----------------------------------------------------------------------------
4516 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4517 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4518 | and returns the proper extended double-precision floating-point value
4519 | corresponding to the abstract input.  Ordinarily, the abstract value is
4520 | rounded and packed into the extended double-precision format, with the
4521 | inexact exception raised if the abstract input cannot be represented
4522 | exactly.  However, if the abstract value is too large, the overflow and
4523 | inexact exceptions are raised and an infinity or maximal finite value is
4524 | returned.  If the abstract value is too small, the input value is rounded to
4525 | a subnormal number, and the underflow and inexact exceptions are raised if
4526 | the abstract input cannot be represented exactly as a subnormal extended
4527 | double-precision floating-point number.
4528 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4529 | number of bits as single or double precision, respectively.  Otherwise, the
4530 | result is rounded to the full precision of the extended double-precision
4531 | format.
4532 |     The input significand must be normalized or smaller.  If the input
4533 | significand is not normalized, `zExp' must be 0; in that case, the result
4534 | returned is a subnormal number, and it must not require rounding.  The
4535 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4536 | Floating-Point Arithmetic.
4537 *----------------------------------------------------------------------------*/
4538
4539 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4540                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4541                               float_status *status)
4542 {
4543     int8_t roundingMode;
4544     bool roundNearestEven, increment, isTiny;
4545     int64_t roundIncrement, roundMask, roundBits;
4546
4547     roundingMode = status->float_rounding_mode;
4548     roundNearestEven = ( roundingMode == float_round_nearest_even );
4549     if ( roundingPrecision == 80 ) goto precision80;
4550     if ( roundingPrecision == 64 ) {
4551         roundIncrement = UINT64_C(0x0000000000000400);
4552         roundMask = UINT64_C(0x00000000000007FF);
4553     }
4554     else if ( roundingPrecision == 32 ) {
4555         roundIncrement = UINT64_C(0x0000008000000000);
4556         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4557     }
4558     else {
4559         goto precision80;
4560     }
4561     zSig0 |= ( zSig1 != 0 );
4562     switch (roundingMode) {
4563     case float_round_nearest_even:
4564     case float_round_ties_away:
4565         break;
4566     case float_round_to_zero:
4567         roundIncrement = 0;
4568         break;
4569     case float_round_up:
4570         roundIncrement = zSign ? 0 : roundMask;
4571         break;
4572     case float_round_down:
4573         roundIncrement = zSign ? roundMask : 0;
4574         break;
4575     default:
4576         abort();
4577     }
4578     roundBits = zSig0 & roundMask;
4579     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4580         if (    ( 0x7FFE < zExp )
4581              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4582            ) {
4583             goto overflow;
4584         }
4585         if ( zExp <= 0 ) {
4586             if (status->flush_to_zero) {
4587                 float_raise(float_flag_output_denormal, status);
4588                 return packFloatx80(zSign, 0, 0);
4589             }
4590             isTiny = status->tininess_before_rounding
4591                   || (zExp < 0 )
4592                   || (zSig0 <= zSig0 + roundIncrement);
4593             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4594             zExp = 0;
4595             roundBits = zSig0 & roundMask;
4596             if (isTiny && roundBits) {
4597                 float_raise(float_flag_underflow, status);
4598             }
4599             if (roundBits) {
4600                 float_raise(float_flag_inexact, status);
4601             }
4602             zSig0 += roundIncrement;
4603             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4604             roundIncrement = roundMask + 1;
4605             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4606                 roundMask |= roundIncrement;
4607             }
4608             zSig0 &= ~ roundMask;
4609             return packFloatx80( zSign, zExp, zSig0 );
4610         }
4611     }
4612     if (roundBits) {
4613         float_raise(float_flag_inexact, status);
4614     }
4615     zSig0 += roundIncrement;
4616     if ( zSig0 < roundIncrement ) {
4617         ++zExp;
4618         zSig0 = UINT64_C(0x8000000000000000);
4619     }
4620     roundIncrement = roundMask + 1;
4621     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4622         roundMask |= roundIncrement;
4623     }
4624     zSig0 &= ~ roundMask;
4625     if ( zSig0 == 0 ) zExp = 0;
4626     return packFloatx80( zSign, zExp, zSig0 );
4627  precision80:
4628     switch (roundingMode) {
4629     case float_round_nearest_even:
4630     case float_round_ties_away:
4631         increment = ((int64_t)zSig1 < 0);
4632         break;
4633     case float_round_to_zero:
4634         increment = 0;
4635         break;
4636     case float_round_up:
4637         increment = !zSign && zSig1;
4638         break;
4639     case float_round_down:
4640         increment = zSign && zSig1;
4641         break;
4642     default:
4643         abort();
4644     }
4645     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4646         if (    ( 0x7FFE < zExp )
4647              || (    ( zExp == 0x7FFE )
4648                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4649                   && increment
4650                 )
4651            ) {
4652             roundMask = 0;
4653  overflow:
4654             float_raise(float_flag_overflow | float_flag_inexact, status);
4655             if (    ( roundingMode == float_round_to_zero )
4656                  || ( zSign && ( roundingMode == float_round_up ) )
4657                  || ( ! zSign && ( roundingMode == float_round_down ) )
4658                ) {
4659                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4660             }
4661             return packFloatx80(zSign,
4662                                 floatx80_infinity_high,
4663                                 floatx80_infinity_low);
4664         }
4665         if ( zExp <= 0 ) {
4666             isTiny = status->tininess_before_rounding
4667                   || (zExp < 0)
4668                   || !increment
4669                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4670             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4671             zExp = 0;
4672             if (isTiny && zSig1) {
4673                 float_raise(float_flag_underflow, status);
4674             }
4675             if (zSig1) {
4676                 float_raise(float_flag_inexact, status);
4677             }
4678             switch (roundingMode) {
4679             case float_round_nearest_even:
4680             case float_round_ties_away:
4681                 increment = ((int64_t)zSig1 < 0);
4682                 break;
4683             case float_round_to_zero:
4684                 increment = 0;
4685                 break;
4686             case float_round_up:
4687                 increment = !zSign && zSig1;
4688                 break;
4689             case float_round_down:
4690                 increment = zSign && zSig1;
4691                 break;
4692             default:
4693                 abort();
4694             }
4695             if ( increment ) {
4696                 ++zSig0;
4697                 if (!(zSig1 << 1) && roundNearestEven) {
4698                     zSig0 &= ~1;
4699                 }
4700                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4701             }
4702             return packFloatx80( zSign, zExp, zSig0 );
4703         }
4704     }
4705     if (zSig1) {
4706         float_raise(float_flag_inexact, status);
4707     }
4708     if ( increment ) {
4709         ++zSig0;
4710         if ( zSig0 == 0 ) {
4711             ++zExp;
4712             zSig0 = UINT64_C(0x8000000000000000);
4713         }
4714         else {
4715             if (!(zSig1 << 1) && roundNearestEven) {
4716                 zSig0 &= ~1;
4717             }
4718         }
4719     }
4720     else {
4721         if ( zSig0 == 0 ) zExp = 0;
4722     }
4723     return packFloatx80( zSign, zExp, zSig0 );
4724
4725 }
4726
4727 /*----------------------------------------------------------------------------
4728 | Takes an abstract floating-point value having sign `zSign', exponent
4729 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4730 | and returns the proper extended double-precision floating-point value
4731 | corresponding to the abstract input.  This routine is just like
4732 | `roundAndPackFloatx80' except that the input significand does not have to be
4733 | normalized.
4734 *----------------------------------------------------------------------------*/
4735
4736 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4737                                        bool zSign, int32_t zExp,
4738                                        uint64_t zSig0, uint64_t zSig1,
4739                                        float_status *status)
4740 {
4741     int8_t shiftCount;
4742
4743     if ( zSig0 == 0 ) {
4744         zSig0 = zSig1;
4745         zSig1 = 0;
4746         zExp -= 64;
4747     }
4748     shiftCount = clz64(zSig0);
4749     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4750     zExp -= shiftCount;
4751     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4752                                 zSig0, zSig1, status);
4753
4754 }
4755
4756 /*----------------------------------------------------------------------------
4757 | Returns the least-significant 64 fraction bits of the quadruple-precision
4758 | floating-point value `a'.
4759 *----------------------------------------------------------------------------*/
4760
4761 static inline uint64_t extractFloat128Frac1( float128 a )
4762 {
4763
4764     return a.low;
4765
4766 }
4767
4768 /*----------------------------------------------------------------------------
4769 | Returns the most-significant 48 fraction bits of the quadruple-precision
4770 | floating-point value `a'.
4771 *----------------------------------------------------------------------------*/
4772
4773 static inline uint64_t extractFloat128Frac0( float128 a )
4774 {
4775
4776     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4777
4778 }
4779
4780 /*----------------------------------------------------------------------------
4781 | Returns the exponent bits of the quadruple-precision floating-point value
4782 | `a'.
4783 *----------------------------------------------------------------------------*/
4784
4785 static inline int32_t extractFloat128Exp( float128 a )
4786 {
4787
4788     return ( a.high>>48 ) & 0x7FFF;
4789
4790 }
4791
4792 /*----------------------------------------------------------------------------
4793 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4794 *----------------------------------------------------------------------------*/
4795
4796 static inline bool extractFloat128Sign(float128 a)
4797 {
4798     return a.high >> 63;
4799 }
4800
4801 /*----------------------------------------------------------------------------
4802 | Normalizes the subnormal quadruple-precision floating-point value
4803 | represented by the denormalized significand formed by the concatenation of
4804 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4805 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4806 | significand are stored at the location pointed to by `zSig0Ptr', and the
4807 | least significant 64 bits of the normalized significand are stored at the
4808 | location pointed to by `zSig1Ptr'.
4809 *----------------------------------------------------------------------------*/
4810
4811 static void
4812  normalizeFloat128Subnormal(
4813      uint64_t aSig0,
4814      uint64_t aSig1,
4815      int32_t *zExpPtr,
4816      uint64_t *zSig0Ptr,
4817      uint64_t *zSig1Ptr
4818  )
4819 {
4820     int8_t shiftCount;
4821
4822     if ( aSig0 == 0 ) {
4823         shiftCount = clz64(aSig1) - 15;
4824         if ( shiftCount < 0 ) {
4825             *zSig0Ptr = aSig1>>( - shiftCount );
4826             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4827         }
4828         else {
4829             *zSig0Ptr = aSig1<<shiftCount;
4830             *zSig1Ptr = 0;
4831         }
4832         *zExpPtr = - shiftCount - 63;
4833     }
4834     else {
4835         shiftCount = clz64(aSig0) - 15;
4836         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4837         *zExpPtr = 1 - shiftCount;
4838     }
4839
4840 }
4841
4842 /*----------------------------------------------------------------------------
4843 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4844 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4845 | floating-point value, returning the result.  After being shifted into the
4846 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4847 | added together to form the most significant 32 bits of the result.  This
4848 | means that any integer portion of `zSig0' will be added into the exponent.
4849 | Since a properly normalized significand will have an integer portion equal
4850 | to 1, the `zExp' input should be 1 less than the desired result exponent
4851 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4852 | significand.
4853 *----------------------------------------------------------------------------*/
4854
4855 static inline float128
4856 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4857 {
4858     float128 z;
4859
4860     z.low = zSig1;
4861     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4862     return z;
4863 }
4864
4865 /*----------------------------------------------------------------------------
4866 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4867 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4868 | and `zSig2', and returns the proper quadruple-precision floating-point value
4869 | corresponding to the abstract input.  Ordinarily, the abstract value is
4870 | simply rounded and packed into the quadruple-precision format, with the
4871 | inexact exception raised if the abstract input cannot be represented
4872 | exactly.  However, if the abstract value is too large, the overflow and
4873 | inexact exceptions are raised and an infinity or maximal finite value is
4874 | returned.  If the abstract value is too small, the input value is rounded to
4875 | a subnormal number, and the underflow and inexact exceptions are raised if
4876 | the abstract input cannot be represented exactly as a subnormal quadruple-
4877 | precision floating-point number.
4878 |     The input significand must be normalized or smaller.  If the input
4879 | significand is not normalized, `zExp' must be 0; in that case, the result
4880 | returned is a subnormal number, and it must not require rounding.  In the
4881 | usual case that the input significand is normalized, `zExp' must be 1 less
4882 | than the ``true'' floating-point exponent.  The handling of underflow and
4883 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4884 *----------------------------------------------------------------------------*/
4885
4886 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4887                                      uint64_t zSig0, uint64_t zSig1,
4888                                      uint64_t zSig2, float_status *status)
4889 {
4890     int8_t roundingMode;
4891     bool roundNearestEven, increment, isTiny;
4892
4893     roundingMode = status->float_rounding_mode;
4894     roundNearestEven = ( roundingMode == float_round_nearest_even );
4895     switch (roundingMode) {
4896     case float_round_nearest_even:
4897     case float_round_ties_away:
4898         increment = ((int64_t)zSig2 < 0);
4899         break;
4900     case float_round_to_zero:
4901         increment = 0;
4902         break;
4903     case float_round_up:
4904         increment = !zSign && zSig2;
4905         break;
4906     case float_round_down:
4907         increment = zSign && zSig2;
4908         break;
4909     case float_round_to_odd:
4910         increment = !(zSig1 & 0x1) && zSig2;
4911         break;
4912     default:
4913         abort();
4914     }
4915     if ( 0x7FFD <= (uint32_t) zExp ) {
4916         if (    ( 0x7FFD < zExp )
4917              || (    ( zExp == 0x7FFD )
4918                   && eq128(
4919                          UINT64_C(0x0001FFFFFFFFFFFF),
4920                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4921                          zSig0,
4922                          zSig1
4923                      )
4924                   && increment
4925                 )
4926            ) {
4927             float_raise(float_flag_overflow | float_flag_inexact, status);
4928             if (    ( roundingMode == float_round_to_zero )
4929                  || ( zSign && ( roundingMode == float_round_up ) )
4930                  || ( ! zSign && ( roundingMode == float_round_down ) )
4931                  || (roundingMode == float_round_to_odd)
4932                ) {
4933                 return
4934                     packFloat128(
4935                         zSign,
4936                         0x7FFE,
4937                         UINT64_C(0x0000FFFFFFFFFFFF),
4938                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4939                     );
4940             }
4941             return packFloat128( zSign, 0x7FFF, 0, 0 );
4942         }
4943         if ( zExp < 0 ) {
4944             if (status->flush_to_zero) {
4945                 float_raise(float_flag_output_denormal, status);
4946                 return packFloat128(zSign, 0, 0, 0);
4947             }
4948             isTiny = status->tininess_before_rounding
4949                   || (zExp < -1)
4950                   || !increment
4951                   || lt128(zSig0, zSig1,
4952                            UINT64_C(0x0001FFFFFFFFFFFF),
4953                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4954             shift128ExtraRightJamming(
4955                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4956             zExp = 0;
4957             if (isTiny && zSig2) {
4958                 float_raise(float_flag_underflow, status);
4959             }
4960             switch (roundingMode) {
4961             case float_round_nearest_even:
4962             case float_round_ties_away:
4963                 increment = ((int64_t)zSig2 < 0);
4964                 break;
4965             case float_round_to_zero:
4966                 increment = 0;
4967                 break;
4968             case float_round_up:
4969                 increment = !zSign && zSig2;
4970                 break;
4971             case float_round_down:
4972                 increment = zSign && zSig2;
4973                 break;
4974             case float_round_to_odd:
4975                 increment = !(zSig1 & 0x1) && zSig2;
4976                 break;
4977             default:
4978                 abort();
4979             }
4980         }
4981     }
4982     if (zSig2) {
4983         float_raise(float_flag_inexact, status);
4984     }
4985     if ( increment ) {
4986         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4987         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4988             zSig1 &= ~1;
4989         }
4990     }
4991     else {
4992         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4993     }
4994     return packFloat128( zSign, zExp, zSig0, zSig1 );
4995
4996 }
4997
4998 /*----------------------------------------------------------------------------
4999 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
5000 | and significand formed by the concatenation of `zSig0' and `zSig1', and
5001 | returns the proper quadruple-precision floating-point value corresponding
5002 | to the abstract input.  This routine is just like `roundAndPackFloat128'
5003 | except that the input significand has fewer bits and does not have to be
5004 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
5005 | point exponent.
5006 *----------------------------------------------------------------------------*/
5007
5008 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
5009                                               uint64_t zSig0, uint64_t zSig1,
5010                                               float_status *status)
5011 {
5012     int8_t shiftCount;
5013     uint64_t zSig2;
5014
5015     if ( zSig0 == 0 ) {
5016         zSig0 = zSig1;
5017         zSig1 = 0;
5018         zExp -= 64;
5019     }
5020     shiftCount = clz64(zSig0) - 15;
5021     if ( 0 <= shiftCount ) {
5022         zSig2 = 0;
5023         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5024     }
5025     else {
5026         shift128ExtraRightJamming(
5027             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
5028     }
5029     zExp -= shiftCount;
5030     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
5031
5032 }
5033
5034
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of converting the 32-bit two's complement integer `a'
5037 | to the extended double-precision floating-point format.  The conversion
5038 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5039 | Arithmetic.
5040 *----------------------------------------------------------------------------*/
5041
5042 floatx80 int32_to_floatx80(int32_t a, float_status *status)
5043 {
5044     bool zSign;
5045     uint32_t absA;
5046     int8_t shiftCount;
5047     uint64_t zSig;
5048
5049     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5050     zSign = ( a < 0 );
5051     absA = zSign ? - a : a;
5052     shiftCount = clz32(absA) + 32;
5053     zSig = absA;
5054     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5055
5056 }
5057
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of converting the 32-bit two's complement integer `a' to
5060 | the quadruple-precision floating-point format.  The conversion is performed
5061 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063
5064 float128 int32_to_float128(int32_t a, float_status *status)
5065 {
5066     bool zSign;
5067     uint32_t absA;
5068     int8_t shiftCount;
5069     uint64_t zSig0;
5070
5071     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5072     zSign = ( a < 0 );
5073     absA = zSign ? - a : a;
5074     shiftCount = clz32(absA) + 17;
5075     zSig0 = absA;
5076     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5077
5078 }
5079
5080 /*----------------------------------------------------------------------------
5081 | Returns the result of converting the 64-bit two's complement integer `a'
5082 | to the extended double-precision floating-point format.  The conversion
5083 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5084 | Arithmetic.
5085 *----------------------------------------------------------------------------*/
5086
5087 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5088 {
5089     bool zSign;
5090     uint64_t absA;
5091     int8_t shiftCount;
5092
5093     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5094     zSign = ( a < 0 );
5095     absA = zSign ? - a : a;
5096     shiftCount = clz64(absA);
5097     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5098
5099 }
5100
5101 /*----------------------------------------------------------------------------
5102 | Returns the result of converting the 64-bit two's complement integer `a' to
5103 | the quadruple-precision floating-point format.  The conversion is performed
5104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5105 *----------------------------------------------------------------------------*/
5106
5107 float128 int64_to_float128(int64_t a, float_status *status)
5108 {
5109     bool zSign;
5110     uint64_t absA;
5111     int8_t shiftCount;
5112     int32_t zExp;
5113     uint64_t zSig0, zSig1;
5114
5115     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5116     zSign = ( a < 0 );
5117     absA = zSign ? - a : a;
5118     shiftCount = clz64(absA) + 49;
5119     zExp = 0x406E - shiftCount;
5120     if ( 64 <= shiftCount ) {
5121         zSig1 = 0;
5122         zSig0 = absA;
5123         shiftCount -= 64;
5124     }
5125     else {
5126         zSig1 = absA;
5127         zSig0 = 0;
5128     }
5129     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5130     return packFloat128( zSign, zExp, zSig0, zSig1 );
5131
5132 }
5133
5134 /*----------------------------------------------------------------------------
5135 | Returns the result of converting the 64-bit unsigned integer `a'
5136 | to the quadruple-precision floating-point format.  The conversion is performed
5137 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138 *----------------------------------------------------------------------------*/
5139
5140 float128 uint64_to_float128(uint64_t a, float_status *status)
5141 {
5142     if (a == 0) {
5143         return float128_zero;
5144     }
5145     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5146 }
5147
5148 /*----------------------------------------------------------------------------
5149 | Returns the result of converting the single-precision floating-point value
5150 | `a' to the extended double-precision floating-point format.  The conversion
5151 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5152 | Arithmetic.
5153 *----------------------------------------------------------------------------*/
5154
5155 floatx80 float32_to_floatx80(float32 a, float_status *status)
5156 {
5157     bool aSign;
5158     int aExp;
5159     uint32_t aSig;
5160
5161     a = float32_squash_input_denormal(a, status);
5162     aSig = extractFloat32Frac( a );
5163     aExp = extractFloat32Exp( a );
5164     aSign = extractFloat32Sign( a );
5165     if ( aExp == 0xFF ) {
5166         if (aSig) {
5167             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5168                                                status);
5169             return floatx80_silence_nan(res, status);
5170         }
5171         return packFloatx80(aSign,
5172                             floatx80_infinity_high,
5173                             floatx80_infinity_low);
5174     }
5175     if ( aExp == 0 ) {
5176         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5177         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5178     }
5179     aSig |= 0x00800000;
5180     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5181
5182 }
5183
5184 /*----------------------------------------------------------------------------
5185 | Returns the remainder of the single-precision floating-point value `a'
5186 | with respect to the corresponding value `b'.  The operation is performed
5187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5188 *----------------------------------------------------------------------------*/
5189
5190 float32 float32_rem(float32 a, float32 b, float_status *status)
5191 {
5192     bool aSign, zSign;
5193     int aExp, bExp, expDiff;
5194     uint32_t aSig, bSig;
5195     uint32_t q;
5196     uint64_t aSig64, bSig64, q64;
5197     uint32_t alternateASig;
5198     int32_t sigMean;
5199     a = float32_squash_input_denormal(a, status);
5200     b = float32_squash_input_denormal(b, status);
5201
5202     aSig = extractFloat32Frac( a );
5203     aExp = extractFloat32Exp( a );
5204     aSign = extractFloat32Sign( a );
5205     bSig = extractFloat32Frac( b );
5206     bExp = extractFloat32Exp( b );
5207     if ( aExp == 0xFF ) {
5208         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5209             return propagateFloat32NaN(a, b, status);
5210         }
5211         float_raise(float_flag_invalid, status);
5212         return float32_default_nan(status);
5213     }
5214     if ( bExp == 0xFF ) {
5215         if (bSig) {
5216             return propagateFloat32NaN(a, b, status);
5217         }
5218         return a;
5219     }
5220     if ( bExp == 0 ) {
5221         if ( bSig == 0 ) {
5222             float_raise(float_flag_invalid, status);
5223             return float32_default_nan(status);
5224         }
5225         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5226     }
5227     if ( aExp == 0 ) {
5228         if ( aSig == 0 ) return a;
5229         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5230     }
5231     expDiff = aExp - bExp;
5232     aSig |= 0x00800000;
5233     bSig |= 0x00800000;
5234     if ( expDiff < 32 ) {
5235         aSig <<= 8;
5236         bSig <<= 8;
5237         if ( expDiff < 0 ) {
5238             if ( expDiff < -1 ) return a;
5239             aSig >>= 1;
5240         }
5241         q = ( bSig <= aSig );
5242         if ( q ) aSig -= bSig;
5243         if ( 0 < expDiff ) {
5244             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5245             q >>= 32 - expDiff;
5246             bSig >>= 2;
5247             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5248         }
5249         else {
5250             aSig >>= 2;
5251             bSig >>= 2;
5252         }
5253     }
5254     else {
5255         if ( bSig <= aSig ) aSig -= bSig;
5256         aSig64 = ( (uint64_t) aSig )<<40;
5257         bSig64 = ( (uint64_t) bSig )<<40;
5258         expDiff -= 64;
5259         while ( 0 < expDiff ) {
5260             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5261             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5262             aSig64 = - ( ( bSig * q64 )<<38 );
5263             expDiff -= 62;
5264         }
5265         expDiff += 64;
5266         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5267         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5268         q = q64>>( 64 - expDiff );
5269         bSig <<= 6;
5270         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5271     }
5272     do {
5273         alternateASig = aSig;
5274         ++q;
5275         aSig -= bSig;
5276     } while ( 0 <= (int32_t) aSig );
5277     sigMean = aSig + alternateASig;
5278     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5279         aSig = alternateASig;
5280     }
5281     zSign = ( (int32_t) aSig < 0 );
5282     if ( zSign ) aSig = - aSig;
5283     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5284 }
5285
5286
5287
5288 /*----------------------------------------------------------------------------
5289 | Returns the binary exponential of the single-precision floating-point value
5290 | `a'. The operation is performed according to the IEC/IEEE Standard for
5291 | Binary Floating-Point Arithmetic.
5292 |
5293 | Uses the following identities:
5294 |
5295 | 1. -------------------------------------------------------------------------
5296 |      x    x*ln(2)
5297 |     2  = e
5298 |
5299 | 2. -------------------------------------------------------------------------
5300 |                      2     3     4     5           n
5301 |      x        x     x     x     x     x           x
5302 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5303 |               1!    2!    3!    4!    5!          n!
5304 *----------------------------------------------------------------------------*/
5305
5306 static const float64 float32_exp2_coefficients[15] =
5307 {
5308     const_float64( 0x3ff0000000000000ll ), /*  1 */
5309     const_float64( 0x3fe0000000000000ll ), /*  2 */
5310     const_float64( 0x3fc5555555555555ll ), /*  3 */
5311     const_float64( 0x3fa5555555555555ll ), /*  4 */
5312     const_float64( 0x3f81111111111111ll ), /*  5 */
5313     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5314     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5315     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5316     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5317     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5318     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5319     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5320     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5321     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5322     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5323 };
5324
5325 float32 float32_exp2(float32 a, float_status *status)
5326 {
5327     bool aSign;
5328     int aExp;
5329     uint32_t aSig;
5330     float64 r, x, xn;
5331     int i;
5332     a = float32_squash_input_denormal(a, status);
5333
5334     aSig = extractFloat32Frac( a );
5335     aExp = extractFloat32Exp( a );
5336     aSign = extractFloat32Sign( a );
5337
5338     if ( aExp == 0xFF) {
5339         if (aSig) {
5340             return propagateFloat32NaN(a, float32_zero, status);
5341         }
5342         return (aSign) ? float32_zero : a;
5343     }
5344     if (aExp == 0) {
5345         if (aSig == 0) return float32_one;
5346     }
5347
5348     float_raise(float_flag_inexact, status);
5349
5350     /* ******************************* */
5351     /* using float64 for approximation */
5352     /* ******************************* */
5353     x = float32_to_float64(a, status);
5354     x = float64_mul(x, float64_ln2, status);
5355
5356     xn = x;
5357     r = float64_one;
5358     for (i = 0 ; i < 15 ; i++) {
5359         float64 f;
5360
5361         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5362         r = float64_add(r, f, status);
5363
5364         xn = float64_mul(xn, x, status);
5365     }
5366
5367     return float64_to_float32(r, status);
5368 }
5369
5370 /*----------------------------------------------------------------------------
5371 | Returns the binary log of the single-precision floating-point value `a'.
5372 | The operation is performed according to the IEC/IEEE Standard for Binary
5373 | Floating-Point Arithmetic.
5374 *----------------------------------------------------------------------------*/
5375 float32 float32_log2(float32 a, float_status *status)
5376 {
5377     bool aSign, zSign;
5378     int aExp;
5379     uint32_t aSig, zSig, i;
5380
5381     a = float32_squash_input_denormal(a, status);
5382     aSig = extractFloat32Frac( a );
5383     aExp = extractFloat32Exp( a );
5384     aSign = extractFloat32Sign( a );
5385
5386     if ( aExp == 0 ) {
5387         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5388         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5389     }
5390     if ( aSign ) {
5391         float_raise(float_flag_invalid, status);
5392         return float32_default_nan(status);
5393     }
5394     if ( aExp == 0xFF ) {
5395         if (aSig) {
5396             return propagateFloat32NaN(a, float32_zero, status);
5397         }
5398         return a;
5399     }
5400
5401     aExp -= 0x7F;
5402     aSig |= 0x00800000;
5403     zSign = aExp < 0;
5404     zSig = aExp << 23;
5405
5406     for (i = 1 << 22; i > 0; i >>= 1) {
5407         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5408         if ( aSig & 0x01000000 ) {
5409             aSig >>= 1;
5410             zSig |= i;
5411         }
5412     }
5413
5414     if ( zSign )
5415         zSig = -zSig;
5416
5417     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5418 }
5419
5420 /*----------------------------------------------------------------------------
5421 | Returns the result of converting the double-precision floating-point value
5422 | `a' to the extended double-precision floating-point format.  The conversion
5423 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5424 | Arithmetic.
5425 *----------------------------------------------------------------------------*/
5426
5427 floatx80 float64_to_floatx80(float64 a, float_status *status)
5428 {
5429     bool aSign;
5430     int aExp;
5431     uint64_t aSig;
5432
5433     a = float64_squash_input_denormal(a, status);
5434     aSig = extractFloat64Frac( a );
5435     aExp = extractFloat64Exp( a );
5436     aSign = extractFloat64Sign( a );
5437     if ( aExp == 0x7FF ) {
5438         if (aSig) {
5439             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5440                                                status);
5441             return floatx80_silence_nan(res, status);
5442         }
5443         return packFloatx80(aSign,
5444                             floatx80_infinity_high,
5445                             floatx80_infinity_low);
5446     }
5447     if ( aExp == 0 ) {
5448         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5449         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5450     }
5451     return
5452         packFloatx80(
5453             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5454
5455 }
5456
5457 /*----------------------------------------------------------------------------
5458 | Returns the remainder of the double-precision floating-point value `a'
5459 | with respect to the corresponding value `b'.  The operation is performed
5460 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462
5463 float64 float64_rem(float64 a, float64 b, float_status *status)
5464 {
5465     bool aSign, zSign;
5466     int aExp, bExp, expDiff;
5467     uint64_t aSig, bSig;
5468     uint64_t q, alternateASig;
5469     int64_t sigMean;
5470
5471     a = float64_squash_input_denormal(a, status);
5472     b = float64_squash_input_denormal(b, status);
5473     aSig = extractFloat64Frac( a );
5474     aExp = extractFloat64Exp( a );
5475     aSign = extractFloat64Sign( a );
5476     bSig = extractFloat64Frac( b );
5477     bExp = extractFloat64Exp( b );
5478     if ( aExp == 0x7FF ) {
5479         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5480             return propagateFloat64NaN(a, b, status);
5481         }
5482         float_raise(float_flag_invalid, status);
5483         return float64_default_nan(status);
5484     }
5485     if ( bExp == 0x7FF ) {
5486         if (bSig) {
5487             return propagateFloat64NaN(a, b, status);
5488         }
5489         return a;
5490     }
5491     if ( bExp == 0 ) {
5492         if ( bSig == 0 ) {
5493             float_raise(float_flag_invalid, status);
5494             return float64_default_nan(status);
5495         }
5496         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5497     }
5498     if ( aExp == 0 ) {
5499         if ( aSig == 0 ) return a;
5500         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5501     }
5502     expDiff = aExp - bExp;
5503     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5504     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5505     if ( expDiff < 0 ) {
5506         if ( expDiff < -1 ) return a;
5507         aSig >>= 1;
5508     }
5509     q = ( bSig <= aSig );
5510     if ( q ) aSig -= bSig;
5511     expDiff -= 64;
5512     while ( 0 < expDiff ) {
5513         q = estimateDiv128To64( aSig, 0, bSig );
5514         q = ( 2 < q ) ? q - 2 : 0;
5515         aSig = - ( ( bSig>>2 ) * q );
5516         expDiff -= 62;
5517     }
5518     expDiff += 64;
5519     if ( 0 < expDiff ) {
5520         q = estimateDiv128To64( aSig, 0, bSig );
5521         q = ( 2 < q ) ? q - 2 : 0;
5522         q >>= 64 - expDiff;
5523         bSig >>= 2;
5524         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5525     }
5526     else {
5527         aSig >>= 2;
5528         bSig >>= 2;
5529     }
5530     do {
5531         alternateASig = aSig;
5532         ++q;
5533         aSig -= bSig;
5534     } while ( 0 <= (int64_t) aSig );
5535     sigMean = aSig + alternateASig;
5536     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5537         aSig = alternateASig;
5538     }
5539     zSign = ( (int64_t) aSig < 0 );
5540     if ( zSign ) aSig = - aSig;
5541     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5542
5543 }
5544
5545 /*----------------------------------------------------------------------------
5546 | Returns the binary log of the double-precision floating-point value `a'.
5547 | The operation is performed according to the IEC/IEEE Standard for Binary
5548 | Floating-Point Arithmetic.
5549 *----------------------------------------------------------------------------*/
5550 float64 float64_log2(float64 a, float_status *status)
5551 {
5552     bool aSign, zSign;
5553     int aExp;
5554     uint64_t aSig, aSig0, aSig1, zSig, i;
5555     a = float64_squash_input_denormal(a, status);
5556
5557     aSig = extractFloat64Frac( a );
5558     aExp = extractFloat64Exp( a );
5559     aSign = extractFloat64Sign( a );
5560
5561     if ( aExp == 0 ) {
5562         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5563         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5564     }
5565     if ( aSign ) {
5566         float_raise(float_flag_invalid, status);
5567         return float64_default_nan(status);
5568     }
5569     if ( aExp == 0x7FF ) {
5570         if (aSig) {
5571             return propagateFloat64NaN(a, float64_zero, status);
5572         }
5573         return a;
5574     }
5575
5576     aExp -= 0x3FF;
5577     aSig |= UINT64_C(0x0010000000000000);
5578     zSign = aExp < 0;
5579     zSig = (uint64_t)aExp << 52;
5580     for (i = 1LL << 51; i > 0; i >>= 1) {
5581         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5582         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5583         if ( aSig & UINT64_C(0x0020000000000000) ) {
5584             aSig >>= 1;
5585             zSig |= i;
5586         }
5587     }
5588
5589     if ( zSign )
5590         zSig = -zSig;
5591     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5592 }
5593
5594 /*----------------------------------------------------------------------------
5595 | Returns the result of converting the extended double-precision floating-
5596 | point value `a' to the 32-bit two's complement integer format.  The
5597 | conversion is performed according to the IEC/IEEE Standard for Binary
5598 | Floating-Point Arithmetic---which means in particular that the conversion
5599 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5600 | largest positive integer is returned.  Otherwise, if the conversion
5601 | overflows, the largest integer with the same sign as `a' is returned.
5602 *----------------------------------------------------------------------------*/
5603
5604 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5605 {
5606     bool aSign;
5607     int32_t aExp, shiftCount;
5608     uint64_t aSig;
5609
5610     if (floatx80_invalid_encoding(a)) {
5611         float_raise(float_flag_invalid, status);
5612         return 1 << 31;
5613     }
5614     aSig = extractFloatx80Frac( a );
5615     aExp = extractFloatx80Exp( a );
5616     aSign = extractFloatx80Sign( a );
5617     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5618     shiftCount = 0x4037 - aExp;
5619     if ( shiftCount <= 0 ) shiftCount = 1;
5620     shift64RightJamming( aSig, shiftCount, &aSig );
5621     return roundAndPackInt32(aSign, aSig, status);
5622
5623 }
5624
5625 /*----------------------------------------------------------------------------
5626 | Returns the result of converting the extended double-precision floating-
5627 | point value `a' to the 32-bit two's complement integer format.  The
5628 | conversion is performed according to the IEC/IEEE Standard for Binary
5629 | Floating-Point Arithmetic, except that the conversion is always rounded
5630 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5631 | Otherwise, if the conversion overflows, the largest integer with the same
5632 | sign as `a' is returned.
5633 *----------------------------------------------------------------------------*/
5634
5635 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5636 {
5637     bool aSign;
5638     int32_t aExp, shiftCount;
5639     uint64_t aSig, savedASig;
5640     int32_t z;
5641
5642     if (floatx80_invalid_encoding(a)) {
5643         float_raise(float_flag_invalid, status);
5644         return 1 << 31;
5645     }
5646     aSig = extractFloatx80Frac( a );
5647     aExp = extractFloatx80Exp( a );
5648     aSign = extractFloatx80Sign( a );
5649     if ( 0x401E < aExp ) {
5650         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5651         goto invalid;
5652     }
5653     else if ( aExp < 0x3FFF ) {
5654         if (aExp || aSig) {
5655             float_raise(float_flag_inexact, status);
5656         }
5657         return 0;
5658     }
5659     shiftCount = 0x403E - aExp;
5660     savedASig = aSig;
5661     aSig >>= shiftCount;
5662     z = aSig;
5663     if ( aSign ) z = - z;
5664     if ( ( z < 0 ) ^ aSign ) {
5665  invalid:
5666         float_raise(float_flag_invalid, status);
5667         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5668     }
5669     if ( ( aSig<<shiftCount ) != savedASig ) {
5670         float_raise(float_flag_inexact, status);
5671     }
5672     return z;
5673
5674 }
5675
5676 /*----------------------------------------------------------------------------
5677 | Returns the result of converting the extended double-precision floating-
5678 | point value `a' to the 64-bit two's complement integer format.  The
5679 | conversion is performed according to the IEC/IEEE Standard for Binary
5680 | Floating-Point Arithmetic---which means in particular that the conversion
5681 | is rounded according to the current rounding mode.  If `a' is a NaN,
5682 | the largest positive integer is returned.  Otherwise, if the conversion
5683 | overflows, the largest integer with the same sign as `a' is returned.
5684 *----------------------------------------------------------------------------*/
5685
5686 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5687 {
5688     bool aSign;
5689     int32_t aExp, shiftCount;
5690     uint64_t aSig, aSigExtra;
5691
5692     if (floatx80_invalid_encoding(a)) {
5693         float_raise(float_flag_invalid, status);
5694         return 1ULL << 63;
5695     }
5696     aSig = extractFloatx80Frac( a );
5697     aExp = extractFloatx80Exp( a );
5698     aSign = extractFloatx80Sign( a );
5699     shiftCount = 0x403E - aExp;
5700     if ( shiftCount <= 0 ) {
5701         if ( shiftCount ) {
5702             float_raise(float_flag_invalid, status);
5703             if (!aSign || floatx80_is_any_nan(a)) {
5704                 return INT64_MAX;
5705             }
5706             return INT64_MIN;
5707         }
5708         aSigExtra = 0;
5709     }
5710     else {
5711         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5712     }
5713     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5714
5715 }
5716
5717 /*----------------------------------------------------------------------------
5718 | Returns the result of converting the extended double-precision floating-
5719 | point value `a' to the 64-bit two's complement integer format.  The
5720 | conversion is performed according to the IEC/IEEE Standard for Binary
5721 | Floating-Point Arithmetic, except that the conversion is always rounded
5722 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5723 | Otherwise, if the conversion overflows, the largest integer with the same
5724 | sign as `a' is returned.
5725 *----------------------------------------------------------------------------*/
5726
5727 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5728 {
5729     bool aSign;
5730     int32_t aExp, shiftCount;
5731     uint64_t aSig;
5732     int64_t z;
5733
5734     if (floatx80_invalid_encoding(a)) {
5735         float_raise(float_flag_invalid, status);
5736         return 1ULL << 63;
5737     }
5738     aSig = extractFloatx80Frac( a );
5739     aExp = extractFloatx80Exp( a );
5740     aSign = extractFloatx80Sign( a );
5741     shiftCount = aExp - 0x403E;
5742     if ( 0 <= shiftCount ) {
5743         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5744         if ( ( a.high != 0xC03E ) || aSig ) {
5745             float_raise(float_flag_invalid, status);
5746             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5747                 return INT64_MAX;
5748             }
5749         }
5750         return INT64_MIN;
5751     }
5752     else if ( aExp < 0x3FFF ) {
5753         if (aExp | aSig) {
5754             float_raise(float_flag_inexact, status);
5755         }
5756         return 0;
5757     }
5758     z = aSig>>( - shiftCount );
5759     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5760         float_raise(float_flag_inexact, status);
5761     }
5762     if ( aSign ) z = - z;
5763     return z;
5764
5765 }
5766
5767 /*----------------------------------------------------------------------------
5768 | Returns the result of converting the extended double-precision floating-
5769 | point value `a' to the single-precision floating-point format.  The
5770 | conversion is performed according to the IEC/IEEE Standard for Binary
5771 | Floating-Point Arithmetic.
5772 *----------------------------------------------------------------------------*/
5773
5774 float32 floatx80_to_float32(floatx80 a, float_status *status)
5775 {
5776     bool aSign;
5777     int32_t aExp;
5778     uint64_t aSig;
5779
5780     if (floatx80_invalid_encoding(a)) {
5781         float_raise(float_flag_invalid, status);
5782         return float32_default_nan(status);
5783     }
5784     aSig = extractFloatx80Frac( a );
5785     aExp = extractFloatx80Exp( a );
5786     aSign = extractFloatx80Sign( a );
5787     if ( aExp == 0x7FFF ) {
5788         if ( (uint64_t) ( aSig<<1 ) ) {
5789             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5790                                              status);
5791             return float32_silence_nan(res, status);
5792         }
5793         return packFloat32( aSign, 0xFF, 0 );
5794     }
5795     shift64RightJamming( aSig, 33, &aSig );
5796     if ( aExp || aSig ) aExp -= 0x3F81;
5797     return roundAndPackFloat32(aSign, aExp, aSig, status);
5798
5799 }
5800
5801 /*----------------------------------------------------------------------------
5802 | Returns the result of converting the extended double-precision floating-
5803 | point value `a' to the double-precision floating-point format.  The
5804 | conversion is performed according to the IEC/IEEE Standard for Binary
5805 | Floating-Point Arithmetic.
5806 *----------------------------------------------------------------------------*/
5807
5808 float64 floatx80_to_float64(floatx80 a, float_status *status)
5809 {
5810     bool aSign;
5811     int32_t aExp;
5812     uint64_t aSig, zSig;
5813
5814     if (floatx80_invalid_encoding(a)) {
5815         float_raise(float_flag_invalid, status);
5816         return float64_default_nan(status);
5817     }
5818     aSig = extractFloatx80Frac( a );
5819     aExp = extractFloatx80Exp( a );
5820     aSign = extractFloatx80Sign( a );
5821     if ( aExp == 0x7FFF ) {
5822         if ( (uint64_t) ( aSig<<1 ) ) {
5823             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5824                                              status);
5825             return float64_silence_nan(res, status);
5826         }
5827         return packFloat64( aSign, 0x7FF, 0 );
5828     }
5829     shift64RightJamming( aSig, 1, &zSig );
5830     if ( aExp || aSig ) aExp -= 0x3C01;
5831     return roundAndPackFloat64(aSign, aExp, zSig, status);
5832
5833 }
5834
5835 /*----------------------------------------------------------------------------
5836 | Returns the result of converting the extended double-precision floating-
5837 | point value `a' to the quadruple-precision floating-point format.  The
5838 | conversion is performed according to the IEC/IEEE Standard for Binary
5839 | Floating-Point Arithmetic.
5840 *----------------------------------------------------------------------------*/
5841
5842 float128 floatx80_to_float128(floatx80 a, float_status *status)
5843 {
5844     bool aSign;
5845     int aExp;
5846     uint64_t aSig, zSig0, zSig1;
5847
5848     if (floatx80_invalid_encoding(a)) {
5849         float_raise(float_flag_invalid, status);
5850         return float128_default_nan(status);
5851     }
5852     aSig = extractFloatx80Frac( a );
5853     aExp = extractFloatx80Exp( a );
5854     aSign = extractFloatx80Sign( a );
5855     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5856         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5857                                            status);
5858         return float128_silence_nan(res, status);
5859     }
5860     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5861     return packFloat128( aSign, aExp, zSig0, zSig1 );
5862
5863 }
5864
5865 /*----------------------------------------------------------------------------
5866 | Rounds the extended double-precision floating-point value `a'
5867 | to the precision provided by floatx80_rounding_precision and returns the
5868 | result as an extended double-precision floating-point value.
5869 | The operation is performed according to the IEC/IEEE Standard for Binary
5870 | Floating-Point Arithmetic.
5871 *----------------------------------------------------------------------------*/
5872
5873 floatx80 floatx80_round(floatx80 a, float_status *status)
5874 {
5875     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5876                                 extractFloatx80Sign(a),
5877                                 extractFloatx80Exp(a),
5878                                 extractFloatx80Frac(a), 0, status);
5879 }
5880
5881 /*----------------------------------------------------------------------------
5882 | Rounds the extended double-precision floating-point value `a' to an integer,
5883 | and returns the result as an extended quadruple-precision floating-point
5884 | value.  The operation is performed according to the IEC/IEEE Standard for
5885 | Binary Floating-Point Arithmetic.
5886 *----------------------------------------------------------------------------*/
5887
5888 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5889 {
5890     bool aSign;
5891     int32_t aExp;
5892     uint64_t lastBitMask, roundBitsMask;
5893     floatx80 z;
5894
5895     if (floatx80_invalid_encoding(a)) {
5896         float_raise(float_flag_invalid, status);
5897         return floatx80_default_nan(status);
5898     }
5899     aExp = extractFloatx80Exp( a );
5900     if ( 0x403E <= aExp ) {
5901         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5902             return propagateFloatx80NaN(a, a, status);
5903         }
5904         return a;
5905     }
5906     if ( aExp < 0x3FFF ) {
5907         if (    ( aExp == 0 )
5908              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5909             return a;
5910         }
5911         float_raise(float_flag_inexact, status);
5912         aSign = extractFloatx80Sign( a );
5913         switch (status->float_rounding_mode) {
5914          case float_round_nearest_even:
5915             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5916                ) {
5917                 return
5918                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5919             }
5920             break;
5921         case float_round_ties_away:
5922             if (aExp == 0x3FFE) {
5923                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5924             }
5925             break;
5926          case float_round_down:
5927             return
5928                   aSign ?
5929                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5930                 : packFloatx80( 0, 0, 0 );
5931          case float_round_up:
5932             return
5933                   aSign ? packFloatx80( 1, 0, 0 )
5934                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5935
5936         case float_round_to_zero:
5937             break;
5938         default:
5939             g_assert_not_reached();
5940         }
5941         return packFloatx80( aSign, 0, 0 );
5942     }
5943     lastBitMask = 1;
5944     lastBitMask <<= 0x403E - aExp;
5945     roundBitsMask = lastBitMask - 1;
5946     z = a;
5947     switch (status->float_rounding_mode) {
5948     case float_round_nearest_even:
5949         z.low += lastBitMask>>1;
5950         if ((z.low & roundBitsMask) == 0) {
5951             z.low &= ~lastBitMask;
5952         }
5953         break;
5954     case float_round_ties_away:
5955         z.low += lastBitMask >> 1;
5956         break;
5957     case float_round_to_zero:
5958         break;
5959     case float_round_up:
5960         if (!extractFloatx80Sign(z)) {
5961             z.low += roundBitsMask;
5962         }
5963         break;
5964     case float_round_down:
5965         if (extractFloatx80Sign(z)) {
5966             z.low += roundBitsMask;
5967         }
5968         break;
5969     default:
5970         abort();
5971     }
5972     z.low &= ~ roundBitsMask;
5973     if ( z.low == 0 ) {
5974         ++z.high;
5975         z.low = UINT64_C(0x8000000000000000);
5976     }
5977     if (z.low != a.low) {
5978         float_raise(float_flag_inexact, status);
5979     }
5980     return z;
5981
5982 }
5983
5984 /*----------------------------------------------------------------------------
5985 | Returns the result of adding the absolute values of the extended double-
5986 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5987 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5988 | The addition is performed according to the IEC/IEEE Standard for Binary
5989 | Floating-Point Arithmetic.
5990 *----------------------------------------------------------------------------*/
5991
5992 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5993                                 float_status *status)
5994 {
5995     int32_t aExp, bExp, zExp;
5996     uint64_t aSig, bSig, zSig0, zSig1;
5997     int32_t expDiff;
5998
5999     aSig = extractFloatx80Frac( a );
6000     aExp = extractFloatx80Exp( a );
6001     bSig = extractFloatx80Frac( b );
6002     bExp = extractFloatx80Exp( b );
6003     expDiff = aExp - bExp;
6004     if ( 0 < expDiff ) {
6005         if ( aExp == 0x7FFF ) {
6006             if ((uint64_t)(aSig << 1)) {
6007                 return propagateFloatx80NaN(a, b, status);
6008             }
6009             return a;
6010         }
6011         if ( bExp == 0 ) --expDiff;
6012         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6013         zExp = aExp;
6014     }
6015     else if ( expDiff < 0 ) {
6016         if ( bExp == 0x7FFF ) {
6017             if ((uint64_t)(bSig << 1)) {
6018                 return propagateFloatx80NaN(a, b, status);
6019             }
6020             return packFloatx80(zSign,
6021                                 floatx80_infinity_high,
6022                                 floatx80_infinity_low);
6023         }
6024         if ( aExp == 0 ) ++expDiff;
6025         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6026         zExp = bExp;
6027     }
6028     else {
6029         if ( aExp == 0x7FFF ) {
6030             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6031                 return propagateFloatx80NaN(a, b, status);
6032             }
6033             return a;
6034         }
6035         zSig1 = 0;
6036         zSig0 = aSig + bSig;
6037         if ( aExp == 0 ) {
6038             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6039                 /* At least one of the values is a pseudo-denormal,
6040                  * and there is a carry out of the result.  */
6041                 zExp = 1;
6042                 goto shiftRight1;
6043             }
6044             if (zSig0 == 0) {
6045                 return packFloatx80(zSign, 0, 0);
6046             }
6047             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6048             goto roundAndPack;
6049         }
6050         zExp = aExp;
6051         goto shiftRight1;
6052     }
6053     zSig0 = aSig + bSig;
6054     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6055  shiftRight1:
6056     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6057     zSig0 |= UINT64_C(0x8000000000000000);
6058     ++zExp;
6059  roundAndPack:
6060     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6061                                 zSign, zExp, zSig0, zSig1, status);
6062 }
6063
6064 /*----------------------------------------------------------------------------
6065 | Returns the result of subtracting the absolute values of the extended
6066 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6067 | difference is negated before being returned.  `zSign' is ignored if the
6068 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6069 | Standard for Binary Floating-Point Arithmetic.
6070 *----------------------------------------------------------------------------*/
6071
6072 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6073                                 float_status *status)
6074 {
6075     int32_t aExp, bExp, zExp;
6076     uint64_t aSig, bSig, zSig0, zSig1;
6077     int32_t expDiff;
6078
6079     aSig = extractFloatx80Frac( a );
6080     aExp = extractFloatx80Exp( a );
6081     bSig = extractFloatx80Frac( b );
6082     bExp = extractFloatx80Exp( b );
6083     expDiff = aExp - bExp;
6084     if ( 0 < expDiff ) goto aExpBigger;
6085     if ( expDiff < 0 ) goto bExpBigger;
6086     if ( aExp == 0x7FFF ) {
6087         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6088             return propagateFloatx80NaN(a, b, status);
6089         }
6090         float_raise(float_flag_invalid, status);
6091         return floatx80_default_nan(status);
6092     }
6093     if ( aExp == 0 ) {
6094         aExp = 1;
6095         bExp = 1;
6096     }
6097     zSig1 = 0;
6098     if ( bSig < aSig ) goto aBigger;
6099     if ( aSig < bSig ) goto bBigger;
6100     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6101  bExpBigger:
6102     if ( bExp == 0x7FFF ) {
6103         if ((uint64_t)(bSig << 1)) {
6104             return propagateFloatx80NaN(a, b, status);
6105         }
6106         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6107                             floatx80_infinity_low);
6108     }
6109     if ( aExp == 0 ) ++expDiff;
6110     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6111  bBigger:
6112     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6113     zExp = bExp;
6114     zSign ^= 1;
6115     goto normalizeRoundAndPack;
6116  aExpBigger:
6117     if ( aExp == 0x7FFF ) {
6118         if ((uint64_t)(aSig << 1)) {
6119             return propagateFloatx80NaN(a, b, status);
6120         }
6121         return a;
6122     }
6123     if ( bExp == 0 ) --expDiff;
6124     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6125  aBigger:
6126     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6127     zExp = aExp;
6128  normalizeRoundAndPack:
6129     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6130                                          zSign, zExp, zSig0, zSig1, status);
6131 }
6132
6133 /*----------------------------------------------------------------------------
6134 | Returns the result of adding the extended double-precision floating-point
6135 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6136 | Standard for Binary Floating-Point Arithmetic.
6137 *----------------------------------------------------------------------------*/
6138
6139 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6140 {
6141     bool aSign, bSign;
6142
6143     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6144         float_raise(float_flag_invalid, status);
6145         return floatx80_default_nan(status);
6146     }
6147     aSign = extractFloatx80Sign( a );
6148     bSign = extractFloatx80Sign( b );
6149     if ( aSign == bSign ) {
6150         return addFloatx80Sigs(a, b, aSign, status);
6151     }
6152     else {
6153         return subFloatx80Sigs(a, b, aSign, status);
6154     }
6155
6156 }
6157
6158 /*----------------------------------------------------------------------------
6159 | Returns the result of subtracting the extended double-precision floating-
6160 | point values `a' and `b'.  The operation is performed according to the
6161 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6162 *----------------------------------------------------------------------------*/
6163
6164 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6165 {
6166     bool aSign, bSign;
6167
6168     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6169         float_raise(float_flag_invalid, status);
6170         return floatx80_default_nan(status);
6171     }
6172     aSign = extractFloatx80Sign( a );
6173     bSign = extractFloatx80Sign( b );
6174     if ( aSign == bSign ) {
6175         return subFloatx80Sigs(a, b, aSign, status);
6176     }
6177     else {
6178         return addFloatx80Sigs(a, b, aSign, status);
6179     }
6180
6181 }
6182
6183 /*----------------------------------------------------------------------------
6184 | Returns the result of multiplying the extended double-precision floating-
6185 | point values `a' and `b'.  The operation is performed according to the
6186 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6188
6189 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6190 {
6191     bool aSign, bSign, zSign;
6192     int32_t aExp, bExp, zExp;
6193     uint64_t aSig, bSig, zSig0, zSig1;
6194
6195     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6196         float_raise(float_flag_invalid, status);
6197         return floatx80_default_nan(status);
6198     }
6199     aSig = extractFloatx80Frac( a );
6200     aExp = extractFloatx80Exp( a );
6201     aSign = extractFloatx80Sign( a );
6202     bSig = extractFloatx80Frac( b );
6203     bExp = extractFloatx80Exp( b );
6204     bSign = extractFloatx80Sign( b );
6205     zSign = aSign ^ bSign;
6206     if ( aExp == 0x7FFF ) {
6207         if (    (uint64_t) ( aSig<<1 )
6208              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6209             return propagateFloatx80NaN(a, b, status);
6210         }
6211         if ( ( bExp | bSig ) == 0 ) goto invalid;
6212         return packFloatx80(zSign, floatx80_infinity_high,
6213                                    floatx80_infinity_low);
6214     }
6215     if ( bExp == 0x7FFF ) {
6216         if ((uint64_t)(bSig << 1)) {
6217             return propagateFloatx80NaN(a, b, status);
6218         }
6219         if ( ( aExp | aSig ) == 0 ) {
6220  invalid:
6221             float_raise(float_flag_invalid, status);
6222             return floatx80_default_nan(status);
6223         }
6224         return packFloatx80(zSign, floatx80_infinity_high,
6225                                    floatx80_infinity_low);
6226     }
6227     if ( aExp == 0 ) {
6228         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6229         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6230     }
6231     if ( bExp == 0 ) {
6232         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6233         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6234     }
6235     zExp = aExp + bExp - 0x3FFE;
6236     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6237     if ( 0 < (int64_t) zSig0 ) {
6238         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6239         --zExp;
6240     }
6241     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6242                                 zSign, zExp, zSig0, zSig1, status);
6243 }
6244
6245 /*----------------------------------------------------------------------------
6246 | Returns the result of dividing the extended double-precision floating-point
6247 | value `a' by the corresponding value `b'.  The operation is performed
6248 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6249 *----------------------------------------------------------------------------*/
6250
6251 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6252 {
6253     bool aSign, bSign, zSign;
6254     int32_t aExp, bExp, zExp;
6255     uint64_t aSig, bSig, zSig0, zSig1;
6256     uint64_t rem0, rem1, rem2, term0, term1, term2;
6257
6258     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6259         float_raise(float_flag_invalid, status);
6260         return floatx80_default_nan(status);
6261     }
6262     aSig = extractFloatx80Frac( a );
6263     aExp = extractFloatx80Exp( a );
6264     aSign = extractFloatx80Sign( a );
6265     bSig = extractFloatx80Frac( b );
6266     bExp = extractFloatx80Exp( b );
6267     bSign = extractFloatx80Sign( b );
6268     zSign = aSign ^ bSign;
6269     if ( aExp == 0x7FFF ) {
6270         if ((uint64_t)(aSig << 1)) {
6271             return propagateFloatx80NaN(a, b, status);
6272         }
6273         if ( bExp == 0x7FFF ) {
6274             if ((uint64_t)(bSig << 1)) {
6275                 return propagateFloatx80NaN(a, b, status);
6276             }
6277             goto invalid;
6278         }
6279         return packFloatx80(zSign, floatx80_infinity_high,
6280                                    floatx80_infinity_low);
6281     }
6282     if ( bExp == 0x7FFF ) {
6283         if ((uint64_t)(bSig << 1)) {
6284             return propagateFloatx80NaN(a, b, status);
6285         }
6286         return packFloatx80( zSign, 0, 0 );
6287     }
6288     if ( bExp == 0 ) {
6289         if ( bSig == 0 ) {
6290             if ( ( aExp | aSig ) == 0 ) {
6291  invalid:
6292                 float_raise(float_flag_invalid, status);
6293                 return floatx80_default_nan(status);
6294             }
6295             float_raise(float_flag_divbyzero, status);
6296             return packFloatx80(zSign, floatx80_infinity_high,
6297                                        floatx80_infinity_low);
6298         }
6299         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6300     }
6301     if ( aExp == 0 ) {
6302         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6303         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6304     }
6305     zExp = aExp - bExp + 0x3FFE;
6306     rem1 = 0;
6307     if ( bSig <= aSig ) {
6308         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6309         ++zExp;
6310     }
6311     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6312     mul64To128( bSig, zSig0, &term0, &term1 );
6313     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6314     while ( (int64_t) rem0 < 0 ) {
6315         --zSig0;
6316         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6317     }
6318     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6319     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6320         mul64To128( bSig, zSig1, &term1, &term2 );
6321         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6322         while ( (int64_t) rem1 < 0 ) {
6323             --zSig1;
6324             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6325         }
6326         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6327     }
6328     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6329                                 zSign, zExp, zSig0, zSig1, status);
6330 }
6331
6332 /*----------------------------------------------------------------------------
6333 | Returns the remainder of the extended double-precision floating-point value
6334 | `a' with respect to the corresponding value `b'.  The operation is performed
6335 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6336 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6337 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6338 | the absolute value of the integer quotient.
6339 *----------------------------------------------------------------------------*/
6340
6341 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6342                          float_status *status)
6343 {
6344     bool aSign, zSign;
6345     int32_t aExp, bExp, expDiff, aExpOrig;
6346     uint64_t aSig0, aSig1, bSig;
6347     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6348
6349     *quotient = 0;
6350     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6351         float_raise(float_flag_invalid, status);
6352         return floatx80_default_nan(status);
6353     }
6354     aSig0 = extractFloatx80Frac( a );
6355     aExpOrig = aExp = extractFloatx80Exp( a );
6356     aSign = extractFloatx80Sign( a );
6357     bSig = extractFloatx80Frac( b );
6358     bExp = extractFloatx80Exp( b );
6359     if ( aExp == 0x7FFF ) {
6360         if (    (uint64_t) ( aSig0<<1 )
6361              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6362             return propagateFloatx80NaN(a, b, status);
6363         }
6364         goto invalid;
6365     }
6366     if ( bExp == 0x7FFF ) {
6367         if ((uint64_t)(bSig << 1)) {
6368             return propagateFloatx80NaN(a, b, status);
6369         }
6370         if (aExp == 0 && aSig0 >> 63) {
6371             /*
6372              * Pseudo-denormal argument must be returned in normalized
6373              * form.
6374              */
6375             return packFloatx80(aSign, 1, aSig0);
6376         }
6377         return a;
6378     }
6379     if ( bExp == 0 ) {
6380         if ( bSig == 0 ) {
6381  invalid:
6382             float_raise(float_flag_invalid, status);
6383             return floatx80_default_nan(status);
6384         }
6385         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6386     }
6387     if ( aExp == 0 ) {
6388         if ( aSig0 == 0 ) return a;
6389         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6390     }
6391     zSign = aSign;
6392     expDiff = aExp - bExp;
6393     aSig1 = 0;
6394     if ( expDiff < 0 ) {
6395         if ( mod || expDiff < -1 ) {
6396             if (aExp == 1 && aExpOrig == 0) {
6397                 /*
6398                  * Pseudo-denormal argument must be returned in
6399                  * normalized form.
6400                  */
6401                 return packFloatx80(aSign, aExp, aSig0);
6402             }
6403             return a;
6404         }
6405         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6406         expDiff = 0;
6407     }
6408     *quotient = q = ( bSig <= aSig0 );
6409     if ( q ) aSig0 -= bSig;
6410     expDiff -= 64;
6411     while ( 0 < expDiff ) {
6412         q = estimateDiv128To64( aSig0, aSig1, bSig );
6413         q = ( 2 < q ) ? q - 2 : 0;
6414         mul64To128( bSig, q, &term0, &term1 );
6415         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6416         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6417         expDiff -= 62;
6418         *quotient <<= 62;
6419         *quotient += q;
6420     }
6421     expDiff += 64;
6422     if ( 0 < expDiff ) {
6423         q = estimateDiv128To64( aSig0, aSig1, bSig );
6424         q = ( 2 < q ) ? q - 2 : 0;
6425         q >>= 64 - expDiff;
6426         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6427         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6428         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6429         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6430             ++q;
6431             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6432         }
6433         if (expDiff < 64) {
6434             *quotient <<= expDiff;
6435         } else {
6436             *quotient = 0;
6437         }
6438         *quotient += q;
6439     }
6440     else {
6441         term1 = 0;
6442         term0 = bSig;
6443     }
6444     if (!mod) {
6445         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6446         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6447                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6448                         && ( q & 1 ) )
6449             ) {
6450             aSig0 = alternateASig0;
6451             aSig1 = alternateASig1;
6452             zSign = ! zSign;
6453             ++*quotient;
6454         }
6455     }
6456     return
6457         normalizeRoundAndPackFloatx80(
6458             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6459
6460 }
6461
6462 /*----------------------------------------------------------------------------
6463 | Returns the remainder of the extended double-precision floating-point value
6464 | `a' with respect to the corresponding value `b'.  The operation is performed
6465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6466 *----------------------------------------------------------------------------*/
6467
6468 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6469 {
6470     uint64_t quotient;
6471     return floatx80_modrem(a, b, false, &quotient, status);
6472 }
6473
6474 /*----------------------------------------------------------------------------
6475 | Returns the remainder of the extended double-precision floating-point value
6476 | `a' with respect to the corresponding value `b', with the quotient truncated
6477 | toward zero.
6478 *----------------------------------------------------------------------------*/
6479
6480 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6481 {
6482     uint64_t quotient;
6483     return floatx80_modrem(a, b, true, &quotient, status);
6484 }
6485
6486 /*----------------------------------------------------------------------------
6487 | Returns the square root of the extended double-precision floating-point
6488 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6489 | for Binary Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6491
6492 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6493 {
6494     bool aSign;
6495     int32_t aExp, zExp;
6496     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6497     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6498
6499     if (floatx80_invalid_encoding(a)) {
6500         float_raise(float_flag_invalid, status);
6501         return floatx80_default_nan(status);
6502     }
6503     aSig0 = extractFloatx80Frac( a );
6504     aExp = extractFloatx80Exp( a );
6505     aSign = extractFloatx80Sign( a );
6506     if ( aExp == 0x7FFF ) {
6507         if ((uint64_t)(aSig0 << 1)) {
6508             return propagateFloatx80NaN(a, a, status);
6509         }
6510         if ( ! aSign ) return a;
6511         goto invalid;
6512     }
6513     if ( aSign ) {
6514         if ( ( aExp | aSig0 ) == 0 ) return a;
6515  invalid:
6516         float_raise(float_flag_invalid, status);
6517         return floatx80_default_nan(status);
6518     }
6519     if ( aExp == 0 ) {
6520         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6521         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6522     }
6523     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6524     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6525     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6526     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6527     doubleZSig0 = zSig0<<1;
6528     mul64To128( zSig0, zSig0, &term0, &term1 );
6529     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6530     while ( (int64_t) rem0 < 0 ) {
6531         --zSig0;
6532         doubleZSig0 -= 2;
6533         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6534     }
6535     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6536     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6537         if ( zSig1 == 0 ) zSig1 = 1;
6538         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6539         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6540         mul64To128( zSig1, zSig1, &term2, &term3 );
6541         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6542         while ( (int64_t) rem1 < 0 ) {
6543             --zSig1;
6544             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6545             term3 |= 1;
6546             term2 |= doubleZSig0;
6547             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6548         }
6549         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6550     }
6551     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6552     zSig0 |= doubleZSig0;
6553     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6554                                 0, zExp, zSig0, zSig1, status);
6555 }
6556
6557 /*----------------------------------------------------------------------------
6558 | Returns the result of converting the quadruple-precision floating-point
6559 | value `a' to the 32-bit two's complement integer format.  The conversion
6560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6561 | Arithmetic---which means in particular that the conversion is rounded
6562 | according to the current rounding mode.  If `a' is a NaN, the largest
6563 | positive integer is returned.  Otherwise, if the conversion overflows, the
6564 | largest integer with the same sign as `a' is returned.
6565 *----------------------------------------------------------------------------*/
6566
6567 int32_t float128_to_int32(float128 a, float_status *status)
6568 {
6569     bool aSign;
6570     int32_t aExp, shiftCount;
6571     uint64_t aSig0, aSig1;
6572
6573     aSig1 = extractFloat128Frac1( a );
6574     aSig0 = extractFloat128Frac0( a );
6575     aExp = extractFloat128Exp( a );
6576     aSign = extractFloat128Sign( a );
6577     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6578     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6579     aSig0 |= ( aSig1 != 0 );
6580     shiftCount = 0x4028 - aExp;
6581     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6582     return roundAndPackInt32(aSign, aSig0, status);
6583
6584 }
6585
6586 /*----------------------------------------------------------------------------
6587 | Returns the result of converting the quadruple-precision floating-point
6588 | value `a' to the 32-bit two's complement integer format.  The conversion
6589 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6590 | Arithmetic, except that the conversion is always rounded toward zero.  If
6591 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6592 | conversion overflows, the largest integer with the same sign as `a' is
6593 | returned.
6594 *----------------------------------------------------------------------------*/
6595
6596 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6597 {
6598     bool aSign;
6599     int32_t aExp, shiftCount;
6600     uint64_t aSig0, aSig1, savedASig;
6601     int32_t z;
6602
6603     aSig1 = extractFloat128Frac1( a );
6604     aSig0 = extractFloat128Frac0( a );
6605     aExp = extractFloat128Exp( a );
6606     aSign = extractFloat128Sign( a );
6607     aSig0 |= ( aSig1 != 0 );
6608     if ( 0x401E < aExp ) {
6609         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6610         goto invalid;
6611     }
6612     else if ( aExp < 0x3FFF ) {
6613         if (aExp || aSig0) {
6614             float_raise(float_flag_inexact, status);
6615         }
6616         return 0;
6617     }
6618     aSig0 |= UINT64_C(0x0001000000000000);
6619     shiftCount = 0x402F - aExp;
6620     savedASig = aSig0;
6621     aSig0 >>= shiftCount;
6622     z = aSig0;
6623     if ( aSign ) z = - z;
6624     if ( ( z < 0 ) ^ aSign ) {
6625  invalid:
6626         float_raise(float_flag_invalid, status);
6627         return aSign ? INT32_MIN : INT32_MAX;
6628     }
6629     if ( ( aSig0<<shiftCount ) != savedASig ) {
6630         float_raise(float_flag_inexact, status);
6631     }
6632     return z;
6633
6634 }
6635
6636 /*----------------------------------------------------------------------------
6637 | Returns the result of converting the quadruple-precision floating-point
6638 | value `a' to the 64-bit two's complement integer format.  The conversion
6639 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6640 | Arithmetic---which means in particular that the conversion is rounded
6641 | according to the current rounding mode.  If `a' is a NaN, the largest
6642 | positive integer is returned.  Otherwise, if the conversion overflows, the
6643 | largest integer with the same sign as `a' is returned.
6644 *----------------------------------------------------------------------------*/
6645
6646 int64_t float128_to_int64(float128 a, float_status *status)
6647 {
6648     bool aSign;
6649     int32_t aExp, shiftCount;
6650     uint64_t aSig0, aSig1;
6651
6652     aSig1 = extractFloat128Frac1( a );
6653     aSig0 = extractFloat128Frac0( a );
6654     aExp = extractFloat128Exp( a );
6655     aSign = extractFloat128Sign( a );
6656     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6657     shiftCount = 0x402F - aExp;
6658     if ( shiftCount <= 0 ) {
6659         if ( 0x403E < aExp ) {
6660             float_raise(float_flag_invalid, status);
6661             if (    ! aSign
6662                  || (    ( aExp == 0x7FFF )
6663                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6664                     )
6665                ) {
6666                 return INT64_MAX;
6667             }
6668             return INT64_MIN;
6669         }
6670         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6671     }
6672     else {
6673         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6674     }
6675     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6676
6677 }
6678
6679 /*----------------------------------------------------------------------------
6680 | Returns the result of converting the quadruple-precision floating-point
6681 | value `a' to the 64-bit two's complement integer format.  The conversion
6682 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6683 | Arithmetic, except that the conversion is always rounded toward zero.
6684 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6685 | the conversion overflows, the largest integer with the same sign as `a' is
6686 | returned.
6687 *----------------------------------------------------------------------------*/
6688
6689 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6690 {
6691     bool aSign;
6692     int32_t aExp, shiftCount;
6693     uint64_t aSig0, aSig1;
6694     int64_t z;
6695
6696     aSig1 = extractFloat128Frac1( a );
6697     aSig0 = extractFloat128Frac0( a );
6698     aExp = extractFloat128Exp( a );
6699     aSign = extractFloat128Sign( a );
6700     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6701     shiftCount = aExp - 0x402F;
6702     if ( 0 < shiftCount ) {
6703         if ( 0x403E <= aExp ) {
6704             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6705             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6706                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6707                 if (aSig1) {
6708                     float_raise(float_flag_inexact, status);
6709                 }
6710             }
6711             else {
6712                 float_raise(float_flag_invalid, status);
6713                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6714                     return INT64_MAX;
6715                 }
6716             }
6717             return INT64_MIN;
6718         }
6719         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6720         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6721             float_raise(float_flag_inexact, status);
6722         }
6723     }
6724     else {
6725         if ( aExp < 0x3FFF ) {
6726             if ( aExp | aSig0 | aSig1 ) {
6727                 float_raise(float_flag_inexact, status);
6728             }
6729             return 0;
6730         }
6731         z = aSig0>>( - shiftCount );
6732         if (    aSig1
6733              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6734             float_raise(float_flag_inexact, status);
6735         }
6736     }
6737     if ( aSign ) z = - z;
6738     return z;
6739
6740 }
6741
6742 /*----------------------------------------------------------------------------
6743 | Returns the result of converting the quadruple-precision floating-point value
6744 | `a' to the 64-bit unsigned integer format.  The conversion is
6745 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6746 | Arithmetic---which means in particular that the conversion is rounded
6747 | according to the current rounding mode.  If `a' is a NaN, the largest
6748 | positive integer is returned.  If the conversion overflows, the
6749 | largest unsigned integer is returned.  If 'a' is negative, the value is
6750 | rounded and zero is returned; negative values that do not round to zero
6751 | will raise the inexact exception.
6752 *----------------------------------------------------------------------------*/
6753
6754 uint64_t float128_to_uint64(float128 a, float_status *status)
6755 {
6756     bool aSign;
6757     int aExp;
6758     int shiftCount;
6759     uint64_t aSig0, aSig1;
6760
6761     aSig0 = extractFloat128Frac0(a);
6762     aSig1 = extractFloat128Frac1(a);
6763     aExp = extractFloat128Exp(a);
6764     aSign = extractFloat128Sign(a);
6765     if (aSign && (aExp > 0x3FFE)) {
6766         float_raise(float_flag_invalid, status);
6767         if (float128_is_any_nan(a)) {
6768             return UINT64_MAX;
6769         } else {
6770             return 0;
6771         }
6772     }
6773     if (aExp) {
6774         aSig0 |= UINT64_C(0x0001000000000000);
6775     }
6776     shiftCount = 0x402F - aExp;
6777     if (shiftCount <= 0) {
6778         if (0x403E < aExp) {
6779             float_raise(float_flag_invalid, status);
6780             return UINT64_MAX;
6781         }
6782         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6783     } else {
6784         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6785     }
6786     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6787 }
6788
6789 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6790 {
6791     uint64_t v;
6792     signed char current_rounding_mode = status->float_rounding_mode;
6793
6794     set_float_rounding_mode(float_round_to_zero, status);
6795     v = float128_to_uint64(a, status);
6796     set_float_rounding_mode(current_rounding_mode, status);
6797
6798     return v;
6799 }
6800
6801 /*----------------------------------------------------------------------------
6802 | Returns the result of converting the quadruple-precision floating-point
6803 | value `a' to the 32-bit unsigned integer format.  The conversion
6804 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6805 | Arithmetic except that the conversion is always rounded toward zero.
6806 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6807 | if the conversion overflows, the largest unsigned integer is returned.
6808 | If 'a' is negative, the value is rounded and zero is returned; negative
6809 | values that do not round to zero will raise the inexact exception.
6810 *----------------------------------------------------------------------------*/
6811
6812 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6813 {
6814     uint64_t v;
6815     uint32_t res;
6816     int old_exc_flags = get_float_exception_flags(status);
6817
6818     v = float128_to_uint64_round_to_zero(a, status);
6819     if (v > 0xffffffff) {
6820         res = 0xffffffff;
6821     } else {
6822         return v;
6823     }
6824     set_float_exception_flags(old_exc_flags, status);
6825     float_raise(float_flag_invalid, status);
6826     return res;
6827 }
6828
6829 /*----------------------------------------------------------------------------
6830 | Returns the result of converting the quadruple-precision floating-point value
6831 | `a' to the 32-bit unsigned integer format.  The conversion is
6832 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6833 | Arithmetic---which means in particular that the conversion is rounded
6834 | according to the current rounding mode.  If `a' is a NaN, the largest
6835 | positive integer is returned.  If the conversion overflows, the
6836 | largest unsigned integer is returned.  If 'a' is negative, the value is
6837 | rounded and zero is returned; negative values that do not round to zero
6838 | will raise the inexact exception.
6839 *----------------------------------------------------------------------------*/
6840
6841 uint32_t float128_to_uint32(float128 a, float_status *status)
6842 {
6843     uint64_t v;
6844     uint32_t res;
6845     int old_exc_flags = get_float_exception_flags(status);
6846
6847     v = float128_to_uint64(a, status);
6848     if (v > 0xffffffff) {
6849         res = 0xffffffff;
6850     } else {
6851         return v;
6852     }
6853     set_float_exception_flags(old_exc_flags, status);
6854     float_raise(float_flag_invalid, status);
6855     return res;
6856 }
6857
6858 /*----------------------------------------------------------------------------
6859 | Returns the result of converting the quadruple-precision floating-point
6860 | value `a' to the extended double-precision floating-point format.  The
6861 | conversion is performed according to the IEC/IEEE Standard for Binary
6862 | Floating-Point Arithmetic.
6863 *----------------------------------------------------------------------------*/
6864
6865 floatx80 float128_to_floatx80(float128 a, float_status *status)
6866 {
6867     bool aSign;
6868     int32_t aExp;
6869     uint64_t aSig0, aSig1;
6870
6871     aSig1 = extractFloat128Frac1( a );
6872     aSig0 = extractFloat128Frac0( a );
6873     aExp = extractFloat128Exp( a );
6874     aSign = extractFloat128Sign( a );
6875     if ( aExp == 0x7FFF ) {
6876         if ( aSig0 | aSig1 ) {
6877             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6878                                                status);
6879             return floatx80_silence_nan(res, status);
6880         }
6881         return packFloatx80(aSign, floatx80_infinity_high,
6882                                    floatx80_infinity_low);
6883     }
6884     if ( aExp == 0 ) {
6885         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6886         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6887     }
6888     else {
6889         aSig0 |= UINT64_C(0x0001000000000000);
6890     }
6891     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6892     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6893
6894 }
6895
6896 /*----------------------------------------------------------------------------
6897 | Returns the remainder of the quadruple-precision floating-point value `a'
6898 | with respect to the corresponding value `b'.  The operation is performed
6899 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6900 *----------------------------------------------------------------------------*/
6901
6902 float128 float128_rem(float128 a, float128 b, float_status *status)
6903 {
6904     bool aSign, zSign;
6905     int32_t aExp, bExp, expDiff;
6906     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6907     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6908     int64_t sigMean0;
6909
6910     aSig1 = extractFloat128Frac1( a );
6911     aSig0 = extractFloat128Frac0( a );
6912     aExp = extractFloat128Exp( a );
6913     aSign = extractFloat128Sign( a );
6914     bSig1 = extractFloat128Frac1( b );
6915     bSig0 = extractFloat128Frac0( b );
6916     bExp = extractFloat128Exp( b );
6917     if ( aExp == 0x7FFF ) {
6918         if (    ( aSig0 | aSig1 )
6919              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6920             return propagateFloat128NaN(a, b, status);
6921         }
6922         goto invalid;
6923     }
6924     if ( bExp == 0x7FFF ) {
6925         if (bSig0 | bSig1) {
6926             return propagateFloat128NaN(a, b, status);
6927         }
6928         return a;
6929     }
6930     if ( bExp == 0 ) {
6931         if ( ( bSig0 | bSig1 ) == 0 ) {
6932  invalid:
6933             float_raise(float_flag_invalid, status);
6934             return float128_default_nan(status);
6935         }
6936         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6937     }
6938     if ( aExp == 0 ) {
6939         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6940         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6941     }
6942     expDiff = aExp - bExp;
6943     if ( expDiff < -1 ) return a;
6944     shortShift128Left(
6945         aSig0 | UINT64_C(0x0001000000000000),
6946         aSig1,
6947         15 - ( expDiff < 0 ),
6948         &aSig0,
6949         &aSig1
6950     );
6951     shortShift128Left(
6952         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6953     q = le128( bSig0, bSig1, aSig0, aSig1 );
6954     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6955     expDiff -= 64;
6956     while ( 0 < expDiff ) {
6957         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6958         q = ( 4 < q ) ? q - 4 : 0;
6959         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6960         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6961         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6962         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6963         expDiff -= 61;
6964     }
6965     if ( -64 < expDiff ) {
6966         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6967         q = ( 4 < q ) ? q - 4 : 0;
6968         q >>= - expDiff;
6969         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6970         expDiff += 52;
6971         if ( expDiff < 0 ) {
6972             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6973         }
6974         else {
6975             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6976         }
6977         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6978         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6979     }
6980     else {
6981         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6982         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6983     }
6984     do {
6985         alternateASig0 = aSig0;
6986         alternateASig1 = aSig1;
6987         ++q;
6988         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6989     } while ( 0 <= (int64_t) aSig0 );
6990     add128(
6991         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6992     if (    ( sigMean0 < 0 )
6993          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6994         aSig0 = alternateASig0;
6995         aSig1 = alternateASig1;
6996     }
6997     zSign = ( (int64_t) aSig0 < 0 );
6998     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6999     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7000                                          status);
7001 }
7002
7003 /*----------------------------------------------------------------------------
7004 | Returns the square root of the quadruple-precision floating-point value `a'.
7005 | The operation is performed according to the IEC/IEEE Standard for Binary
7006 | Floating-Point Arithmetic.
7007 *----------------------------------------------------------------------------*/
7008
7009 float128 float128_sqrt(float128 a, float_status *status)
7010 {
7011     bool aSign;
7012     int32_t aExp, zExp;
7013     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7014     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7015
7016     aSig1 = extractFloat128Frac1( a );
7017     aSig0 = extractFloat128Frac0( a );
7018     aExp = extractFloat128Exp( a );
7019     aSign = extractFloat128Sign( a );
7020     if ( aExp == 0x7FFF ) {
7021         if (aSig0 | aSig1) {
7022             return propagateFloat128NaN(a, a, status);
7023         }
7024         if ( ! aSign ) return a;
7025         goto invalid;
7026     }
7027     if ( aSign ) {
7028         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7029  invalid:
7030         float_raise(float_flag_invalid, status);
7031         return float128_default_nan(status);
7032     }
7033     if ( aExp == 0 ) {
7034         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7035         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7036     }
7037     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7038     aSig0 |= UINT64_C(0x0001000000000000);
7039     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7040     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7041     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7042     doubleZSig0 = zSig0<<1;
7043     mul64To128( zSig0, zSig0, &term0, &term1 );
7044     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7045     while ( (int64_t) rem0 < 0 ) {
7046         --zSig0;
7047         doubleZSig0 -= 2;
7048         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7049     }
7050     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7051     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7052         if ( zSig1 == 0 ) zSig1 = 1;
7053         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7054         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7055         mul64To128( zSig1, zSig1, &term2, &term3 );
7056         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7057         while ( (int64_t) rem1 < 0 ) {
7058             --zSig1;
7059             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7060             term3 |= 1;
7061             term2 |= doubleZSig0;
7062             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7063         }
7064         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7065     }
7066     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7067     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7068
7069 }
7070
7071 static inline FloatRelation
7072 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7073                           float_status *status)
7074 {
7075     bool aSign, bSign;
7076
7077     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7078         float_raise(float_flag_invalid, status);
7079         return float_relation_unordered;
7080     }
7081     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7082           ( extractFloatx80Frac( a )<<1 ) ) ||
7083         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7084           ( extractFloatx80Frac( b )<<1 ) )) {
7085         if (!is_quiet ||
7086             floatx80_is_signaling_nan(a, status) ||
7087             floatx80_is_signaling_nan(b, status)) {
7088             float_raise(float_flag_invalid, status);
7089         }
7090         return float_relation_unordered;
7091     }
7092     aSign = extractFloatx80Sign( a );
7093     bSign = extractFloatx80Sign( b );
7094     if ( aSign != bSign ) {
7095
7096         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7097              ( ( a.low | b.low ) == 0 ) ) {
7098             /* zero case */
7099             return float_relation_equal;
7100         } else {
7101             return 1 - (2 * aSign);
7102         }
7103     } else {
7104         /* Normalize pseudo-denormals before comparison.  */
7105         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7106             ++a.high;
7107         }
7108         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7109             ++b.high;
7110         }
7111         if (a.low == b.low && a.high == b.high) {
7112             return float_relation_equal;
7113         } else {
7114             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7115         }
7116     }
7117 }
7118
7119 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7120 {
7121     return floatx80_compare_internal(a, b, 0, status);
7122 }
7123
7124 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7125                                      float_status *status)
7126 {
7127     return floatx80_compare_internal(a, b, 1, status);
7128 }
7129
7130 static inline FloatRelation
7131 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7132                           float_status *status)
7133 {
7134     bool aSign, bSign;
7135
7136     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7137           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7138         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7139           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7140         if (!is_quiet ||
7141             float128_is_signaling_nan(a, status) ||
7142             float128_is_signaling_nan(b, status)) {
7143             float_raise(float_flag_invalid, status);
7144         }
7145         return float_relation_unordered;
7146     }
7147     aSign = extractFloat128Sign( a );
7148     bSign = extractFloat128Sign( b );
7149     if ( aSign != bSign ) {
7150         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7151             /* zero case */
7152             return float_relation_equal;
7153         } else {
7154             return 1 - (2 * aSign);
7155         }
7156     } else {
7157         if (a.low == b.low && a.high == b.high) {
7158             return float_relation_equal;
7159         } else {
7160             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7161         }
7162     }
7163 }
7164
7165 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7166 {
7167     return float128_compare_internal(a, b, 0, status);
7168 }
7169
7170 FloatRelation float128_compare_quiet(float128 a, float128 b,
7171                                      float_status *status)
7172 {
7173     return float128_compare_internal(a, b, 1, status);
7174 }
7175
7176 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7177 {
7178     bool aSign;
7179     int32_t aExp;
7180     uint64_t aSig;
7181
7182     if (floatx80_invalid_encoding(a)) {
7183         float_raise(float_flag_invalid, status);
7184         return floatx80_default_nan(status);
7185     }
7186     aSig = extractFloatx80Frac( a );
7187     aExp = extractFloatx80Exp( a );
7188     aSign = extractFloatx80Sign( a );
7189
7190     if ( aExp == 0x7FFF ) {
7191         if ( aSig<<1 ) {
7192             return propagateFloatx80NaN(a, a, status);
7193         }
7194         return a;
7195     }
7196
7197     if (aExp == 0) {
7198         if (aSig == 0) {
7199             return a;
7200         }
7201         aExp++;
7202     }
7203
7204     if (n > 0x10000) {
7205         n = 0x10000;
7206     } else if (n < -0x10000) {
7207         n = -0x10000;
7208     }
7209
7210     aExp += n;
7211     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7212                                          aSign, aExp, aSig, 0, status);
7213 }
7214
7215 float128 float128_scalbn(float128 a, int n, float_status *status)
7216 {
7217     bool aSign;
7218     int32_t aExp;
7219     uint64_t aSig0, aSig1;
7220
7221     aSig1 = extractFloat128Frac1( a );
7222     aSig0 = extractFloat128Frac0( a );
7223     aExp = extractFloat128Exp( a );
7224     aSign = extractFloat128Sign( a );
7225     if ( aExp == 0x7FFF ) {
7226         if ( aSig0 | aSig1 ) {
7227             return propagateFloat128NaN(a, a, status);
7228         }
7229         return a;
7230     }
7231     if (aExp != 0) {
7232         aSig0 |= UINT64_C(0x0001000000000000);
7233     } else if (aSig0 == 0 && aSig1 == 0) {
7234         return a;
7235     } else {
7236         aExp++;
7237     }
7238
7239     if (n > 0x10000) {
7240         n = 0x10000;
7241     } else if (n < -0x10000) {
7242         n = -0x10000;
7243     }
7244
7245     aExp += n - 1;
7246     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7247                                          , status);
7248
7249 }
7250
7251 static void __attribute__((constructor)) softfloat_init(void)
7252 {
7253     union_float64 ua, ub, uc, ur;
7254
7255     if (QEMU_NO_HARDFLOAT) {
7256         return;
7257     }
7258     /*
7259      * Test that the host's FMA is not obviously broken. For example,
7260      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7261      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7262      */
7263     ua.s = 0x0020000000000001ULL;
7264     ub.s = 0x3ca0000000000000ULL;
7265     uc.s = 0x0020000000000000ULL;
7266     ur.h = fma(ua.h, ub.h, uc.h);
7267     if (ur.s != 0x0020000000000001ULL) {
7268         force_soft_fma = true;
7269     }
7270 }