fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float.
 504  * The exponent is unbiased and the fraction is normalized.
 505  *
 506  * The fraction words are stored in big-endian word ordering,
 507  * so that truncation from a larger format to a smaller format
 508  * can be done simply by ignoring subsequent elements.
 509  */
 510
 511 typedef struct {
 512     FloatClass cls;
 513     bool sign;
 514     int32_t exp;
 515     union {
 516         /* Routines that know the structure may reference the singular name. */
 517         uint64_t frac;
 518         /*
 519          * Routines expanded with multiple structures reference "hi" and "lo"
 520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
 521          * both the same word and aliased here.
 522          */
 523         uint64_t frac_hi;
 524         uint64_t frac_lo;
 525     };
 526 } FloatParts64;
 527
 528 typedef struct {
 529     FloatClass cls;
 530     bool sign;
 531     int32_t exp;
 532     uint64_t frac_hi;
 533     uint64_t frac_lo;
 534 } FloatParts128;
 535
 536 typedef struct {
 537     FloatClass cls;
 538     bool sign;
 539     int32_t exp;
 540     uint64_t frac_hi;
 541     uint64_t frac_hm;  /* high-middle */
 542     uint64_t frac_lm;  /* low-middle */
 543     uint64_t frac_lo;
 544 } FloatParts256;
 545
 546 /* These apply to the most significant word of each FloatPartsN. */
 547 #define DECOMPOSED_BINARY_POINT    63
 548 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 549
 550 /* Structure holding all of the relevant parameters for a format.
 551  *   exp_size: the size of the exponent field
 552  *   exp_bias: the offset applied to the exponent field
 553  *   exp_max: the maximum normalised exponent
 554  *   frac_size: the size of the fraction field
 555  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 556  * The following are computed based the size of fraction
 557  *   frac_lsb: least significant bit of fraction
 558  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 559  *   round_mask/roundeven_mask: masks used for rounding
 560  * The following optional modifiers are available:
 561  *   arm_althp: handle ARM Alternative Half Precision
 562  */
 563 typedef struct {
 564     int exp_size;
 565     int exp_bias;
 566     int exp_max;
 567     int frac_size;
 568     int frac_shift;
 569     uint64_t frac_lsb;
 570     uint64_t frac_lsbm1;
 571     uint64_t round_mask;
 572     uint64_t roundeven_mask;
 573     bool arm_althp;
 574 } FloatFmt;
 575
 576 /* Expand fields based on the size of exponent and fraction */
 577 #define FLOAT_PARAMS(E, F)                                           \
 578     .exp_size       = E,                                             \
 579     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 580     .exp_max        = (1 << E) - 1,                                  \
 581     .frac_size      = F,                                             \
 582     .frac_shift     = (-F - 1) & 63,                                 \
 583     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
 584     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
 585     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
 586     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
 587
 588 static const FloatFmt float16_params = {
 589     FLOAT_PARAMS(5, 10)
 590 };
 591
 592 static const FloatFmt float16_params_ahp = {
 593     FLOAT_PARAMS(5, 10),
 594     .arm_althp = true
 595 };
 596
 597 static const FloatFmt bfloat16_params = {
 598     FLOAT_PARAMS(8, 7)
 599 };
 600
 601 static const FloatFmt float32_params = {
 602     FLOAT_PARAMS(8, 23)
 603 };
 604
 605 static const FloatFmt float64_params = {
 606     FLOAT_PARAMS(11, 52)
 607 };
 608
 609 static const FloatFmt float128_params = {
 610     FLOAT_PARAMS(15, 112)
 611 };
 612
 613 /* Unpack a float to parts, but do not canonicalize.  */
 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 615 {
 616     const int f_size = fmt->frac_size;
 617     const int e_size = fmt->exp_size;
 618
 619     *r = (FloatParts64) {
 620         .cls = float_class_unclassified,
 621         .sign = extract64(raw, f_size + e_size, 1),
 622         .exp = extract64(raw, f_size, e_size),
 623         .frac = extract64(raw, 0, f_size)
 624     };
 625 }
 626
 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
 628 {
 629     unpack_raw64(p, &float16_params, f);
 630 }
 631
 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 633 {
 634     unpack_raw64(p, &bfloat16_params, f);
 635 }
 636
 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
 638 {
 639     unpack_raw64(p, &float32_params, f);
 640 }
 641
 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
 643 {
 644     unpack_raw64(p, &float64_params, f);
 645 }
 646
 647 static void float128_unpack_raw(FloatParts128 *p, float128 f)
 648 {
 649     const int f_size = float128_params.frac_size - 64;
 650     const int e_size = float128_params.exp_size;
 651
 652     *p = (FloatParts128) {
 653         .cls = float_class_unclassified,
 654         .sign = extract64(f.high, f_size + e_size, 1),
 655         .exp = extract64(f.high, f_size, e_size),
 656         .frac_hi = extract64(f.high, 0, f_size),
 657         .frac_lo = f.low,
 658     };
 659 }
 660
 661 /* Pack a float from parts, but do not canonicalize.  */
 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 663 {
 664     const int f_size = fmt->frac_size;
 665     const int e_size = fmt->exp_size;
 666     uint64_t ret;
 667
 668     ret = (uint64_t)p->sign << (f_size + e_size);
 669     ret = deposit64(ret, f_size, e_size, p->exp);
 670     ret = deposit64(ret, 0, f_size, p->frac);
 671     return ret;
 672 }
 673
 674 static inline float16 float16_pack_raw(const FloatParts64 *p)
 675 {
 676     return make_float16(pack_raw64(p, &float16_params));
 677 }
 678
 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
 680 {
 681     return pack_raw64(p, &bfloat16_params);
 682 }
 683
 684 static inline float32 float32_pack_raw(const FloatParts64 *p)
 685 {
 686     return make_float32(pack_raw64(p, &float32_params));
 687 }
 688
 689 static inline float64 float64_pack_raw(const FloatParts64 *p)
 690 {
 691     return make_float64(pack_raw64(p, &float64_params));
 692 }
 693
 694 static float128 float128_pack_raw(const FloatParts128 *p)
 695 {
 696     const int f_size = float128_params.frac_size - 64;
 697     const int e_size = float128_params.exp_size;
 698     uint64_t hi;
 699
 700     hi = (uint64_t)p->sign << (f_size + e_size);
 701     hi = deposit64(hi, f_size, e_size, p->exp);
 702     hi = deposit64(hi, 0, f_size, p->frac_hi);
 703     return make_float128(hi, p->frac_lo);
 704 }
 705
 706 /*----------------------------------------------------------------------------
 707 | Functions and definitions to determine:  (1) whether tininess for underflow
 708 | is detected before or after rounding by default, (2) what (if anything)
 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 711 | are propagated from function inputs to output.  These details are target-
 712 | specific.
 713 *----------------------------------------------------------------------------*/
 714 #include "softfloat-specialize.c.inc"
 715
 716 #define PARTS_GENERIC_64_128(NAME, P) \
 717     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 718
 719 #define PARTS_GENERIC_64_128_256(NAME, P) \
 720     QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \
 721                  (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 722
 723 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
 724 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
 725
 726 static void parts64_return_nan(FloatParts64 *a, float_status *s);
 727 static void parts128_return_nan(FloatParts128 *a, float_status *s);
 728
 729 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
 730
 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
 732                                       float_status *s);
 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
 734                                         float_status *s);
 735
 736 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
 737
 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
 739                                              FloatParts64 *c, float_status *s,
 740                                              int ab_mask, int abc_mask);
 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
 742                                                FloatParts128 *b,
 743                                                FloatParts128 *c,
 744                                                float_status *s,
 745                                                int ab_mask, int abc_mask);
 746
 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
 748     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
 749
 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
 751                                  const FloatFmt *fmt);
 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
 753                                   const FloatFmt *fmt);
 754
 755 #define parts_canonicalize(A, S, F) \
 756     PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
 757
 758 static void parts64_uncanon(FloatParts64 *p, float_status *status,
 759                             const FloatFmt *fmt);
 760 static void parts128_uncanon(FloatParts128 *p, float_status *status,
 761                              const FloatFmt *fmt);
 762
 763 #define parts_uncanon(A, S, F) \
 764     PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
 765
 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b);
 769
 770 #define parts_add_normal(A, B) \
 771     PARTS_GENERIC_64_128_256(add_normal, A)(A, B)
 772
 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b);
 776
 777 #define parts_sub_normal(A, B) \
 778     PARTS_GENERIC_64_128_256(sub_normal, A)(A, B)
 779
 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
 781                                     float_status *s, bool subtract);
 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
 783                                       float_status *s, bool subtract);
 784
 785 #define parts_addsub(A, B, S, Z) \
 786     PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
 787
 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b,
 789                                  float_status *s);
 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
 791                                    float_status *s);
 792
 793 #define parts_mul(A, B, S) \
 794     PARTS_GENERIC_64_128(mul, A)(A, B, S)
 795
 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
 797                                     FloatParts64 *c, int flags,
 798                                     float_status *s);
 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
 800                                       FloatParts128 *c, int flags,
 801                                       float_status *s);
 802
 803 #define parts_muladd(A, B, C, Z, S) \
 804     PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
 805
 806 /*
 807  * Helper functions for softfloat-parts.c.inc, per-size operations.
 808  */
 809
 810 #define FRAC_GENERIC_64_128(NAME, P) \
 811     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 812
 813 #define FRAC_GENERIC_64_128_256(NAME, P) \
 814     QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \
 815                  (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 816
 817 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
 818 {
 819     return uadd64_overflow(a->frac, b->frac, &r->frac);
 820 }
 821
 822 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
 823 {
 824     bool c = 0;
 825     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
 826     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
 827     return c;
 828 }
 829
 830 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
 831 {
 832     bool c = 0;
 833     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
 834     r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c);
 835     r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c);
 836     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
 837     return c;
 838 }
 839
 840 #define frac_add(R, A, B)  FRAC_GENERIC_64_128_256(add, R)(R, A, B)
 841
 842 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
 843 {
 844     return uadd64_overflow(a->frac, c, &r->frac);
 845 }
 846
 847 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
 848 {
 849     c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
 850     return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
 851 }
 852
 853 #define frac_addi(R, A, C)  FRAC_GENERIC_64_128(addi, R)(R, A, C)
 854
 855 static void frac64_allones(FloatParts64 *a)
 856 {
 857     a->frac = -1;
 858 }
 859
 860 static void frac128_allones(FloatParts128 *a)
 861 {
 862     a->frac_hi = a->frac_lo = -1;
 863 }
 864
 865 #define frac_allones(A)  FRAC_GENERIC_64_128(allones, A)(A)
 866
 867 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
 868 {
 869     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
 870 }
 871
 872 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
 873 {
 874     uint64_t ta = a->frac_hi, tb = b->frac_hi;
 875     if (ta == tb) {
 876         ta = a->frac_lo, tb = b->frac_lo;
 877         if (ta == tb) {
 878             return 0;
 879         }
 880     }
 881     return ta < tb ? -1 : 1;
 882 }
 883
 884 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
 885
 886 static void frac64_clear(FloatParts64 *a)
 887 {
 888     a->frac = 0;
 889 }
 890
 891 static void frac128_clear(FloatParts128 *a)
 892 {
 893     a->frac_hi = a->frac_lo = 0;
 894 }
 895
 896 #define frac_clear(A)  FRAC_GENERIC_64_128(clear, A)(A)
 897
 898 static bool frac64_eqz(FloatParts64 *a)
 899 {
 900     return a->frac == 0;
 901 }
 902
 903 static bool frac128_eqz(FloatParts128 *a)
 904 {
 905     return (a->frac_hi | a->frac_lo) == 0;
 906 }
 907
 908 #define frac_eqz(A)  FRAC_GENERIC_64_128(eqz, A)(A)
 909
 910 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b)
 911 {
 912     mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac);
 913 }
 914
 915 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b)
 916 {
 917     mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo,
 918                 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo);
 919 }
 920
 921 #define frac_mulw(R, A, B)  FRAC_GENERIC_64_128(mulw, A)(R, A, B)
 922
 923 static void frac64_neg(FloatParts64 *a)
 924 {
 925     a->frac = -a->frac;
 926 }
 927
 928 static void frac128_neg(FloatParts128 *a)
 929 {
 930     bool c = 0;
 931     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
 932     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
 933 }
 934
 935 static void frac256_neg(FloatParts256 *a)
 936 {
 937     bool c = 0;
 938     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
 939     a->frac_lm = usub64_borrow(0, a->frac_lm, &c);
 940     a->frac_hm = usub64_borrow(0, a->frac_hm, &c);
 941     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
 942 }
 943
 944 #define frac_neg(A)  FRAC_GENERIC_64_128_256(neg, A)(A)
 945
 946 static int frac64_normalize(FloatParts64 *a)
 947 {
 948     if (a->frac) {
 949         int shift = clz64(a->frac);
 950         a->frac <<= shift;
 951         return shift;
 952     }
 953     return 64;
 954 }
 955
 956 static int frac128_normalize(FloatParts128 *a)
 957 {
 958     if (a->frac_hi) {
 959         int shl = clz64(a->frac_hi);
 960         if (shl) {
 961             int shr = 64 - shl;
 962             a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
 963             a->frac_lo = (a->frac_lo << shl);
 964         }
 965         return shl;
 966     } else if (a->frac_lo) {
 967         int shl = clz64(a->frac_lo);
 968         a->frac_hi = (a->frac_lo << shl);
 969         a->frac_lo = 0;
 970         return shl + 64;
 971     }
 972     return 128;
 973 }
 974
 975 static int frac256_normalize(FloatParts256 *a)
 976 {
 977     uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
 978     uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
 979     int ret, shl, shr;
 980
 981     if (likely(a0)) {
 982         shl = clz64(a0);
 983         if (shl == 0) {
 984             return 0;
 985         }
 986         ret = shl;
 987     } else {
 988         if (a1) {
 989             ret = 64;
 990             a0 = a1, a1 = a2, a2 = a3, a3 = 0;
 991         } else if (a2) {
 992             ret = 128;
 993             a0 = a2, a1 = a3, a2 = 0, a3 = 0;
 994         } else if (a3) {
 995             ret = 192;
 996             a0 = a3, a1 = 0, a2 = 0, a3 = 0;
 997         } else {
 998             ret = 256;
 999             a0 = 0, a1 = 0, a2 = 0, a3 = 0;
1000             goto done;
1001         }
1002         shl = clz64(a0);
1003         if (shl == 0) {
1004             goto done;
1005         }
1006         ret += shl;
1007     }
1008
1009     shr = -shl & 63;
1010     a0 = (a0 << shl) | (a1 >> shr);
1011     a1 = (a1 << shl) | (a2 >> shr);
1012     a2 = (a2 << shl) | (a3 >> shr);
1013     a3 = (a3 << shl);
1014
1015  done:
1016     a->frac_hi = a0;
1017     a->frac_hm = a1;
1018     a->frac_lm = a2;
1019     a->frac_lo = a3;
1020     return ret;
1021 }
1022
1023 #define frac_normalize(A)  FRAC_GENERIC_64_128_256(normalize, A)(A)
1024
1025 static void frac64_shl(FloatParts64 *a, int c)
1026 {
1027     a->frac <<= c;
1028 }
1029
1030 static void frac128_shl(FloatParts128 *a, int c)
1031 {
1032     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
1033 }
1034
1035 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
1036
1037 static void frac64_shr(FloatParts64 *a, int c)
1038 {
1039     a->frac >>= c;
1040 }
1041
1042 static void frac128_shr(FloatParts128 *a, int c)
1043 {
1044     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
1045 }
1046
1047 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
1048
1049 static void frac64_shrjam(FloatParts64 *a, int c)
1050 {
1051     shift64RightJamming(a->frac, c, &a->frac);
1052 }
1053
1054 static void frac128_shrjam(FloatParts128 *a, int c)
1055 {
1056     shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
1057 }
1058
1059 static void frac256_shrjam(FloatParts256 *a, int c)
1060 {
1061     uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
1062     uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
1063     uint64_t sticky = 0;
1064     int invc;
1065
1066     if (unlikely(c == 0)) {
1067         return;
1068     } else if (likely(c < 64)) {
1069         /* nothing */
1070     } else if (likely(c < 256)) {
1071         if (unlikely(c & 128)) {
1072             sticky |= a2 | a3;
1073             a3 = a1, a2 = a0, a1 = 0, a0 = 0;
1074         }
1075         if (unlikely(c & 64)) {
1076             sticky |= a3;
1077             a3 = a2, a2 = a1, a1 = a0, a0 = 0;
1078         }
1079         c &= 63;
1080         if (c == 0) {
1081             goto done;
1082         }
1083     } else {
1084         sticky = a0 | a1 | a2 | a3;
1085         a0 = a1 = a2 = a3 = 0;
1086         goto done;
1087     }
1088
1089     invc = -c & 63;
1090     sticky |= a3 << invc;
1091     a3 = (a3 >> c) | (a2 << invc);
1092     a2 = (a2 >> c) | (a1 << invc);
1093     a1 = (a1 >> c) | (a0 << invc);
1094     a0 = (a0 >> c);
1095
1096  done:
1097     a->frac_lo = a3 | (sticky != 0);
1098     a->frac_lm = a2;
1099     a->frac_hm = a1;
1100     a->frac_hi = a0;
1101 }
1102
1103 #define frac_shrjam(A, C)  FRAC_GENERIC_64_128_256(shrjam, A)(A, C)
1104
1105 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
1106 {
1107     return usub64_overflow(a->frac, b->frac, &r->frac);
1108 }
1109
1110 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
1111 {
1112     bool c = 0;
1113     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1114     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1115     return c;
1116 }
1117
1118 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b)
1119 {
1120     bool c = 0;
1121     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
1122     r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c);
1123     r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c);
1124     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
1125     return c;
1126 }
1127
1128 #define frac_sub(R, A, B)  FRAC_GENERIC_64_128_256(sub, R)(R, A, B)
1129
1130 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a)
1131 {
1132     r->frac = a->frac_hi | (a->frac_lo != 0);
1133 }
1134
1135 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a)
1136 {
1137     r->frac_hi = a->frac_hi;
1138     r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0);
1139 }
1140
1141 #define frac_truncjam(R, A)  FRAC_GENERIC_64_128(truncjam, R)(R, A)
1142
1143 static void frac64_widen(FloatParts128 *r, FloatParts64 *a)
1144 {
1145     r->frac_hi = a->frac;
1146     r->frac_lo = 0;
1147 }
1148
1149 static void frac128_widen(FloatParts256 *r, FloatParts128 *a)
1150 {
1151     r->frac_hi = a->frac_hi;
1152     r->frac_hm = a->frac_lo;
1153     r->frac_lm = 0;
1154     r->frac_lo = 0;
1155 }
1156
1157 #define frac_widen(A, B)  FRAC_GENERIC_64_128(widen, B)(A, B)
1158
1159 #define partsN(NAME)   glue(glue(glue(parts,N),_),NAME)
1160 #define FloatPartsN    glue(FloatParts,N)
1161 #define FloatPartsW    glue(FloatParts,W)
1162
1163 #define N 64
1164 #define W 128
1165
1166 #include "softfloat-parts-addsub.c.inc"
1167 #include "softfloat-parts.c.inc"
1168
1169 #undef  N
1170 #undef  W
1171 #define N 128
1172 #define W 256
1173
1174 #include "softfloat-parts-addsub.c.inc"
1175 #include "softfloat-parts.c.inc"
1176
1177 #undef  N
1178 #undef  W
1179 #define N            256
1180
1181 #include "softfloat-parts-addsub.c.inc"
1182
1183 #undef  N
1184 #undef  W
1185 #undef  partsN
1186 #undef  FloatPartsN
1187 #undef  FloatPartsW
1188
1189 /*
1190  * Pack/unpack routines with a specific FloatFmt.
1191  */
1192
1193 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1194                                       float_status *s, const FloatFmt *params)
1195 {
1196     float16_unpack_raw(p, f);
1197     parts_canonicalize(p, s, params);
1198 }
1199
1200 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1201                                      float_status *s)
1202 {
1203     float16a_unpack_canonical(p, f, s, &float16_params);
1204 }
1205
1206 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1207                                       float_status *s)
1208 {
1209     bfloat16_unpack_raw(p, f);
1210     parts_canonicalize(p, s, &bfloat16_params);
1211 }
1212
1213 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1214                                              float_status *s,
1215                                              const FloatFmt *params)
1216 {
1217     parts_uncanon(p, s, params);
1218     return float16_pack_raw(p);
1219 }
1220
1221 static float16 float16_round_pack_canonical(FloatParts64 *p,
1222                                             float_status *s)
1223 {
1224     return float16a_round_pack_canonical(p, s, &float16_params);
1225 }
1226
1227 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1228                                               float_status *s)
1229 {
1230     parts_uncanon(p, s, &bfloat16_params);
1231     return bfloat16_pack_raw(p);
1232 }
1233
1234 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1235                                      float_status *s)
1236 {
1237     float32_unpack_raw(p, f);
1238     parts_canonicalize(p, s, &float32_params);
1239 }
1240
1241 static float32 float32_round_pack_canonical(FloatParts64 *p,
1242                                             float_status *s)
1243 {
1244     parts_uncanon(p, s, &float32_params);
1245     return float32_pack_raw(p);
1246 }
1247
1248 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1249                                      float_status *s)
1250 {
1251     float64_unpack_raw(p, f);
1252     parts_canonicalize(p, s, &float64_params);
1253 }
1254
1255 static float64 float64_round_pack_canonical(FloatParts64 *p,
1256                                             float_status *s)
1257 {
1258     parts_uncanon(p, s, &float64_params);
1259     return float64_pack_raw(p);
1260 }
1261
1262 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1263                                       float_status *s)
1264 {
1265     float128_unpack_raw(p, f);
1266     parts_canonicalize(p, s, &float128_params);
1267 }
1268
1269 static float128 float128_round_pack_canonical(FloatParts128 *p,
1270                                               float_status *s)
1271 {
1272     parts_uncanon(p, s, &float128_params);
1273     return float128_pack_raw(p);
1274 }
1275
1276 /*
1277  * Addition and subtraction
1278  */
1279
1280 static float16 QEMU_FLATTEN
1281 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1282 {
1283     FloatParts64 pa, pb, *pr;
1284
1285     float16_unpack_canonical(&pa, a, status);
1286     float16_unpack_canonical(&pb, b, status);
1287     pr = parts_addsub(&pa, &pb, status, subtract);
1288
1289     return float16_round_pack_canonical(pr, status);
1290 }
1291
1292 float16 float16_add(float16 a, float16 b, float_status *status)
1293 {
1294     return float16_addsub(a, b, status, false);
1295 }
1296
1297 float16 float16_sub(float16 a, float16 b, float_status *status)
1298 {
1299     return float16_addsub(a, b, status, true);
1300 }
1301
1302 static float32 QEMU_SOFTFLOAT_ATTR
1303 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1304 {
1305     FloatParts64 pa, pb, *pr;
1306
1307     float32_unpack_canonical(&pa, a, status);
1308     float32_unpack_canonical(&pb, b, status);
1309     pr = parts_addsub(&pa, &pb, status, subtract);
1310
1311     return float32_round_pack_canonical(pr, status);
1312 }
1313
1314 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1315 {
1316     return soft_f32_addsub(a, b, status, false);
1317 }
1318
1319 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1320 {
1321     return soft_f32_addsub(a, b, status, true);
1322 }
1323
1324 static float64 QEMU_SOFTFLOAT_ATTR
1325 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1326 {
1327     FloatParts64 pa, pb, *pr;
1328
1329     float64_unpack_canonical(&pa, a, status);
1330     float64_unpack_canonical(&pb, b, status);
1331     pr = parts_addsub(&pa, &pb, status, subtract);
1332
1333     return float64_round_pack_canonical(pr, status);
1334 }
1335
1336 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1337 {
1338     return soft_f64_addsub(a, b, status, false);
1339 }
1340
1341 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1342 {
1343     return soft_f64_addsub(a, b, status, true);
1344 }
1345
1346 static float hard_f32_add(float a, float b)
1347 {
1348     return a + b;
1349 }
1350
1351 static float hard_f32_sub(float a, float b)
1352 {
1353     return a - b;
1354 }
1355
1356 static double hard_f64_add(double a, double b)
1357 {
1358     return a + b;
1359 }
1360
1361 static double hard_f64_sub(double a, double b)
1362 {
1363     return a - b;
1364 }
1365
1366 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1367 {
1368     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1369         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1370     }
1371     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1372 }
1373
1374 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1375 {
1376     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1377         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1378     } else {
1379         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1380     }
1381 }
1382
1383 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1384                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1385 {
1386     return float32_gen2(a, b, s, hard, soft,
1387                         f32_is_zon2, f32_addsubmul_post);
1388 }
1389
1390 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1391                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1392 {
1393     return float64_gen2(a, b, s, hard, soft,
1394                         f64_is_zon2, f64_addsubmul_post);
1395 }
1396
1397 float32 QEMU_FLATTEN
1398 float32_add(float32 a, float32 b, float_status *s)
1399 {
1400     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1401 }
1402
1403 float32 QEMU_FLATTEN
1404 float32_sub(float32 a, float32 b, float_status *s)
1405 {
1406     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1407 }
1408
1409 float64 QEMU_FLATTEN
1410 float64_add(float64 a, float64 b, float_status *s)
1411 {
1412     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1413 }
1414
1415 float64 QEMU_FLATTEN
1416 float64_sub(float64 a, float64 b, float_status *s)
1417 {
1418     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1419 }
1420
1421 static bfloat16 QEMU_FLATTEN
1422 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1423 {
1424     FloatParts64 pa, pb, *pr;
1425
1426     bfloat16_unpack_canonical(&pa, a, status);
1427     bfloat16_unpack_canonical(&pb, b, status);
1428     pr = parts_addsub(&pa, &pb, status, subtract);
1429
1430     return bfloat16_round_pack_canonical(pr, status);
1431 }
1432
1433 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1434 {
1435     return bfloat16_addsub(a, b, status, false);
1436 }
1437
1438 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1439 {
1440     return bfloat16_addsub(a, b, status, true);
1441 }
1442
1443 static float128 QEMU_FLATTEN
1444 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1445 {
1446     FloatParts128 pa, pb, *pr;
1447
1448     float128_unpack_canonical(&pa, a, status);
1449     float128_unpack_canonical(&pb, b, status);
1450     pr = parts_addsub(&pa, &pb, status, subtract);
1451
1452     return float128_round_pack_canonical(pr, status);
1453 }
1454
1455 float128 float128_add(float128 a, float128 b, float_status *status)
1456 {
1457     return float128_addsub(a, b, status, false);
1458 }
1459
1460 float128 float128_sub(float128 a, float128 b, float_status *status)
1461 {
1462     return float128_addsub(a, b, status, true);
1463 }
1464
1465 /*
1466  * Multiplication
1467  */
1468
1469 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1470 {
1471     FloatParts64 pa, pb, *pr;
1472
1473     float16_unpack_canonical(&pa, a, status);
1474     float16_unpack_canonical(&pb, b, status);
1475     pr = parts_mul(&pa, &pb, status);
1476
1477     return float16_round_pack_canonical(pr, status);
1478 }
1479
1480 static float32 QEMU_SOFTFLOAT_ATTR
1481 soft_f32_mul(float32 a, float32 b, float_status *status)
1482 {
1483     FloatParts64 pa, pb, *pr;
1484
1485     float32_unpack_canonical(&pa, a, status);
1486     float32_unpack_canonical(&pb, b, status);
1487     pr = parts_mul(&pa, &pb, status);
1488
1489     return float32_round_pack_canonical(pr, status);
1490 }
1491
1492 static float64 QEMU_SOFTFLOAT_ATTR
1493 soft_f64_mul(float64 a, float64 b, float_status *status)
1494 {
1495     FloatParts64 pa, pb, *pr;
1496
1497     float64_unpack_canonical(&pa, a, status);
1498     float64_unpack_canonical(&pb, b, status);
1499     pr = parts_mul(&pa, &pb, status);
1500
1501     return float64_round_pack_canonical(pr, status);
1502 }
1503
1504 static float hard_f32_mul(float a, float b)
1505 {
1506     return a * b;
1507 }
1508
1509 static double hard_f64_mul(double a, double b)
1510 {
1511     return a * b;
1512 }
1513
1514 float32 QEMU_FLATTEN
1515 float32_mul(float32 a, float32 b, float_status *s)
1516 {
1517     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1518                         f32_is_zon2, f32_addsubmul_post);
1519 }
1520
1521 float64 QEMU_FLATTEN
1522 float64_mul(float64 a, float64 b, float_status *s)
1523 {
1524     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1525                         f64_is_zon2, f64_addsubmul_post);
1526 }
1527
1528 bfloat16 QEMU_FLATTEN
1529 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1530 {
1531     FloatParts64 pa, pb, *pr;
1532
1533     bfloat16_unpack_canonical(&pa, a, status);
1534     bfloat16_unpack_canonical(&pb, b, status);
1535     pr = parts_mul(&pa, &pb, status);
1536
1537     return bfloat16_round_pack_canonical(pr, status);
1538 }
1539
1540 float128 QEMU_FLATTEN
1541 float128_mul(float128 a, float128 b, float_status *status)
1542 {
1543     FloatParts128 pa, pb, *pr;
1544
1545     float128_unpack_canonical(&pa, a, status);
1546     float128_unpack_canonical(&pb, b, status);
1547     pr = parts_mul(&pa, &pb, status);
1548
1549     return float128_round_pack_canonical(pr, status);
1550 }
1551
1552 /*
1553  * Fused multiply-add
1554  */
1555
1556 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1557                                     int flags, float_status *status)
1558 {
1559     FloatParts64 pa, pb, pc, *pr;
1560
1561     float16_unpack_canonical(&pa, a, status);
1562     float16_unpack_canonical(&pb, b, status);
1563     float16_unpack_canonical(&pc, c, status);
1564     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1565
1566     return float16_round_pack_canonical(pr, status);
1567 }
1568
1569 static float32 QEMU_SOFTFLOAT_ATTR
1570 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1571                 float_status *status)
1572 {
1573     FloatParts64 pa, pb, pc, *pr;
1574
1575     float32_unpack_canonical(&pa, a, status);
1576     float32_unpack_canonical(&pb, b, status);
1577     float32_unpack_canonical(&pc, c, status);
1578     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1579
1580     return float32_round_pack_canonical(pr, status);
1581 }
1582
1583 static float64 QEMU_SOFTFLOAT_ATTR
1584 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1585                 float_status *status)
1586 {
1587     FloatParts64 pa, pb, pc, *pr;
1588
1589     float64_unpack_canonical(&pa, a, status);
1590     float64_unpack_canonical(&pb, b, status);
1591     float64_unpack_canonical(&pc, c, status);
1592     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1593
1594     return float64_round_pack_canonical(pr, status);
1595 }
1596
1597 static bool force_soft_fma;
1598
1599 float32 QEMU_FLATTEN
1600 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1601 {
1602     union_float32 ua, ub, uc, ur;
1603
1604     ua.s = xa;
1605     ub.s = xb;
1606     uc.s = xc;
1607
1608     if (unlikely(!can_use_fpu(s))) {
1609         goto soft;
1610     }
1611     if (unlikely(flags & float_muladd_halve_result)) {
1612         goto soft;
1613     }
1614
1615     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1616     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1617         goto soft;
1618     }
1619
1620     if (unlikely(force_soft_fma)) {
1621         goto soft;
1622     }
1623
1624     /*
1625      * When (a || b) == 0, there's no need to check for under/over flow,
1626      * since we know the addend is (normal || 0) and the product is 0.
1627      */
1628     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1629         union_float32 up;
1630         bool prod_sign;
1631
1632         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1633         prod_sign ^= !!(flags & float_muladd_negate_product);
1634         up.s = float32_set_sign(float32_zero, prod_sign);
1635
1636         if (flags & float_muladd_negate_c) {
1637             uc.h = -uc.h;
1638         }
1639         ur.h = up.h + uc.h;
1640     } else {
1641         union_float32 ua_orig = ua;
1642         union_float32 uc_orig = uc;
1643
1644         if (flags & float_muladd_negate_product) {
1645             ua.h = -ua.h;
1646         }
1647         if (flags & float_muladd_negate_c) {
1648             uc.h = -uc.h;
1649         }
1650
1651         ur.h = fmaf(ua.h, ub.h, uc.h);
1652
1653         if (unlikely(f32_is_inf(ur))) {
1654             float_raise(float_flag_overflow, s);
1655         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1656             ua = ua_orig;
1657             uc = uc_orig;
1658             goto soft;
1659         }
1660     }
1661     if (flags & float_muladd_negate_result) {
1662         return float32_chs(ur.s);
1663     }
1664     return ur.s;
1665
1666  soft:
1667     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1668 }
1669
1670 float64 QEMU_FLATTEN
1671 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1672 {
1673     union_float64 ua, ub, uc, ur;
1674
1675     ua.s = xa;
1676     ub.s = xb;
1677     uc.s = xc;
1678
1679     if (unlikely(!can_use_fpu(s))) {
1680         goto soft;
1681     }
1682     if (unlikely(flags & float_muladd_halve_result)) {
1683         goto soft;
1684     }
1685
1686     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1687     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1688         goto soft;
1689     }
1690
1691     if (unlikely(force_soft_fma)) {
1692         goto soft;
1693     }
1694
1695     /*
1696      * When (a || b) == 0, there's no need to check for under/over flow,
1697      * since we know the addend is (normal || 0) and the product is 0.
1698      */
1699     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1700         union_float64 up;
1701         bool prod_sign;
1702
1703         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1704         prod_sign ^= !!(flags & float_muladd_negate_product);
1705         up.s = float64_set_sign(float64_zero, prod_sign);
1706
1707         if (flags & float_muladd_negate_c) {
1708             uc.h = -uc.h;
1709         }
1710         ur.h = up.h + uc.h;
1711     } else {
1712         union_float64 ua_orig = ua;
1713         union_float64 uc_orig = uc;
1714
1715         if (flags & float_muladd_negate_product) {
1716             ua.h = -ua.h;
1717         }
1718         if (flags & float_muladd_negate_c) {
1719             uc.h = -uc.h;
1720         }
1721
1722         ur.h = fma(ua.h, ub.h, uc.h);
1723
1724         if (unlikely(f64_is_inf(ur))) {
1725             float_raise(float_flag_overflow, s);
1726         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1727             ua = ua_orig;
1728             uc = uc_orig;
1729             goto soft;
1730         }
1731     }
1732     if (flags & float_muladd_negate_result) {
1733         return float64_chs(ur.s);
1734     }
1735     return ur.s;
1736
1737  soft:
1738     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1739 }
1740
1741 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1742                                       int flags, float_status *status)
1743 {
1744     FloatParts64 pa, pb, pc, *pr;
1745
1746     bfloat16_unpack_canonical(&pa, a, status);
1747     bfloat16_unpack_canonical(&pb, b, status);
1748     bfloat16_unpack_canonical(&pc, c, status);
1749     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1750
1751     return bfloat16_round_pack_canonical(pr, status);
1752 }
1753
1754 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
1755                                       int flags, float_status *status)
1756 {
1757     FloatParts128 pa, pb, pc, *pr;
1758
1759     float128_unpack_canonical(&pa, a, status);
1760     float128_unpack_canonical(&pb, b, status);
1761     float128_unpack_canonical(&pc, c, status);
1762     pr = parts_muladd(&pa, &pb, &pc, flags, status);
1763
1764     return float128_round_pack_canonical(pr, status);
1765 }
1766
1767 /*
1768  * Returns the result of dividing the floating-point value `a' by the
1769  * corresponding value `b'. The operation is performed according to
1770  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1771  */
1772
1773 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1774 {
1775     bool sign = a.sign ^ b.sign;
1776
1777     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1778         uint64_t n0, n1, q, r;
1779         int exp = a.exp - b.exp;
1780
1781         /*
1782          * We want a 2*N / N-bit division to produce exactly an N-bit
1783          * result, so that we do not lose any precision and so that we
1784          * do not have to renormalize afterward.  If A.frac < B.frac,
1785          * then division would produce an (N-1)-bit result; shift A left
1786          * by one to produce the an N-bit result, and decrement the
1787          * exponent to match.
1788          *
1789          * The udiv_qrnnd algorithm that we're using requires normalization,
1790          * i.e. the msb of the denominator must be set, which is already true.
1791          */
1792         if (a.frac < b.frac) {
1793             exp -= 1;
1794             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1795         } else {
1796             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1797         }
1798         q = udiv_qrnnd(&r, n1, n0, b.frac);
1799
1800         /* Set lsb if there is a remainder, to set inexact. */
1801         a.frac = q | (r != 0);
1802         a.sign = sign;
1803         a.exp = exp;
1804         return a;
1805     }
1806     /* handle all the NaN cases */
1807     if (is_nan(a.cls) || is_nan(b.cls)) {
1808         return *parts_pick_nan(&a, &b, s);
1809     }
1810     /* 0/0 or Inf/Inf */
1811     if (a.cls == b.cls
1812         &&
1813         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1814         float_raise(float_flag_invalid, s);
1815         parts_default_nan(&a, s);
1816         return a;
1817     }
1818     /* Inf / x or 0 / x */
1819     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1820         a.sign = sign;
1821         return a;
1822     }
1823     /* Div 0 => Inf */
1824     if (b.cls == float_class_zero) {
1825         float_raise(float_flag_divbyzero, s);
1826         a.cls = float_class_inf;
1827         a.sign = sign;
1828         return a;
1829     }
1830     /* Div by Inf */
1831     if (b.cls == float_class_inf) {
1832         a.cls = float_class_zero;
1833         a.sign = sign;
1834         return a;
1835     }
1836     g_assert_not_reached();
1837 }
1838
1839 float16 float16_div(float16 a, float16 b, float_status *status)
1840 {
1841     FloatParts64 pa, pb, pr;
1842
1843     float16_unpack_canonical(&pa, a, status);
1844     float16_unpack_canonical(&pb, b, status);
1845     pr = div_floats(pa, pb, status);
1846
1847     return float16_round_pack_canonical(&pr, status);
1848 }
1849
1850 static float32 QEMU_SOFTFLOAT_ATTR
1851 soft_f32_div(float32 a, float32 b, float_status *status)
1852 {
1853     FloatParts64 pa, pb, pr;
1854
1855     float32_unpack_canonical(&pa, a, status);
1856     float32_unpack_canonical(&pb, b, status);
1857     pr = div_floats(pa, pb, status);
1858
1859     return float32_round_pack_canonical(&pr, status);
1860 }
1861
1862 static float64 QEMU_SOFTFLOAT_ATTR
1863 soft_f64_div(float64 a, float64 b, float_status *status)
1864 {
1865     FloatParts64 pa, pb, pr;
1866
1867     float64_unpack_canonical(&pa, a, status);
1868     float64_unpack_canonical(&pb, b, status);
1869     pr = div_floats(pa, pb, status);
1870
1871     return float64_round_pack_canonical(&pr, status);
1872 }
1873
1874 static float hard_f32_div(float a, float b)
1875 {
1876     return a / b;
1877 }
1878
1879 static double hard_f64_div(double a, double b)
1880 {
1881     return a / b;
1882 }
1883
1884 static bool f32_div_pre(union_float32 a, union_float32 b)
1885 {
1886     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1887         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1888                fpclassify(b.h) == FP_NORMAL;
1889     }
1890     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1891 }
1892
1893 static bool f64_div_pre(union_float64 a, union_float64 b)
1894 {
1895     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1896         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1897                fpclassify(b.h) == FP_NORMAL;
1898     }
1899     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1900 }
1901
1902 static bool f32_div_post(union_float32 a, union_float32 b)
1903 {
1904     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1905         return fpclassify(a.h) != FP_ZERO;
1906     }
1907     return !float32_is_zero(a.s);
1908 }
1909
1910 static bool f64_div_post(union_float64 a, union_float64 b)
1911 {
1912     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1913         return fpclassify(a.h) != FP_ZERO;
1914     }
1915     return !float64_is_zero(a.s);
1916 }
1917
1918 float32 QEMU_FLATTEN
1919 float32_div(float32 a, float32 b, float_status *s)
1920 {
1921     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1922                         f32_div_pre, f32_div_post);
1923 }
1924
1925 float64 QEMU_FLATTEN
1926 float64_div(float64 a, float64 b, float_status *s)
1927 {
1928     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1929                         f64_div_pre, f64_div_post);
1930 }
1931
1932 /*
1933  * Returns the result of dividing the bfloat16
1934  * value `a' by the corresponding value `b'.
1935  */
1936
1937 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1938 {
1939     FloatParts64 pa, pb, pr;
1940
1941     bfloat16_unpack_canonical(&pa, a, status);
1942     bfloat16_unpack_canonical(&pb, b, status);
1943     pr = div_floats(pa, pb, status);
1944
1945     return bfloat16_round_pack_canonical(&pr, status);
1946 }
1947
1948 /*
1949  * Float to Float conversions
1950  *
1951  * Returns the result of converting one float format to another. The
1952  * conversion is performed according to the IEC/IEEE Standard for
1953  * Binary Floating-Point Arithmetic.
1954  *
1955  * The float_to_float helper only needs to take care of raising
1956  * invalid exceptions and handling the conversion on NaNs.
1957  */
1958
1959 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1960                                  float_status *s)
1961 {
1962     if (dstf->arm_althp) {
1963         switch (a.cls) {
1964         case float_class_qnan:
1965         case float_class_snan:
1966             /* There is no NaN in the destination format.  Raise Invalid
1967              * and return a zero with the sign of the input NaN.
1968              */
1969             float_raise(float_flag_invalid, s);
1970             a.cls = float_class_zero;
1971             a.frac = 0;
1972             a.exp = 0;
1973             break;
1974
1975         case float_class_inf:
1976             /* There is no Inf in the destination format.  Raise Invalid
1977              * and return the maximum normal with the correct sign.
1978              */
1979             float_raise(float_flag_invalid, s);
1980             a.cls = float_class_normal;
1981             a.exp = dstf->exp_max;
1982             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1983             break;
1984
1985         default:
1986             break;
1987         }
1988     } else if (is_nan(a.cls)) {
1989         parts_return_nan(&a, s);
1990     }
1991     return a;
1992 }
1993
1994 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1995 {
1996     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1997     FloatParts64 pa, pr;
1998
1999     float16a_unpack_canonical(&pa, a, s, fmt16);
2000     pr = float_to_float(pa, &float32_params, s);
2001     return float32_round_pack_canonical(&pr, s);
2002 }
2003
2004 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2005 {
2006     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2007     FloatParts64 pa, pr;
2008
2009     float16a_unpack_canonical(&pa, a, s, fmt16);
2010     pr = float_to_float(pa, &float64_params, s);
2011     return float64_round_pack_canonical(&pr, s);
2012 }
2013
2014 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2015 {
2016     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2017     FloatParts64 pa, pr;
2018
2019     float32_unpack_canonical(&pa, a, s);
2020     pr = float_to_float(pa, fmt16, s);
2021     return float16a_round_pack_canonical(&pr, s, fmt16);
2022 }
2023
2024 static float64 QEMU_SOFTFLOAT_ATTR
2025 soft_float32_to_float64(float32 a, float_status *s)
2026 {
2027     FloatParts64 pa, pr;
2028
2029     float32_unpack_canonical(&pa, a, s);
2030     pr = float_to_float(pa, &float64_params, s);
2031     return float64_round_pack_canonical(&pr, s);
2032 }
2033
2034 float64 float32_to_float64(float32 a, float_status *s)
2035 {
2036     if (likely(float32_is_normal(a))) {
2037         /* Widening conversion can never produce inexact results.  */
2038         union_float32 uf;
2039         union_float64 ud;
2040         uf.s = a;
2041         ud.h = uf.h;
2042         return ud.s;
2043     } else if (float32_is_zero(a)) {
2044         return float64_set_sign(float64_zero, float32_is_neg(a));
2045     } else {
2046         return soft_float32_to_float64(a, s);
2047     }
2048 }
2049
2050 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2051 {
2052     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2053     FloatParts64 pa, pr;
2054
2055     float64_unpack_canonical(&pa, a, s);
2056     pr = float_to_float(pa, fmt16, s);
2057     return float16a_round_pack_canonical(&pr, s, fmt16);
2058 }
2059
2060 float32 float64_to_float32(float64 a, float_status *s)
2061 {
2062     FloatParts64 pa, pr;
2063
2064     float64_unpack_canonical(&pa, a, s);
2065     pr = float_to_float(pa, &float32_params, s);
2066     return float32_round_pack_canonical(&pr, s);
2067 }
2068
2069 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2070 {
2071     FloatParts64 pa, pr;
2072
2073     bfloat16_unpack_canonical(&pa, a, s);
2074     pr = float_to_float(pa, &float32_params, s);
2075     return float32_round_pack_canonical(&pr, s);
2076 }
2077
2078 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2079 {
2080     FloatParts64 pa, pr;
2081
2082     bfloat16_unpack_canonical(&pa, a, s);
2083     pr = float_to_float(pa, &float64_params, s);
2084     return float64_round_pack_canonical(&pr, s);
2085 }
2086
2087 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2088 {
2089     FloatParts64 pa, pr;
2090
2091     float32_unpack_canonical(&pa, a, s);
2092     pr = float_to_float(pa, &bfloat16_params, s);
2093     return bfloat16_round_pack_canonical(&pr, s);
2094 }
2095
2096 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2097 {
2098     FloatParts64 pa, pr;
2099
2100     float64_unpack_canonical(&pa, a, s);
2101     pr = float_to_float(pa, &bfloat16_params, s);
2102     return bfloat16_round_pack_canonical(&pr, s);
2103 }
2104
2105 /*
2106  * Rounds the floating-point value `a' to an integer, and returns the
2107  * result as a floating-point value. The operation is performed
2108  * according to the IEC/IEEE Standard for Binary Floating-Point
2109  * Arithmetic.
2110  */
2111
2112 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2113                                int scale, float_status *s)
2114 {
2115     switch (a.cls) {
2116     case float_class_qnan:
2117     case float_class_snan:
2118         parts_return_nan(&a, s);
2119         break;
2120
2121     case float_class_zero:
2122     case float_class_inf:
2123         /* already "integral" */
2124         break;
2125
2126     case float_class_normal:
2127         scale = MIN(MAX(scale, -0x10000), 0x10000);
2128         a.exp += scale;
2129
2130         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2131             /* already integral */
2132             break;
2133         }
2134         if (a.exp < 0) {
2135             bool one;
2136             /* all fractional */
2137             float_raise(float_flag_inexact, s);
2138             switch (rmode) {
2139             case float_round_nearest_even:
2140                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2141                 break;
2142             case float_round_ties_away:
2143                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2144                 break;
2145             case float_round_to_zero:
2146                 one = false;
2147                 break;
2148             case float_round_up:
2149                 one = !a.sign;
2150                 break;
2151             case float_round_down:
2152                 one = a.sign;
2153                 break;
2154             case float_round_to_odd:
2155                 one = true;
2156                 break;
2157             default:
2158                 g_assert_not_reached();
2159             }
2160
2161             if (one) {
2162                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2163                 a.exp = 0;
2164             } else {
2165                 a.cls = float_class_zero;
2166             }
2167         } else {
2168             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2169             uint64_t frac_lsbm1 = frac_lsb >> 1;
2170             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2171             uint64_t rnd_mask = rnd_even_mask >> 1;
2172             uint64_t inc;
2173
2174             switch (rmode) {
2175             case float_round_nearest_even:
2176                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2177                 break;
2178             case float_round_ties_away:
2179                 inc = frac_lsbm1;
2180                 break;
2181             case float_round_to_zero:
2182                 inc = 0;
2183                 break;
2184             case float_round_up:
2185                 inc = a.sign ? 0 : rnd_mask;
2186                 break;
2187             case float_round_down:
2188                 inc = a.sign ? rnd_mask : 0;
2189                 break;
2190             case float_round_to_odd:
2191                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2192                 break;
2193             default:
2194                 g_assert_not_reached();
2195             }
2196
2197             if (a.frac & rnd_mask) {
2198                 float_raise(float_flag_inexact, s);
2199                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2200                     a.frac >>= 1;
2201                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2202                     a.exp++;
2203                 }
2204                 a.frac &= ~rnd_mask;
2205             }
2206         }
2207         break;
2208     default:
2209         g_assert_not_reached();
2210     }
2211     return a;
2212 }
2213
2214 float16 float16_round_to_int(float16 a, float_status *s)
2215 {
2216     FloatParts64 pa, pr;
2217
2218     float16_unpack_canonical(&pa, a, s);
2219     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2220     return float16_round_pack_canonical(&pr, s);
2221 }
2222
2223 float32 float32_round_to_int(float32 a, float_status *s)
2224 {
2225     FloatParts64 pa, pr;
2226
2227     float32_unpack_canonical(&pa, a, s);
2228     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2229     return float32_round_pack_canonical(&pr, s);
2230 }
2231
2232 float64 float64_round_to_int(float64 a, float_status *s)
2233 {
2234     FloatParts64 pa, pr;
2235
2236     float64_unpack_canonical(&pa, a, s);
2237     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2238     return float64_round_pack_canonical(&pr, s);
2239 }
2240
2241 /*
2242  * Rounds the bfloat16 value `a' to an integer, and returns the
2243  * result as a bfloat16 value.
2244  */
2245
2246 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2247 {
2248     FloatParts64 pa, pr;
2249
2250     bfloat16_unpack_canonical(&pa, a, s);
2251     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2252     return bfloat16_round_pack_canonical(&pr, s);
2253 }
2254
2255 /*
2256  * Returns the result of converting the floating-point value `a' to
2257  * the two's complement integer format. The conversion is performed
2258  * according to the IEC/IEEE Standard for Binary Floating-Point
2259  * Arithmetic---which means in particular that the conversion is
2260  * rounded according to the current rounding mode. If `a' is a NaN,
2261  * the largest positive integer is returned. Otherwise, if the
2262  * conversion overflows, the largest integer with the same sign as `a'
2263  * is returned.
2264 */
2265
2266 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2267                                      int scale, int64_t min, int64_t max,
2268                                      float_status *s)
2269 {
2270     uint64_t r;
2271     int orig_flags = get_float_exception_flags(s);
2272     FloatParts64 p = round_to_int(in, rmode, scale, s);
2273
2274     switch (p.cls) {
2275     case float_class_snan:
2276     case float_class_qnan:
2277         s->float_exception_flags = orig_flags | float_flag_invalid;
2278         return max;
2279     case float_class_inf:
2280         s->float_exception_flags = orig_flags | float_flag_invalid;
2281         return p.sign ? min : max;
2282     case float_class_zero:
2283         return 0;
2284     case float_class_normal:
2285         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2286             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2287         } else {
2288             r = UINT64_MAX;
2289         }
2290         if (p.sign) {
2291             if (r <= -(uint64_t) min) {
2292                 return -r;
2293             } else {
2294                 s->float_exception_flags = orig_flags | float_flag_invalid;
2295                 return min;
2296             }
2297         } else {
2298             if (r <= max) {
2299                 return r;
2300             } else {
2301                 s->float_exception_flags = orig_flags | float_flag_invalid;
2302                 return max;
2303             }
2304         }
2305     default:
2306         g_assert_not_reached();
2307     }
2308 }
2309
2310 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2311                               float_status *s)
2312 {
2313     FloatParts64 p;
2314
2315     float16_unpack_canonical(&p, a, s);
2316     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2317 }
2318
2319 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2320                                 float_status *s)
2321 {
2322     FloatParts64 p;
2323
2324     float16_unpack_canonical(&p, a, s);
2325     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2326 }
2327
2328 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2329                                 float_status *s)
2330 {
2331     FloatParts64 p;
2332
2333     float16_unpack_canonical(&p, a, s);
2334     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2335 }
2336
2337 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2338                                 float_status *s)
2339 {
2340     FloatParts64 p;
2341
2342     float16_unpack_canonical(&p, a, s);
2343     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2344 }
2345
2346 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2347                                 float_status *s)
2348 {
2349     FloatParts64 p;
2350
2351     float32_unpack_canonical(&p, a, s);
2352     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2353 }
2354
2355 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2356                                 float_status *s)
2357 {
2358     FloatParts64 p;
2359
2360     float32_unpack_canonical(&p, a, s);
2361     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2362 }
2363
2364 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2365                                 float_status *s)
2366 {
2367     FloatParts64 p;
2368
2369     float32_unpack_canonical(&p, a, s);
2370     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2371 }
2372
2373 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2374                                 float_status *s)
2375 {
2376     FloatParts64 p;
2377
2378     float64_unpack_canonical(&p, a, s);
2379     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2380 }
2381
2382 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2383                                 float_status *s)
2384 {
2385     FloatParts64 p;
2386
2387     float64_unpack_canonical(&p, a, s);
2388     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2389 }
2390
2391 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2392                                 float_status *s)
2393 {
2394     FloatParts64 p;
2395
2396     float64_unpack_canonical(&p, a, s);
2397     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2398 }
2399
2400 int8_t float16_to_int8(float16 a, float_status *s)
2401 {
2402     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2403 }
2404
2405 int16_t float16_to_int16(float16 a, float_status *s)
2406 {
2407     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2408 }
2409
2410 int32_t float16_to_int32(float16 a, float_status *s)
2411 {
2412     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2413 }
2414
2415 int64_t float16_to_int64(float16 a, float_status *s)
2416 {
2417     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2418 }
2419
2420 int16_t float32_to_int16(float32 a, float_status *s)
2421 {
2422     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2423 }
2424
2425 int32_t float32_to_int32(float32 a, float_status *s)
2426 {
2427     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2428 }
2429
2430 int64_t float32_to_int64(float32 a, float_status *s)
2431 {
2432     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2433 }
2434
2435 int16_t float64_to_int16(float64 a, float_status *s)
2436 {
2437     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2438 }
2439
2440 int32_t float64_to_int32(float64 a, float_status *s)
2441 {
2442     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2443 }
2444
2445 int64_t float64_to_int64(float64 a, float_status *s)
2446 {
2447     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2448 }
2449
2450 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2451 {
2452     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2453 }
2454
2455 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2456 {
2457     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2458 }
2459
2460 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2461 {
2462     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2463 }
2464
2465 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2466 {
2467     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2468 }
2469
2470 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2471 {
2472     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2473 }
2474
2475 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2476 {
2477     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2478 }
2479
2480 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2481 {
2482     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2483 }
2484
2485 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2486 {
2487     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2488 }
2489
2490 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2491 {
2492     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2493 }
2494
2495 /*
2496  * Returns the result of converting the floating-point value `a' to
2497  * the two's complement integer format.
2498  */
2499
2500 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2501                                  float_status *s)
2502 {
2503     FloatParts64 p;
2504
2505     bfloat16_unpack_canonical(&p, a, s);
2506     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2507 }
2508
2509 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2510                                  float_status *s)
2511 {
2512     FloatParts64 p;
2513
2514     bfloat16_unpack_canonical(&p, a, s);
2515     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2516 }
2517
2518 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2519                                  float_status *s)
2520 {
2521     FloatParts64 p;
2522
2523     bfloat16_unpack_canonical(&p, a, s);
2524     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2525 }
2526
2527 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2528 {
2529     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2530 }
2531
2532 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2533 {
2534     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2535 }
2536
2537 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2538 {
2539     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2540 }
2541
2542 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2543 {
2544     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2545 }
2546
2547 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2548 {
2549     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2550 }
2551
2552 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2553 {
2554     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2555 }
2556
2557 /*
2558  *  Returns the result of converting the floating-point value `a' to
2559  *  the unsigned integer format. The conversion is performed according
2560  *  to the IEC/IEEE Standard for Binary Floating-Point
2561  *  Arithmetic---which means in particular that the conversion is
2562  *  rounded according to the current rounding mode. If `a' is a NaN,
2563  *  the largest unsigned integer is returned. Otherwise, if the
2564  *  conversion overflows, the largest unsigned integer is returned. If
2565  *  the 'a' is negative, the result is rounded and zero is returned;
2566  *  values that do not round to zero will raise the inexact exception
2567  *  flag.
2568  */
2569
2570 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2571                                        int scale, uint64_t max,
2572                                        float_status *s)
2573 {
2574     int orig_flags = get_float_exception_flags(s);
2575     FloatParts64 p = round_to_int(in, rmode, scale, s);
2576     uint64_t r;
2577
2578     switch (p.cls) {
2579     case float_class_snan:
2580     case float_class_qnan:
2581         s->float_exception_flags = orig_flags | float_flag_invalid;
2582         return max;
2583     case float_class_inf:
2584         s->float_exception_flags = orig_flags | float_flag_invalid;
2585         return p.sign ? 0 : max;
2586     case float_class_zero:
2587         return 0;
2588     case float_class_normal:
2589         if (p.sign) {
2590             s->float_exception_flags = orig_flags | float_flag_invalid;
2591             return 0;
2592         }
2593
2594         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2595             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2596         } else {
2597             s->float_exception_flags = orig_flags | float_flag_invalid;
2598             return max;
2599         }
2600
2601         /* For uint64 this will never trip, but if p.exp is too large
2602          * to shift a decomposed fraction we shall have exited via the
2603          * 3rd leg above.
2604          */
2605         if (r > max) {
2606             s->float_exception_flags = orig_flags | float_flag_invalid;
2607             return max;
2608         }
2609         return r;
2610     default:
2611         g_assert_not_reached();
2612     }
2613 }
2614
2615 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2616                                 float_status *s)
2617 {
2618     FloatParts64 p;
2619
2620     float16_unpack_canonical(&p, a, s);
2621     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2622 }
2623
2624 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2625                                   float_status *s)
2626 {
2627     FloatParts64 p;
2628
2629     float16_unpack_canonical(&p, a, s);
2630     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2631 }
2632
2633 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2634                                   float_status *s)
2635 {
2636     FloatParts64 p;
2637
2638     float16_unpack_canonical(&p, a, s);
2639     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2640 }
2641
2642 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2643                                   float_status *s)
2644 {
2645     FloatParts64 p;
2646
2647     float16_unpack_canonical(&p, a, s);
2648     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2649 }
2650
2651 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2652                                   float_status *s)
2653 {
2654     FloatParts64 p;
2655
2656     float32_unpack_canonical(&p, a, s);
2657     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2658 }
2659
2660 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2661                                   float_status *s)
2662 {
2663     FloatParts64 p;
2664
2665     float32_unpack_canonical(&p, a, s);
2666     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2667 }
2668
2669 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2670                                   float_status *s)
2671 {
2672     FloatParts64 p;
2673
2674     float32_unpack_canonical(&p, a, s);
2675     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2676 }
2677
2678 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2679                                   float_status *s)
2680 {
2681     FloatParts64 p;
2682
2683     float64_unpack_canonical(&p, a, s);
2684     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2685 }
2686
2687 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2688                                   float_status *s)
2689 {
2690     FloatParts64 p;
2691
2692     float64_unpack_canonical(&p, a, s);
2693     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2694 }
2695
2696 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2697                                   float_status *s)
2698 {
2699     FloatParts64 p;
2700
2701     float64_unpack_canonical(&p, a, s);
2702     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2703 }
2704
2705 uint8_t float16_to_uint8(float16 a, float_status *s)
2706 {
2707     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2708 }
2709
2710 uint16_t float16_to_uint16(float16 a, float_status *s)
2711 {
2712     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2713 }
2714
2715 uint32_t float16_to_uint32(float16 a, float_status *s)
2716 {
2717     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2718 }
2719
2720 uint64_t float16_to_uint64(float16 a, float_status *s)
2721 {
2722     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2723 }
2724
2725 uint16_t float32_to_uint16(float32 a, float_status *s)
2726 {
2727     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2728 }
2729
2730 uint32_t float32_to_uint32(float32 a, float_status *s)
2731 {
2732     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2733 }
2734
2735 uint64_t float32_to_uint64(float32 a, float_status *s)
2736 {
2737     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2738 }
2739
2740 uint16_t float64_to_uint16(float64 a, float_status *s)
2741 {
2742     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2743 }
2744
2745 uint32_t float64_to_uint32(float64 a, float_status *s)
2746 {
2747     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2748 }
2749
2750 uint64_t float64_to_uint64(float64 a, float_status *s)
2751 {
2752     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2753 }
2754
2755 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2756 {
2757     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2758 }
2759
2760 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2761 {
2762     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2763 }
2764
2765 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2766 {
2767     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2768 }
2769
2770 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2771 {
2772     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2773 }
2774
2775 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2776 {
2777     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2778 }
2779
2780 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2781 {
2782     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2783 }
2784
2785 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2786 {
2787     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2788 }
2789
2790 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2791 {
2792     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2793 }
2794
2795 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2796 {
2797     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2798 }
2799
2800 /*
2801  *  Returns the result of converting the bfloat16 value `a' to
2802  *  the unsigned integer format.
2803  */
2804
2805 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2806                                    int scale, float_status *s)
2807 {
2808     FloatParts64 p;
2809
2810     bfloat16_unpack_canonical(&p, a, s);
2811     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2812 }
2813
2814 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2815                                    int scale, float_status *s)
2816 {
2817     FloatParts64 p;
2818
2819     bfloat16_unpack_canonical(&p, a, s);
2820     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2821 }
2822
2823 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2824                                    int scale, float_status *s)
2825 {
2826     FloatParts64 p;
2827
2828     bfloat16_unpack_canonical(&p, a, s);
2829     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2830 }
2831
2832 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2833 {
2834     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2835 }
2836
2837 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2838 {
2839     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2840 }
2841
2842 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2843 {
2844     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2845 }
2846
2847 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2848 {
2849     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2850 }
2851
2852 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2853 {
2854     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2855 }
2856
2857 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2858 {
2859     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2860 }
2861
2862 /*
2863  * Integer to float conversions
2864  *
2865  * Returns the result of converting the two's complement integer `a'
2866  * to the floating-point format. The conversion is performed according
2867  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2868  */
2869
2870 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2871 {
2872     FloatParts64 r = { .sign = false };
2873
2874     if (a == 0) {
2875         r.cls = float_class_zero;
2876     } else {
2877         uint64_t f = a;
2878         int shift;
2879
2880         r.cls = float_class_normal;
2881         if (a < 0) {
2882             f = -f;
2883             r.sign = true;
2884         }
2885         shift = clz64(f);
2886         scale = MIN(MAX(scale, -0x10000), 0x10000);
2887
2888         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2889         r.frac = f << shift;
2890     }
2891
2892     return r;
2893 }
2894
2895 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2896 {
2897     FloatParts64 pa = int_to_float(a, scale, status);
2898     return float16_round_pack_canonical(&pa, status);
2899 }
2900
2901 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2902 {
2903     return int64_to_float16_scalbn(a, scale, status);
2904 }
2905
2906 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2907 {
2908     return int64_to_float16_scalbn(a, scale, status);
2909 }
2910
2911 float16 int64_to_float16(int64_t a, float_status *status)
2912 {
2913     return int64_to_float16_scalbn(a, 0, status);
2914 }
2915
2916 float16 int32_to_float16(int32_t a, float_status *status)
2917 {
2918     return int64_to_float16_scalbn(a, 0, status);
2919 }
2920
2921 float16 int16_to_float16(int16_t a, float_status *status)
2922 {
2923     return int64_to_float16_scalbn(a, 0, status);
2924 }
2925
2926 float16 int8_to_float16(int8_t a, float_status *status)
2927 {
2928     return int64_to_float16_scalbn(a, 0, status);
2929 }
2930
2931 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2932 {
2933     FloatParts64 pa = int_to_float(a, scale, status);
2934     return float32_round_pack_canonical(&pa, status);
2935 }
2936
2937 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2938 {
2939     return int64_to_float32_scalbn(a, scale, status);
2940 }
2941
2942 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2943 {
2944     return int64_to_float32_scalbn(a, scale, status);
2945 }
2946
2947 float32 int64_to_float32(int64_t a, float_status *status)
2948 {
2949     return int64_to_float32_scalbn(a, 0, status);
2950 }
2951
2952 float32 int32_to_float32(int32_t a, float_status *status)
2953 {
2954     return int64_to_float32_scalbn(a, 0, status);
2955 }
2956
2957 float32 int16_to_float32(int16_t a, float_status *status)
2958 {
2959     return int64_to_float32_scalbn(a, 0, status);
2960 }
2961
2962 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2963 {
2964     FloatParts64 pa = int_to_float(a, scale, status);
2965     return float64_round_pack_canonical(&pa, status);
2966 }
2967
2968 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2969 {
2970     return int64_to_float64_scalbn(a, scale, status);
2971 }
2972
2973 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2974 {
2975     return int64_to_float64_scalbn(a, scale, status);
2976 }
2977
2978 float64 int64_to_float64(int64_t a, float_status *status)
2979 {
2980     return int64_to_float64_scalbn(a, 0, status);
2981 }
2982
2983 float64 int32_to_float64(int32_t a, float_status *status)
2984 {
2985     return int64_to_float64_scalbn(a, 0, status);
2986 }
2987
2988 float64 int16_to_float64(int16_t a, float_status *status)
2989 {
2990     return int64_to_float64_scalbn(a, 0, status);
2991 }
2992
2993 /*
2994  * Returns the result of converting the two's complement integer `a'
2995  * to the bfloat16 format.
2996  */
2997
2998 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2999 {
3000     FloatParts64 pa = int_to_float(a, scale, status);
3001     return bfloat16_round_pack_canonical(&pa, status);
3002 }
3003
3004 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3005 {
3006     return int64_to_bfloat16_scalbn(a, scale, status);
3007 }
3008
3009 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3010 {
3011     return int64_to_bfloat16_scalbn(a, scale, status);
3012 }
3013
3014 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3015 {
3016     return int64_to_bfloat16_scalbn(a, 0, status);
3017 }
3018
3019 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3020 {
3021     return int64_to_bfloat16_scalbn(a, 0, status);
3022 }
3023
3024 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3025 {
3026     return int64_to_bfloat16_scalbn(a, 0, status);
3027 }
3028
3029 /*
3030  * Unsigned Integer to float conversions
3031  *
3032  * Returns the result of converting the unsigned integer `a' to the
3033  * floating-point format. The conversion is performed according to the
3034  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3035  */
3036
3037 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3038 {
3039     FloatParts64 r = { .sign = false };
3040     int shift;
3041
3042     if (a == 0) {
3043         r.cls = float_class_zero;
3044     } else {
3045         scale = MIN(MAX(scale, -0x10000), 0x10000);
3046         shift = clz64(a);
3047         r.cls = float_class_normal;
3048         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3049         r.frac = a << shift;
3050     }
3051
3052     return r;
3053 }
3054
3055 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3056 {
3057     FloatParts64 pa = uint_to_float(a, scale, status);
3058     return float16_round_pack_canonical(&pa, status);
3059 }
3060
3061 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3062 {
3063     return uint64_to_float16_scalbn(a, scale, status);
3064 }
3065
3066 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3067 {
3068     return uint64_to_float16_scalbn(a, scale, status);
3069 }
3070
3071 float16 uint64_to_float16(uint64_t a, float_status *status)
3072 {
3073     return uint64_to_float16_scalbn(a, 0, status);
3074 }
3075
3076 float16 uint32_to_float16(uint32_t a, float_status *status)
3077 {
3078     return uint64_to_float16_scalbn(a, 0, status);
3079 }
3080
3081 float16 uint16_to_float16(uint16_t a, float_status *status)
3082 {
3083     return uint64_to_float16_scalbn(a, 0, status);
3084 }
3085
3086 float16 uint8_to_float16(uint8_t a, float_status *status)
3087 {
3088     return uint64_to_float16_scalbn(a, 0, status);
3089 }
3090
3091 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3092 {
3093     FloatParts64 pa = uint_to_float(a, scale, status);
3094     return float32_round_pack_canonical(&pa, status);
3095 }
3096
3097 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3098 {
3099     return uint64_to_float32_scalbn(a, scale, status);
3100 }
3101
3102 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3103 {
3104     return uint64_to_float32_scalbn(a, scale, status);
3105 }
3106
3107 float32 uint64_to_float32(uint64_t a, float_status *status)
3108 {
3109     return uint64_to_float32_scalbn(a, 0, status);
3110 }
3111
3112 float32 uint32_to_float32(uint32_t a, float_status *status)
3113 {
3114     return uint64_to_float32_scalbn(a, 0, status);
3115 }
3116
3117 float32 uint16_to_float32(uint16_t a, float_status *status)
3118 {
3119     return uint64_to_float32_scalbn(a, 0, status);
3120 }
3121
3122 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3123 {
3124     FloatParts64 pa = uint_to_float(a, scale, status);
3125     return float64_round_pack_canonical(&pa, status);
3126 }
3127
3128 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3129 {
3130     return uint64_to_float64_scalbn(a, scale, status);
3131 }
3132
3133 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3134 {
3135     return uint64_to_float64_scalbn(a, scale, status);
3136 }
3137
3138 float64 uint64_to_float64(uint64_t a, float_status *status)
3139 {
3140     return uint64_to_float64_scalbn(a, 0, status);
3141 }
3142
3143 float64 uint32_to_float64(uint32_t a, float_status *status)
3144 {
3145     return uint64_to_float64_scalbn(a, 0, status);
3146 }
3147
3148 float64 uint16_to_float64(uint16_t a, float_status *status)
3149 {
3150     return uint64_to_float64_scalbn(a, 0, status);
3151 }
3152
3153 /*
3154  * Returns the result of converting the unsigned integer `a' to the
3155  * bfloat16 format.
3156  */
3157
3158 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3159 {
3160     FloatParts64 pa = uint_to_float(a, scale, status);
3161     return bfloat16_round_pack_canonical(&pa, status);
3162 }
3163
3164 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3165 {
3166     return uint64_to_bfloat16_scalbn(a, scale, status);
3167 }
3168
3169 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3170 {
3171     return uint64_to_bfloat16_scalbn(a, scale, status);
3172 }
3173
3174 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3175 {
3176     return uint64_to_bfloat16_scalbn(a, 0, status);
3177 }
3178
3179 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3180 {
3181     return uint64_to_bfloat16_scalbn(a, 0, status);
3182 }
3183
3184 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3185 {
3186     return uint64_to_bfloat16_scalbn(a, 0, status);
3187 }
3188
3189 /* Float Min/Max */
3190 /* min() and max() functions. These can't be implemented as
3191  * 'compare and pick one input' because that would mishandle
3192  * NaNs and +0 vs -0.
3193  *
3194  * minnum() and maxnum() functions. These are similar to the min()
3195  * and max() functions but if one of the arguments is a QNaN and
3196  * the other is numerical then the numerical argument is returned.
3197  * SNaNs will get quietened before being returned.
3198  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3199  * and maxNum() operations. min() and max() are the typical min/max
3200  * semantics provided by many CPUs which predate that specification.
3201  *
3202  * minnummag() and maxnummag() functions correspond to minNumMag()
3203  * and minNumMag() from the IEEE-754 2008.
3204  */
3205 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3206                                 bool ieee, bool ismag, float_status *s)
3207 {
3208     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3209         if (ieee) {
3210             /* Takes two floating-point values `a' and `b', one of
3211              * which is a NaN, and returns the appropriate NaN
3212              * result. If either `a' or `b' is a signaling NaN,
3213              * the invalid exception is raised.
3214              */
3215             if (is_snan(a.cls) || is_snan(b.cls)) {
3216                 return *parts_pick_nan(&a, &b, s);
3217             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3218                 return b;
3219             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3220                 return a;
3221             }
3222         }
3223         return *parts_pick_nan(&a, &b, s);
3224     } else {
3225         int a_exp, b_exp;
3226
3227         switch (a.cls) {
3228         case float_class_normal:
3229             a_exp = a.exp;
3230             break;
3231         case float_class_inf:
3232             a_exp = INT_MAX;
3233             break;
3234         case float_class_zero:
3235             a_exp = INT_MIN;
3236             break;
3237         default:
3238             g_assert_not_reached();
3239             break;
3240         }
3241         switch (b.cls) {
3242         case float_class_normal:
3243             b_exp = b.exp;
3244             break;
3245         case float_class_inf:
3246             b_exp = INT_MAX;
3247             break;
3248         case float_class_zero:
3249             b_exp = INT_MIN;
3250             break;
3251         default:
3252             g_assert_not_reached();
3253             break;
3254         }
3255
3256         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3257             bool a_less = a_exp < b_exp;
3258             if (a_exp == b_exp) {
3259                 a_less = a.frac < b.frac;
3260             }
3261             return a_less ^ ismin ? b : a;
3262         }
3263
3264         if (a.sign == b.sign) {
3265             bool a_less = a_exp < b_exp;
3266             if (a_exp == b_exp) {
3267                 a_less = a.frac < b.frac;
3268             }
3269             return a.sign ^ a_less ^ ismin ? b : a;
3270         } else {
3271             return a.sign ^ ismin ? b : a;
3272         }
3273     }
3274 }
3275
3276 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3277 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3278                                      float_status *s)                   \
3279 {                                                                       \
3280     FloatParts64 pa, pb, pr;                                            \
3281     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3282     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3283     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3284     return float ## sz ## _round_pack_canonical(&pr, s);                \
3285 }
3286
3287 MINMAX(16, min, true, false, false)
3288 MINMAX(16, minnum, true, true, false)
3289 MINMAX(16, minnummag, true, true, true)
3290 MINMAX(16, max, false, false, false)
3291 MINMAX(16, maxnum, false, true, false)
3292 MINMAX(16, maxnummag, false, true, true)
3293
3294 MINMAX(32, min, true, false, false)
3295 MINMAX(32, minnum, true, true, false)
3296 MINMAX(32, minnummag, true, true, true)
3297 MINMAX(32, max, false, false, false)
3298 MINMAX(32, maxnum, false, true, false)
3299 MINMAX(32, maxnummag, false, true, true)
3300
3301 MINMAX(64, min, true, false, false)
3302 MINMAX(64, minnum, true, true, false)
3303 MINMAX(64, minnummag, true, true, true)
3304 MINMAX(64, max, false, false, false)
3305 MINMAX(64, maxnum, false, true, false)
3306 MINMAX(64, maxnummag, false, true, true)
3307
3308 #undef MINMAX
3309
3310 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3311 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3312 {                                                                       \
3313     FloatParts64 pa, pb, pr;                                            \
3314     bfloat16_unpack_canonical(&pa, a, s);                               \
3315     bfloat16_unpack_canonical(&pb, b, s);                               \
3316     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3317     return bfloat16_round_pack_canonical(&pr, s);                       \
3318 }
3319
3320 BF16_MINMAX(min, true, false, false)
3321 BF16_MINMAX(minnum, true, true, false)
3322 BF16_MINMAX(minnummag, true, true, true)
3323 BF16_MINMAX(max, false, false, false)
3324 BF16_MINMAX(maxnum, false, true, false)
3325 BF16_MINMAX(maxnummag, false, true, true)
3326
3327 #undef BF16_MINMAX
3328
3329 /* Floating point compare */
3330 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3331                                     float_status *s)
3332 {
3333     if (is_nan(a.cls) || is_nan(b.cls)) {
3334         if (!is_quiet ||
3335             a.cls == float_class_snan ||
3336             b.cls == float_class_snan) {
3337             float_raise(float_flag_invalid, s);
3338         }
3339         return float_relation_unordered;
3340     }
3341
3342     if (a.cls == float_class_zero) {
3343         if (b.cls == float_class_zero) {
3344             return float_relation_equal;
3345         }
3346         return b.sign ? float_relation_greater : float_relation_less;
3347     } else if (b.cls == float_class_zero) {
3348         return a.sign ? float_relation_less : float_relation_greater;
3349     }
3350
3351     /* The only really important thing about infinity is its sign. If
3352      * both are infinities the sign marks the smallest of the two.
3353      */
3354     if (a.cls == float_class_inf) {
3355         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3356             return float_relation_equal;
3357         }
3358         return a.sign ? float_relation_less : float_relation_greater;
3359     } else if (b.cls == float_class_inf) {
3360         return b.sign ? float_relation_greater : float_relation_less;
3361     }
3362
3363     if (a.sign != b.sign) {
3364         return a.sign ? float_relation_less : float_relation_greater;
3365     }
3366
3367     if (a.exp == b.exp) {
3368         if (a.frac == b.frac) {
3369             return float_relation_equal;
3370         }
3371         if (a.sign) {
3372             return a.frac > b.frac ?
3373                 float_relation_less : float_relation_greater;
3374         } else {
3375             return a.frac > b.frac ?
3376                 float_relation_greater : float_relation_less;
3377         }
3378     } else {
3379         if (a.sign) {
3380             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3381         } else {
3382             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3383         }
3384     }
3385 }
3386
3387 #define COMPARE(name, attr, sz)                                         \
3388 static int attr                                                         \
3389 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3390 {                                                                       \
3391     FloatParts64 pa, pb;                                                \
3392     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3393     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3394     return compare_floats(pa, pb, is_quiet, s);                         \
3395 }
3396
3397 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3398 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3399 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3400
3401 #undef COMPARE
3402
3403 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3404 {
3405     return soft_f16_compare(a, b, false, s);
3406 }
3407
3408 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3409 {
3410     return soft_f16_compare(a, b, true, s);
3411 }
3412
3413 static FloatRelation QEMU_FLATTEN
3414 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3415 {
3416     union_float32 ua, ub;
3417
3418     ua.s = xa;
3419     ub.s = xb;
3420
3421     if (QEMU_NO_HARDFLOAT) {
3422         goto soft;
3423     }
3424
3425     float32_input_flush2(&ua.s, &ub.s, s);
3426     if (isgreaterequal(ua.h, ub.h)) {
3427         if (isgreater(ua.h, ub.h)) {
3428             return float_relation_greater;
3429         }
3430         return float_relation_equal;
3431     }
3432     if (likely(isless(ua.h, ub.h))) {
3433         return float_relation_less;
3434     }
3435     /* The only condition remaining is unordered.
3436      * Fall through to set flags.
3437      */
3438  soft:
3439     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3440 }
3441
3442 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3443 {
3444     return f32_compare(a, b, false, s);
3445 }
3446
3447 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3448 {
3449     return f32_compare(a, b, true, s);
3450 }
3451
3452 static FloatRelation QEMU_FLATTEN
3453 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3454 {
3455     union_float64 ua, ub;
3456
3457     ua.s = xa;
3458     ub.s = xb;
3459
3460     if (QEMU_NO_HARDFLOAT) {
3461         goto soft;
3462     }
3463
3464     float64_input_flush2(&ua.s, &ub.s, s);
3465     if (isgreaterequal(ua.h, ub.h)) {
3466         if (isgreater(ua.h, ub.h)) {
3467             return float_relation_greater;
3468         }
3469         return float_relation_equal;
3470     }
3471     if (likely(isless(ua.h, ub.h))) {
3472         return float_relation_less;
3473     }
3474     /* The only condition remaining is unordered.
3475      * Fall through to set flags.
3476      */
3477  soft:
3478     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3479 }
3480
3481 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3482 {
3483     return f64_compare(a, b, false, s);
3484 }
3485
3486 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3487 {
3488     return f64_compare(a, b, true, s);
3489 }
3490
3491 static FloatRelation QEMU_FLATTEN
3492 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3493 {
3494     FloatParts64 pa, pb;
3495
3496     bfloat16_unpack_canonical(&pa, a, s);
3497     bfloat16_unpack_canonical(&pb, b, s);
3498     return compare_floats(pa, pb, is_quiet, s);
3499 }
3500
3501 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3502 {
3503     return soft_bf16_compare(a, b, false, s);
3504 }
3505
3506 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3507 {
3508     return soft_bf16_compare(a, b, true, s);
3509 }
3510
3511 /* Multiply A by 2 raised to the power N.  */
3512 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3513 {
3514     if (unlikely(is_nan(a.cls))) {
3515         parts_return_nan(&a, s);
3516     }
3517     if (a.cls == float_class_normal) {
3518         /* The largest float type (even though not supported by FloatParts64)
3519          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3520          * still allows rounding to infinity, without allowing overflow
3521          * within the int32_t that backs FloatParts64.exp.
3522          */
3523         n = MIN(MAX(n, -0x10000), 0x10000);
3524         a.exp += n;
3525     }
3526     return a;
3527 }
3528
3529 float16 float16_scalbn(float16 a, int n, float_status *status)
3530 {
3531     FloatParts64 pa, pr;
3532
3533     float16_unpack_canonical(&pa, a, status);
3534     pr = scalbn_decomposed(pa, n, status);
3535     return float16_round_pack_canonical(&pr, status);
3536 }
3537
3538 float32 float32_scalbn(float32 a, int n, float_status *status)
3539 {
3540     FloatParts64 pa, pr;
3541
3542     float32_unpack_canonical(&pa, a, status);
3543     pr = scalbn_decomposed(pa, n, status);
3544     return float32_round_pack_canonical(&pr, status);
3545 }
3546
3547 float64 float64_scalbn(float64 a, int n, float_status *status)
3548 {
3549     FloatParts64 pa, pr;
3550
3551     float64_unpack_canonical(&pa, a, status);
3552     pr = scalbn_decomposed(pa, n, status);
3553     return float64_round_pack_canonical(&pr, status);
3554 }
3555
3556 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3557 {
3558     FloatParts64 pa, pr;
3559
3560     bfloat16_unpack_canonical(&pa, a, status);
3561     pr = scalbn_decomposed(pa, n, status);
3562     return bfloat16_round_pack_canonical(&pr, status);
3563 }
3564
3565 /*
3566  * Square Root
3567  *
3568  * The old softfloat code did an approximation step before zeroing in
3569  * on the final result. However for simpleness we just compute the
3570  * square root by iterating down from the implicit bit to enough extra
3571  * bits to ensure we get a correctly rounded result.
3572  *
3573  * This does mean however the calculation is slower than before,
3574  * especially for 64 bit floats.
3575  */
3576
3577 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3578 {
3579     uint64_t a_frac, r_frac, s_frac;
3580     int bit, last_bit;
3581
3582     if (is_nan(a.cls)) {
3583         parts_return_nan(&a, s);
3584         return a;
3585     }
3586     if (a.cls == float_class_zero) {
3587         return a;  /* sqrt(+-0) = +-0 */
3588     }
3589     if (a.sign) {
3590         float_raise(float_flag_invalid, s);
3591         parts_default_nan(&a, s);
3592         return a;
3593     }
3594     if (a.cls == float_class_inf) {
3595         return a;  /* sqrt(+inf) = +inf */
3596     }
3597
3598     assert(a.cls == float_class_normal);
3599
3600     /* We need two overflow bits at the top. Adding room for that is a
3601      * right shift. If the exponent is odd, we can discard the low bit
3602      * by multiplying the fraction by 2; that's a left shift. Combine
3603      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3604      */
3605     a_frac = a.frac >> (2 - (a.exp & 1));
3606     a.exp >>= 1;
3607
3608     /* Bit-by-bit computation of sqrt.  */
3609     r_frac = 0;
3610     s_frac = 0;
3611
3612     /* Iterate from implicit bit down to the 3 extra bits to compute a
3613      * properly rounded result. Remember we've inserted two more bits
3614      * at the top, so these positions are two less.
3615      */
3616     bit = DECOMPOSED_BINARY_POINT - 2;
3617     last_bit = MAX(p->frac_shift - 4, 0);
3618     do {
3619         uint64_t q = 1ULL << bit;
3620         uint64_t t_frac = s_frac + q;
3621         if (t_frac <= a_frac) {
3622             s_frac = t_frac + q;
3623             a_frac -= t_frac;
3624             r_frac += q;
3625         }
3626         a_frac <<= 1;
3627     } while (--bit >= last_bit);
3628
3629     /* Undo the right shift done above. If there is any remaining
3630      * fraction, the result is inexact. Set the sticky bit.
3631      */
3632     a.frac = (r_frac << 2) + (a_frac != 0);
3633
3634     return a;
3635 }
3636
3637 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3638 {
3639     FloatParts64 pa, pr;
3640
3641     float16_unpack_canonical(&pa, a, status);
3642     pr = sqrt_float(pa, status, &float16_params);
3643     return float16_round_pack_canonical(&pr, status);
3644 }
3645
3646 static float32 QEMU_SOFTFLOAT_ATTR
3647 soft_f32_sqrt(float32 a, float_status *status)
3648 {
3649     FloatParts64 pa, pr;
3650
3651     float32_unpack_canonical(&pa, a, status);
3652     pr = sqrt_float(pa, status, &float32_params);
3653     return float32_round_pack_canonical(&pr, status);
3654 }
3655
3656 static float64 QEMU_SOFTFLOAT_ATTR
3657 soft_f64_sqrt(float64 a, float_status *status)
3658 {
3659     FloatParts64 pa, pr;
3660
3661     float64_unpack_canonical(&pa, a, status);
3662     pr = sqrt_float(pa, status, &float64_params);
3663     return float64_round_pack_canonical(&pr, status);
3664 }
3665
3666 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3667 {
3668     union_float32 ua, ur;
3669
3670     ua.s = xa;
3671     if (unlikely(!can_use_fpu(s))) {
3672         goto soft;
3673     }
3674
3675     float32_input_flush1(&ua.s, s);
3676     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3677         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3678                        fpclassify(ua.h) == FP_ZERO) ||
3679                      signbit(ua.h))) {
3680             goto soft;
3681         }
3682     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3683                         float32_is_neg(ua.s))) {
3684         goto soft;
3685     }
3686     ur.h = sqrtf(ua.h);
3687     return ur.s;
3688
3689  soft:
3690     return soft_f32_sqrt(ua.s, s);
3691 }
3692
3693 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3694 {
3695     union_float64 ua, ur;
3696
3697     ua.s = xa;
3698     if (unlikely(!can_use_fpu(s))) {
3699         goto soft;
3700     }
3701
3702     float64_input_flush1(&ua.s, s);
3703     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3704         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3705                        fpclassify(ua.h) == FP_ZERO) ||
3706                      signbit(ua.h))) {
3707             goto soft;
3708         }
3709     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3710                         float64_is_neg(ua.s))) {
3711         goto soft;
3712     }
3713     ur.h = sqrt(ua.h);
3714     return ur.s;
3715
3716  soft:
3717     return soft_f64_sqrt(ua.s, s);
3718 }
3719
3720 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3721 {
3722     FloatParts64 pa, pr;
3723
3724     bfloat16_unpack_canonical(&pa, a, status);
3725     pr = sqrt_float(pa, status, &bfloat16_params);
3726     return bfloat16_round_pack_canonical(&pr, status);
3727 }
3728
3729 /*----------------------------------------------------------------------------
3730 | The pattern for a default generated NaN.
3731 *----------------------------------------------------------------------------*/
3732
3733 float16 float16_default_nan(float_status *status)
3734 {
3735     FloatParts64 p;
3736
3737     parts_default_nan(&p, status);
3738     p.frac >>= float16_params.frac_shift;
3739     return float16_pack_raw(&p);
3740 }
3741
3742 float32 float32_default_nan(float_status *status)
3743 {
3744     FloatParts64 p;
3745
3746     parts_default_nan(&p, status);
3747     p.frac >>= float32_params.frac_shift;
3748     return float32_pack_raw(&p);
3749 }
3750
3751 float64 float64_default_nan(float_status *status)
3752 {
3753     FloatParts64 p;
3754
3755     parts_default_nan(&p, status);
3756     p.frac >>= float64_params.frac_shift;
3757     return float64_pack_raw(&p);
3758 }
3759
3760 float128 float128_default_nan(float_status *status)
3761 {
3762     FloatParts128 p;
3763
3764     parts_default_nan(&p, status);
3765     frac_shr(&p, float128_params.frac_shift);
3766     return float128_pack_raw(&p);
3767 }
3768
3769 bfloat16 bfloat16_default_nan(float_status *status)
3770 {
3771     FloatParts64 p;
3772
3773     parts_default_nan(&p, status);
3774     p.frac >>= bfloat16_params.frac_shift;
3775     return bfloat16_pack_raw(&p);
3776 }
3777
3778 /*----------------------------------------------------------------------------
3779 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3780 *----------------------------------------------------------------------------*/
3781
3782 float16 float16_silence_nan(float16 a, float_status *status)
3783 {
3784     FloatParts64 p;
3785
3786     float16_unpack_raw(&p, a);
3787     p.frac <<= float16_params.frac_shift;
3788     parts_silence_nan(&p, status);
3789     p.frac >>= float16_params.frac_shift;
3790     return float16_pack_raw(&p);
3791 }
3792
3793 float32 float32_silence_nan(float32 a, float_status *status)
3794 {
3795     FloatParts64 p;
3796
3797     float32_unpack_raw(&p, a);
3798     p.frac <<= float32_params.frac_shift;
3799     parts_silence_nan(&p, status);
3800     p.frac >>= float32_params.frac_shift;
3801     return float32_pack_raw(&p);
3802 }
3803
3804 float64 float64_silence_nan(float64 a, float_status *status)
3805 {
3806     FloatParts64 p;
3807
3808     float64_unpack_raw(&p, a);
3809     p.frac <<= float64_params.frac_shift;
3810     parts_silence_nan(&p, status);
3811     p.frac >>= float64_params.frac_shift;
3812     return float64_pack_raw(&p);
3813 }
3814
3815 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3816 {
3817     FloatParts64 p;
3818
3819     bfloat16_unpack_raw(&p, a);
3820     p.frac <<= bfloat16_params.frac_shift;
3821     parts_silence_nan(&p, status);
3822     p.frac >>= bfloat16_params.frac_shift;
3823     return bfloat16_pack_raw(&p);
3824 }
3825
3826 float128 float128_silence_nan(float128 a, float_status *status)
3827 {
3828     FloatParts128 p;
3829
3830     float128_unpack_raw(&p, a);
3831     frac_shl(&p, float128_params.frac_shift);
3832     parts_silence_nan(&p, status);
3833     frac_shr(&p, float128_params.frac_shift);
3834     return float128_pack_raw(&p);
3835 }
3836
3837 /*----------------------------------------------------------------------------
3838 | If `a' is denormal and we are in flush-to-zero mode then set the
3839 | input-denormal exception and return zero. Otherwise just return the value.
3840 *----------------------------------------------------------------------------*/
3841
3842 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3843 {
3844     if (p.exp == 0 && p.frac != 0) {
3845         float_raise(float_flag_input_denormal, status);
3846         return true;
3847     }
3848
3849     return false;
3850 }
3851
3852 float16 float16_squash_input_denormal(float16 a, float_status *status)
3853 {
3854     if (status->flush_inputs_to_zero) {
3855         FloatParts64 p;
3856
3857         float16_unpack_raw(&p, a);
3858         if (parts_squash_denormal(p, status)) {
3859             return float16_set_sign(float16_zero, p.sign);
3860         }
3861     }
3862     return a;
3863 }
3864
3865 float32 float32_squash_input_denormal(float32 a, float_status *status)
3866 {
3867     if (status->flush_inputs_to_zero) {
3868         FloatParts64 p;
3869
3870         float32_unpack_raw(&p, a);
3871         if (parts_squash_denormal(p, status)) {
3872             return float32_set_sign(float32_zero, p.sign);
3873         }
3874     }
3875     return a;
3876 }
3877
3878 float64 float64_squash_input_denormal(float64 a, float_status *status)
3879 {
3880     if (status->flush_inputs_to_zero) {
3881         FloatParts64 p;
3882
3883         float64_unpack_raw(&p, a);
3884         if (parts_squash_denormal(p, status)) {
3885             return float64_set_sign(float64_zero, p.sign);
3886         }
3887     }
3888     return a;
3889 }
3890
3891 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3892 {
3893     if (status->flush_inputs_to_zero) {
3894         FloatParts64 p;
3895
3896         bfloat16_unpack_raw(&p, a);
3897         if (parts_squash_denormal(p, status)) {
3898             return bfloat16_set_sign(bfloat16_zero, p.sign);
3899         }
3900     }
3901     return a;
3902 }
3903
3904 /*----------------------------------------------------------------------------
3905 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3906 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3907 | input.  If `zSign' is 1, the input is negated before being converted to an
3908 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3909 | is simply rounded to an integer, with the inexact exception raised if the
3910 | input cannot be represented exactly as an integer.  However, if the fixed-
3911 | point input is too large, the invalid exception is raised and the largest
3912 | positive or negative integer is returned.
3913 *----------------------------------------------------------------------------*/
3914
3915 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3916                                  float_status *status)
3917 {
3918     int8_t roundingMode;
3919     bool roundNearestEven;
3920     int8_t roundIncrement, roundBits;
3921     int32_t z;
3922
3923     roundingMode = status->float_rounding_mode;
3924     roundNearestEven = ( roundingMode == float_round_nearest_even );
3925     switch (roundingMode) {
3926     case float_round_nearest_even:
3927     case float_round_ties_away:
3928         roundIncrement = 0x40;
3929         break;
3930     case float_round_to_zero:
3931         roundIncrement = 0;
3932         break;
3933     case float_round_up:
3934         roundIncrement = zSign ? 0 : 0x7f;
3935         break;
3936     case float_round_down:
3937         roundIncrement = zSign ? 0x7f : 0;
3938         break;
3939     case float_round_to_odd:
3940         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3941         break;
3942     default:
3943         abort();
3944     }
3945     roundBits = absZ & 0x7F;
3946     absZ = ( absZ + roundIncrement )>>7;
3947     if (!(roundBits ^ 0x40) && roundNearestEven) {
3948         absZ &= ~1;
3949     }
3950     z = absZ;
3951     if ( zSign ) z = - z;
3952     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3953         float_raise(float_flag_invalid, status);
3954         return zSign ? INT32_MIN : INT32_MAX;
3955     }
3956     if (roundBits) {
3957         float_raise(float_flag_inexact, status);
3958     }
3959     return z;
3960
3961 }
3962
3963 /*----------------------------------------------------------------------------
3964 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3965 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3966 | and returns the properly rounded 64-bit integer corresponding to the input.
3967 | If `zSign' is 1, the input is negated before being converted to an integer.
3968 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3969 | the inexact exception raised if the input cannot be represented exactly as
3970 | an integer.  However, if the fixed-point input is too large, the invalid
3971 | exception is raised and the largest positive or negative integer is
3972 | returned.
3973 *----------------------------------------------------------------------------*/
3974
3975 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3976                                float_status *status)
3977 {
3978     int8_t roundingMode;
3979     bool roundNearestEven, increment;
3980     int64_t z;
3981
3982     roundingMode = status->float_rounding_mode;
3983     roundNearestEven = ( roundingMode == float_round_nearest_even );
3984     switch (roundingMode) {
3985     case float_round_nearest_even:
3986     case float_round_ties_away:
3987         increment = ((int64_t) absZ1 < 0);
3988         break;
3989     case float_round_to_zero:
3990         increment = 0;
3991         break;
3992     case float_round_up:
3993         increment = !zSign && absZ1;
3994         break;
3995     case float_round_down:
3996         increment = zSign && absZ1;
3997         break;
3998     case float_round_to_odd:
3999         increment = !(absZ0 & 1) && absZ1;
4000         break;
4001     default:
4002         abort();
4003     }
4004     if ( increment ) {
4005         ++absZ0;
4006         if ( absZ0 == 0 ) goto overflow;
4007         if (!(absZ1 << 1) && roundNearestEven) {
4008             absZ0 &= ~1;
4009         }
4010     }
4011     z = absZ0;
4012     if ( zSign ) z = - z;
4013     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4014  overflow:
4015         float_raise(float_flag_invalid, status);
4016         return zSign ? INT64_MIN : INT64_MAX;
4017     }
4018     if (absZ1) {
4019         float_raise(float_flag_inexact, status);
4020     }
4021     return z;
4022
4023 }
4024
4025 /*----------------------------------------------------------------------------
4026 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4027 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4028 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4029 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4030 | with the inexact exception raised if the input cannot be represented exactly
4031 | as an integer.  However, if the fixed-point input is too large, the invalid
4032 | exception is raised and the largest unsigned integer is returned.
4033 *----------------------------------------------------------------------------*/
4034
4035 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4036                                 uint64_t absZ1, float_status *status)
4037 {
4038     int8_t roundingMode;
4039     bool roundNearestEven, increment;
4040
4041     roundingMode = status->float_rounding_mode;
4042     roundNearestEven = (roundingMode == float_round_nearest_even);
4043     switch (roundingMode) {
4044     case float_round_nearest_even:
4045     case float_round_ties_away:
4046         increment = ((int64_t)absZ1 < 0);
4047         break;
4048     case float_round_to_zero:
4049         increment = 0;
4050         break;
4051     case float_round_up:
4052         increment = !zSign && absZ1;
4053         break;
4054     case float_round_down:
4055         increment = zSign && absZ1;
4056         break;
4057     case float_round_to_odd:
4058         increment = !(absZ0 & 1) && absZ1;
4059         break;
4060     default:
4061         abort();
4062     }
4063     if (increment) {
4064         ++absZ0;
4065         if (absZ0 == 0) {
4066             float_raise(float_flag_invalid, status);
4067             return UINT64_MAX;
4068         }
4069         if (!(absZ1 << 1) && roundNearestEven) {
4070             absZ0 &= ~1;
4071         }
4072     }
4073
4074     if (zSign && absZ0) {
4075         float_raise(float_flag_invalid, status);
4076         return 0;
4077     }
4078
4079     if (absZ1) {
4080         float_raise(float_flag_inexact, status);
4081     }
4082     return absZ0;
4083 }
4084
4085 /*----------------------------------------------------------------------------
4086 | Normalizes the subnormal single-precision floating-point value represented
4087 | by the denormalized significand `aSig'.  The normalized exponent and
4088 | significand are stored at the locations pointed to by `zExpPtr' and
4089 | `zSigPtr', respectively.
4090 *----------------------------------------------------------------------------*/
4091
4092 static void
4093  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4094 {
4095     int8_t shiftCount;
4096
4097     shiftCount = clz32(aSig) - 8;
4098     *zSigPtr = aSig<<shiftCount;
4099     *zExpPtr = 1 - shiftCount;
4100
4101 }
4102
4103 /*----------------------------------------------------------------------------
4104 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4105 | and significand `zSig', and returns the proper single-precision floating-
4106 | point value corresponding to the abstract input.  Ordinarily, the abstract
4107 | value is simply rounded and packed into the single-precision format, with
4108 | the inexact exception raised if the abstract input cannot be represented
4109 | exactly.  However, if the abstract value is too large, the overflow and
4110 | inexact exceptions are raised and an infinity or maximal finite value is
4111 | returned.  If the abstract value is too small, the input value is rounded to
4112 | a subnormal number, and the underflow and inexact exceptions are raised if
4113 | the abstract input cannot be represented exactly as a subnormal single-
4114 | precision floating-point number.
4115 |     The input significand `zSig' has its binary point between bits 30
4116 | and 29, which is 7 bits to the left of the usual location.  This shifted
4117 | significand must be normalized or smaller.  If `zSig' is not normalized,
4118 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4119 | and it must not require rounding.  In the usual case that `zSig' is
4120 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4121 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4122 | Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124
4125 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4126                                    float_status *status)
4127 {
4128     int8_t roundingMode;
4129     bool roundNearestEven;
4130     int8_t roundIncrement, roundBits;
4131     bool isTiny;
4132
4133     roundingMode = status->float_rounding_mode;
4134     roundNearestEven = ( roundingMode == float_round_nearest_even );
4135     switch (roundingMode) {
4136     case float_round_nearest_even:
4137     case float_round_ties_away:
4138         roundIncrement = 0x40;
4139         break;
4140     case float_round_to_zero:
4141         roundIncrement = 0;
4142         break;
4143     case float_round_up:
4144         roundIncrement = zSign ? 0 : 0x7f;
4145         break;
4146     case float_round_down:
4147         roundIncrement = zSign ? 0x7f : 0;
4148         break;
4149     case float_round_to_odd:
4150         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4151         break;
4152     default:
4153         abort();
4154         break;
4155     }
4156     roundBits = zSig & 0x7F;
4157     if ( 0xFD <= (uint16_t) zExp ) {
4158         if (    ( 0xFD < zExp )
4159              || (    ( zExp == 0xFD )
4160                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4161            ) {
4162             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4163                                    roundIncrement != 0;
4164             float_raise(float_flag_overflow | float_flag_inexact, status);
4165             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4166         }
4167         if ( zExp < 0 ) {
4168             if (status->flush_to_zero) {
4169                 float_raise(float_flag_output_denormal, status);
4170                 return packFloat32(zSign, 0, 0);
4171             }
4172             isTiny = status->tininess_before_rounding
4173                   || (zExp < -1)
4174                   || (zSig + roundIncrement < 0x80000000);
4175             shift32RightJamming( zSig, - zExp, &zSig );
4176             zExp = 0;
4177             roundBits = zSig & 0x7F;
4178             if (isTiny && roundBits) {
4179                 float_raise(float_flag_underflow, status);
4180             }
4181             if (roundingMode == float_round_to_odd) {
4182                 /*
4183                  * For round-to-odd case, the roundIncrement depends on
4184                  * zSig which just changed.
4185                  */
4186                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4187             }
4188         }
4189     }
4190     if (roundBits) {
4191         float_raise(float_flag_inexact, status);
4192     }
4193     zSig = ( zSig + roundIncrement )>>7;
4194     if (!(roundBits ^ 0x40) && roundNearestEven) {
4195         zSig &= ~1;
4196     }
4197     if ( zSig == 0 ) zExp = 0;
4198     return packFloat32( zSign, zExp, zSig );
4199
4200 }
4201
4202 /*----------------------------------------------------------------------------
4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4204 | and significand `zSig', and returns the proper single-precision floating-
4205 | point value corresponding to the abstract input.  This routine is just like
4206 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4207 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4208 | floating-point exponent.
4209 *----------------------------------------------------------------------------*/
4210
4211 static float32
4212  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4213                               float_status *status)
4214 {
4215     int8_t shiftCount;
4216
4217     shiftCount = clz32(zSig) - 1;
4218     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4219                                status);
4220
4221 }
4222
4223 /*----------------------------------------------------------------------------
4224 | Normalizes the subnormal double-precision floating-point value represented
4225 | by the denormalized significand `aSig'.  The normalized exponent and
4226 | significand are stored at the locations pointed to by `zExpPtr' and
4227 | `zSigPtr', respectively.
4228 *----------------------------------------------------------------------------*/
4229
4230 static void
4231  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4232 {
4233     int8_t shiftCount;
4234
4235     shiftCount = clz64(aSig) - 11;
4236     *zSigPtr = aSig<<shiftCount;
4237     *zExpPtr = 1 - shiftCount;
4238
4239 }
4240
4241 /*----------------------------------------------------------------------------
4242 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4243 | double-precision floating-point value, returning the result.  After being
4244 | shifted into the proper positions, the three fields are simply added
4245 | together to form the result.  This means that any integer portion of `zSig'
4246 | will be added into the exponent.  Since a properly normalized significand
4247 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4248 | than the desired result exponent whenever `zSig' is a complete, normalized
4249 | significand.
4250 *----------------------------------------------------------------------------*/
4251
4252 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4253 {
4254
4255     return make_float64(
4256         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4257
4258 }
4259
4260 /*----------------------------------------------------------------------------
4261 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4262 | and significand `zSig', and returns the proper double-precision floating-
4263 | point value corresponding to the abstract input.  Ordinarily, the abstract
4264 | value is simply rounded and packed into the double-precision format, with
4265 | the inexact exception raised if the abstract input cannot be represented
4266 | exactly.  However, if the abstract value is too large, the overflow and
4267 | inexact exceptions are raised and an infinity or maximal finite value is
4268 | returned.  If the abstract value is too small, the input value is rounded to
4269 | a subnormal number, and the underflow and inexact exceptions are raised if
4270 | the abstract input cannot be represented exactly as a subnormal double-
4271 | precision floating-point number.
4272 |     The input significand `zSig' has its binary point between bits 62
4273 | and 61, which is 10 bits to the left of the usual location.  This shifted
4274 | significand must be normalized or smaller.  If `zSig' is not normalized,
4275 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4276 | and it must not require rounding.  In the usual case that `zSig' is
4277 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4278 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4279 | Binary Floating-Point Arithmetic.
4280 *----------------------------------------------------------------------------*/
4281
4282 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4283                                    float_status *status)
4284 {
4285     int8_t roundingMode;
4286     bool roundNearestEven;
4287     int roundIncrement, roundBits;
4288     bool isTiny;
4289
4290     roundingMode = status->float_rounding_mode;
4291     roundNearestEven = ( roundingMode == float_round_nearest_even );
4292     switch (roundingMode) {
4293     case float_round_nearest_even:
4294     case float_round_ties_away:
4295         roundIncrement = 0x200;
4296         break;
4297     case float_round_to_zero:
4298         roundIncrement = 0;
4299         break;
4300     case float_round_up:
4301         roundIncrement = zSign ? 0 : 0x3ff;
4302         break;
4303     case float_round_down:
4304         roundIncrement = zSign ? 0x3ff : 0;
4305         break;
4306     case float_round_to_odd:
4307         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4308         break;
4309     default:
4310         abort();
4311     }
4312     roundBits = zSig & 0x3FF;
4313     if ( 0x7FD <= (uint16_t) zExp ) {
4314         if (    ( 0x7FD < zExp )
4315              || (    ( zExp == 0x7FD )
4316                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4317            ) {
4318             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4319                                    roundIncrement != 0;
4320             float_raise(float_flag_overflow | float_flag_inexact, status);
4321             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4322         }
4323         if ( zExp < 0 ) {
4324             if (status->flush_to_zero) {
4325                 float_raise(float_flag_output_denormal, status);
4326                 return packFloat64(zSign, 0, 0);
4327             }
4328             isTiny = status->tininess_before_rounding
4329                   || (zExp < -1)
4330                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4331             shift64RightJamming( zSig, - zExp, &zSig );
4332             zExp = 0;
4333             roundBits = zSig & 0x3FF;
4334             if (isTiny && roundBits) {
4335                 float_raise(float_flag_underflow, status);
4336             }
4337             if (roundingMode == float_round_to_odd) {
4338                 /*
4339                  * For round-to-odd case, the roundIncrement depends on
4340                  * zSig which just changed.
4341                  */
4342                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4343             }
4344         }
4345     }
4346     if (roundBits) {
4347         float_raise(float_flag_inexact, status);
4348     }
4349     zSig = ( zSig + roundIncrement )>>10;
4350     if (!(roundBits ^ 0x200) && roundNearestEven) {
4351         zSig &= ~1;
4352     }
4353     if ( zSig == 0 ) zExp = 0;
4354     return packFloat64( zSign, zExp, zSig );
4355
4356 }
4357
4358 /*----------------------------------------------------------------------------
4359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4360 | and significand `zSig', and returns the proper double-precision floating-
4361 | point value corresponding to the abstract input.  This routine is just like
4362 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4363 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4364 | floating-point exponent.
4365 *----------------------------------------------------------------------------*/
4366
4367 static float64
4368  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4369                               float_status *status)
4370 {
4371     int8_t shiftCount;
4372
4373     shiftCount = clz64(zSig) - 1;
4374     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4375                                status);
4376
4377 }
4378
4379 /*----------------------------------------------------------------------------
4380 | Normalizes the subnormal extended double-precision floating-point value
4381 | represented by the denormalized significand `aSig'.  The normalized exponent
4382 | and significand are stored at the locations pointed to by `zExpPtr' and
4383 | `zSigPtr', respectively.
4384 *----------------------------------------------------------------------------*/
4385
4386 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4387                                 uint64_t *zSigPtr)
4388 {
4389     int8_t shiftCount;
4390
4391     shiftCount = clz64(aSig);
4392     *zSigPtr = aSig<<shiftCount;
4393     *zExpPtr = 1 - shiftCount;
4394 }
4395
4396 /*----------------------------------------------------------------------------
4397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4398 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4399 | and returns the proper extended double-precision floating-point value
4400 | corresponding to the abstract input.  Ordinarily, the abstract value is
4401 | rounded and packed into the extended double-precision format, with the
4402 | inexact exception raised if the abstract input cannot be represented
4403 | exactly.  However, if the abstract value is too large, the overflow and
4404 | inexact exceptions are raised and an infinity or maximal finite value is
4405 | returned.  If the abstract value is too small, the input value is rounded to
4406 | a subnormal number, and the underflow and inexact exceptions are raised if
4407 | the abstract input cannot be represented exactly as a subnormal extended
4408 | double-precision floating-point number.
4409 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4410 | number of bits as single or double precision, respectively.  Otherwise, the
4411 | result is rounded to the full precision of the extended double-precision
4412 | format.
4413 |     The input significand must be normalized or smaller.  If the input
4414 | significand is not normalized, `zExp' must be 0; in that case, the result
4415 | returned is a subnormal number, and it must not require rounding.  The
4416 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4417 | Floating-Point Arithmetic.
4418 *----------------------------------------------------------------------------*/
4419
4420 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4421                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4422                               float_status *status)
4423 {
4424     int8_t roundingMode;
4425     bool roundNearestEven, increment, isTiny;
4426     int64_t roundIncrement, roundMask, roundBits;
4427
4428     roundingMode = status->float_rounding_mode;
4429     roundNearestEven = ( roundingMode == float_round_nearest_even );
4430     if ( roundingPrecision == 80 ) goto precision80;
4431     if ( roundingPrecision == 64 ) {
4432         roundIncrement = UINT64_C(0x0000000000000400);
4433         roundMask = UINT64_C(0x00000000000007FF);
4434     }
4435     else if ( roundingPrecision == 32 ) {
4436         roundIncrement = UINT64_C(0x0000008000000000);
4437         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4438     }
4439     else {
4440         goto precision80;
4441     }
4442     zSig0 |= ( zSig1 != 0 );
4443     switch (roundingMode) {
4444     case float_round_nearest_even:
4445     case float_round_ties_away:
4446         break;
4447     case float_round_to_zero:
4448         roundIncrement = 0;
4449         break;
4450     case float_round_up:
4451         roundIncrement = zSign ? 0 : roundMask;
4452         break;
4453     case float_round_down:
4454         roundIncrement = zSign ? roundMask : 0;
4455         break;
4456     default:
4457         abort();
4458     }
4459     roundBits = zSig0 & roundMask;
4460     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4461         if (    ( 0x7FFE < zExp )
4462              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4463            ) {
4464             goto overflow;
4465         }
4466         if ( zExp <= 0 ) {
4467             if (status->flush_to_zero) {
4468                 float_raise(float_flag_output_denormal, status);
4469                 return packFloatx80(zSign, 0, 0);
4470             }
4471             isTiny = status->tininess_before_rounding
4472                   || (zExp < 0 )
4473                   || (zSig0 <= zSig0 + roundIncrement);
4474             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4475             zExp = 0;
4476             roundBits = zSig0 & roundMask;
4477             if (isTiny && roundBits) {
4478                 float_raise(float_flag_underflow, status);
4479             }
4480             if (roundBits) {
4481                 float_raise(float_flag_inexact, status);
4482             }
4483             zSig0 += roundIncrement;
4484             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4485             roundIncrement = roundMask + 1;
4486             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4487                 roundMask |= roundIncrement;
4488             }
4489             zSig0 &= ~ roundMask;
4490             return packFloatx80( zSign, zExp, zSig0 );
4491         }
4492     }
4493     if (roundBits) {
4494         float_raise(float_flag_inexact, status);
4495     }
4496     zSig0 += roundIncrement;
4497     if ( zSig0 < roundIncrement ) {
4498         ++zExp;
4499         zSig0 = UINT64_C(0x8000000000000000);
4500     }
4501     roundIncrement = roundMask + 1;
4502     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4503         roundMask |= roundIncrement;
4504     }
4505     zSig0 &= ~ roundMask;
4506     if ( zSig0 == 0 ) zExp = 0;
4507     return packFloatx80( zSign, zExp, zSig0 );
4508  precision80:
4509     switch (roundingMode) {
4510     case float_round_nearest_even:
4511     case float_round_ties_away:
4512         increment = ((int64_t)zSig1 < 0);
4513         break;
4514     case float_round_to_zero:
4515         increment = 0;
4516         break;
4517     case float_round_up:
4518         increment = !zSign && zSig1;
4519         break;
4520     case float_round_down:
4521         increment = zSign && zSig1;
4522         break;
4523     default:
4524         abort();
4525     }
4526     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4527         if (    ( 0x7FFE < zExp )
4528              || (    ( zExp == 0x7FFE )
4529                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4530                   && increment
4531                 )
4532            ) {
4533             roundMask = 0;
4534  overflow:
4535             float_raise(float_flag_overflow | float_flag_inexact, status);
4536             if (    ( roundingMode == float_round_to_zero )
4537                  || ( zSign && ( roundingMode == float_round_up ) )
4538                  || ( ! zSign && ( roundingMode == float_round_down ) )
4539                ) {
4540                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4541             }
4542             return packFloatx80(zSign,
4543                                 floatx80_infinity_high,
4544                                 floatx80_infinity_low);
4545         }
4546         if ( zExp <= 0 ) {
4547             isTiny = status->tininess_before_rounding
4548                   || (zExp < 0)
4549                   || !increment
4550                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4551             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4552             zExp = 0;
4553             if (isTiny && zSig1) {
4554                 float_raise(float_flag_underflow, status);
4555             }
4556             if (zSig1) {
4557                 float_raise(float_flag_inexact, status);
4558             }
4559             switch (roundingMode) {
4560             case float_round_nearest_even:
4561             case float_round_ties_away:
4562                 increment = ((int64_t)zSig1 < 0);
4563                 break;
4564             case float_round_to_zero:
4565                 increment = 0;
4566                 break;
4567             case float_round_up:
4568                 increment = !zSign && zSig1;
4569                 break;
4570             case float_round_down:
4571                 increment = zSign && zSig1;
4572                 break;
4573             default:
4574                 abort();
4575             }
4576             if ( increment ) {
4577                 ++zSig0;
4578                 if (!(zSig1 << 1) && roundNearestEven) {
4579                     zSig0 &= ~1;
4580                 }
4581                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4582             }
4583             return packFloatx80( zSign, zExp, zSig0 );
4584         }
4585     }
4586     if (zSig1) {
4587         float_raise(float_flag_inexact, status);
4588     }
4589     if ( increment ) {
4590         ++zSig0;
4591         if ( zSig0 == 0 ) {
4592             ++zExp;
4593             zSig0 = UINT64_C(0x8000000000000000);
4594         }
4595         else {
4596             if (!(zSig1 << 1) && roundNearestEven) {
4597                 zSig0 &= ~1;
4598             }
4599         }
4600     }
4601     else {
4602         if ( zSig0 == 0 ) zExp = 0;
4603     }
4604     return packFloatx80( zSign, zExp, zSig0 );
4605
4606 }
4607
4608 /*----------------------------------------------------------------------------
4609 | Takes an abstract floating-point value having sign `zSign', exponent
4610 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4611 | and returns the proper extended double-precision floating-point value
4612 | corresponding to the abstract input.  This routine is just like
4613 | `roundAndPackFloatx80' except that the input significand does not have to be
4614 | normalized.
4615 *----------------------------------------------------------------------------*/
4616
4617 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4618                                        bool zSign, int32_t zExp,
4619                                        uint64_t zSig0, uint64_t zSig1,
4620                                        float_status *status)
4621 {
4622     int8_t shiftCount;
4623
4624     if ( zSig0 == 0 ) {
4625         zSig0 = zSig1;
4626         zSig1 = 0;
4627         zExp -= 64;
4628     }
4629     shiftCount = clz64(zSig0);
4630     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4631     zExp -= shiftCount;
4632     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4633                                 zSig0, zSig1, status);
4634
4635 }
4636
4637 /*----------------------------------------------------------------------------
4638 | Returns the least-significant 64 fraction bits of the quadruple-precision
4639 | floating-point value `a'.
4640 *----------------------------------------------------------------------------*/
4641
4642 static inline uint64_t extractFloat128Frac1( float128 a )
4643 {
4644
4645     return a.low;
4646
4647 }
4648
4649 /*----------------------------------------------------------------------------
4650 | Returns the most-significant 48 fraction bits of the quadruple-precision
4651 | floating-point value `a'.
4652 *----------------------------------------------------------------------------*/
4653
4654 static inline uint64_t extractFloat128Frac0( float128 a )
4655 {
4656
4657     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4658
4659 }
4660
4661 /*----------------------------------------------------------------------------
4662 | Returns the exponent bits of the quadruple-precision floating-point value
4663 | `a'.
4664 *----------------------------------------------------------------------------*/
4665
4666 static inline int32_t extractFloat128Exp( float128 a )
4667 {
4668
4669     return ( a.high>>48 ) & 0x7FFF;
4670
4671 }
4672
4673 /*----------------------------------------------------------------------------
4674 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4675 *----------------------------------------------------------------------------*/
4676
4677 static inline bool extractFloat128Sign(float128 a)
4678 {
4679     return a.high >> 63;
4680 }
4681
4682 /*----------------------------------------------------------------------------
4683 | Normalizes the subnormal quadruple-precision floating-point value
4684 | represented by the denormalized significand formed by the concatenation of
4685 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4686 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4687 | significand are stored at the location pointed to by `zSig0Ptr', and the
4688 | least significant 64 bits of the normalized significand are stored at the
4689 | location pointed to by `zSig1Ptr'.
4690 *----------------------------------------------------------------------------*/
4691
4692 static void
4693  normalizeFloat128Subnormal(
4694      uint64_t aSig0,
4695      uint64_t aSig1,
4696      int32_t *zExpPtr,
4697      uint64_t *zSig0Ptr,
4698      uint64_t *zSig1Ptr
4699  )
4700 {
4701     int8_t shiftCount;
4702
4703     if ( aSig0 == 0 ) {
4704         shiftCount = clz64(aSig1) - 15;
4705         if ( shiftCount < 0 ) {
4706             *zSig0Ptr = aSig1>>( - shiftCount );
4707             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4708         }
4709         else {
4710             *zSig0Ptr = aSig1<<shiftCount;
4711             *zSig1Ptr = 0;
4712         }
4713         *zExpPtr = - shiftCount - 63;
4714     }
4715     else {
4716         shiftCount = clz64(aSig0) - 15;
4717         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4718         *zExpPtr = 1 - shiftCount;
4719     }
4720
4721 }
4722
4723 /*----------------------------------------------------------------------------
4724 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4725 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4726 | floating-point value, returning the result.  After being shifted into the
4727 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4728 | added together to form the most significant 32 bits of the result.  This
4729 | means that any integer portion of `zSig0' will be added into the exponent.
4730 | Since a properly normalized significand will have an integer portion equal
4731 | to 1, the `zExp' input should be 1 less than the desired result exponent
4732 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4733 | significand.
4734 *----------------------------------------------------------------------------*/
4735
4736 static inline float128
4737 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4738 {
4739     float128 z;
4740
4741     z.low = zSig1;
4742     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4743     return z;
4744 }
4745
4746 /*----------------------------------------------------------------------------
4747 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4748 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4749 | and `zSig2', and returns the proper quadruple-precision floating-point value
4750 | corresponding to the abstract input.  Ordinarily, the abstract value is
4751 | simply rounded and packed into the quadruple-precision format, with the
4752 | inexact exception raised if the abstract input cannot be represented
4753 | exactly.  However, if the abstract value is too large, the overflow and
4754 | inexact exceptions are raised and an infinity or maximal finite value is
4755 | returned.  If the abstract value is too small, the input value is rounded to
4756 | a subnormal number, and the underflow and inexact exceptions are raised if
4757 | the abstract input cannot be represented exactly as a subnormal quadruple-
4758 | precision floating-point number.
4759 |     The input significand must be normalized or smaller.  If the input
4760 | significand is not normalized, `zExp' must be 0; in that case, the result
4761 | returned is a subnormal number, and it must not require rounding.  In the
4762 | usual case that the input significand is normalized, `zExp' must be 1 less
4763 | than the ``true'' floating-point exponent.  The handling of underflow and
4764 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4766
4767 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4768                                      uint64_t zSig0, uint64_t zSig1,
4769                                      uint64_t zSig2, float_status *status)
4770 {
4771     int8_t roundingMode;
4772     bool roundNearestEven, increment, isTiny;
4773
4774     roundingMode = status->float_rounding_mode;
4775     roundNearestEven = ( roundingMode == float_round_nearest_even );
4776     switch (roundingMode) {
4777     case float_round_nearest_even:
4778     case float_round_ties_away:
4779         increment = ((int64_t)zSig2 < 0);
4780         break;
4781     case float_round_to_zero:
4782         increment = 0;
4783         break;
4784     case float_round_up:
4785         increment = !zSign && zSig2;
4786         break;
4787     case float_round_down:
4788         increment = zSign && zSig2;
4789         break;
4790     case float_round_to_odd:
4791         increment = !(zSig1 & 0x1) && zSig2;
4792         break;
4793     default:
4794         abort();
4795     }
4796     if ( 0x7FFD <= (uint32_t) zExp ) {
4797         if (    ( 0x7FFD < zExp )
4798              || (    ( zExp == 0x7FFD )
4799                   && eq128(
4800                          UINT64_C(0x0001FFFFFFFFFFFF),
4801                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4802                          zSig0,
4803                          zSig1
4804                      )
4805                   && increment
4806                 )
4807            ) {
4808             float_raise(float_flag_overflow | float_flag_inexact, status);
4809             if (    ( roundingMode == float_round_to_zero )
4810                  || ( zSign && ( roundingMode == float_round_up ) )
4811                  || ( ! zSign && ( roundingMode == float_round_down ) )
4812                  || (roundingMode == float_round_to_odd)
4813                ) {
4814                 return
4815                     packFloat128(
4816                         zSign,
4817                         0x7FFE,
4818                         UINT64_C(0x0000FFFFFFFFFFFF),
4819                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4820                     );
4821             }
4822             return packFloat128( zSign, 0x7FFF, 0, 0 );
4823         }
4824         if ( zExp < 0 ) {
4825             if (status->flush_to_zero) {
4826                 float_raise(float_flag_output_denormal, status);
4827                 return packFloat128(zSign, 0, 0, 0);
4828             }
4829             isTiny = status->tininess_before_rounding
4830                   || (zExp < -1)
4831                   || !increment
4832                   || lt128(zSig0, zSig1,
4833                            UINT64_C(0x0001FFFFFFFFFFFF),
4834                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4835             shift128ExtraRightJamming(
4836                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4837             zExp = 0;
4838             if (isTiny && zSig2) {
4839                 float_raise(float_flag_underflow, status);
4840             }
4841             switch (roundingMode) {
4842             case float_round_nearest_even:
4843             case float_round_ties_away:
4844                 increment = ((int64_t)zSig2 < 0);
4845                 break;
4846             case float_round_to_zero:
4847                 increment = 0;
4848                 break;
4849             case float_round_up:
4850                 increment = !zSign && zSig2;
4851                 break;
4852             case float_round_down:
4853                 increment = zSign && zSig2;
4854                 break;
4855             case float_round_to_odd:
4856                 increment = !(zSig1 & 0x1) && zSig2;
4857                 break;
4858             default:
4859                 abort();
4860             }
4861         }
4862     }
4863     if (zSig2) {
4864         float_raise(float_flag_inexact, status);
4865     }
4866     if ( increment ) {
4867         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4868         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4869             zSig1 &= ~1;
4870         }
4871     }
4872     else {
4873         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4874     }
4875     return packFloat128( zSign, zExp, zSig0, zSig1 );
4876
4877 }
4878
4879 /*----------------------------------------------------------------------------
4880 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4881 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4882 | returns the proper quadruple-precision floating-point value corresponding
4883 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4884 | except that the input significand has fewer bits and does not have to be
4885 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4886 | point exponent.
4887 *----------------------------------------------------------------------------*/
4888
4889 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4890                                               uint64_t zSig0, uint64_t zSig1,
4891                                               float_status *status)
4892 {
4893     int8_t shiftCount;
4894     uint64_t zSig2;
4895
4896     if ( zSig0 == 0 ) {
4897         zSig0 = zSig1;
4898         zSig1 = 0;
4899         zExp -= 64;
4900     }
4901     shiftCount = clz64(zSig0) - 15;
4902     if ( 0 <= shiftCount ) {
4903         zSig2 = 0;
4904         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4905     }
4906     else {
4907         shift128ExtraRightJamming(
4908             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4909     }
4910     zExp -= shiftCount;
4911     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4912
4913 }
4914
4915
4916 /*----------------------------------------------------------------------------
4917 | Returns the result of converting the 32-bit two's complement integer `a'
4918 | to the extended double-precision floating-point format.  The conversion
4919 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4920 | Arithmetic.
4921 *----------------------------------------------------------------------------*/
4922
4923 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4924 {
4925     bool zSign;
4926     uint32_t absA;
4927     int8_t shiftCount;
4928     uint64_t zSig;
4929
4930     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4931     zSign = ( a < 0 );
4932     absA = zSign ? - a : a;
4933     shiftCount = clz32(absA) + 32;
4934     zSig = absA;
4935     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4936
4937 }
4938
4939 /*----------------------------------------------------------------------------
4940 | Returns the result of converting the 32-bit two's complement integer `a' to
4941 | the quadruple-precision floating-point format.  The conversion is performed
4942 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4943 *----------------------------------------------------------------------------*/
4944
4945 float128 int32_to_float128(int32_t a, float_status *status)
4946 {
4947     bool zSign;
4948     uint32_t absA;
4949     int8_t shiftCount;
4950     uint64_t zSig0;
4951
4952     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4953     zSign = ( a < 0 );
4954     absA = zSign ? - a : a;
4955     shiftCount = clz32(absA) + 17;
4956     zSig0 = absA;
4957     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4958
4959 }
4960
4961 /*----------------------------------------------------------------------------
4962 | Returns the result of converting the 64-bit two's complement integer `a'
4963 | to the extended double-precision floating-point format.  The conversion
4964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4965 | Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967
4968 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4969 {
4970     bool zSign;
4971     uint64_t absA;
4972     int8_t shiftCount;
4973
4974     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4975     zSign = ( a < 0 );
4976     absA = zSign ? - a : a;
4977     shiftCount = clz64(absA);
4978     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4979
4980 }
4981
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of converting the 64-bit two's complement integer `a' to
4984 | the quadruple-precision floating-point format.  The conversion is performed
4985 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 float128 int64_to_float128(int64_t a, float_status *status)
4989 {
4990     bool zSign;
4991     uint64_t absA;
4992     int8_t shiftCount;
4993     int32_t zExp;
4994     uint64_t zSig0, zSig1;
4995
4996     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4997     zSign = ( a < 0 );
4998     absA = zSign ? - a : a;
4999     shiftCount = clz64(absA) + 49;
5000     zExp = 0x406E - shiftCount;
5001     if ( 64 <= shiftCount ) {
5002         zSig1 = 0;
5003         zSig0 = absA;
5004         shiftCount -= 64;
5005     }
5006     else {
5007         zSig1 = absA;
5008         zSig0 = 0;
5009     }
5010     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5011     return packFloat128( zSign, zExp, zSig0, zSig1 );
5012
5013 }
5014
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of converting the 64-bit unsigned integer `a'
5017 | to the quadruple-precision floating-point format.  The conversion is performed
5018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5020
5021 float128 uint64_to_float128(uint64_t a, float_status *status)
5022 {
5023     if (a == 0) {
5024         return float128_zero;
5025     }
5026     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5027 }
5028
5029 /*----------------------------------------------------------------------------
5030 | Returns the result of converting the single-precision floating-point value
5031 | `a' to the extended double-precision floating-point format.  The conversion
5032 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5033 | Arithmetic.
5034 *----------------------------------------------------------------------------*/
5035
5036 floatx80 float32_to_floatx80(float32 a, float_status *status)
5037 {
5038     bool aSign;
5039     int aExp;
5040     uint32_t aSig;
5041
5042     a = float32_squash_input_denormal(a, status);
5043     aSig = extractFloat32Frac( a );
5044     aExp = extractFloat32Exp( a );
5045     aSign = extractFloat32Sign( a );
5046     if ( aExp == 0xFF ) {
5047         if (aSig) {
5048             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5049                                                status);
5050             return floatx80_silence_nan(res, status);
5051         }
5052         return packFloatx80(aSign,
5053                             floatx80_infinity_high,
5054                             floatx80_infinity_low);
5055     }
5056     if ( aExp == 0 ) {
5057         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5058         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5059     }
5060     aSig |= 0x00800000;
5061     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5062
5063 }
5064
5065 /*----------------------------------------------------------------------------
5066 | Returns the result of converting the single-precision floating-point value
5067 | `a' to the double-precision floating-point format.  The conversion is
5068 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5069 | Arithmetic.
5070 *----------------------------------------------------------------------------*/
5071
5072 float128 float32_to_float128(float32 a, float_status *status)
5073 {
5074     bool aSign;
5075     int aExp;
5076     uint32_t aSig;
5077
5078     a = float32_squash_input_denormal(a, status);
5079     aSig = extractFloat32Frac( a );
5080     aExp = extractFloat32Exp( a );
5081     aSign = extractFloat32Sign( a );
5082     if ( aExp == 0xFF ) {
5083         if (aSig) {
5084             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5085         }
5086         return packFloat128( aSign, 0x7FFF, 0, 0 );
5087     }
5088     if ( aExp == 0 ) {
5089         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5090         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5091         --aExp;
5092     }
5093     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5094
5095 }
5096
5097 /*----------------------------------------------------------------------------
5098 | Returns the remainder of the single-precision floating-point value `a'
5099 | with respect to the corresponding value `b'.  The operation is performed
5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5102
5103 float32 float32_rem(float32 a, float32 b, float_status *status)
5104 {
5105     bool aSign, zSign;
5106     int aExp, bExp, expDiff;
5107     uint32_t aSig, bSig;
5108     uint32_t q;
5109     uint64_t aSig64, bSig64, q64;
5110     uint32_t alternateASig;
5111     int32_t sigMean;
5112     a = float32_squash_input_denormal(a, status);
5113     b = float32_squash_input_denormal(b, status);
5114
5115     aSig = extractFloat32Frac( a );
5116     aExp = extractFloat32Exp( a );
5117     aSign = extractFloat32Sign( a );
5118     bSig = extractFloat32Frac( b );
5119     bExp = extractFloat32Exp( b );
5120     if ( aExp == 0xFF ) {
5121         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5122             return propagateFloat32NaN(a, b, status);
5123         }
5124         float_raise(float_flag_invalid, status);
5125         return float32_default_nan(status);
5126     }
5127     if ( bExp == 0xFF ) {
5128         if (bSig) {
5129             return propagateFloat32NaN(a, b, status);
5130         }
5131         return a;
5132     }
5133     if ( bExp == 0 ) {
5134         if ( bSig == 0 ) {
5135             float_raise(float_flag_invalid, status);
5136             return float32_default_nan(status);
5137         }
5138         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5139     }
5140     if ( aExp == 0 ) {
5141         if ( aSig == 0 ) return a;
5142         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5143     }
5144     expDiff = aExp - bExp;
5145     aSig |= 0x00800000;
5146     bSig |= 0x00800000;
5147     if ( expDiff < 32 ) {
5148         aSig <<= 8;
5149         bSig <<= 8;
5150         if ( expDiff < 0 ) {
5151             if ( expDiff < -1 ) return a;
5152             aSig >>= 1;
5153         }
5154         q = ( bSig <= aSig );
5155         if ( q ) aSig -= bSig;
5156         if ( 0 < expDiff ) {
5157             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5158             q >>= 32 - expDiff;
5159             bSig >>= 2;
5160             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5161         }
5162         else {
5163             aSig >>= 2;
5164             bSig >>= 2;
5165         }
5166     }
5167     else {
5168         if ( bSig <= aSig ) aSig -= bSig;
5169         aSig64 = ( (uint64_t) aSig )<<40;
5170         bSig64 = ( (uint64_t) bSig )<<40;
5171         expDiff -= 64;
5172         while ( 0 < expDiff ) {
5173             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5174             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5175             aSig64 = - ( ( bSig * q64 )<<38 );
5176             expDiff -= 62;
5177         }
5178         expDiff += 64;
5179         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5180         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5181         q = q64>>( 64 - expDiff );
5182         bSig <<= 6;
5183         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5184     }
5185     do {
5186         alternateASig = aSig;
5187         ++q;
5188         aSig -= bSig;
5189     } while ( 0 <= (int32_t) aSig );
5190     sigMean = aSig + alternateASig;
5191     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5192         aSig = alternateASig;
5193     }
5194     zSign = ( (int32_t) aSig < 0 );
5195     if ( zSign ) aSig = - aSig;
5196     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5197 }
5198
5199
5200
5201 /*----------------------------------------------------------------------------
5202 | Returns the binary exponential of the single-precision floating-point value
5203 | `a'. The operation is performed according to the IEC/IEEE Standard for
5204 | Binary Floating-Point Arithmetic.
5205 |
5206 | Uses the following identities:
5207 |
5208 | 1. -------------------------------------------------------------------------
5209 |      x    x*ln(2)
5210 |     2  = e
5211 |
5212 | 2. -------------------------------------------------------------------------
5213 |                      2     3     4     5           n
5214 |      x        x     x     x     x     x           x
5215 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5216 |               1!    2!    3!    4!    5!          n!
5217 *----------------------------------------------------------------------------*/
5218
5219 static const float64 float32_exp2_coefficients[15] =
5220 {
5221     const_float64( 0x3ff0000000000000ll ), /*  1 */
5222     const_float64( 0x3fe0000000000000ll ), /*  2 */
5223     const_float64( 0x3fc5555555555555ll ), /*  3 */
5224     const_float64( 0x3fa5555555555555ll ), /*  4 */
5225     const_float64( 0x3f81111111111111ll ), /*  5 */
5226     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5227     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5228     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5229     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5230     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5231     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5232     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5233     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5234     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5235     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5236 };
5237
5238 float32 float32_exp2(float32 a, float_status *status)
5239 {
5240     bool aSign;
5241     int aExp;
5242     uint32_t aSig;
5243     float64 r, x, xn;
5244     int i;
5245     a = float32_squash_input_denormal(a, status);
5246
5247     aSig = extractFloat32Frac( a );
5248     aExp = extractFloat32Exp( a );
5249     aSign = extractFloat32Sign( a );
5250
5251     if ( aExp == 0xFF) {
5252         if (aSig) {
5253             return propagateFloat32NaN(a, float32_zero, status);
5254         }
5255         return (aSign) ? float32_zero : a;
5256     }
5257     if (aExp == 0) {
5258         if (aSig == 0) return float32_one;
5259     }
5260
5261     float_raise(float_flag_inexact, status);
5262
5263     /* ******************************* */
5264     /* using float64 for approximation */
5265     /* ******************************* */
5266     x = float32_to_float64(a, status);
5267     x = float64_mul(x, float64_ln2, status);
5268
5269     xn = x;
5270     r = float64_one;
5271     for (i = 0 ; i < 15 ; i++) {
5272         float64 f;
5273
5274         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5275         r = float64_add(r, f, status);
5276
5277         xn = float64_mul(xn, x, status);
5278     }
5279
5280     return float64_to_float32(r, status);
5281 }
5282
5283 /*----------------------------------------------------------------------------
5284 | Returns the binary log of the single-precision floating-point value `a'.
5285 | The operation is performed according to the IEC/IEEE Standard for Binary
5286 | Floating-Point Arithmetic.
5287 *----------------------------------------------------------------------------*/
5288 float32 float32_log2(float32 a, float_status *status)
5289 {
5290     bool aSign, zSign;
5291     int aExp;
5292     uint32_t aSig, zSig, i;
5293
5294     a = float32_squash_input_denormal(a, status);
5295     aSig = extractFloat32Frac( a );
5296     aExp = extractFloat32Exp( a );
5297     aSign = extractFloat32Sign( a );
5298
5299     if ( aExp == 0 ) {
5300         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5301         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5302     }
5303     if ( aSign ) {
5304         float_raise(float_flag_invalid, status);
5305         return float32_default_nan(status);
5306     }
5307     if ( aExp == 0xFF ) {
5308         if (aSig) {
5309             return propagateFloat32NaN(a, float32_zero, status);
5310         }
5311         return a;
5312     }
5313
5314     aExp -= 0x7F;
5315     aSig |= 0x00800000;
5316     zSign = aExp < 0;
5317     zSig = aExp << 23;
5318
5319     for (i = 1 << 22; i > 0; i >>= 1) {
5320         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5321         if ( aSig & 0x01000000 ) {
5322             aSig >>= 1;
5323             zSig |= i;
5324         }
5325     }
5326
5327     if ( zSign )
5328         zSig = -zSig;
5329
5330     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5331 }
5332
5333 /*----------------------------------------------------------------------------
5334 | Returns the result of converting the double-precision floating-point value
5335 | `a' to the extended double-precision floating-point format.  The conversion
5336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5337 | Arithmetic.
5338 *----------------------------------------------------------------------------*/
5339
5340 floatx80 float64_to_floatx80(float64 a, float_status *status)
5341 {
5342     bool aSign;
5343     int aExp;
5344     uint64_t aSig;
5345
5346     a = float64_squash_input_denormal(a, status);
5347     aSig = extractFloat64Frac( a );
5348     aExp = extractFloat64Exp( a );
5349     aSign = extractFloat64Sign( a );
5350     if ( aExp == 0x7FF ) {
5351         if (aSig) {
5352             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5353                                                status);
5354             return floatx80_silence_nan(res, status);
5355         }
5356         return packFloatx80(aSign,
5357                             floatx80_infinity_high,
5358                             floatx80_infinity_low);
5359     }
5360     if ( aExp == 0 ) {
5361         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5362         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5363     }
5364     return
5365         packFloatx80(
5366             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5367
5368 }
5369
5370 /*----------------------------------------------------------------------------
5371 | Returns the result of converting the double-precision floating-point value
5372 | `a' to the quadruple-precision floating-point format.  The conversion is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5374 | Arithmetic.
5375 *----------------------------------------------------------------------------*/
5376
5377 float128 float64_to_float128(float64 a, float_status *status)
5378 {
5379     bool aSign;
5380     int aExp;
5381     uint64_t aSig, zSig0, zSig1;
5382
5383     a = float64_squash_input_denormal(a, status);
5384     aSig = extractFloat64Frac( a );
5385     aExp = extractFloat64Exp( a );
5386     aSign = extractFloat64Sign( a );
5387     if ( aExp == 0x7FF ) {
5388         if (aSig) {
5389             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5390         }
5391         return packFloat128( aSign, 0x7FFF, 0, 0 );
5392     }
5393     if ( aExp == 0 ) {
5394         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5395         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5396         --aExp;
5397     }
5398     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5399     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5400
5401 }
5402
5403
5404 /*----------------------------------------------------------------------------
5405 | Returns the remainder of the double-precision floating-point value `a'
5406 | with respect to the corresponding value `b'.  The operation is performed
5407 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5409
5410 float64 float64_rem(float64 a, float64 b, float_status *status)
5411 {
5412     bool aSign, zSign;
5413     int aExp, bExp, expDiff;
5414     uint64_t aSig, bSig;
5415     uint64_t q, alternateASig;
5416     int64_t sigMean;
5417
5418     a = float64_squash_input_denormal(a, status);
5419     b = float64_squash_input_denormal(b, status);
5420     aSig = extractFloat64Frac( a );
5421     aExp = extractFloat64Exp( a );
5422     aSign = extractFloat64Sign( a );
5423     bSig = extractFloat64Frac( b );
5424     bExp = extractFloat64Exp( b );
5425     if ( aExp == 0x7FF ) {
5426         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5427             return propagateFloat64NaN(a, b, status);
5428         }
5429         float_raise(float_flag_invalid, status);
5430         return float64_default_nan(status);
5431     }
5432     if ( bExp == 0x7FF ) {
5433         if (bSig) {
5434             return propagateFloat64NaN(a, b, status);
5435         }
5436         return a;
5437     }
5438     if ( bExp == 0 ) {
5439         if ( bSig == 0 ) {
5440             float_raise(float_flag_invalid, status);
5441             return float64_default_nan(status);
5442         }
5443         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5444     }
5445     if ( aExp == 0 ) {
5446         if ( aSig == 0 ) return a;
5447         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5448     }
5449     expDiff = aExp - bExp;
5450     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5451     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5452     if ( expDiff < 0 ) {
5453         if ( expDiff < -1 ) return a;
5454         aSig >>= 1;
5455     }
5456     q = ( bSig <= aSig );
5457     if ( q ) aSig -= bSig;
5458     expDiff -= 64;
5459     while ( 0 < expDiff ) {
5460         q = estimateDiv128To64( aSig, 0, bSig );
5461         q = ( 2 < q ) ? q - 2 : 0;
5462         aSig = - ( ( bSig>>2 ) * q );
5463         expDiff -= 62;
5464     }
5465     expDiff += 64;
5466     if ( 0 < expDiff ) {
5467         q = estimateDiv128To64( aSig, 0, bSig );
5468         q = ( 2 < q ) ? q - 2 : 0;
5469         q >>= 64 - expDiff;
5470         bSig >>= 2;
5471         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5472     }
5473     else {
5474         aSig >>= 2;
5475         bSig >>= 2;
5476     }
5477     do {
5478         alternateASig = aSig;
5479         ++q;
5480         aSig -= bSig;
5481     } while ( 0 <= (int64_t) aSig );
5482     sigMean = aSig + alternateASig;
5483     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5484         aSig = alternateASig;
5485     }
5486     zSign = ( (int64_t) aSig < 0 );
5487     if ( zSign ) aSig = - aSig;
5488     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5489
5490 }
5491
5492 /*----------------------------------------------------------------------------
5493 | Returns the binary log of the double-precision floating-point value `a'.
5494 | The operation is performed according to the IEC/IEEE Standard for Binary
5495 | Floating-Point Arithmetic.
5496 *----------------------------------------------------------------------------*/
5497 float64 float64_log2(float64 a, float_status *status)
5498 {
5499     bool aSign, zSign;
5500     int aExp;
5501     uint64_t aSig, aSig0, aSig1, zSig, i;
5502     a = float64_squash_input_denormal(a, status);
5503
5504     aSig = extractFloat64Frac( a );
5505     aExp = extractFloat64Exp( a );
5506     aSign = extractFloat64Sign( a );
5507
5508     if ( aExp == 0 ) {
5509         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5510         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5511     }
5512     if ( aSign ) {
5513         float_raise(float_flag_invalid, status);
5514         return float64_default_nan(status);
5515     }
5516     if ( aExp == 0x7FF ) {
5517         if (aSig) {
5518             return propagateFloat64NaN(a, float64_zero, status);
5519         }
5520         return a;
5521     }
5522
5523     aExp -= 0x3FF;
5524     aSig |= UINT64_C(0x0010000000000000);
5525     zSign = aExp < 0;
5526     zSig = (uint64_t)aExp << 52;
5527     for (i = 1LL << 51; i > 0; i >>= 1) {
5528         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5529         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5530         if ( aSig & UINT64_C(0x0020000000000000) ) {
5531             aSig >>= 1;
5532             zSig |= i;
5533         }
5534     }
5535
5536     if ( zSign )
5537         zSig = -zSig;
5538     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5539 }
5540
5541 /*----------------------------------------------------------------------------
5542 | Returns the result of converting the extended double-precision floating-
5543 | point value `a' to the 32-bit two's complement integer format.  The
5544 | conversion is performed according to the IEC/IEEE Standard for Binary
5545 | Floating-Point Arithmetic---which means in particular that the conversion
5546 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5547 | largest positive integer is returned.  Otherwise, if the conversion
5548 | overflows, the largest integer with the same sign as `a' is returned.
5549 *----------------------------------------------------------------------------*/
5550
5551 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5552 {
5553     bool aSign;
5554     int32_t aExp, shiftCount;
5555     uint64_t aSig;
5556
5557     if (floatx80_invalid_encoding(a)) {
5558         float_raise(float_flag_invalid, status);
5559         return 1 << 31;
5560     }
5561     aSig = extractFloatx80Frac( a );
5562     aExp = extractFloatx80Exp( a );
5563     aSign = extractFloatx80Sign( a );
5564     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5565     shiftCount = 0x4037 - aExp;
5566     if ( shiftCount <= 0 ) shiftCount = 1;
5567     shift64RightJamming( aSig, shiftCount, &aSig );
5568     return roundAndPackInt32(aSign, aSig, status);
5569
5570 }
5571
5572 /*----------------------------------------------------------------------------
5573 | Returns the result of converting the extended double-precision floating-
5574 | point value `a' to the 32-bit two's complement integer format.  The
5575 | conversion is performed according to the IEC/IEEE Standard for Binary
5576 | Floating-Point Arithmetic, except that the conversion is always rounded
5577 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5578 | Otherwise, if the conversion overflows, the largest integer with the same
5579 | sign as `a' is returned.
5580 *----------------------------------------------------------------------------*/
5581
5582 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5583 {
5584     bool aSign;
5585     int32_t aExp, shiftCount;
5586     uint64_t aSig, savedASig;
5587     int32_t z;
5588
5589     if (floatx80_invalid_encoding(a)) {
5590         float_raise(float_flag_invalid, status);
5591         return 1 << 31;
5592     }
5593     aSig = extractFloatx80Frac( a );
5594     aExp = extractFloatx80Exp( a );
5595     aSign = extractFloatx80Sign( a );
5596     if ( 0x401E < aExp ) {
5597         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5598         goto invalid;
5599     }
5600     else if ( aExp < 0x3FFF ) {
5601         if (aExp || aSig) {
5602             float_raise(float_flag_inexact, status);
5603         }
5604         return 0;
5605     }
5606     shiftCount = 0x403E - aExp;
5607     savedASig = aSig;
5608     aSig >>= shiftCount;
5609     z = aSig;
5610     if ( aSign ) z = - z;
5611     if ( ( z < 0 ) ^ aSign ) {
5612  invalid:
5613         float_raise(float_flag_invalid, status);
5614         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5615     }
5616     if ( ( aSig<<shiftCount ) != savedASig ) {
5617         float_raise(float_flag_inexact, status);
5618     }
5619     return z;
5620
5621 }
5622
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of converting the extended double-precision floating-
5625 | point value `a' to the 64-bit two's complement integer format.  The
5626 | conversion is performed according to the IEC/IEEE Standard for Binary
5627 | Floating-Point Arithmetic---which means in particular that the conversion
5628 | is rounded according to the current rounding mode.  If `a' is a NaN,
5629 | the largest positive integer is returned.  Otherwise, if the conversion
5630 | overflows, the largest integer with the same sign as `a' is returned.
5631 *----------------------------------------------------------------------------*/
5632
5633 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5634 {
5635     bool aSign;
5636     int32_t aExp, shiftCount;
5637     uint64_t aSig, aSigExtra;
5638
5639     if (floatx80_invalid_encoding(a)) {
5640         float_raise(float_flag_invalid, status);
5641         return 1ULL << 63;
5642     }
5643     aSig = extractFloatx80Frac( a );
5644     aExp = extractFloatx80Exp( a );
5645     aSign = extractFloatx80Sign( a );
5646     shiftCount = 0x403E - aExp;
5647     if ( shiftCount <= 0 ) {
5648         if ( shiftCount ) {
5649             float_raise(float_flag_invalid, status);
5650             if (!aSign || floatx80_is_any_nan(a)) {
5651                 return INT64_MAX;
5652             }
5653             return INT64_MIN;
5654         }
5655         aSigExtra = 0;
5656     }
5657     else {
5658         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5659     }
5660     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5661
5662 }
5663
5664 /*----------------------------------------------------------------------------
5665 | Returns the result of converting the extended double-precision floating-
5666 | point value `a' to the 64-bit two's complement integer format.  The
5667 | conversion is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic, except that the conversion is always rounded
5669 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5670 | Otherwise, if the conversion overflows, the largest integer with the same
5671 | sign as `a' is returned.
5672 *----------------------------------------------------------------------------*/
5673
5674 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5675 {
5676     bool aSign;
5677     int32_t aExp, shiftCount;
5678     uint64_t aSig;
5679     int64_t z;
5680
5681     if (floatx80_invalid_encoding(a)) {
5682         float_raise(float_flag_invalid, status);
5683         return 1ULL << 63;
5684     }
5685     aSig = extractFloatx80Frac( a );
5686     aExp = extractFloatx80Exp( a );
5687     aSign = extractFloatx80Sign( a );
5688     shiftCount = aExp - 0x403E;
5689     if ( 0 <= shiftCount ) {
5690         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5691         if ( ( a.high != 0xC03E ) || aSig ) {
5692             float_raise(float_flag_invalid, status);
5693             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5694                 return INT64_MAX;
5695             }
5696         }
5697         return INT64_MIN;
5698     }
5699     else if ( aExp < 0x3FFF ) {
5700         if (aExp | aSig) {
5701             float_raise(float_flag_inexact, status);
5702         }
5703         return 0;
5704     }
5705     z = aSig>>( - shiftCount );
5706     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5707         float_raise(float_flag_inexact, status);
5708     }
5709     if ( aSign ) z = - z;
5710     return z;
5711
5712 }
5713
5714 /*----------------------------------------------------------------------------
5715 | Returns the result of converting the extended double-precision floating-
5716 | point value `a' to the single-precision floating-point format.  The
5717 | conversion is performed according to the IEC/IEEE Standard for Binary
5718 | Floating-Point Arithmetic.
5719 *----------------------------------------------------------------------------*/
5720
5721 float32 floatx80_to_float32(floatx80 a, float_status *status)
5722 {
5723     bool aSign;
5724     int32_t aExp;
5725     uint64_t aSig;
5726
5727     if (floatx80_invalid_encoding(a)) {
5728         float_raise(float_flag_invalid, status);
5729         return float32_default_nan(status);
5730     }
5731     aSig = extractFloatx80Frac( a );
5732     aExp = extractFloatx80Exp( a );
5733     aSign = extractFloatx80Sign( a );
5734     if ( aExp == 0x7FFF ) {
5735         if ( (uint64_t) ( aSig<<1 ) ) {
5736             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5737                                              status);
5738             return float32_silence_nan(res, status);
5739         }
5740         return packFloat32( aSign, 0xFF, 0 );
5741     }
5742     shift64RightJamming( aSig, 33, &aSig );
5743     if ( aExp || aSig ) aExp -= 0x3F81;
5744     return roundAndPackFloat32(aSign, aExp, aSig, status);
5745
5746 }
5747
5748 /*----------------------------------------------------------------------------
5749 | Returns the result of converting the extended double-precision floating-
5750 | point value `a' to the double-precision floating-point format.  The
5751 | conversion is performed according to the IEC/IEEE Standard for Binary
5752 | Floating-Point Arithmetic.
5753 *----------------------------------------------------------------------------*/
5754
5755 float64 floatx80_to_float64(floatx80 a, float_status *status)
5756 {
5757     bool aSign;
5758     int32_t aExp;
5759     uint64_t aSig, zSig;
5760
5761     if (floatx80_invalid_encoding(a)) {
5762         float_raise(float_flag_invalid, status);
5763         return float64_default_nan(status);
5764     }
5765     aSig = extractFloatx80Frac( a );
5766     aExp = extractFloatx80Exp( a );
5767     aSign = extractFloatx80Sign( a );
5768     if ( aExp == 0x7FFF ) {
5769         if ( (uint64_t) ( aSig<<1 ) ) {
5770             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5771                                              status);
5772             return float64_silence_nan(res, status);
5773         }
5774         return packFloat64( aSign, 0x7FF, 0 );
5775     }
5776     shift64RightJamming( aSig, 1, &zSig );
5777     if ( aExp || aSig ) aExp -= 0x3C01;
5778     return roundAndPackFloat64(aSign, aExp, zSig, status);
5779
5780 }
5781
5782 /*----------------------------------------------------------------------------
5783 | Returns the result of converting the extended double-precision floating-
5784 | point value `a' to the quadruple-precision floating-point format.  The
5785 | conversion is performed according to the IEC/IEEE Standard for Binary
5786 | Floating-Point Arithmetic.
5787 *----------------------------------------------------------------------------*/
5788
5789 float128 floatx80_to_float128(floatx80 a, float_status *status)
5790 {
5791     bool aSign;
5792     int aExp;
5793     uint64_t aSig, zSig0, zSig1;
5794
5795     if (floatx80_invalid_encoding(a)) {
5796         float_raise(float_flag_invalid, status);
5797         return float128_default_nan(status);
5798     }
5799     aSig = extractFloatx80Frac( a );
5800     aExp = extractFloatx80Exp( a );
5801     aSign = extractFloatx80Sign( a );
5802     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5803         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5804                                            status);
5805         return float128_silence_nan(res, status);
5806     }
5807     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5808     return packFloat128( aSign, aExp, zSig0, zSig1 );
5809
5810 }
5811
5812 /*----------------------------------------------------------------------------
5813 | Rounds the extended double-precision floating-point value `a'
5814 | to the precision provided by floatx80_rounding_precision and returns the
5815 | result as an extended double-precision floating-point value.
5816 | The operation is performed according to the IEC/IEEE Standard for Binary
5817 | Floating-Point Arithmetic.
5818 *----------------------------------------------------------------------------*/
5819
5820 floatx80 floatx80_round(floatx80 a, float_status *status)
5821 {
5822     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5823                                 extractFloatx80Sign(a),
5824                                 extractFloatx80Exp(a),
5825                                 extractFloatx80Frac(a), 0, status);
5826 }
5827
5828 /*----------------------------------------------------------------------------
5829 | Rounds the extended double-precision floating-point value `a' to an integer,
5830 | and returns the result as an extended quadruple-precision floating-point
5831 | value.  The operation is performed according to the IEC/IEEE Standard for
5832 | Binary Floating-Point Arithmetic.
5833 *----------------------------------------------------------------------------*/
5834
5835 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5836 {
5837     bool aSign;
5838     int32_t aExp;
5839     uint64_t lastBitMask, roundBitsMask;
5840     floatx80 z;
5841
5842     if (floatx80_invalid_encoding(a)) {
5843         float_raise(float_flag_invalid, status);
5844         return floatx80_default_nan(status);
5845     }
5846     aExp = extractFloatx80Exp( a );
5847     if ( 0x403E <= aExp ) {
5848         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5849             return propagateFloatx80NaN(a, a, status);
5850         }
5851         return a;
5852     }
5853     if ( aExp < 0x3FFF ) {
5854         if (    ( aExp == 0 )
5855              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5856             return a;
5857         }
5858         float_raise(float_flag_inexact, status);
5859         aSign = extractFloatx80Sign( a );
5860         switch (status->float_rounding_mode) {
5861          case float_round_nearest_even:
5862             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5863                ) {
5864                 return
5865                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5866             }
5867             break;
5868         case float_round_ties_away:
5869             if (aExp == 0x3FFE) {
5870                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5871             }
5872             break;
5873          case float_round_down:
5874             return
5875                   aSign ?
5876                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5877                 : packFloatx80( 0, 0, 0 );
5878          case float_round_up:
5879             return
5880                   aSign ? packFloatx80( 1, 0, 0 )
5881                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5882
5883         case float_round_to_zero:
5884             break;
5885         default:
5886             g_assert_not_reached();
5887         }
5888         return packFloatx80( aSign, 0, 0 );
5889     }
5890     lastBitMask = 1;
5891     lastBitMask <<= 0x403E - aExp;
5892     roundBitsMask = lastBitMask - 1;
5893     z = a;
5894     switch (status->float_rounding_mode) {
5895     case float_round_nearest_even:
5896         z.low += lastBitMask>>1;
5897         if ((z.low & roundBitsMask) == 0) {
5898             z.low &= ~lastBitMask;
5899         }
5900         break;
5901     case float_round_ties_away:
5902         z.low += lastBitMask >> 1;
5903         break;
5904     case float_round_to_zero:
5905         break;
5906     case float_round_up:
5907         if (!extractFloatx80Sign(z)) {
5908             z.low += roundBitsMask;
5909         }
5910         break;
5911     case float_round_down:
5912         if (extractFloatx80Sign(z)) {
5913             z.low += roundBitsMask;
5914         }
5915         break;
5916     default:
5917         abort();
5918     }
5919     z.low &= ~ roundBitsMask;
5920     if ( z.low == 0 ) {
5921         ++z.high;
5922         z.low = UINT64_C(0x8000000000000000);
5923     }
5924     if (z.low != a.low) {
5925         float_raise(float_flag_inexact, status);
5926     }
5927     return z;
5928
5929 }
5930
5931 /*----------------------------------------------------------------------------
5932 | Returns the result of adding the absolute values of the extended double-
5933 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5934 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5935 | The addition is performed according to the IEC/IEEE Standard for Binary
5936 | Floating-Point Arithmetic.
5937 *----------------------------------------------------------------------------*/
5938
5939 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5940                                 float_status *status)
5941 {
5942     int32_t aExp, bExp, zExp;
5943     uint64_t aSig, bSig, zSig0, zSig1;
5944     int32_t expDiff;
5945
5946     aSig = extractFloatx80Frac( a );
5947     aExp = extractFloatx80Exp( a );
5948     bSig = extractFloatx80Frac( b );
5949     bExp = extractFloatx80Exp( b );
5950     expDiff = aExp - bExp;
5951     if ( 0 < expDiff ) {
5952         if ( aExp == 0x7FFF ) {
5953             if ((uint64_t)(aSig << 1)) {
5954                 return propagateFloatx80NaN(a, b, status);
5955             }
5956             return a;
5957         }
5958         if ( bExp == 0 ) --expDiff;
5959         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5960         zExp = aExp;
5961     }
5962     else if ( expDiff < 0 ) {
5963         if ( bExp == 0x7FFF ) {
5964             if ((uint64_t)(bSig << 1)) {
5965                 return propagateFloatx80NaN(a, b, status);
5966             }
5967             return packFloatx80(zSign,
5968                                 floatx80_infinity_high,
5969                                 floatx80_infinity_low);
5970         }
5971         if ( aExp == 0 ) ++expDiff;
5972         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5973         zExp = bExp;
5974     }
5975     else {
5976         if ( aExp == 0x7FFF ) {
5977             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5978                 return propagateFloatx80NaN(a, b, status);
5979             }
5980             return a;
5981         }
5982         zSig1 = 0;
5983         zSig0 = aSig + bSig;
5984         if ( aExp == 0 ) {
5985             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5986                 /* At least one of the values is a pseudo-denormal,
5987                  * and there is a carry out of the result.  */
5988                 zExp = 1;
5989                 goto shiftRight1;
5990             }
5991             if (zSig0 == 0) {
5992                 return packFloatx80(zSign, 0, 0);
5993             }
5994             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5995             goto roundAndPack;
5996         }
5997         zExp = aExp;
5998         goto shiftRight1;
5999     }
6000     zSig0 = aSig + bSig;
6001     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6002  shiftRight1:
6003     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6004     zSig0 |= UINT64_C(0x8000000000000000);
6005     ++zExp;
6006  roundAndPack:
6007     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6008                                 zSign, zExp, zSig0, zSig1, status);
6009 }
6010
6011 /*----------------------------------------------------------------------------
6012 | Returns the result of subtracting the absolute values of the extended
6013 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6014 | difference is negated before being returned.  `zSign' is ignored if the
6015 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6016 | Standard for Binary Floating-Point Arithmetic.
6017 *----------------------------------------------------------------------------*/
6018
6019 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6020                                 float_status *status)
6021 {
6022     int32_t aExp, bExp, zExp;
6023     uint64_t aSig, bSig, zSig0, zSig1;
6024     int32_t expDiff;
6025
6026     aSig = extractFloatx80Frac( a );
6027     aExp = extractFloatx80Exp( a );
6028     bSig = extractFloatx80Frac( b );
6029     bExp = extractFloatx80Exp( b );
6030     expDiff = aExp - bExp;
6031     if ( 0 < expDiff ) goto aExpBigger;
6032     if ( expDiff < 0 ) goto bExpBigger;
6033     if ( aExp == 0x7FFF ) {
6034         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6035             return propagateFloatx80NaN(a, b, status);
6036         }
6037         float_raise(float_flag_invalid, status);
6038         return floatx80_default_nan(status);
6039     }
6040     if ( aExp == 0 ) {
6041         aExp = 1;
6042         bExp = 1;
6043     }
6044     zSig1 = 0;
6045     if ( bSig < aSig ) goto aBigger;
6046     if ( aSig < bSig ) goto bBigger;
6047     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6048  bExpBigger:
6049     if ( bExp == 0x7FFF ) {
6050         if ((uint64_t)(bSig << 1)) {
6051             return propagateFloatx80NaN(a, b, status);
6052         }
6053         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6054                             floatx80_infinity_low);
6055     }
6056     if ( aExp == 0 ) ++expDiff;
6057     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6058  bBigger:
6059     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6060     zExp = bExp;
6061     zSign ^= 1;
6062     goto normalizeRoundAndPack;
6063  aExpBigger:
6064     if ( aExp == 0x7FFF ) {
6065         if ((uint64_t)(aSig << 1)) {
6066             return propagateFloatx80NaN(a, b, status);
6067         }
6068         return a;
6069     }
6070     if ( bExp == 0 ) --expDiff;
6071     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6072  aBigger:
6073     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6074     zExp = aExp;
6075  normalizeRoundAndPack:
6076     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6077                                          zSign, zExp, zSig0, zSig1, status);
6078 }
6079
6080 /*----------------------------------------------------------------------------
6081 | Returns the result of adding the extended double-precision floating-point
6082 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6083 | Standard for Binary Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6085
6086 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6087 {
6088     bool aSign, bSign;
6089
6090     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6091         float_raise(float_flag_invalid, status);
6092         return floatx80_default_nan(status);
6093     }
6094     aSign = extractFloatx80Sign( a );
6095     bSign = extractFloatx80Sign( b );
6096     if ( aSign == bSign ) {
6097         return addFloatx80Sigs(a, b, aSign, status);
6098     }
6099     else {
6100         return subFloatx80Sigs(a, b, aSign, status);
6101     }
6102
6103 }
6104
6105 /*----------------------------------------------------------------------------
6106 | Returns the result of subtracting the extended double-precision floating-
6107 | point values `a' and `b'.  The operation is performed according to the
6108 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6109 *----------------------------------------------------------------------------*/
6110
6111 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6112 {
6113     bool aSign, bSign;
6114
6115     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6116         float_raise(float_flag_invalid, status);
6117         return floatx80_default_nan(status);
6118     }
6119     aSign = extractFloatx80Sign( a );
6120     bSign = extractFloatx80Sign( b );
6121     if ( aSign == bSign ) {
6122         return subFloatx80Sigs(a, b, aSign, status);
6123     }
6124     else {
6125         return addFloatx80Sigs(a, b, aSign, status);
6126     }
6127
6128 }
6129
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of multiplying the extended double-precision floating-
6132 | point values `a' and `b'.  The operation is performed according to the
6133 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6134 *----------------------------------------------------------------------------*/
6135
6136 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6137 {
6138     bool aSign, bSign, zSign;
6139     int32_t aExp, bExp, zExp;
6140     uint64_t aSig, bSig, zSig0, zSig1;
6141
6142     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6143         float_raise(float_flag_invalid, status);
6144         return floatx80_default_nan(status);
6145     }
6146     aSig = extractFloatx80Frac( a );
6147     aExp = extractFloatx80Exp( a );
6148     aSign = extractFloatx80Sign( a );
6149     bSig = extractFloatx80Frac( b );
6150     bExp = extractFloatx80Exp( b );
6151     bSign = extractFloatx80Sign( b );
6152     zSign = aSign ^ bSign;
6153     if ( aExp == 0x7FFF ) {
6154         if (    (uint64_t) ( aSig<<1 )
6155              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6156             return propagateFloatx80NaN(a, b, status);
6157         }
6158         if ( ( bExp | bSig ) == 0 ) goto invalid;
6159         return packFloatx80(zSign, floatx80_infinity_high,
6160                                    floatx80_infinity_low);
6161     }
6162     if ( bExp == 0x7FFF ) {
6163         if ((uint64_t)(bSig << 1)) {
6164             return propagateFloatx80NaN(a, b, status);
6165         }
6166         if ( ( aExp | aSig ) == 0 ) {
6167  invalid:
6168             float_raise(float_flag_invalid, status);
6169             return floatx80_default_nan(status);
6170         }
6171         return packFloatx80(zSign, floatx80_infinity_high,
6172                                    floatx80_infinity_low);
6173     }
6174     if ( aExp == 0 ) {
6175         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6176         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6177     }
6178     if ( bExp == 0 ) {
6179         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6180         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6181     }
6182     zExp = aExp + bExp - 0x3FFE;
6183     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6184     if ( 0 < (int64_t) zSig0 ) {
6185         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6186         --zExp;
6187     }
6188     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6189                                 zSign, zExp, zSig0, zSig1, status);
6190 }
6191
6192 /*----------------------------------------------------------------------------
6193 | Returns the result of dividing the extended double-precision floating-point
6194 | value `a' by the corresponding value `b'.  The operation is performed
6195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6196 *----------------------------------------------------------------------------*/
6197
6198 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6199 {
6200     bool aSign, bSign, zSign;
6201     int32_t aExp, bExp, zExp;
6202     uint64_t aSig, bSig, zSig0, zSig1;
6203     uint64_t rem0, rem1, rem2, term0, term1, term2;
6204
6205     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6206         float_raise(float_flag_invalid, status);
6207         return floatx80_default_nan(status);
6208     }
6209     aSig = extractFloatx80Frac( a );
6210     aExp = extractFloatx80Exp( a );
6211     aSign = extractFloatx80Sign( a );
6212     bSig = extractFloatx80Frac( b );
6213     bExp = extractFloatx80Exp( b );
6214     bSign = extractFloatx80Sign( b );
6215     zSign = aSign ^ bSign;
6216     if ( aExp == 0x7FFF ) {
6217         if ((uint64_t)(aSig << 1)) {
6218             return propagateFloatx80NaN(a, b, status);
6219         }
6220         if ( bExp == 0x7FFF ) {
6221             if ((uint64_t)(bSig << 1)) {
6222                 return propagateFloatx80NaN(a, b, status);
6223             }
6224             goto invalid;
6225         }
6226         return packFloatx80(zSign, floatx80_infinity_high,
6227                                    floatx80_infinity_low);
6228     }
6229     if ( bExp == 0x7FFF ) {
6230         if ((uint64_t)(bSig << 1)) {
6231             return propagateFloatx80NaN(a, b, status);
6232         }
6233         return packFloatx80( zSign, 0, 0 );
6234     }
6235     if ( bExp == 0 ) {
6236         if ( bSig == 0 ) {
6237             if ( ( aExp | aSig ) == 0 ) {
6238  invalid:
6239                 float_raise(float_flag_invalid, status);
6240                 return floatx80_default_nan(status);
6241             }
6242             float_raise(float_flag_divbyzero, status);
6243             return packFloatx80(zSign, floatx80_infinity_high,
6244                                        floatx80_infinity_low);
6245         }
6246         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6247     }
6248     if ( aExp == 0 ) {
6249         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6250         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6251     }
6252     zExp = aExp - bExp + 0x3FFE;
6253     rem1 = 0;
6254     if ( bSig <= aSig ) {
6255         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6256         ++zExp;
6257     }
6258     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6259     mul64To128( bSig, zSig0, &term0, &term1 );
6260     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6261     while ( (int64_t) rem0 < 0 ) {
6262         --zSig0;
6263         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6264     }
6265     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6266     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6267         mul64To128( bSig, zSig1, &term1, &term2 );
6268         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6269         while ( (int64_t) rem1 < 0 ) {
6270             --zSig1;
6271             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6272         }
6273         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6274     }
6275     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6276                                 zSign, zExp, zSig0, zSig1, status);
6277 }
6278
6279 /*----------------------------------------------------------------------------
6280 | Returns the remainder of the extended double-precision floating-point value
6281 | `a' with respect to the corresponding value `b'.  The operation is performed
6282 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6283 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6284 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6285 | the absolute value of the integer quotient.
6286 *----------------------------------------------------------------------------*/
6287
6288 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6289                          float_status *status)
6290 {
6291     bool aSign, zSign;
6292     int32_t aExp, bExp, expDiff, aExpOrig;
6293     uint64_t aSig0, aSig1, bSig;
6294     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6295
6296     *quotient = 0;
6297     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6298         float_raise(float_flag_invalid, status);
6299         return floatx80_default_nan(status);
6300     }
6301     aSig0 = extractFloatx80Frac( a );
6302     aExpOrig = aExp = extractFloatx80Exp( a );
6303     aSign = extractFloatx80Sign( a );
6304     bSig = extractFloatx80Frac( b );
6305     bExp = extractFloatx80Exp( b );
6306     if ( aExp == 0x7FFF ) {
6307         if (    (uint64_t) ( aSig0<<1 )
6308              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6309             return propagateFloatx80NaN(a, b, status);
6310         }
6311         goto invalid;
6312     }
6313     if ( bExp == 0x7FFF ) {
6314         if ((uint64_t)(bSig << 1)) {
6315             return propagateFloatx80NaN(a, b, status);
6316         }
6317         if (aExp == 0 && aSig0 >> 63) {
6318             /*
6319              * Pseudo-denormal argument must be returned in normalized
6320              * form.
6321              */
6322             return packFloatx80(aSign, 1, aSig0);
6323         }
6324         return a;
6325     }
6326     if ( bExp == 0 ) {
6327         if ( bSig == 0 ) {
6328  invalid:
6329             float_raise(float_flag_invalid, status);
6330             return floatx80_default_nan(status);
6331         }
6332         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6333     }
6334     if ( aExp == 0 ) {
6335         if ( aSig0 == 0 ) return a;
6336         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6337     }
6338     zSign = aSign;
6339     expDiff = aExp - bExp;
6340     aSig1 = 0;
6341     if ( expDiff < 0 ) {
6342         if ( mod || expDiff < -1 ) {
6343             if (aExp == 1 && aExpOrig == 0) {
6344                 /*
6345                  * Pseudo-denormal argument must be returned in
6346                  * normalized form.
6347                  */
6348                 return packFloatx80(aSign, aExp, aSig0);
6349             }
6350             return a;
6351         }
6352         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6353         expDiff = 0;
6354     }
6355     *quotient = q = ( bSig <= aSig0 );
6356     if ( q ) aSig0 -= bSig;
6357     expDiff -= 64;
6358     while ( 0 < expDiff ) {
6359         q = estimateDiv128To64( aSig0, aSig1, bSig );
6360         q = ( 2 < q ) ? q - 2 : 0;
6361         mul64To128( bSig, q, &term0, &term1 );
6362         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6363         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6364         expDiff -= 62;
6365         *quotient <<= 62;
6366         *quotient += q;
6367     }
6368     expDiff += 64;
6369     if ( 0 < expDiff ) {
6370         q = estimateDiv128To64( aSig0, aSig1, bSig );
6371         q = ( 2 < q ) ? q - 2 : 0;
6372         q >>= 64 - expDiff;
6373         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6374         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6375         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6376         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6377             ++q;
6378             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6379         }
6380         if (expDiff < 64) {
6381             *quotient <<= expDiff;
6382         } else {
6383             *quotient = 0;
6384         }
6385         *quotient += q;
6386     }
6387     else {
6388         term1 = 0;
6389         term0 = bSig;
6390     }
6391     if (!mod) {
6392         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6393         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6394                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6395                         && ( q & 1 ) )
6396             ) {
6397             aSig0 = alternateASig0;
6398             aSig1 = alternateASig1;
6399             zSign = ! zSign;
6400             ++*quotient;
6401         }
6402     }
6403     return
6404         normalizeRoundAndPackFloatx80(
6405             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6406
6407 }
6408
6409 /*----------------------------------------------------------------------------
6410 | Returns the remainder of the extended double-precision floating-point value
6411 | `a' with respect to the corresponding value `b'.  The operation is performed
6412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6413 *----------------------------------------------------------------------------*/
6414
6415 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6416 {
6417     uint64_t quotient;
6418     return floatx80_modrem(a, b, false, &quotient, status);
6419 }
6420
6421 /*----------------------------------------------------------------------------
6422 | Returns the remainder of the extended double-precision floating-point value
6423 | `a' with respect to the corresponding value `b', with the quotient truncated
6424 | toward zero.
6425 *----------------------------------------------------------------------------*/
6426
6427 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6428 {
6429     uint64_t quotient;
6430     return floatx80_modrem(a, b, true, &quotient, status);
6431 }
6432
6433 /*----------------------------------------------------------------------------
6434 | Returns the square root of the extended double-precision floating-point
6435 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6436 | for Binary Floating-Point Arithmetic.
6437 *----------------------------------------------------------------------------*/
6438
6439 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6440 {
6441     bool aSign;
6442     int32_t aExp, zExp;
6443     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6444     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6445
6446     if (floatx80_invalid_encoding(a)) {
6447         float_raise(float_flag_invalid, status);
6448         return floatx80_default_nan(status);
6449     }
6450     aSig0 = extractFloatx80Frac( a );
6451     aExp = extractFloatx80Exp( a );
6452     aSign = extractFloatx80Sign( a );
6453     if ( aExp == 0x7FFF ) {
6454         if ((uint64_t)(aSig0 << 1)) {
6455             return propagateFloatx80NaN(a, a, status);
6456         }
6457         if ( ! aSign ) return a;
6458         goto invalid;
6459     }
6460     if ( aSign ) {
6461         if ( ( aExp | aSig0 ) == 0 ) return a;
6462  invalid:
6463         float_raise(float_flag_invalid, status);
6464         return floatx80_default_nan(status);
6465     }
6466     if ( aExp == 0 ) {
6467         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6468         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6469     }
6470     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6471     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6472     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6473     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6474     doubleZSig0 = zSig0<<1;
6475     mul64To128( zSig0, zSig0, &term0, &term1 );
6476     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6477     while ( (int64_t) rem0 < 0 ) {
6478         --zSig0;
6479         doubleZSig0 -= 2;
6480         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6481     }
6482     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6483     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6484         if ( zSig1 == 0 ) zSig1 = 1;
6485         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6486         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6487         mul64To128( zSig1, zSig1, &term2, &term3 );
6488         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6489         while ( (int64_t) rem1 < 0 ) {
6490             --zSig1;
6491             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6492             term3 |= 1;
6493             term2 |= doubleZSig0;
6494             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6495         }
6496         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6497     }
6498     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6499     zSig0 |= doubleZSig0;
6500     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6501                                 0, zExp, zSig0, zSig1, status);
6502 }
6503
6504 /*----------------------------------------------------------------------------
6505 | Returns the result of converting the quadruple-precision floating-point
6506 | value `a' to the 32-bit two's complement integer format.  The conversion
6507 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6508 | Arithmetic---which means in particular that the conversion is rounded
6509 | according to the current rounding mode.  If `a' is a NaN, the largest
6510 | positive integer is returned.  Otherwise, if the conversion overflows, the
6511 | largest integer with the same sign as `a' is returned.
6512 *----------------------------------------------------------------------------*/
6513
6514 int32_t float128_to_int32(float128 a, float_status *status)
6515 {
6516     bool aSign;
6517     int32_t aExp, shiftCount;
6518     uint64_t aSig0, aSig1;
6519
6520     aSig1 = extractFloat128Frac1( a );
6521     aSig0 = extractFloat128Frac0( a );
6522     aExp = extractFloat128Exp( a );
6523     aSign = extractFloat128Sign( a );
6524     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6525     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6526     aSig0 |= ( aSig1 != 0 );
6527     shiftCount = 0x4028 - aExp;
6528     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6529     return roundAndPackInt32(aSign, aSig0, status);
6530
6531 }
6532
6533 /*----------------------------------------------------------------------------
6534 | Returns the result of converting the quadruple-precision floating-point
6535 | value `a' to the 32-bit two's complement integer format.  The conversion
6536 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6537 | Arithmetic, except that the conversion is always rounded toward zero.  If
6538 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6539 | conversion overflows, the largest integer with the same sign as `a' is
6540 | returned.
6541 *----------------------------------------------------------------------------*/
6542
6543 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6544 {
6545     bool aSign;
6546     int32_t aExp, shiftCount;
6547     uint64_t aSig0, aSig1, savedASig;
6548     int32_t z;
6549
6550     aSig1 = extractFloat128Frac1( a );
6551     aSig0 = extractFloat128Frac0( a );
6552     aExp = extractFloat128Exp( a );
6553     aSign = extractFloat128Sign( a );
6554     aSig0 |= ( aSig1 != 0 );
6555     if ( 0x401E < aExp ) {
6556         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6557         goto invalid;
6558     }
6559     else if ( aExp < 0x3FFF ) {
6560         if (aExp || aSig0) {
6561             float_raise(float_flag_inexact, status);
6562         }
6563         return 0;
6564     }
6565     aSig0 |= UINT64_C(0x0001000000000000);
6566     shiftCount = 0x402F - aExp;
6567     savedASig = aSig0;
6568     aSig0 >>= shiftCount;
6569     z = aSig0;
6570     if ( aSign ) z = - z;
6571     if ( ( z < 0 ) ^ aSign ) {
6572  invalid:
6573         float_raise(float_flag_invalid, status);
6574         return aSign ? INT32_MIN : INT32_MAX;
6575     }
6576     if ( ( aSig0<<shiftCount ) != savedASig ) {
6577         float_raise(float_flag_inexact, status);
6578     }
6579     return z;
6580
6581 }
6582
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of converting the quadruple-precision floating-point
6585 | value `a' to the 64-bit two's complement integer format.  The conversion
6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6587 | Arithmetic---which means in particular that the conversion is rounded
6588 | according to the current rounding mode.  If `a' is a NaN, the largest
6589 | positive integer is returned.  Otherwise, if the conversion overflows, the
6590 | largest integer with the same sign as `a' is returned.
6591 *----------------------------------------------------------------------------*/
6592
6593 int64_t float128_to_int64(float128 a, float_status *status)
6594 {
6595     bool aSign;
6596     int32_t aExp, shiftCount;
6597     uint64_t aSig0, aSig1;
6598
6599     aSig1 = extractFloat128Frac1( a );
6600     aSig0 = extractFloat128Frac0( a );
6601     aExp = extractFloat128Exp( a );
6602     aSign = extractFloat128Sign( a );
6603     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6604     shiftCount = 0x402F - aExp;
6605     if ( shiftCount <= 0 ) {
6606         if ( 0x403E < aExp ) {
6607             float_raise(float_flag_invalid, status);
6608             if (    ! aSign
6609                  || (    ( aExp == 0x7FFF )
6610                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6611                     )
6612                ) {
6613                 return INT64_MAX;
6614             }
6615             return INT64_MIN;
6616         }
6617         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6618     }
6619     else {
6620         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6621     }
6622     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6623
6624 }
6625
6626 /*----------------------------------------------------------------------------
6627 | Returns the result of converting the quadruple-precision floating-point
6628 | value `a' to the 64-bit two's complement integer format.  The conversion
6629 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6630 | Arithmetic, except that the conversion is always rounded toward zero.
6631 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6632 | the conversion overflows, the largest integer with the same sign as `a' is
6633 | returned.
6634 *----------------------------------------------------------------------------*/
6635
6636 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6637 {
6638     bool aSign;
6639     int32_t aExp, shiftCount;
6640     uint64_t aSig0, aSig1;
6641     int64_t z;
6642
6643     aSig1 = extractFloat128Frac1( a );
6644     aSig0 = extractFloat128Frac0( a );
6645     aExp = extractFloat128Exp( a );
6646     aSign = extractFloat128Sign( a );
6647     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6648     shiftCount = aExp - 0x402F;
6649     if ( 0 < shiftCount ) {
6650         if ( 0x403E <= aExp ) {
6651             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6652             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6653                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6654                 if (aSig1) {
6655                     float_raise(float_flag_inexact, status);
6656                 }
6657             }
6658             else {
6659                 float_raise(float_flag_invalid, status);
6660                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6661                     return INT64_MAX;
6662                 }
6663             }
6664             return INT64_MIN;
6665         }
6666         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6667         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6668             float_raise(float_flag_inexact, status);
6669         }
6670     }
6671     else {
6672         if ( aExp < 0x3FFF ) {
6673             if ( aExp | aSig0 | aSig1 ) {
6674                 float_raise(float_flag_inexact, status);
6675             }
6676             return 0;
6677         }
6678         z = aSig0>>( - shiftCount );
6679         if (    aSig1
6680              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6681             float_raise(float_flag_inexact, status);
6682         }
6683     }
6684     if ( aSign ) z = - z;
6685     return z;
6686
6687 }
6688
6689 /*----------------------------------------------------------------------------
6690 | Returns the result of converting the quadruple-precision floating-point value
6691 | `a' to the 64-bit unsigned integer format.  The conversion is
6692 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6693 | Arithmetic---which means in particular that the conversion is rounded
6694 | according to the current rounding mode.  If `a' is a NaN, the largest
6695 | positive integer is returned.  If the conversion overflows, the
6696 | largest unsigned integer is returned.  If 'a' is negative, the value is
6697 | rounded and zero is returned; negative values that do not round to zero
6698 | will raise the inexact exception.
6699 *----------------------------------------------------------------------------*/
6700
6701 uint64_t float128_to_uint64(float128 a, float_status *status)
6702 {
6703     bool aSign;
6704     int aExp;
6705     int shiftCount;
6706     uint64_t aSig0, aSig1;
6707
6708     aSig0 = extractFloat128Frac0(a);
6709     aSig1 = extractFloat128Frac1(a);
6710     aExp = extractFloat128Exp(a);
6711     aSign = extractFloat128Sign(a);
6712     if (aSign && (aExp > 0x3FFE)) {
6713         float_raise(float_flag_invalid, status);
6714         if (float128_is_any_nan(a)) {
6715             return UINT64_MAX;
6716         } else {
6717             return 0;
6718         }
6719     }
6720     if (aExp) {
6721         aSig0 |= UINT64_C(0x0001000000000000);
6722     }
6723     shiftCount = 0x402F - aExp;
6724     if (shiftCount <= 0) {
6725         if (0x403E < aExp) {
6726             float_raise(float_flag_invalid, status);
6727             return UINT64_MAX;
6728         }
6729         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6730     } else {
6731         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6732     }
6733     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6734 }
6735
6736 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6737 {
6738     uint64_t v;
6739     signed char current_rounding_mode = status->float_rounding_mode;
6740
6741     set_float_rounding_mode(float_round_to_zero, status);
6742     v = float128_to_uint64(a, status);
6743     set_float_rounding_mode(current_rounding_mode, status);
6744
6745     return v;
6746 }
6747
6748 /*----------------------------------------------------------------------------
6749 | Returns the result of converting the quadruple-precision floating-point
6750 | value `a' to the 32-bit unsigned integer format.  The conversion
6751 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6752 | Arithmetic except that the conversion is always rounded toward zero.
6753 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6754 | if the conversion overflows, the largest unsigned integer is returned.
6755 | If 'a' is negative, the value is rounded and zero is returned; negative
6756 | values that do not round to zero will raise the inexact exception.
6757 *----------------------------------------------------------------------------*/
6758
6759 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6760 {
6761     uint64_t v;
6762     uint32_t res;
6763     int old_exc_flags = get_float_exception_flags(status);
6764
6765     v = float128_to_uint64_round_to_zero(a, status);
6766     if (v > 0xffffffff) {
6767         res = 0xffffffff;
6768     } else {
6769         return v;
6770     }
6771     set_float_exception_flags(old_exc_flags, status);
6772     float_raise(float_flag_invalid, status);
6773     return res;
6774 }
6775
6776 /*----------------------------------------------------------------------------
6777 | Returns the result of converting the quadruple-precision floating-point value
6778 | `a' to the 32-bit unsigned integer format.  The conversion is
6779 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6780 | Arithmetic---which means in particular that the conversion is rounded
6781 | according to the current rounding mode.  If `a' is a NaN, the largest
6782 | positive integer is returned.  If the conversion overflows, the
6783 | largest unsigned integer is returned.  If 'a' is negative, the value is
6784 | rounded and zero is returned; negative values that do not round to zero
6785 | will raise the inexact exception.
6786 *----------------------------------------------------------------------------*/
6787
6788 uint32_t float128_to_uint32(float128 a, float_status *status)
6789 {
6790     uint64_t v;
6791     uint32_t res;
6792     int old_exc_flags = get_float_exception_flags(status);
6793
6794     v = float128_to_uint64(a, status);
6795     if (v > 0xffffffff) {
6796         res = 0xffffffff;
6797     } else {
6798         return v;
6799     }
6800     set_float_exception_flags(old_exc_flags, status);
6801     float_raise(float_flag_invalid, status);
6802     return res;
6803 }
6804
6805 /*----------------------------------------------------------------------------
6806 | Returns the result of converting the quadruple-precision floating-point
6807 | value `a' to the single-precision floating-point format.  The conversion
6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6809 | Arithmetic.
6810 *----------------------------------------------------------------------------*/
6811
6812 float32 float128_to_float32(float128 a, float_status *status)
6813 {
6814     bool aSign;
6815     int32_t aExp;
6816     uint64_t aSig0, aSig1;
6817     uint32_t zSig;
6818
6819     aSig1 = extractFloat128Frac1( a );
6820     aSig0 = extractFloat128Frac0( a );
6821     aExp = extractFloat128Exp( a );
6822     aSign = extractFloat128Sign( a );
6823     if ( aExp == 0x7FFF ) {
6824         if ( aSig0 | aSig1 ) {
6825             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6826         }
6827         return packFloat32( aSign, 0xFF, 0 );
6828     }
6829     aSig0 |= ( aSig1 != 0 );
6830     shift64RightJamming( aSig0, 18, &aSig0 );
6831     zSig = aSig0;
6832     if ( aExp || zSig ) {
6833         zSig |= 0x40000000;
6834         aExp -= 0x3F81;
6835     }
6836     return roundAndPackFloat32(aSign, aExp, zSig, status);
6837
6838 }
6839
6840 /*----------------------------------------------------------------------------
6841 | Returns the result of converting the quadruple-precision floating-point
6842 | value `a' to the double-precision floating-point format.  The conversion
6843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6844 | Arithmetic.
6845 *----------------------------------------------------------------------------*/
6846
6847 float64 float128_to_float64(float128 a, float_status *status)
6848 {
6849     bool aSign;
6850     int32_t aExp;
6851     uint64_t aSig0, aSig1;
6852
6853     aSig1 = extractFloat128Frac1( a );
6854     aSig0 = extractFloat128Frac0( a );
6855     aExp = extractFloat128Exp( a );
6856     aSign = extractFloat128Sign( a );
6857     if ( aExp == 0x7FFF ) {
6858         if ( aSig0 | aSig1 ) {
6859             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6860         }
6861         return packFloat64( aSign, 0x7FF, 0 );
6862     }
6863     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6864     aSig0 |= ( aSig1 != 0 );
6865     if ( aExp || aSig0 ) {
6866         aSig0 |= UINT64_C(0x4000000000000000);
6867         aExp -= 0x3C01;
6868     }
6869     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6870
6871 }
6872
6873 /*----------------------------------------------------------------------------
6874 | Returns the result of converting the quadruple-precision floating-point
6875 | value `a' to the extended double-precision floating-point format.  The
6876 | conversion is performed according to the IEC/IEEE Standard for Binary
6877 | Floating-Point Arithmetic.
6878 *----------------------------------------------------------------------------*/
6879
6880 floatx80 float128_to_floatx80(float128 a, float_status *status)
6881 {
6882     bool aSign;
6883     int32_t aExp;
6884     uint64_t aSig0, aSig1;
6885
6886     aSig1 = extractFloat128Frac1( a );
6887     aSig0 = extractFloat128Frac0( a );
6888     aExp = extractFloat128Exp( a );
6889     aSign = extractFloat128Sign( a );
6890     if ( aExp == 0x7FFF ) {
6891         if ( aSig0 | aSig1 ) {
6892             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6893                                                status);
6894             return floatx80_silence_nan(res, status);
6895         }
6896         return packFloatx80(aSign, floatx80_infinity_high,
6897                                    floatx80_infinity_low);
6898     }
6899     if ( aExp == 0 ) {
6900         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6901         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6902     }
6903     else {
6904         aSig0 |= UINT64_C(0x0001000000000000);
6905     }
6906     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6907     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6908
6909 }
6910
6911 /*----------------------------------------------------------------------------
6912 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6913 | returns the result as a quadruple-precision floating-point value.  The
6914 | operation is performed according to the IEC/IEEE Standard for Binary
6915 | Floating-Point Arithmetic.
6916 *----------------------------------------------------------------------------*/
6917
6918 float128 float128_round_to_int(float128 a, float_status *status)
6919 {
6920     bool aSign;
6921     int32_t aExp;
6922     uint64_t lastBitMask, roundBitsMask;
6923     float128 z;
6924
6925     aExp = extractFloat128Exp( a );
6926     if ( 0x402F <= aExp ) {
6927         if ( 0x406F <= aExp ) {
6928             if (    ( aExp == 0x7FFF )
6929                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6930                ) {
6931                 return propagateFloat128NaN(a, a, status);
6932             }
6933             return a;
6934         }
6935         lastBitMask = 1;
6936         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6937         roundBitsMask = lastBitMask - 1;
6938         z = a;
6939         switch (status->float_rounding_mode) {
6940         case float_round_nearest_even:
6941             if ( lastBitMask ) {
6942                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6943                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6944             }
6945             else {
6946                 if ( (int64_t) z.low < 0 ) {
6947                     ++z.high;
6948                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6949                 }
6950             }
6951             break;
6952         case float_round_ties_away:
6953             if (lastBitMask) {
6954                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6955             } else {
6956                 if ((int64_t) z.low < 0) {
6957                     ++z.high;
6958                 }
6959             }
6960             break;
6961         case float_round_to_zero:
6962             break;
6963         case float_round_up:
6964             if (!extractFloat128Sign(z)) {
6965                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6966             }
6967             break;
6968         case float_round_down:
6969             if (extractFloat128Sign(z)) {
6970                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6971             }
6972             break;
6973         case float_round_to_odd:
6974             /*
6975              * Note that if lastBitMask == 0, the last bit is the lsb
6976              * of high, and roundBitsMask == -1.
6977              */
6978             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6979                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6980             }
6981             break;
6982         default:
6983             abort();
6984         }
6985         z.low &= ~ roundBitsMask;
6986     }
6987     else {
6988         if ( aExp < 0x3FFF ) {
6989             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6990             float_raise(float_flag_inexact, status);
6991             aSign = extractFloat128Sign( a );
6992             switch (status->float_rounding_mode) {
6993             case float_round_nearest_even:
6994                 if (    ( aExp == 0x3FFE )
6995                      && (   extractFloat128Frac0( a )
6996                           | extractFloat128Frac1( a ) )
6997                    ) {
6998                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6999                 }
7000                 break;
7001             case float_round_ties_away:
7002                 if (aExp == 0x3FFE) {
7003                     return packFloat128(aSign, 0x3FFF, 0, 0);
7004                 }
7005                 break;
7006             case float_round_down:
7007                 return
7008                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7009                     : packFloat128( 0, 0, 0, 0 );
7010             case float_round_up:
7011                 return
7012                       aSign ? packFloat128( 1, 0, 0, 0 )
7013                     : packFloat128( 0, 0x3FFF, 0, 0 );
7014
7015             case float_round_to_odd:
7016                 return packFloat128(aSign, 0x3FFF, 0, 0);
7017
7018             case float_round_to_zero:
7019                 break;
7020             }
7021             return packFloat128( aSign, 0, 0, 0 );
7022         }
7023         lastBitMask = 1;
7024         lastBitMask <<= 0x402F - aExp;
7025         roundBitsMask = lastBitMask - 1;
7026         z.low = 0;
7027         z.high = a.high;
7028         switch (status->float_rounding_mode) {
7029         case float_round_nearest_even:
7030             z.high += lastBitMask>>1;
7031             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7032                 z.high &= ~ lastBitMask;
7033             }
7034             break;
7035         case float_round_ties_away:
7036             z.high += lastBitMask>>1;
7037             break;
7038         case float_round_to_zero:
7039             break;
7040         case float_round_up:
7041             if (!extractFloat128Sign(z)) {
7042                 z.high |= ( a.low != 0 );
7043                 z.high += roundBitsMask;
7044             }
7045             break;
7046         case float_round_down:
7047             if (extractFloat128Sign(z)) {
7048                 z.high |= (a.low != 0);
7049                 z.high += roundBitsMask;
7050             }
7051             break;
7052         case float_round_to_odd:
7053             if ((z.high & lastBitMask) == 0) {
7054                 z.high |= (a.low != 0);
7055                 z.high += roundBitsMask;
7056             }
7057             break;
7058         default:
7059             abort();
7060         }
7061         z.high &= ~ roundBitsMask;
7062     }
7063     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7064         float_raise(float_flag_inexact, status);
7065     }
7066     return z;
7067
7068 }
7069
7070 /*----------------------------------------------------------------------------
7071 | Returns the result of dividing the quadruple-precision floating-point value
7072 | `a' by the corresponding value `b'.  The operation is performed according to
7073 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7074 *----------------------------------------------------------------------------*/
7075
7076 float128 float128_div(float128 a, float128 b, float_status *status)
7077 {
7078     bool aSign, bSign, zSign;
7079     int32_t aExp, bExp, zExp;
7080     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7081     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7082
7083     aSig1 = extractFloat128Frac1( a );
7084     aSig0 = extractFloat128Frac0( a );
7085     aExp = extractFloat128Exp( a );
7086     aSign = extractFloat128Sign( a );
7087     bSig1 = extractFloat128Frac1( b );
7088     bSig0 = extractFloat128Frac0( b );
7089     bExp = extractFloat128Exp( b );
7090     bSign = extractFloat128Sign( b );
7091     zSign = aSign ^ bSign;
7092     if ( aExp == 0x7FFF ) {
7093         if (aSig0 | aSig1) {
7094             return propagateFloat128NaN(a, b, status);
7095         }
7096         if ( bExp == 0x7FFF ) {
7097             if (bSig0 | bSig1) {
7098                 return propagateFloat128NaN(a, b, status);
7099             }
7100             goto invalid;
7101         }
7102         return packFloat128( zSign, 0x7FFF, 0, 0 );
7103     }
7104     if ( bExp == 0x7FFF ) {
7105         if (bSig0 | bSig1) {
7106             return propagateFloat128NaN(a, b, status);
7107         }
7108         return packFloat128( zSign, 0, 0, 0 );
7109     }
7110     if ( bExp == 0 ) {
7111         if ( ( bSig0 | bSig1 ) == 0 ) {
7112             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7113  invalid:
7114                 float_raise(float_flag_invalid, status);
7115                 return float128_default_nan(status);
7116             }
7117             float_raise(float_flag_divbyzero, status);
7118             return packFloat128( zSign, 0x7FFF, 0, 0 );
7119         }
7120         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7121     }
7122     if ( aExp == 0 ) {
7123         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7124         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7125     }
7126     zExp = aExp - bExp + 0x3FFD;
7127     shortShift128Left(
7128         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7129     shortShift128Left(
7130         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7131     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7132         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7133         ++zExp;
7134     }
7135     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7136     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7137     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7138     while ( (int64_t) rem0 < 0 ) {
7139         --zSig0;
7140         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7141     }
7142     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7143     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7144         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7145         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7146         while ( (int64_t) rem1 < 0 ) {
7147             --zSig1;
7148             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7149         }
7150         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7151     }
7152     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7153     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7154
7155 }
7156
7157 /*----------------------------------------------------------------------------
7158 | Returns the remainder of the quadruple-precision floating-point value `a'
7159 | with respect to the corresponding value `b'.  The operation is performed
7160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7161 *----------------------------------------------------------------------------*/
7162
7163 float128 float128_rem(float128 a, float128 b, float_status *status)
7164 {
7165     bool aSign, zSign;
7166     int32_t aExp, bExp, expDiff;
7167     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7168     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7169     int64_t sigMean0;
7170
7171     aSig1 = extractFloat128Frac1( a );
7172     aSig0 = extractFloat128Frac0( a );
7173     aExp = extractFloat128Exp( a );
7174     aSign = extractFloat128Sign( a );
7175     bSig1 = extractFloat128Frac1( b );
7176     bSig0 = extractFloat128Frac0( b );
7177     bExp = extractFloat128Exp( b );
7178     if ( aExp == 0x7FFF ) {
7179         if (    ( aSig0 | aSig1 )
7180              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7181             return propagateFloat128NaN(a, b, status);
7182         }
7183         goto invalid;
7184     }
7185     if ( bExp == 0x7FFF ) {
7186         if (bSig0 | bSig1) {
7187             return propagateFloat128NaN(a, b, status);
7188         }
7189         return a;
7190     }
7191     if ( bExp == 0 ) {
7192         if ( ( bSig0 | bSig1 ) == 0 ) {
7193  invalid:
7194             float_raise(float_flag_invalid, status);
7195             return float128_default_nan(status);
7196         }
7197         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7198     }
7199     if ( aExp == 0 ) {
7200         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7201         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7202     }
7203     expDiff = aExp - bExp;
7204     if ( expDiff < -1 ) return a;
7205     shortShift128Left(
7206         aSig0 | UINT64_C(0x0001000000000000),
7207         aSig1,
7208         15 - ( expDiff < 0 ),
7209         &aSig0,
7210         &aSig1
7211     );
7212     shortShift128Left(
7213         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7214     q = le128( bSig0, bSig1, aSig0, aSig1 );
7215     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7216     expDiff -= 64;
7217     while ( 0 < expDiff ) {
7218         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7219         q = ( 4 < q ) ? q - 4 : 0;
7220         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7221         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7222         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7223         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7224         expDiff -= 61;
7225     }
7226     if ( -64 < expDiff ) {
7227         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7228         q = ( 4 < q ) ? q - 4 : 0;
7229         q >>= - expDiff;
7230         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7231         expDiff += 52;
7232         if ( expDiff < 0 ) {
7233             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7234         }
7235         else {
7236             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7237         }
7238         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7239         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7240     }
7241     else {
7242         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7243         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7244     }
7245     do {
7246         alternateASig0 = aSig0;
7247         alternateASig1 = aSig1;
7248         ++q;
7249         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7250     } while ( 0 <= (int64_t) aSig0 );
7251     add128(
7252         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7253     if (    ( sigMean0 < 0 )
7254          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7255         aSig0 = alternateASig0;
7256         aSig1 = alternateASig1;
7257     }
7258     zSign = ( (int64_t) aSig0 < 0 );
7259     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7260     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7261                                          status);
7262 }
7263
7264 /*----------------------------------------------------------------------------
7265 | Returns the square root of the quadruple-precision floating-point value `a'.
7266 | The operation is performed according to the IEC/IEEE Standard for Binary
7267 | Floating-Point Arithmetic.
7268 *----------------------------------------------------------------------------*/
7269
7270 float128 float128_sqrt(float128 a, float_status *status)
7271 {
7272     bool aSign;
7273     int32_t aExp, zExp;
7274     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7275     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7276
7277     aSig1 = extractFloat128Frac1( a );
7278     aSig0 = extractFloat128Frac0( a );
7279     aExp = extractFloat128Exp( a );
7280     aSign = extractFloat128Sign( a );
7281     if ( aExp == 0x7FFF ) {
7282         if (aSig0 | aSig1) {
7283             return propagateFloat128NaN(a, a, status);
7284         }
7285         if ( ! aSign ) return a;
7286         goto invalid;
7287     }
7288     if ( aSign ) {
7289         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7290  invalid:
7291         float_raise(float_flag_invalid, status);
7292         return float128_default_nan(status);
7293     }
7294     if ( aExp == 0 ) {
7295         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7296         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7297     }
7298     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7299     aSig0 |= UINT64_C(0x0001000000000000);
7300     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7301     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7302     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7303     doubleZSig0 = zSig0<<1;
7304     mul64To128( zSig0, zSig0, &term0, &term1 );
7305     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7306     while ( (int64_t) rem0 < 0 ) {
7307         --zSig0;
7308         doubleZSig0 -= 2;
7309         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7310     }
7311     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7312     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7313         if ( zSig1 == 0 ) zSig1 = 1;
7314         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7315         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7316         mul64To128( zSig1, zSig1, &term2, &term3 );
7317         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7318         while ( (int64_t) rem1 < 0 ) {
7319             --zSig1;
7320             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7321             term3 |= 1;
7322             term2 |= doubleZSig0;
7323             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7324         }
7325         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7326     }
7327     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7328     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7329
7330 }
7331
7332 static inline FloatRelation
7333 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7334                           float_status *status)
7335 {
7336     bool aSign, bSign;
7337
7338     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7339         float_raise(float_flag_invalid, status);
7340         return float_relation_unordered;
7341     }
7342     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7343           ( extractFloatx80Frac( a )<<1 ) ) ||
7344         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7345           ( extractFloatx80Frac( b )<<1 ) )) {
7346         if (!is_quiet ||
7347             floatx80_is_signaling_nan(a, status) ||
7348             floatx80_is_signaling_nan(b, status)) {
7349             float_raise(float_flag_invalid, status);
7350         }
7351         return float_relation_unordered;
7352     }
7353     aSign = extractFloatx80Sign( a );
7354     bSign = extractFloatx80Sign( b );
7355     if ( aSign != bSign ) {
7356
7357         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7358              ( ( a.low | b.low ) == 0 ) ) {
7359             /* zero case */
7360             return float_relation_equal;
7361         } else {
7362             return 1 - (2 * aSign);
7363         }
7364     } else {
7365         /* Normalize pseudo-denormals before comparison.  */
7366         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7367             ++a.high;
7368         }
7369         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7370             ++b.high;
7371         }
7372         if (a.low == b.low && a.high == b.high) {
7373             return float_relation_equal;
7374         } else {
7375             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7376         }
7377     }
7378 }
7379
7380 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7381 {
7382     return floatx80_compare_internal(a, b, 0, status);
7383 }
7384
7385 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7386                                      float_status *status)
7387 {
7388     return floatx80_compare_internal(a, b, 1, status);
7389 }
7390
7391 static inline FloatRelation
7392 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7393                           float_status *status)
7394 {
7395     bool aSign, bSign;
7396
7397     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7398           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7399         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7400           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7401         if (!is_quiet ||
7402             float128_is_signaling_nan(a, status) ||
7403             float128_is_signaling_nan(b, status)) {
7404             float_raise(float_flag_invalid, status);
7405         }
7406         return float_relation_unordered;
7407     }
7408     aSign = extractFloat128Sign( a );
7409     bSign = extractFloat128Sign( b );
7410     if ( aSign != bSign ) {
7411         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7412             /* zero case */
7413             return float_relation_equal;
7414         } else {
7415             return 1 - (2 * aSign);
7416         }
7417     } else {
7418         if (a.low == b.low && a.high == b.high) {
7419             return float_relation_equal;
7420         } else {
7421             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7422         }
7423     }
7424 }
7425
7426 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7427 {
7428     return float128_compare_internal(a, b, 0, status);
7429 }
7430
7431 FloatRelation float128_compare_quiet(float128 a, float128 b,
7432                                      float_status *status)
7433 {
7434     return float128_compare_internal(a, b, 1, status);
7435 }
7436
7437 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7438 {
7439     bool aSign;
7440     int32_t aExp;
7441     uint64_t aSig;
7442
7443     if (floatx80_invalid_encoding(a)) {
7444         float_raise(float_flag_invalid, status);
7445         return floatx80_default_nan(status);
7446     }
7447     aSig = extractFloatx80Frac( a );
7448     aExp = extractFloatx80Exp( a );
7449     aSign = extractFloatx80Sign( a );
7450
7451     if ( aExp == 0x7FFF ) {
7452         if ( aSig<<1 ) {
7453             return propagateFloatx80NaN(a, a, status);
7454         }
7455         return a;
7456     }
7457
7458     if (aExp == 0) {
7459         if (aSig == 0) {
7460             return a;
7461         }
7462         aExp++;
7463     }
7464
7465     if (n > 0x10000) {
7466         n = 0x10000;
7467     } else if (n < -0x10000) {
7468         n = -0x10000;
7469     }
7470
7471     aExp += n;
7472     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7473                                          aSign, aExp, aSig, 0, status);
7474 }
7475
7476 float128 float128_scalbn(float128 a, int n, float_status *status)
7477 {
7478     bool aSign;
7479     int32_t aExp;
7480     uint64_t aSig0, aSig1;
7481
7482     aSig1 = extractFloat128Frac1( a );
7483     aSig0 = extractFloat128Frac0( a );
7484     aExp = extractFloat128Exp( a );
7485     aSign = extractFloat128Sign( a );
7486     if ( aExp == 0x7FFF ) {
7487         if ( aSig0 | aSig1 ) {
7488             return propagateFloat128NaN(a, a, status);
7489         }
7490         return a;
7491     }
7492     if (aExp != 0) {
7493         aSig0 |= UINT64_C(0x0001000000000000);
7494     } else if (aSig0 == 0 && aSig1 == 0) {
7495         return a;
7496     } else {
7497         aExp++;
7498     }
7499
7500     if (n > 0x10000) {
7501         n = 0x10000;
7502     } else if (n < -0x10000) {
7503         n = -0x10000;
7504     }
7505
7506     aExp += n - 1;
7507     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7508                                          , status);
7509
7510 }
7511
7512 static void __attribute__((constructor)) softfloat_init(void)
7513 {
7514     union_float64 ua, ub, uc, ur;
7515
7516     if (QEMU_NO_HARDFLOAT) {
7517         return;
7518     }
7519     /*
7520      * Test that the host's FMA is not obviously broken. For example,
7521      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7522      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7523      */
7524     ua.s = 0x0020000000000001ULL;
7525     ub.s = 0x3ca0000000000000ULL;
7526     uc.s = 0x0020000000000000ULL;
7527     ur.h = fma(ua.h, ub.h, uc.h);
7528     if (ur.s != 0x0020000000000001ULL) {
7529         force_soft_fma = true;
7530     }
7531 }