fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float.
 504  * The exponent is unbiased and the fraction is normalized.
 505  *
 506  * The fraction words are stored in big-endian word ordering,
 507  * so that truncation from a larger format to a smaller format
 508  * can be done simply by ignoring subsequent elements.
 509  */
 510
 511 typedef struct {
 512     FloatClass cls;
 513     bool sign;
 514     int32_t exp;
 515     union {
 516         /* Routines that know the structure may reference the singular name. */
 517         uint64_t frac;
 518         /*
 519          * Routines expanded with multiple structures reference "hi" and "lo"
 520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
 521          * both the same word and aliased here.
 522          */
 523         uint64_t frac_hi;
 524         uint64_t frac_lo;
 525     };
 526 } FloatParts64;
 527
 528 typedef struct {
 529     FloatClass cls;
 530     bool sign;
 531     int32_t exp;
 532     uint64_t frac_hi;
 533     uint64_t frac_lo;
 534 } FloatParts128;
 535
 536 /* These apply to the most significant word of each FloatPartsN. */
 537 #define DECOMPOSED_BINARY_POINT    63
 538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 539
 540 /* Structure holding all of the relevant parameters for a format.
 541  *   exp_size: the size of the exponent field
 542  *   exp_bias: the offset applied to the exponent field
 543  *   exp_max: the maximum normalised exponent
 544  *   frac_size: the size of the fraction field
 545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 546  * The following are computed based the size of fraction
 547  *   frac_lsb: least significant bit of fraction
 548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 549  *   round_mask/roundeven_mask: masks used for rounding
 550  * The following optional modifiers are available:
 551  *   arm_althp: handle ARM Alternative Half Precision
 552  */
 553 typedef struct {
 554     int exp_size;
 555     int exp_bias;
 556     int exp_max;
 557     int frac_size;
 558     int frac_shift;
 559     uint64_t frac_lsb;
 560     uint64_t frac_lsbm1;
 561     uint64_t round_mask;
 562     uint64_t roundeven_mask;
 563     bool arm_althp;
 564 } FloatFmt;
 565
 566 /* Expand fields based on the size of exponent and fraction */
 567 #define FLOAT_PARAMS(E, F)                                           \
 568     .exp_size       = E,                                             \
 569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 570     .exp_max        = (1 << E) - 1,                                  \
 571     .frac_size      = F,                                             \
 572     .frac_shift     = (-F - 1) & 63,                                 \
 573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
 574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
 575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
 576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
 577
 578 static const FloatFmt float16_params = {
 579     FLOAT_PARAMS(5, 10)
 580 };
 581
 582 static const FloatFmt float16_params_ahp = {
 583     FLOAT_PARAMS(5, 10),
 584     .arm_althp = true
 585 };
 586
 587 static const FloatFmt bfloat16_params = {
 588     FLOAT_PARAMS(8, 7)
 589 };
 590
 591 static const FloatFmt float32_params = {
 592     FLOAT_PARAMS(8, 23)
 593 };
 594
 595 static const FloatFmt float64_params = {
 596     FLOAT_PARAMS(11, 52)
 597 };
 598
 599 static const FloatFmt float128_params = {
 600     FLOAT_PARAMS(15, 112)
 601 };
 602
 603 /* Unpack a float to parts, but do not canonicalize.  */
 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 605 {
 606     const int f_size = fmt->frac_size;
 607     const int e_size = fmt->exp_size;
 608
 609     *r = (FloatParts64) {
 610         .cls = float_class_unclassified,
 611         .sign = extract64(raw, f_size + e_size, 1),
 612         .exp = extract64(raw, f_size, e_size),
 613         .frac = extract64(raw, 0, f_size)
 614     };
 615 }
 616
 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
 618 {
 619     unpack_raw64(p, &float16_params, f);
 620 }
 621
 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 623 {
 624     unpack_raw64(p, &bfloat16_params, f);
 625 }
 626
 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
 628 {
 629     unpack_raw64(p, &float32_params, f);
 630 }
 631
 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
 633 {
 634     unpack_raw64(p, &float64_params, f);
 635 }
 636
 637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
 638 {
 639     const int f_size = float128_params.frac_size - 64;
 640     const int e_size = float128_params.exp_size;
 641
 642     *p = (FloatParts128) {
 643         .cls = float_class_unclassified,
 644         .sign = extract64(f.high, f_size + e_size, 1),
 645         .exp = extract64(f.high, f_size, e_size),
 646         .frac_hi = extract64(f.high, 0, f_size),
 647         .frac_lo = f.low,
 648     };
 649 }
 650
 651 /* Pack a float from parts, but do not canonicalize.  */
 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 653 {
 654     const int f_size = fmt->frac_size;
 655     const int e_size = fmt->exp_size;
 656     uint64_t ret;
 657
 658     ret = (uint64_t)p->sign << (f_size + e_size);
 659     ret = deposit64(ret, f_size, e_size, p->exp);
 660     ret = deposit64(ret, 0, f_size, p->frac);
 661     return ret;
 662 }
 663
 664 static inline float16 float16_pack_raw(const FloatParts64 *p)
 665 {
 666     return make_float16(pack_raw64(p, &float16_params));
 667 }
 668
 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
 670 {
 671     return pack_raw64(p, &bfloat16_params);
 672 }
 673
 674 static inline float32 float32_pack_raw(const FloatParts64 *p)
 675 {
 676     return make_float32(pack_raw64(p, &float32_params));
 677 }
 678
 679 static inline float64 float64_pack_raw(const FloatParts64 *p)
 680 {
 681     return make_float64(pack_raw64(p, &float64_params));
 682 }
 683
 684 static float128 float128_pack_raw(const FloatParts128 *p)
 685 {
 686     const int f_size = float128_params.frac_size - 64;
 687     const int e_size = float128_params.exp_size;
 688     uint64_t hi;
 689
 690     hi = (uint64_t)p->sign << (f_size + e_size);
 691     hi = deposit64(hi, f_size, e_size, p->exp);
 692     hi = deposit64(hi, 0, f_size, p->frac_hi);
 693     return make_float128(hi, p->frac_lo);
 694 }
 695
 696 /*----------------------------------------------------------------------------
 697 | Functions and definitions to determine:  (1) whether tininess for underflow
 698 | is detected before or after rounding by default, (2) what (if anything)
 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 701 | are propagated from function inputs to output.  These details are target-
 702 | specific.
 703 *----------------------------------------------------------------------------*/
 704 #include "softfloat-specialize.c.inc"
 705
 706 #define PARTS_GENERIC_64_128(NAME, P) \
 707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 708
 709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
 710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
 711
 712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
 713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
 714
 715 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
 716
 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
 718                                       float_status *s);
 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
 720                                         float_status *s);
 721
 722 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
 723
 724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
 725                                              FloatParts64 *c, float_status *s,
 726                                              int ab_mask, int abc_mask);
 727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
 728                                                FloatParts128 *b,
 729                                                FloatParts128 *c,
 730                                                float_status *s,
 731                                                int ab_mask, int abc_mask);
 732
 733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
 734     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
 735
 736 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
 737                                  const FloatFmt *fmt);
 738 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
 739                                   const FloatFmt *fmt);
 740
 741 #define parts_canonicalize(A, S, F) \
 742     PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
 743
 744 static void parts64_uncanon(FloatParts64 *p, float_status *status,
 745                             const FloatFmt *fmt);
 746 static void parts128_uncanon(FloatParts128 *p, float_status *status,
 747                              const FloatFmt *fmt);
 748
 749 #define parts_uncanon(A, S, F) \
 750     PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
 751
 752 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
 753 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
 754
 755 #define parts_add_normal(A, B) \
 756     PARTS_GENERIC_64_128(add_normal, A)(A, B)
 757
 758 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
 759 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
 760
 761 #define parts_sub_normal(A, B) \
 762     PARTS_GENERIC_64_128(sub_normal, A)(A, B)
 763
 764 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
 765                                     float_status *s, bool subtract);
 766 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
 767                                       float_status *s, bool subtract);
 768
 769 #define parts_addsub(A, B, S, Z) \
 770     PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
 771
 772 /*
 773  * Helper functions for softfloat-parts.c.inc, per-size operations.
 774  */
 775
 776 #define FRAC_GENERIC_64_128(NAME, P) \
 777     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 778
 779 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
 780 {
 781     return uadd64_overflow(a->frac, b->frac, &r->frac);
 782 }
 783
 784 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
 785 {
 786     bool c = 0;
 787     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
 788     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
 789     return c;
 790 }
 791
 792 #define frac_add(R, A, B)  FRAC_GENERIC_64_128(add, R)(R, A, B)
 793
 794 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
 795 {
 796     return uadd64_overflow(a->frac, c, &r->frac);
 797 }
 798
 799 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
 800 {
 801     c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
 802     return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
 803 }
 804
 805 #define frac_addi(R, A, C)  FRAC_GENERIC_64_128(addi, R)(R, A, C)
 806
 807 static void frac64_allones(FloatParts64 *a)
 808 {
 809     a->frac = -1;
 810 }
 811
 812 static void frac128_allones(FloatParts128 *a)
 813 {
 814     a->frac_hi = a->frac_lo = -1;
 815 }
 816
 817 #define frac_allones(A)  FRAC_GENERIC_64_128(allones, A)(A)
 818
 819 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
 820 {
 821     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
 822 }
 823
 824 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
 825 {
 826     uint64_t ta = a->frac_hi, tb = b->frac_hi;
 827     if (ta == tb) {
 828         ta = a->frac_lo, tb = b->frac_lo;
 829         if (ta == tb) {
 830             return 0;
 831         }
 832     }
 833     return ta < tb ? -1 : 1;
 834 }
 835
 836 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
 837
 838 static void frac64_clear(FloatParts64 *a)
 839 {
 840     a->frac = 0;
 841 }
 842
 843 static void frac128_clear(FloatParts128 *a)
 844 {
 845     a->frac_hi = a->frac_lo = 0;
 846 }
 847
 848 #define frac_clear(A)  FRAC_GENERIC_64_128(clear, A)(A)
 849
 850 static bool frac64_eqz(FloatParts64 *a)
 851 {
 852     return a->frac == 0;
 853 }
 854
 855 static bool frac128_eqz(FloatParts128 *a)
 856 {
 857     return (a->frac_hi | a->frac_lo) == 0;
 858 }
 859
 860 #define frac_eqz(A)  FRAC_GENERIC_64_128(eqz, A)(A)
 861
 862 static void frac64_neg(FloatParts64 *a)
 863 {
 864     a->frac = -a->frac;
 865 }
 866
 867 static void frac128_neg(FloatParts128 *a)
 868 {
 869     bool c = 0;
 870     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
 871     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
 872 }
 873
 874 #define frac_neg(A)  FRAC_GENERIC_64_128(neg, A)(A)
 875
 876 static int frac64_normalize(FloatParts64 *a)
 877 {
 878     if (a->frac) {
 879         int shift = clz64(a->frac);
 880         a->frac <<= shift;
 881         return shift;
 882     }
 883     return 64;
 884 }
 885
 886 static int frac128_normalize(FloatParts128 *a)
 887 {
 888     if (a->frac_hi) {
 889         int shl = clz64(a->frac_hi);
 890         if (shl) {
 891             int shr = 64 - shl;
 892             a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
 893             a->frac_lo = (a->frac_lo << shl);
 894         }
 895         return shl;
 896     } else if (a->frac_lo) {
 897         int shl = clz64(a->frac_lo);
 898         a->frac_hi = (a->frac_lo << shl);
 899         a->frac_lo = 0;
 900         return shl + 64;
 901     }
 902     return 128;
 903 }
 904
 905 #define frac_normalize(A)  FRAC_GENERIC_64_128(normalize, A)(A)
 906
 907 static void frac64_shl(FloatParts64 *a, int c)
 908 {
 909     a->frac <<= c;
 910 }
 911
 912 static void frac128_shl(FloatParts128 *a, int c)
 913 {
 914     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 915 }
 916
 917 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
 918
 919 static void frac64_shr(FloatParts64 *a, int c)
 920 {
 921     a->frac >>= c;
 922 }
 923
 924 static void frac128_shr(FloatParts128 *a, int c)
 925 {
 926     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 927 }
 928
 929 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
 930
 931 static void frac64_shrjam(FloatParts64 *a, int c)
 932 {
 933     shift64RightJamming(a->frac, c, &a->frac);
 934 }
 935
 936 static void frac128_shrjam(FloatParts128 *a, int c)
 937 {
 938     shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 939 }
 940
 941 #define frac_shrjam(A, C)  FRAC_GENERIC_64_128(shrjam, A)(A, C)
 942
 943 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
 944 {
 945     return usub64_overflow(a->frac, b->frac, &r->frac);
 946 }
 947
 948 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
 949 {
 950     bool c = 0;
 951     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
 952     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
 953     return c;
 954 }
 955
 956 #define frac_sub(R, A, B)  FRAC_GENERIC_64_128(sub, R)(R, A, B)
 957
 958 #define partsN(NAME)   glue(glue(glue(parts,N),_),NAME)
 959 #define FloatPartsN    glue(FloatParts,N)
 960
 961 #define N 64
 962
 963 #include "softfloat-parts-addsub.c.inc"
 964 #include "softfloat-parts.c.inc"
 965
 966 #undef  N
 967 #define N 128
 968
 969 #include "softfloat-parts-addsub.c.inc"
 970 #include "softfloat-parts.c.inc"
 971
 972 #undef  N
 973 #undef  partsN
 974 #undef  FloatPartsN
 975
 976 /*
 977  * Pack/unpack routines with a specific FloatFmt.
 978  */
 979
 980 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
 981                                       float_status *s, const FloatFmt *params)
 982 {
 983     float16_unpack_raw(p, f);
 984     parts_canonicalize(p, s, params);
 985 }
 986
 987 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
 988                                      float_status *s)
 989 {
 990     float16a_unpack_canonical(p, f, s, &float16_params);
 991 }
 992
 993 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
 994                                       float_status *s)
 995 {
 996     bfloat16_unpack_raw(p, f);
 997     parts_canonicalize(p, s, &bfloat16_params);
 998 }
 999
1000 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1001                                              float_status *s,
1002                                              const FloatFmt *params)
1003 {
1004     parts_uncanon(p, s, params);
1005     return float16_pack_raw(p);
1006 }
1007
1008 static float16 float16_round_pack_canonical(FloatParts64 *p,
1009                                             float_status *s)
1010 {
1011     return float16a_round_pack_canonical(p, s, &float16_params);
1012 }
1013
1014 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1015                                               float_status *s)
1016 {
1017     parts_uncanon(p, s, &bfloat16_params);
1018     return bfloat16_pack_raw(p);
1019 }
1020
1021 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1022                                      float_status *s)
1023 {
1024     float32_unpack_raw(p, f);
1025     parts_canonicalize(p, s, &float32_params);
1026 }
1027
1028 static float32 float32_round_pack_canonical(FloatParts64 *p,
1029                                             float_status *s)
1030 {
1031     parts_uncanon(p, s, &float32_params);
1032     return float32_pack_raw(p);
1033 }
1034
1035 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1036                                      float_status *s)
1037 {
1038     float64_unpack_raw(p, f);
1039     parts_canonicalize(p, s, &float64_params);
1040 }
1041
1042 static float64 float64_round_pack_canonical(FloatParts64 *p,
1043                                             float_status *s)
1044 {
1045     parts_uncanon(p, s, &float64_params);
1046     return float64_pack_raw(p);
1047 }
1048
1049 static void float128_unpack_canonical(FloatParts128 *p, float128 f,
1050                                       float_status *s)
1051 {
1052     float128_unpack_raw(p, f);
1053     parts_canonicalize(p, s, &float128_params);
1054 }
1055
1056 static float128 float128_round_pack_canonical(FloatParts128 *p,
1057                                               float_status *s)
1058 {
1059     parts_uncanon(p, s, &float128_params);
1060     return float128_pack_raw(p);
1061 }
1062
1063 /*
1064  * Addition and subtraction
1065  */
1066
1067 static float16 QEMU_FLATTEN
1068 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1069 {
1070     FloatParts64 pa, pb, *pr;
1071
1072     float16_unpack_canonical(&pa, a, status);
1073     float16_unpack_canonical(&pb, b, status);
1074     pr = parts_addsub(&pa, &pb, status, subtract);
1075
1076     return float16_round_pack_canonical(pr, status);
1077 }
1078
1079 float16 float16_add(float16 a, float16 b, float_status *status)
1080 {
1081     return float16_addsub(a, b, status, false);
1082 }
1083
1084 float16 float16_sub(float16 a, float16 b, float_status *status)
1085 {
1086     return float16_addsub(a, b, status, true);
1087 }
1088
1089 static float32 QEMU_SOFTFLOAT_ATTR
1090 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1091 {
1092     FloatParts64 pa, pb, *pr;
1093
1094     float32_unpack_canonical(&pa, a, status);
1095     float32_unpack_canonical(&pb, b, status);
1096     pr = parts_addsub(&pa, &pb, status, subtract);
1097
1098     return float32_round_pack_canonical(pr, status);
1099 }
1100
1101 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1102 {
1103     return soft_f32_addsub(a, b, status, false);
1104 }
1105
1106 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1107 {
1108     return soft_f32_addsub(a, b, status, true);
1109 }
1110
1111 static float64 QEMU_SOFTFLOAT_ATTR
1112 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1113 {
1114     FloatParts64 pa, pb, *pr;
1115
1116     float64_unpack_canonical(&pa, a, status);
1117     float64_unpack_canonical(&pb, b, status);
1118     pr = parts_addsub(&pa, &pb, status, subtract);
1119
1120     return float64_round_pack_canonical(pr, status);
1121 }
1122
1123 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1124 {
1125     return soft_f64_addsub(a, b, status, false);
1126 }
1127
1128 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1129 {
1130     return soft_f64_addsub(a, b, status, true);
1131 }
1132
1133 static float hard_f32_add(float a, float b)
1134 {
1135     return a + b;
1136 }
1137
1138 static float hard_f32_sub(float a, float b)
1139 {
1140     return a - b;
1141 }
1142
1143 static double hard_f64_add(double a, double b)
1144 {
1145     return a + b;
1146 }
1147
1148 static double hard_f64_sub(double a, double b)
1149 {
1150     return a - b;
1151 }
1152
1153 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1154 {
1155     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1156         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1157     }
1158     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1159 }
1160
1161 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1162 {
1163     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1164         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1165     } else {
1166         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1167     }
1168 }
1169
1170 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1171                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1172 {
1173     return float32_gen2(a, b, s, hard, soft,
1174                         f32_is_zon2, f32_addsubmul_post);
1175 }
1176
1177 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1178                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1179 {
1180     return float64_gen2(a, b, s, hard, soft,
1181                         f64_is_zon2, f64_addsubmul_post);
1182 }
1183
1184 float32 QEMU_FLATTEN
1185 float32_add(float32 a, float32 b, float_status *s)
1186 {
1187     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1188 }
1189
1190 float32 QEMU_FLATTEN
1191 float32_sub(float32 a, float32 b, float_status *s)
1192 {
1193     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1194 }
1195
1196 float64 QEMU_FLATTEN
1197 float64_add(float64 a, float64 b, float_status *s)
1198 {
1199     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1200 }
1201
1202 float64 QEMU_FLATTEN
1203 float64_sub(float64 a, float64 b, float_status *s)
1204 {
1205     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1206 }
1207
1208 static bfloat16 QEMU_FLATTEN
1209 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1210 {
1211     FloatParts64 pa, pb, *pr;
1212
1213     bfloat16_unpack_canonical(&pa, a, status);
1214     bfloat16_unpack_canonical(&pb, b, status);
1215     pr = parts_addsub(&pa, &pb, status, subtract);
1216
1217     return bfloat16_round_pack_canonical(pr, status);
1218 }
1219
1220 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1221 {
1222     return bfloat16_addsub(a, b, status, false);
1223 }
1224
1225 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1226 {
1227     return bfloat16_addsub(a, b, status, true);
1228 }
1229
1230 static float128 QEMU_FLATTEN
1231 float128_addsub(float128 a, float128 b, float_status *status, bool subtract)
1232 {
1233     FloatParts128 pa, pb, *pr;
1234
1235     float128_unpack_canonical(&pa, a, status);
1236     float128_unpack_canonical(&pb, b, status);
1237     pr = parts_addsub(&pa, &pb, status, subtract);
1238
1239     return float128_round_pack_canonical(pr, status);
1240 }
1241
1242 float128 float128_add(float128 a, float128 b, float_status *status)
1243 {
1244     return float128_addsub(a, b, status, false);
1245 }
1246
1247 float128 float128_sub(float128 a, float128 b, float_status *status)
1248 {
1249     return float128_addsub(a, b, status, true);
1250 }
1251
1252 /*
1253  * Returns the result of multiplying the floating-point values `a' and
1254  * `b'. The operation is performed according to the IEC/IEEE Standard
1255  * for Binary Floating-Point Arithmetic.
1256  */
1257
1258 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1259 {
1260     bool sign = a.sign ^ b.sign;
1261
1262     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1263         uint64_t hi, lo;
1264         int exp = a.exp + b.exp;
1265
1266         mul64To128(a.frac, b.frac, &hi, &lo);
1267         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1268             exp += 1;
1269         } else {
1270             hi <<= 1;
1271         }
1272         hi |= (lo != 0);
1273
1274         /* Re-use a */
1275         a.exp = exp;
1276         a.sign = sign;
1277         a.frac = hi;
1278         return a;
1279     }
1280     /* handle all the NaN cases */
1281     if (is_nan(a.cls) || is_nan(b.cls)) {
1282         return *parts_pick_nan(&a, &b, s);
1283     }
1284     /* Inf * Zero == NaN */
1285     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1286         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1287         float_raise(float_flag_invalid, s);
1288         parts_default_nan(&a, s);
1289         return a;
1290     }
1291     /* Multiply by 0 or Inf */
1292     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1293         a.sign = sign;
1294         return a;
1295     }
1296     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1297         b.sign = sign;
1298         return b;
1299     }
1300     g_assert_not_reached();
1301 }
1302
1303 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1304 {
1305     FloatParts64 pa, pb, pr;
1306
1307     float16_unpack_canonical(&pa, a, status);
1308     float16_unpack_canonical(&pb, b, status);
1309     pr = mul_floats(pa, pb, status);
1310
1311     return float16_round_pack_canonical(&pr, status);
1312 }
1313
1314 static float32 QEMU_SOFTFLOAT_ATTR
1315 soft_f32_mul(float32 a, float32 b, float_status *status)
1316 {
1317     FloatParts64 pa, pb, pr;
1318
1319     float32_unpack_canonical(&pa, a, status);
1320     float32_unpack_canonical(&pb, b, status);
1321     pr = mul_floats(pa, pb, status);
1322
1323     return float32_round_pack_canonical(&pr, status);
1324 }
1325
1326 static float64 QEMU_SOFTFLOAT_ATTR
1327 soft_f64_mul(float64 a, float64 b, float_status *status)
1328 {
1329     FloatParts64 pa, pb, pr;
1330
1331     float64_unpack_canonical(&pa, a, status);
1332     float64_unpack_canonical(&pb, b, status);
1333     pr = mul_floats(pa, pb, status);
1334
1335     return float64_round_pack_canonical(&pr, status);
1336 }
1337
1338 static float hard_f32_mul(float a, float b)
1339 {
1340     return a * b;
1341 }
1342
1343 static double hard_f64_mul(double a, double b)
1344 {
1345     return a * b;
1346 }
1347
1348 float32 QEMU_FLATTEN
1349 float32_mul(float32 a, float32 b, float_status *s)
1350 {
1351     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1352                         f32_is_zon2, f32_addsubmul_post);
1353 }
1354
1355 float64 QEMU_FLATTEN
1356 float64_mul(float64 a, float64 b, float_status *s)
1357 {
1358     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1359                         f64_is_zon2, f64_addsubmul_post);
1360 }
1361
1362 /*
1363  * Returns the result of multiplying the bfloat16
1364  * values `a' and `b'.
1365  */
1366
1367 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1368 {
1369     FloatParts64 pa, pb, pr;
1370
1371     bfloat16_unpack_canonical(&pa, a, status);
1372     bfloat16_unpack_canonical(&pb, b, status);
1373     pr = mul_floats(pa, pb, status);
1374
1375     return bfloat16_round_pack_canonical(&pr, status);
1376 }
1377
1378 /*
1379  * Returns the result of multiplying the floating-point values `a' and
1380  * `b' then adding 'c', with no intermediate rounding step after the
1381  * multiplication. The operation is performed according to the
1382  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1383  * The flags argument allows the caller to select negation of the
1384  * addend, the intermediate product, or the final result. (The
1385  * difference between this and having the caller do a separate
1386  * negation is that negating externally will flip the sign bit on
1387  * NaNs.)
1388  */
1389
1390 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1391                                 int flags, float_status *s)
1392 {
1393     bool inf_zero, p_sign;
1394     bool sign_flip = flags & float_muladd_negate_result;
1395     FloatClass p_class;
1396     uint64_t hi, lo;
1397     int p_exp;
1398     int ab_mask, abc_mask;
1399
1400     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1401     abc_mask = float_cmask(c.cls) | ab_mask;
1402     inf_zero = ab_mask == float_cmask_infzero;
1403
1404     /* It is implementation-defined whether the cases of (0,inf,qnan)
1405      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1406      * they return if they do), so we have to hand this information
1407      * off to the target-specific pick-a-NaN routine.
1408      */
1409     if (unlikely(abc_mask & float_cmask_anynan)) {
1410         return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1411     }
1412
1413     if (inf_zero) {
1414         float_raise(float_flag_invalid, s);
1415         parts_default_nan(&a, s);
1416         return a;
1417     }
1418
1419     if (flags & float_muladd_negate_c) {
1420         c.sign ^= 1;
1421     }
1422
1423     p_sign = a.sign ^ b.sign;
1424
1425     if (flags & float_muladd_negate_product) {
1426         p_sign ^= 1;
1427     }
1428
1429     if (ab_mask & float_cmask_inf) {
1430         p_class = float_class_inf;
1431     } else if (ab_mask & float_cmask_zero) {
1432         p_class = float_class_zero;
1433     } else {
1434         p_class = float_class_normal;
1435     }
1436
1437     if (c.cls == float_class_inf) {
1438         if (p_class == float_class_inf && p_sign != c.sign) {
1439             float_raise(float_flag_invalid, s);
1440             parts_default_nan(&c, s);
1441         } else {
1442             c.sign ^= sign_flip;
1443         }
1444         return c;
1445     }
1446
1447     if (p_class == float_class_inf) {
1448         a.cls = float_class_inf;
1449         a.sign = p_sign ^ sign_flip;
1450         return a;
1451     }
1452
1453     if (p_class == float_class_zero) {
1454         if (c.cls == float_class_zero) {
1455             if (p_sign != c.sign) {
1456                 p_sign = s->float_rounding_mode == float_round_down;
1457             }
1458             c.sign = p_sign;
1459         } else if (flags & float_muladd_halve_result) {
1460             c.exp -= 1;
1461         }
1462         c.sign ^= sign_flip;
1463         return c;
1464     }
1465
1466     /* a & b should be normals now... */
1467     assert(a.cls == float_class_normal &&
1468            b.cls == float_class_normal);
1469
1470     p_exp = a.exp + b.exp;
1471
1472     mul64To128(a.frac, b.frac, &hi, &lo);
1473
1474     /* Renormalize to the msb. */
1475     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1476         p_exp += 1;
1477     } else {
1478         shortShift128Left(hi, lo, 1, &hi, &lo);
1479     }
1480
1481     /* + add/sub */
1482     if (c.cls != float_class_zero) {
1483         int exp_diff = p_exp - c.exp;
1484         if (p_sign == c.sign) {
1485             /* Addition */
1486             if (exp_diff <= 0) {
1487                 shift64RightJamming(hi, -exp_diff, &hi);
1488                 p_exp = c.exp;
1489                 if (uadd64_overflow(hi, c.frac, &hi)) {
1490                     shift64RightJamming(hi, 1, &hi);
1491                     hi |= DECOMPOSED_IMPLICIT_BIT;
1492                     p_exp += 1;
1493                 }
1494             } else {
1495                 uint64_t c_hi, c_lo, over;
1496                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1497                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1498                 if (over) {
1499                     shift64RightJamming(hi, 1, &hi);
1500                     hi |= DECOMPOSED_IMPLICIT_BIT;
1501                     p_exp += 1;
1502                 }
1503             }
1504         } else {
1505             /* Subtraction */
1506             uint64_t c_hi = c.frac, c_lo = 0;
1507
1508             if (exp_diff <= 0) {
1509                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1510                 if (exp_diff == 0
1511                     &&
1512                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1513                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1514                 } else {
1515                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1516                     p_sign ^= 1;
1517                     p_exp = c.exp;
1518                 }
1519             } else {
1520                 shift128RightJamming(c_hi, c_lo,
1521                                      exp_diff,
1522                                      &c_hi, &c_lo);
1523                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1524             }
1525
1526             if (hi == 0 && lo == 0) {
1527                 a.cls = float_class_zero;
1528                 a.sign = s->float_rounding_mode == float_round_down;
1529                 a.sign ^= sign_flip;
1530                 return a;
1531             } else {
1532                 int shift;
1533                 if (hi != 0) {
1534                     shift = clz64(hi);
1535                 } else {
1536                     shift = clz64(lo) + 64;
1537                 }
1538                 /* Normalizing to a binary point of 124 is the
1539                    correct adjust for the exponent.  However since we're
1540                    shifting, we might as well put the binary point back
1541                    at 63 where we really want it.  Therefore shift as
1542                    if we're leaving 1 bit at the top of the word, but
1543                    adjust the exponent as if we're leaving 3 bits.  */
1544                 shift128Left(hi, lo, shift, &hi, &lo);
1545                 p_exp -= shift;
1546             }
1547         }
1548     }
1549     hi |= (lo != 0);
1550
1551     if (flags & float_muladd_halve_result) {
1552         p_exp -= 1;
1553     }
1554
1555     /* finally prepare our result */
1556     a.cls = float_class_normal;
1557     a.sign = p_sign ^ sign_flip;
1558     a.exp = p_exp;
1559     a.frac = hi;
1560
1561     return a;
1562 }
1563
1564 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1565                                                 int flags, float_status *status)
1566 {
1567     FloatParts64 pa, pb, pc, pr;
1568
1569     float16_unpack_canonical(&pa, a, status);
1570     float16_unpack_canonical(&pb, b, status);
1571     float16_unpack_canonical(&pc, c, status);
1572     pr = muladd_floats(pa, pb, pc, flags, status);
1573
1574     return float16_round_pack_canonical(&pr, status);
1575 }
1576
1577 static float32 QEMU_SOFTFLOAT_ATTR
1578 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1579                 float_status *status)
1580 {
1581     FloatParts64 pa, pb, pc, pr;
1582
1583     float32_unpack_canonical(&pa, a, status);
1584     float32_unpack_canonical(&pb, b, status);
1585     float32_unpack_canonical(&pc, c, status);
1586     pr = muladd_floats(pa, pb, pc, flags, status);
1587
1588     return float32_round_pack_canonical(&pr, status);
1589 }
1590
1591 static float64 QEMU_SOFTFLOAT_ATTR
1592 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1593                 float_status *status)
1594 {
1595     FloatParts64 pa, pb, pc, pr;
1596
1597     float64_unpack_canonical(&pa, a, status);
1598     float64_unpack_canonical(&pb, b, status);
1599     float64_unpack_canonical(&pc, c, status);
1600     pr = muladd_floats(pa, pb, pc, flags, status);
1601
1602     return float64_round_pack_canonical(&pr, status);
1603 }
1604
1605 static bool force_soft_fma;
1606
1607 float32 QEMU_FLATTEN
1608 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1609 {
1610     union_float32 ua, ub, uc, ur;
1611
1612     ua.s = xa;
1613     ub.s = xb;
1614     uc.s = xc;
1615
1616     if (unlikely(!can_use_fpu(s))) {
1617         goto soft;
1618     }
1619     if (unlikely(flags & float_muladd_halve_result)) {
1620         goto soft;
1621     }
1622
1623     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1624     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1625         goto soft;
1626     }
1627
1628     if (unlikely(force_soft_fma)) {
1629         goto soft;
1630     }
1631
1632     /*
1633      * When (a || b) == 0, there's no need to check for under/over flow,
1634      * since we know the addend is (normal || 0) and the product is 0.
1635      */
1636     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1637         union_float32 up;
1638         bool prod_sign;
1639
1640         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1641         prod_sign ^= !!(flags & float_muladd_negate_product);
1642         up.s = float32_set_sign(float32_zero, prod_sign);
1643
1644         if (flags & float_muladd_negate_c) {
1645             uc.h = -uc.h;
1646         }
1647         ur.h = up.h + uc.h;
1648     } else {
1649         union_float32 ua_orig = ua;
1650         union_float32 uc_orig = uc;
1651
1652         if (flags & float_muladd_negate_product) {
1653             ua.h = -ua.h;
1654         }
1655         if (flags & float_muladd_negate_c) {
1656             uc.h = -uc.h;
1657         }
1658
1659         ur.h = fmaf(ua.h, ub.h, uc.h);
1660
1661         if (unlikely(f32_is_inf(ur))) {
1662             float_raise(float_flag_overflow, s);
1663         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1664             ua = ua_orig;
1665             uc = uc_orig;
1666             goto soft;
1667         }
1668     }
1669     if (flags & float_muladd_negate_result) {
1670         return float32_chs(ur.s);
1671     }
1672     return ur.s;
1673
1674  soft:
1675     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1676 }
1677
1678 float64 QEMU_FLATTEN
1679 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1680 {
1681     union_float64 ua, ub, uc, ur;
1682
1683     ua.s = xa;
1684     ub.s = xb;
1685     uc.s = xc;
1686
1687     if (unlikely(!can_use_fpu(s))) {
1688         goto soft;
1689     }
1690     if (unlikely(flags & float_muladd_halve_result)) {
1691         goto soft;
1692     }
1693
1694     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1695     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1696         goto soft;
1697     }
1698
1699     if (unlikely(force_soft_fma)) {
1700         goto soft;
1701     }
1702
1703     /*
1704      * When (a || b) == 0, there's no need to check for under/over flow,
1705      * since we know the addend is (normal || 0) and the product is 0.
1706      */
1707     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1708         union_float64 up;
1709         bool prod_sign;
1710
1711         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1712         prod_sign ^= !!(flags & float_muladd_negate_product);
1713         up.s = float64_set_sign(float64_zero, prod_sign);
1714
1715         if (flags & float_muladd_negate_c) {
1716             uc.h = -uc.h;
1717         }
1718         ur.h = up.h + uc.h;
1719     } else {
1720         union_float64 ua_orig = ua;
1721         union_float64 uc_orig = uc;
1722
1723         if (flags & float_muladd_negate_product) {
1724             ua.h = -ua.h;
1725         }
1726         if (flags & float_muladd_negate_c) {
1727             uc.h = -uc.h;
1728         }
1729
1730         ur.h = fma(ua.h, ub.h, uc.h);
1731
1732         if (unlikely(f64_is_inf(ur))) {
1733             float_raise(float_flag_overflow, s);
1734         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1735             ua = ua_orig;
1736             uc = uc_orig;
1737             goto soft;
1738         }
1739     }
1740     if (flags & float_muladd_negate_result) {
1741         return float64_chs(ur.s);
1742     }
1743     return ur.s;
1744
1745  soft:
1746     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1747 }
1748
1749 /*
1750  * Returns the result of multiplying the bfloat16 values `a'
1751  * and `b' then adding 'c', with no intermediate rounding step after the
1752  * multiplication.
1753  */
1754
1755 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1756                                       int flags, float_status *status)
1757 {
1758     FloatParts64 pa, pb, pc, pr;
1759
1760     bfloat16_unpack_canonical(&pa, a, status);
1761     bfloat16_unpack_canonical(&pb, b, status);
1762     bfloat16_unpack_canonical(&pc, c, status);
1763     pr = muladd_floats(pa, pb, pc, flags, status);
1764
1765     return bfloat16_round_pack_canonical(&pr, status);
1766 }
1767
1768 /*
1769  * Returns the result of dividing the floating-point value `a' by the
1770  * corresponding value `b'. The operation is performed according to
1771  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1772  */
1773
1774 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1775 {
1776     bool sign = a.sign ^ b.sign;
1777
1778     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1779         uint64_t n0, n1, q, r;
1780         int exp = a.exp - b.exp;
1781
1782         /*
1783          * We want a 2*N / N-bit division to produce exactly an N-bit
1784          * result, so that we do not lose any precision and so that we
1785          * do not have to renormalize afterward.  If A.frac < B.frac,
1786          * then division would produce an (N-1)-bit result; shift A left
1787          * by one to produce the an N-bit result, and decrement the
1788          * exponent to match.
1789          *
1790          * The udiv_qrnnd algorithm that we're using requires normalization,
1791          * i.e. the msb of the denominator must be set, which is already true.
1792          */
1793         if (a.frac < b.frac) {
1794             exp -= 1;
1795             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1796         } else {
1797             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1798         }
1799         q = udiv_qrnnd(&r, n1, n0, b.frac);
1800
1801         /* Set lsb if there is a remainder, to set inexact. */
1802         a.frac = q | (r != 0);
1803         a.sign = sign;
1804         a.exp = exp;
1805         return a;
1806     }
1807     /* handle all the NaN cases */
1808     if (is_nan(a.cls) || is_nan(b.cls)) {
1809         return *parts_pick_nan(&a, &b, s);
1810     }
1811     /* 0/0 or Inf/Inf */
1812     if (a.cls == b.cls
1813         &&
1814         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1815         float_raise(float_flag_invalid, s);
1816         parts_default_nan(&a, s);
1817         return a;
1818     }
1819     /* Inf / x or 0 / x */
1820     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1821         a.sign = sign;
1822         return a;
1823     }
1824     /* Div 0 => Inf */
1825     if (b.cls == float_class_zero) {
1826         float_raise(float_flag_divbyzero, s);
1827         a.cls = float_class_inf;
1828         a.sign = sign;
1829         return a;
1830     }
1831     /* Div by Inf */
1832     if (b.cls == float_class_inf) {
1833         a.cls = float_class_zero;
1834         a.sign = sign;
1835         return a;
1836     }
1837     g_assert_not_reached();
1838 }
1839
1840 float16 float16_div(float16 a, float16 b, float_status *status)
1841 {
1842     FloatParts64 pa, pb, pr;
1843
1844     float16_unpack_canonical(&pa, a, status);
1845     float16_unpack_canonical(&pb, b, status);
1846     pr = div_floats(pa, pb, status);
1847
1848     return float16_round_pack_canonical(&pr, status);
1849 }
1850
1851 static float32 QEMU_SOFTFLOAT_ATTR
1852 soft_f32_div(float32 a, float32 b, float_status *status)
1853 {
1854     FloatParts64 pa, pb, pr;
1855
1856     float32_unpack_canonical(&pa, a, status);
1857     float32_unpack_canonical(&pb, b, status);
1858     pr = div_floats(pa, pb, status);
1859
1860     return float32_round_pack_canonical(&pr, status);
1861 }
1862
1863 static float64 QEMU_SOFTFLOAT_ATTR
1864 soft_f64_div(float64 a, float64 b, float_status *status)
1865 {
1866     FloatParts64 pa, pb, pr;
1867
1868     float64_unpack_canonical(&pa, a, status);
1869     float64_unpack_canonical(&pb, b, status);
1870     pr = div_floats(pa, pb, status);
1871
1872     return float64_round_pack_canonical(&pr, status);
1873 }
1874
1875 static float hard_f32_div(float a, float b)
1876 {
1877     return a / b;
1878 }
1879
1880 static double hard_f64_div(double a, double b)
1881 {
1882     return a / b;
1883 }
1884
1885 static bool f32_div_pre(union_float32 a, union_float32 b)
1886 {
1887     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1888         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1889                fpclassify(b.h) == FP_NORMAL;
1890     }
1891     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1892 }
1893
1894 static bool f64_div_pre(union_float64 a, union_float64 b)
1895 {
1896     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1897         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1898                fpclassify(b.h) == FP_NORMAL;
1899     }
1900     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1901 }
1902
1903 static bool f32_div_post(union_float32 a, union_float32 b)
1904 {
1905     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1906         return fpclassify(a.h) != FP_ZERO;
1907     }
1908     return !float32_is_zero(a.s);
1909 }
1910
1911 static bool f64_div_post(union_float64 a, union_float64 b)
1912 {
1913     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1914         return fpclassify(a.h) != FP_ZERO;
1915     }
1916     return !float64_is_zero(a.s);
1917 }
1918
1919 float32 QEMU_FLATTEN
1920 float32_div(float32 a, float32 b, float_status *s)
1921 {
1922     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1923                         f32_div_pre, f32_div_post);
1924 }
1925
1926 float64 QEMU_FLATTEN
1927 float64_div(float64 a, float64 b, float_status *s)
1928 {
1929     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1930                         f64_div_pre, f64_div_post);
1931 }
1932
1933 /*
1934  * Returns the result of dividing the bfloat16
1935  * value `a' by the corresponding value `b'.
1936  */
1937
1938 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1939 {
1940     FloatParts64 pa, pb, pr;
1941
1942     bfloat16_unpack_canonical(&pa, a, status);
1943     bfloat16_unpack_canonical(&pb, b, status);
1944     pr = div_floats(pa, pb, status);
1945
1946     return bfloat16_round_pack_canonical(&pr, status);
1947 }
1948
1949 /*
1950  * Float to Float conversions
1951  *
1952  * Returns the result of converting one float format to another. The
1953  * conversion is performed according to the IEC/IEEE Standard for
1954  * Binary Floating-Point Arithmetic.
1955  *
1956  * The float_to_float helper only needs to take care of raising
1957  * invalid exceptions and handling the conversion on NaNs.
1958  */
1959
1960 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1961                                  float_status *s)
1962 {
1963     if (dstf->arm_althp) {
1964         switch (a.cls) {
1965         case float_class_qnan:
1966         case float_class_snan:
1967             /* There is no NaN in the destination format.  Raise Invalid
1968              * and return a zero with the sign of the input NaN.
1969              */
1970             float_raise(float_flag_invalid, s);
1971             a.cls = float_class_zero;
1972             a.frac = 0;
1973             a.exp = 0;
1974             break;
1975
1976         case float_class_inf:
1977             /* There is no Inf in the destination format.  Raise Invalid
1978              * and return the maximum normal with the correct sign.
1979              */
1980             float_raise(float_flag_invalid, s);
1981             a.cls = float_class_normal;
1982             a.exp = dstf->exp_max;
1983             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1984             break;
1985
1986         default:
1987             break;
1988         }
1989     } else if (is_nan(a.cls)) {
1990         parts_return_nan(&a, s);
1991     }
1992     return a;
1993 }
1994
1995 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1996 {
1997     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1998     FloatParts64 pa, pr;
1999
2000     float16a_unpack_canonical(&pa, a, s, fmt16);
2001     pr = float_to_float(pa, &float32_params, s);
2002     return float32_round_pack_canonical(&pr, s);
2003 }
2004
2005 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2006 {
2007     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2008     FloatParts64 pa, pr;
2009
2010     float16a_unpack_canonical(&pa, a, s, fmt16);
2011     pr = float_to_float(pa, &float64_params, s);
2012     return float64_round_pack_canonical(&pr, s);
2013 }
2014
2015 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2016 {
2017     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2018     FloatParts64 pa, pr;
2019
2020     float32_unpack_canonical(&pa, a, s);
2021     pr = float_to_float(pa, fmt16, s);
2022     return float16a_round_pack_canonical(&pr, s, fmt16);
2023 }
2024
2025 static float64 QEMU_SOFTFLOAT_ATTR
2026 soft_float32_to_float64(float32 a, float_status *s)
2027 {
2028     FloatParts64 pa, pr;
2029
2030     float32_unpack_canonical(&pa, a, s);
2031     pr = float_to_float(pa, &float64_params, s);
2032     return float64_round_pack_canonical(&pr, s);
2033 }
2034
2035 float64 float32_to_float64(float32 a, float_status *s)
2036 {
2037     if (likely(float32_is_normal(a))) {
2038         /* Widening conversion can never produce inexact results.  */
2039         union_float32 uf;
2040         union_float64 ud;
2041         uf.s = a;
2042         ud.h = uf.h;
2043         return ud.s;
2044     } else if (float32_is_zero(a)) {
2045         return float64_set_sign(float64_zero, float32_is_neg(a));
2046     } else {
2047         return soft_float32_to_float64(a, s);
2048     }
2049 }
2050
2051 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2052 {
2053     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2054     FloatParts64 pa, pr;
2055
2056     float64_unpack_canonical(&pa, a, s);
2057     pr = float_to_float(pa, fmt16, s);
2058     return float16a_round_pack_canonical(&pr, s, fmt16);
2059 }
2060
2061 float32 float64_to_float32(float64 a, float_status *s)
2062 {
2063     FloatParts64 pa, pr;
2064
2065     float64_unpack_canonical(&pa, a, s);
2066     pr = float_to_float(pa, &float32_params, s);
2067     return float32_round_pack_canonical(&pr, s);
2068 }
2069
2070 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2071 {
2072     FloatParts64 pa, pr;
2073
2074     bfloat16_unpack_canonical(&pa, a, s);
2075     pr = float_to_float(pa, &float32_params, s);
2076     return float32_round_pack_canonical(&pr, s);
2077 }
2078
2079 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2080 {
2081     FloatParts64 pa, pr;
2082
2083     bfloat16_unpack_canonical(&pa, a, s);
2084     pr = float_to_float(pa, &float64_params, s);
2085     return float64_round_pack_canonical(&pr, s);
2086 }
2087
2088 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2089 {
2090     FloatParts64 pa, pr;
2091
2092     float32_unpack_canonical(&pa, a, s);
2093     pr = float_to_float(pa, &bfloat16_params, s);
2094     return bfloat16_round_pack_canonical(&pr, s);
2095 }
2096
2097 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2098 {
2099     FloatParts64 pa, pr;
2100
2101     float64_unpack_canonical(&pa, a, s);
2102     pr = float_to_float(pa, &bfloat16_params, s);
2103     return bfloat16_round_pack_canonical(&pr, s);
2104 }
2105
2106 /*
2107  * Rounds the floating-point value `a' to an integer, and returns the
2108  * result as a floating-point value. The operation is performed
2109  * according to the IEC/IEEE Standard for Binary Floating-Point
2110  * Arithmetic.
2111  */
2112
2113 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2114                                int scale, float_status *s)
2115 {
2116     switch (a.cls) {
2117     case float_class_qnan:
2118     case float_class_snan:
2119         parts_return_nan(&a, s);
2120         break;
2121
2122     case float_class_zero:
2123     case float_class_inf:
2124         /* already "integral" */
2125         break;
2126
2127     case float_class_normal:
2128         scale = MIN(MAX(scale, -0x10000), 0x10000);
2129         a.exp += scale;
2130
2131         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2132             /* already integral */
2133             break;
2134         }
2135         if (a.exp < 0) {
2136             bool one;
2137             /* all fractional */
2138             float_raise(float_flag_inexact, s);
2139             switch (rmode) {
2140             case float_round_nearest_even:
2141                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2142                 break;
2143             case float_round_ties_away:
2144                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2145                 break;
2146             case float_round_to_zero:
2147                 one = false;
2148                 break;
2149             case float_round_up:
2150                 one = !a.sign;
2151                 break;
2152             case float_round_down:
2153                 one = a.sign;
2154                 break;
2155             case float_round_to_odd:
2156                 one = true;
2157                 break;
2158             default:
2159                 g_assert_not_reached();
2160             }
2161
2162             if (one) {
2163                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2164                 a.exp = 0;
2165             } else {
2166                 a.cls = float_class_zero;
2167             }
2168         } else {
2169             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2170             uint64_t frac_lsbm1 = frac_lsb >> 1;
2171             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2172             uint64_t rnd_mask = rnd_even_mask >> 1;
2173             uint64_t inc;
2174
2175             switch (rmode) {
2176             case float_round_nearest_even:
2177                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2178                 break;
2179             case float_round_ties_away:
2180                 inc = frac_lsbm1;
2181                 break;
2182             case float_round_to_zero:
2183                 inc = 0;
2184                 break;
2185             case float_round_up:
2186                 inc = a.sign ? 0 : rnd_mask;
2187                 break;
2188             case float_round_down:
2189                 inc = a.sign ? rnd_mask : 0;
2190                 break;
2191             case float_round_to_odd:
2192                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2193                 break;
2194             default:
2195                 g_assert_not_reached();
2196             }
2197
2198             if (a.frac & rnd_mask) {
2199                 float_raise(float_flag_inexact, s);
2200                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2201                     a.frac >>= 1;
2202                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2203                     a.exp++;
2204                 }
2205                 a.frac &= ~rnd_mask;
2206             }
2207         }
2208         break;
2209     default:
2210         g_assert_not_reached();
2211     }
2212     return a;
2213 }
2214
2215 float16 float16_round_to_int(float16 a, float_status *s)
2216 {
2217     FloatParts64 pa, pr;
2218
2219     float16_unpack_canonical(&pa, a, s);
2220     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2221     return float16_round_pack_canonical(&pr, s);
2222 }
2223
2224 float32 float32_round_to_int(float32 a, float_status *s)
2225 {
2226     FloatParts64 pa, pr;
2227
2228     float32_unpack_canonical(&pa, a, s);
2229     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2230     return float32_round_pack_canonical(&pr, s);
2231 }
2232
2233 float64 float64_round_to_int(float64 a, float_status *s)
2234 {
2235     FloatParts64 pa, pr;
2236
2237     float64_unpack_canonical(&pa, a, s);
2238     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2239     return float64_round_pack_canonical(&pr, s);
2240 }
2241
2242 /*
2243  * Rounds the bfloat16 value `a' to an integer, and returns the
2244  * result as a bfloat16 value.
2245  */
2246
2247 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2248 {
2249     FloatParts64 pa, pr;
2250
2251     bfloat16_unpack_canonical(&pa, a, s);
2252     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2253     return bfloat16_round_pack_canonical(&pr, s);
2254 }
2255
2256 /*
2257  * Returns the result of converting the floating-point value `a' to
2258  * the two's complement integer format. The conversion is performed
2259  * according to the IEC/IEEE Standard for Binary Floating-Point
2260  * Arithmetic---which means in particular that the conversion is
2261  * rounded according to the current rounding mode. If `a' is a NaN,
2262  * the largest positive integer is returned. Otherwise, if the
2263  * conversion overflows, the largest integer with the same sign as `a'
2264  * is returned.
2265 */
2266
2267 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2268                                      int scale, int64_t min, int64_t max,
2269                                      float_status *s)
2270 {
2271     uint64_t r;
2272     int orig_flags = get_float_exception_flags(s);
2273     FloatParts64 p = round_to_int(in, rmode, scale, s);
2274
2275     switch (p.cls) {
2276     case float_class_snan:
2277     case float_class_qnan:
2278         s->float_exception_flags = orig_flags | float_flag_invalid;
2279         return max;
2280     case float_class_inf:
2281         s->float_exception_flags = orig_flags | float_flag_invalid;
2282         return p.sign ? min : max;
2283     case float_class_zero:
2284         return 0;
2285     case float_class_normal:
2286         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2287             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2288         } else {
2289             r = UINT64_MAX;
2290         }
2291         if (p.sign) {
2292             if (r <= -(uint64_t) min) {
2293                 return -r;
2294             } else {
2295                 s->float_exception_flags = orig_flags | float_flag_invalid;
2296                 return min;
2297             }
2298         } else {
2299             if (r <= max) {
2300                 return r;
2301             } else {
2302                 s->float_exception_flags = orig_flags | float_flag_invalid;
2303                 return max;
2304             }
2305         }
2306     default:
2307         g_assert_not_reached();
2308     }
2309 }
2310
2311 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2312                               float_status *s)
2313 {
2314     FloatParts64 p;
2315
2316     float16_unpack_canonical(&p, a, s);
2317     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2318 }
2319
2320 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2321                                 float_status *s)
2322 {
2323     FloatParts64 p;
2324
2325     float16_unpack_canonical(&p, a, s);
2326     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2327 }
2328
2329 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2330                                 float_status *s)
2331 {
2332     FloatParts64 p;
2333
2334     float16_unpack_canonical(&p, a, s);
2335     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2336 }
2337
2338 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2339                                 float_status *s)
2340 {
2341     FloatParts64 p;
2342
2343     float16_unpack_canonical(&p, a, s);
2344     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2345 }
2346
2347 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2348                                 float_status *s)
2349 {
2350     FloatParts64 p;
2351
2352     float32_unpack_canonical(&p, a, s);
2353     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2354 }
2355
2356 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2357                                 float_status *s)
2358 {
2359     FloatParts64 p;
2360
2361     float32_unpack_canonical(&p, a, s);
2362     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2363 }
2364
2365 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2366                                 float_status *s)
2367 {
2368     FloatParts64 p;
2369
2370     float32_unpack_canonical(&p, a, s);
2371     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2372 }
2373
2374 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2375                                 float_status *s)
2376 {
2377     FloatParts64 p;
2378
2379     float64_unpack_canonical(&p, a, s);
2380     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2381 }
2382
2383 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2384                                 float_status *s)
2385 {
2386     FloatParts64 p;
2387
2388     float64_unpack_canonical(&p, a, s);
2389     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2390 }
2391
2392 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2393                                 float_status *s)
2394 {
2395     FloatParts64 p;
2396
2397     float64_unpack_canonical(&p, a, s);
2398     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2399 }
2400
2401 int8_t float16_to_int8(float16 a, float_status *s)
2402 {
2403     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2404 }
2405
2406 int16_t float16_to_int16(float16 a, float_status *s)
2407 {
2408     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2409 }
2410
2411 int32_t float16_to_int32(float16 a, float_status *s)
2412 {
2413     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2414 }
2415
2416 int64_t float16_to_int64(float16 a, float_status *s)
2417 {
2418     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2419 }
2420
2421 int16_t float32_to_int16(float32 a, float_status *s)
2422 {
2423     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2424 }
2425
2426 int32_t float32_to_int32(float32 a, float_status *s)
2427 {
2428     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2429 }
2430
2431 int64_t float32_to_int64(float32 a, float_status *s)
2432 {
2433     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2434 }
2435
2436 int16_t float64_to_int16(float64 a, float_status *s)
2437 {
2438     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2439 }
2440
2441 int32_t float64_to_int32(float64 a, float_status *s)
2442 {
2443     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2444 }
2445
2446 int64_t float64_to_int64(float64 a, float_status *s)
2447 {
2448     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2449 }
2450
2451 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2452 {
2453     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2454 }
2455
2456 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2457 {
2458     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2459 }
2460
2461 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2462 {
2463     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2464 }
2465
2466 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2467 {
2468     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2469 }
2470
2471 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2472 {
2473     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2474 }
2475
2476 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2477 {
2478     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2479 }
2480
2481 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2482 {
2483     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2484 }
2485
2486 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2487 {
2488     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2489 }
2490
2491 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2492 {
2493     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2494 }
2495
2496 /*
2497  * Returns the result of converting the floating-point value `a' to
2498  * the two's complement integer format.
2499  */
2500
2501 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2502                                  float_status *s)
2503 {
2504     FloatParts64 p;
2505
2506     bfloat16_unpack_canonical(&p, a, s);
2507     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2508 }
2509
2510 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2511                                  float_status *s)
2512 {
2513     FloatParts64 p;
2514
2515     bfloat16_unpack_canonical(&p, a, s);
2516     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2517 }
2518
2519 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2520                                  float_status *s)
2521 {
2522     FloatParts64 p;
2523
2524     bfloat16_unpack_canonical(&p, a, s);
2525     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2526 }
2527
2528 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2529 {
2530     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2531 }
2532
2533 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2534 {
2535     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2536 }
2537
2538 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2539 {
2540     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2541 }
2542
2543 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2544 {
2545     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2546 }
2547
2548 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2549 {
2550     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2551 }
2552
2553 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2554 {
2555     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2556 }
2557
2558 /*
2559  *  Returns the result of converting the floating-point value `a' to
2560  *  the unsigned integer format. The conversion is performed according
2561  *  to the IEC/IEEE Standard for Binary Floating-Point
2562  *  Arithmetic---which means in particular that the conversion is
2563  *  rounded according to the current rounding mode. If `a' is a NaN,
2564  *  the largest unsigned integer is returned. Otherwise, if the
2565  *  conversion overflows, the largest unsigned integer is returned. If
2566  *  the 'a' is negative, the result is rounded and zero is returned;
2567  *  values that do not round to zero will raise the inexact exception
2568  *  flag.
2569  */
2570
2571 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2572                                        int scale, uint64_t max,
2573                                        float_status *s)
2574 {
2575     int orig_flags = get_float_exception_flags(s);
2576     FloatParts64 p = round_to_int(in, rmode, scale, s);
2577     uint64_t r;
2578
2579     switch (p.cls) {
2580     case float_class_snan:
2581     case float_class_qnan:
2582         s->float_exception_flags = orig_flags | float_flag_invalid;
2583         return max;
2584     case float_class_inf:
2585         s->float_exception_flags = orig_flags | float_flag_invalid;
2586         return p.sign ? 0 : max;
2587     case float_class_zero:
2588         return 0;
2589     case float_class_normal:
2590         if (p.sign) {
2591             s->float_exception_flags = orig_flags | float_flag_invalid;
2592             return 0;
2593         }
2594
2595         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2596             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2597         } else {
2598             s->float_exception_flags = orig_flags | float_flag_invalid;
2599             return max;
2600         }
2601
2602         /* For uint64 this will never trip, but if p.exp is too large
2603          * to shift a decomposed fraction we shall have exited via the
2604          * 3rd leg above.
2605          */
2606         if (r > max) {
2607             s->float_exception_flags = orig_flags | float_flag_invalid;
2608             return max;
2609         }
2610         return r;
2611     default:
2612         g_assert_not_reached();
2613     }
2614 }
2615
2616 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2617                                 float_status *s)
2618 {
2619     FloatParts64 p;
2620
2621     float16_unpack_canonical(&p, a, s);
2622     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2623 }
2624
2625 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2626                                   float_status *s)
2627 {
2628     FloatParts64 p;
2629
2630     float16_unpack_canonical(&p, a, s);
2631     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2632 }
2633
2634 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2635                                   float_status *s)
2636 {
2637     FloatParts64 p;
2638
2639     float16_unpack_canonical(&p, a, s);
2640     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2641 }
2642
2643 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2644                                   float_status *s)
2645 {
2646     FloatParts64 p;
2647
2648     float16_unpack_canonical(&p, a, s);
2649     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2650 }
2651
2652 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2653                                   float_status *s)
2654 {
2655     FloatParts64 p;
2656
2657     float32_unpack_canonical(&p, a, s);
2658     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2659 }
2660
2661 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2662                                   float_status *s)
2663 {
2664     FloatParts64 p;
2665
2666     float32_unpack_canonical(&p, a, s);
2667     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2668 }
2669
2670 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2671                                   float_status *s)
2672 {
2673     FloatParts64 p;
2674
2675     float32_unpack_canonical(&p, a, s);
2676     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2677 }
2678
2679 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2680                                   float_status *s)
2681 {
2682     FloatParts64 p;
2683
2684     float64_unpack_canonical(&p, a, s);
2685     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2686 }
2687
2688 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2689                                   float_status *s)
2690 {
2691     FloatParts64 p;
2692
2693     float64_unpack_canonical(&p, a, s);
2694     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2695 }
2696
2697 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2698                                   float_status *s)
2699 {
2700     FloatParts64 p;
2701
2702     float64_unpack_canonical(&p, a, s);
2703     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2704 }
2705
2706 uint8_t float16_to_uint8(float16 a, float_status *s)
2707 {
2708     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2709 }
2710
2711 uint16_t float16_to_uint16(float16 a, float_status *s)
2712 {
2713     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2714 }
2715
2716 uint32_t float16_to_uint32(float16 a, float_status *s)
2717 {
2718     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2719 }
2720
2721 uint64_t float16_to_uint64(float16 a, float_status *s)
2722 {
2723     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2724 }
2725
2726 uint16_t float32_to_uint16(float32 a, float_status *s)
2727 {
2728     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2729 }
2730
2731 uint32_t float32_to_uint32(float32 a, float_status *s)
2732 {
2733     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2734 }
2735
2736 uint64_t float32_to_uint64(float32 a, float_status *s)
2737 {
2738     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2739 }
2740
2741 uint16_t float64_to_uint16(float64 a, float_status *s)
2742 {
2743     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2744 }
2745
2746 uint32_t float64_to_uint32(float64 a, float_status *s)
2747 {
2748     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2749 }
2750
2751 uint64_t float64_to_uint64(float64 a, float_status *s)
2752 {
2753     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2754 }
2755
2756 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2757 {
2758     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2759 }
2760
2761 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2762 {
2763     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2764 }
2765
2766 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2767 {
2768     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2769 }
2770
2771 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2772 {
2773     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2774 }
2775
2776 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2777 {
2778     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2779 }
2780
2781 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2782 {
2783     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2784 }
2785
2786 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2787 {
2788     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2789 }
2790
2791 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2792 {
2793     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2794 }
2795
2796 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2797 {
2798     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2799 }
2800
2801 /*
2802  *  Returns the result of converting the bfloat16 value `a' to
2803  *  the unsigned integer format.
2804  */
2805
2806 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2807                                    int scale, float_status *s)
2808 {
2809     FloatParts64 p;
2810
2811     bfloat16_unpack_canonical(&p, a, s);
2812     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2813 }
2814
2815 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2816                                    int scale, float_status *s)
2817 {
2818     FloatParts64 p;
2819
2820     bfloat16_unpack_canonical(&p, a, s);
2821     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2822 }
2823
2824 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2825                                    int scale, float_status *s)
2826 {
2827     FloatParts64 p;
2828
2829     bfloat16_unpack_canonical(&p, a, s);
2830     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2831 }
2832
2833 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2834 {
2835     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2836 }
2837
2838 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2839 {
2840     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2841 }
2842
2843 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2844 {
2845     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2846 }
2847
2848 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2849 {
2850     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2851 }
2852
2853 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2854 {
2855     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2856 }
2857
2858 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2859 {
2860     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2861 }
2862
2863 /*
2864  * Integer to float conversions
2865  *
2866  * Returns the result of converting the two's complement integer `a'
2867  * to the floating-point format. The conversion is performed according
2868  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2869  */
2870
2871 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2872 {
2873     FloatParts64 r = { .sign = false };
2874
2875     if (a == 0) {
2876         r.cls = float_class_zero;
2877     } else {
2878         uint64_t f = a;
2879         int shift;
2880
2881         r.cls = float_class_normal;
2882         if (a < 0) {
2883             f = -f;
2884             r.sign = true;
2885         }
2886         shift = clz64(f);
2887         scale = MIN(MAX(scale, -0x10000), 0x10000);
2888
2889         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2890         r.frac = f << shift;
2891     }
2892
2893     return r;
2894 }
2895
2896 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2897 {
2898     FloatParts64 pa = int_to_float(a, scale, status);
2899     return float16_round_pack_canonical(&pa, status);
2900 }
2901
2902 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2903 {
2904     return int64_to_float16_scalbn(a, scale, status);
2905 }
2906
2907 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2908 {
2909     return int64_to_float16_scalbn(a, scale, status);
2910 }
2911
2912 float16 int64_to_float16(int64_t a, float_status *status)
2913 {
2914     return int64_to_float16_scalbn(a, 0, status);
2915 }
2916
2917 float16 int32_to_float16(int32_t a, float_status *status)
2918 {
2919     return int64_to_float16_scalbn(a, 0, status);
2920 }
2921
2922 float16 int16_to_float16(int16_t a, float_status *status)
2923 {
2924     return int64_to_float16_scalbn(a, 0, status);
2925 }
2926
2927 float16 int8_to_float16(int8_t a, float_status *status)
2928 {
2929     return int64_to_float16_scalbn(a, 0, status);
2930 }
2931
2932 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2933 {
2934     FloatParts64 pa = int_to_float(a, scale, status);
2935     return float32_round_pack_canonical(&pa, status);
2936 }
2937
2938 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2939 {
2940     return int64_to_float32_scalbn(a, scale, status);
2941 }
2942
2943 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2944 {
2945     return int64_to_float32_scalbn(a, scale, status);
2946 }
2947
2948 float32 int64_to_float32(int64_t a, float_status *status)
2949 {
2950     return int64_to_float32_scalbn(a, 0, status);
2951 }
2952
2953 float32 int32_to_float32(int32_t a, float_status *status)
2954 {
2955     return int64_to_float32_scalbn(a, 0, status);
2956 }
2957
2958 float32 int16_to_float32(int16_t a, float_status *status)
2959 {
2960     return int64_to_float32_scalbn(a, 0, status);
2961 }
2962
2963 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2964 {
2965     FloatParts64 pa = int_to_float(a, scale, status);
2966     return float64_round_pack_canonical(&pa, status);
2967 }
2968
2969 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2970 {
2971     return int64_to_float64_scalbn(a, scale, status);
2972 }
2973
2974 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2975 {
2976     return int64_to_float64_scalbn(a, scale, status);
2977 }
2978
2979 float64 int64_to_float64(int64_t a, float_status *status)
2980 {
2981     return int64_to_float64_scalbn(a, 0, status);
2982 }
2983
2984 float64 int32_to_float64(int32_t a, float_status *status)
2985 {
2986     return int64_to_float64_scalbn(a, 0, status);
2987 }
2988
2989 float64 int16_to_float64(int16_t a, float_status *status)
2990 {
2991     return int64_to_float64_scalbn(a, 0, status);
2992 }
2993
2994 /*
2995  * Returns the result of converting the two's complement integer `a'
2996  * to the bfloat16 format.
2997  */
2998
2999 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3000 {
3001     FloatParts64 pa = int_to_float(a, scale, status);
3002     return bfloat16_round_pack_canonical(&pa, status);
3003 }
3004
3005 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3006 {
3007     return int64_to_bfloat16_scalbn(a, scale, status);
3008 }
3009
3010 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3011 {
3012     return int64_to_bfloat16_scalbn(a, scale, status);
3013 }
3014
3015 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3016 {
3017     return int64_to_bfloat16_scalbn(a, 0, status);
3018 }
3019
3020 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3021 {
3022     return int64_to_bfloat16_scalbn(a, 0, status);
3023 }
3024
3025 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3026 {
3027     return int64_to_bfloat16_scalbn(a, 0, status);
3028 }
3029
3030 /*
3031  * Unsigned Integer to float conversions
3032  *
3033  * Returns the result of converting the unsigned integer `a' to the
3034  * floating-point format. The conversion is performed according to the
3035  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3036  */
3037
3038 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3039 {
3040     FloatParts64 r = { .sign = false };
3041     int shift;
3042
3043     if (a == 0) {
3044         r.cls = float_class_zero;
3045     } else {
3046         scale = MIN(MAX(scale, -0x10000), 0x10000);
3047         shift = clz64(a);
3048         r.cls = float_class_normal;
3049         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3050         r.frac = a << shift;
3051     }
3052
3053     return r;
3054 }
3055
3056 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3057 {
3058     FloatParts64 pa = uint_to_float(a, scale, status);
3059     return float16_round_pack_canonical(&pa, status);
3060 }
3061
3062 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3063 {
3064     return uint64_to_float16_scalbn(a, scale, status);
3065 }
3066
3067 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3068 {
3069     return uint64_to_float16_scalbn(a, scale, status);
3070 }
3071
3072 float16 uint64_to_float16(uint64_t a, float_status *status)
3073 {
3074     return uint64_to_float16_scalbn(a, 0, status);
3075 }
3076
3077 float16 uint32_to_float16(uint32_t a, float_status *status)
3078 {
3079     return uint64_to_float16_scalbn(a, 0, status);
3080 }
3081
3082 float16 uint16_to_float16(uint16_t a, float_status *status)
3083 {
3084     return uint64_to_float16_scalbn(a, 0, status);
3085 }
3086
3087 float16 uint8_to_float16(uint8_t a, float_status *status)
3088 {
3089     return uint64_to_float16_scalbn(a, 0, status);
3090 }
3091
3092 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3093 {
3094     FloatParts64 pa = uint_to_float(a, scale, status);
3095     return float32_round_pack_canonical(&pa, status);
3096 }
3097
3098 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3099 {
3100     return uint64_to_float32_scalbn(a, scale, status);
3101 }
3102
3103 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3104 {
3105     return uint64_to_float32_scalbn(a, scale, status);
3106 }
3107
3108 float32 uint64_to_float32(uint64_t a, float_status *status)
3109 {
3110     return uint64_to_float32_scalbn(a, 0, status);
3111 }
3112
3113 float32 uint32_to_float32(uint32_t a, float_status *status)
3114 {
3115     return uint64_to_float32_scalbn(a, 0, status);
3116 }
3117
3118 float32 uint16_to_float32(uint16_t a, float_status *status)
3119 {
3120     return uint64_to_float32_scalbn(a, 0, status);
3121 }
3122
3123 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3124 {
3125     FloatParts64 pa = uint_to_float(a, scale, status);
3126     return float64_round_pack_canonical(&pa, status);
3127 }
3128
3129 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3130 {
3131     return uint64_to_float64_scalbn(a, scale, status);
3132 }
3133
3134 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3135 {
3136     return uint64_to_float64_scalbn(a, scale, status);
3137 }
3138
3139 float64 uint64_to_float64(uint64_t a, float_status *status)
3140 {
3141     return uint64_to_float64_scalbn(a, 0, status);
3142 }
3143
3144 float64 uint32_to_float64(uint32_t a, float_status *status)
3145 {
3146     return uint64_to_float64_scalbn(a, 0, status);
3147 }
3148
3149 float64 uint16_to_float64(uint16_t a, float_status *status)
3150 {
3151     return uint64_to_float64_scalbn(a, 0, status);
3152 }
3153
3154 /*
3155  * Returns the result of converting the unsigned integer `a' to the
3156  * bfloat16 format.
3157  */
3158
3159 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3160 {
3161     FloatParts64 pa = uint_to_float(a, scale, status);
3162     return bfloat16_round_pack_canonical(&pa, status);
3163 }
3164
3165 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3166 {
3167     return uint64_to_bfloat16_scalbn(a, scale, status);
3168 }
3169
3170 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3171 {
3172     return uint64_to_bfloat16_scalbn(a, scale, status);
3173 }
3174
3175 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3176 {
3177     return uint64_to_bfloat16_scalbn(a, 0, status);
3178 }
3179
3180 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3181 {
3182     return uint64_to_bfloat16_scalbn(a, 0, status);
3183 }
3184
3185 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3186 {
3187     return uint64_to_bfloat16_scalbn(a, 0, status);
3188 }
3189
3190 /* Float Min/Max */
3191 /* min() and max() functions. These can't be implemented as
3192  * 'compare and pick one input' because that would mishandle
3193  * NaNs and +0 vs -0.
3194  *
3195  * minnum() and maxnum() functions. These are similar to the min()
3196  * and max() functions but if one of the arguments is a QNaN and
3197  * the other is numerical then the numerical argument is returned.
3198  * SNaNs will get quietened before being returned.
3199  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3200  * and maxNum() operations. min() and max() are the typical min/max
3201  * semantics provided by many CPUs which predate that specification.
3202  *
3203  * minnummag() and maxnummag() functions correspond to minNumMag()
3204  * and minNumMag() from the IEEE-754 2008.
3205  */
3206 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3207                                 bool ieee, bool ismag, float_status *s)
3208 {
3209     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3210         if (ieee) {
3211             /* Takes two floating-point values `a' and `b', one of
3212              * which is a NaN, and returns the appropriate NaN
3213              * result. If either `a' or `b' is a signaling NaN,
3214              * the invalid exception is raised.
3215              */
3216             if (is_snan(a.cls) || is_snan(b.cls)) {
3217                 return *parts_pick_nan(&a, &b, s);
3218             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3219                 return b;
3220             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3221                 return a;
3222             }
3223         }
3224         return *parts_pick_nan(&a, &b, s);
3225     } else {
3226         int a_exp, b_exp;
3227
3228         switch (a.cls) {
3229         case float_class_normal:
3230             a_exp = a.exp;
3231             break;
3232         case float_class_inf:
3233             a_exp = INT_MAX;
3234             break;
3235         case float_class_zero:
3236             a_exp = INT_MIN;
3237             break;
3238         default:
3239             g_assert_not_reached();
3240             break;
3241         }
3242         switch (b.cls) {
3243         case float_class_normal:
3244             b_exp = b.exp;
3245             break;
3246         case float_class_inf:
3247             b_exp = INT_MAX;
3248             break;
3249         case float_class_zero:
3250             b_exp = INT_MIN;
3251             break;
3252         default:
3253             g_assert_not_reached();
3254             break;
3255         }
3256
3257         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3258             bool a_less = a_exp < b_exp;
3259             if (a_exp == b_exp) {
3260                 a_less = a.frac < b.frac;
3261             }
3262             return a_less ^ ismin ? b : a;
3263         }
3264
3265         if (a.sign == b.sign) {
3266             bool a_less = a_exp < b_exp;
3267             if (a_exp == b_exp) {
3268                 a_less = a.frac < b.frac;
3269             }
3270             return a.sign ^ a_less ^ ismin ? b : a;
3271         } else {
3272             return a.sign ^ ismin ? b : a;
3273         }
3274     }
3275 }
3276
3277 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3278 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3279                                      float_status *s)                   \
3280 {                                                                       \
3281     FloatParts64 pa, pb, pr;                                            \
3282     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3283     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3284     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3285     return float ## sz ## _round_pack_canonical(&pr, s);                \
3286 }
3287
3288 MINMAX(16, min, true, false, false)
3289 MINMAX(16, minnum, true, true, false)
3290 MINMAX(16, minnummag, true, true, true)
3291 MINMAX(16, max, false, false, false)
3292 MINMAX(16, maxnum, false, true, false)
3293 MINMAX(16, maxnummag, false, true, true)
3294
3295 MINMAX(32, min, true, false, false)
3296 MINMAX(32, minnum, true, true, false)
3297 MINMAX(32, minnummag, true, true, true)
3298 MINMAX(32, max, false, false, false)
3299 MINMAX(32, maxnum, false, true, false)
3300 MINMAX(32, maxnummag, false, true, true)
3301
3302 MINMAX(64, min, true, false, false)
3303 MINMAX(64, minnum, true, true, false)
3304 MINMAX(64, minnummag, true, true, true)
3305 MINMAX(64, max, false, false, false)
3306 MINMAX(64, maxnum, false, true, false)
3307 MINMAX(64, maxnummag, false, true, true)
3308
3309 #undef MINMAX
3310
3311 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3312 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3313 {                                                                       \
3314     FloatParts64 pa, pb, pr;                                            \
3315     bfloat16_unpack_canonical(&pa, a, s);                               \
3316     bfloat16_unpack_canonical(&pb, b, s);                               \
3317     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3318     return bfloat16_round_pack_canonical(&pr, s);                       \
3319 }
3320
3321 BF16_MINMAX(min, true, false, false)
3322 BF16_MINMAX(minnum, true, true, false)
3323 BF16_MINMAX(minnummag, true, true, true)
3324 BF16_MINMAX(max, false, false, false)
3325 BF16_MINMAX(maxnum, false, true, false)
3326 BF16_MINMAX(maxnummag, false, true, true)
3327
3328 #undef BF16_MINMAX
3329
3330 /* Floating point compare */
3331 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3332                                     float_status *s)
3333 {
3334     if (is_nan(a.cls) || is_nan(b.cls)) {
3335         if (!is_quiet ||
3336             a.cls == float_class_snan ||
3337             b.cls == float_class_snan) {
3338             float_raise(float_flag_invalid, s);
3339         }
3340         return float_relation_unordered;
3341     }
3342
3343     if (a.cls == float_class_zero) {
3344         if (b.cls == float_class_zero) {
3345             return float_relation_equal;
3346         }
3347         return b.sign ? float_relation_greater : float_relation_less;
3348     } else if (b.cls == float_class_zero) {
3349         return a.sign ? float_relation_less : float_relation_greater;
3350     }
3351
3352     /* The only really important thing about infinity is its sign. If
3353      * both are infinities the sign marks the smallest of the two.
3354      */
3355     if (a.cls == float_class_inf) {
3356         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3357             return float_relation_equal;
3358         }
3359         return a.sign ? float_relation_less : float_relation_greater;
3360     } else if (b.cls == float_class_inf) {
3361         return b.sign ? float_relation_greater : float_relation_less;
3362     }
3363
3364     if (a.sign != b.sign) {
3365         return a.sign ? float_relation_less : float_relation_greater;
3366     }
3367
3368     if (a.exp == b.exp) {
3369         if (a.frac == b.frac) {
3370             return float_relation_equal;
3371         }
3372         if (a.sign) {
3373             return a.frac > b.frac ?
3374                 float_relation_less : float_relation_greater;
3375         } else {
3376             return a.frac > b.frac ?
3377                 float_relation_greater : float_relation_less;
3378         }
3379     } else {
3380         if (a.sign) {
3381             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3382         } else {
3383             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3384         }
3385     }
3386 }
3387
3388 #define COMPARE(name, attr, sz)                                         \
3389 static int attr                                                         \
3390 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3391 {                                                                       \
3392     FloatParts64 pa, pb;                                                \
3393     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3394     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3395     return compare_floats(pa, pb, is_quiet, s);                         \
3396 }
3397
3398 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3399 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3400 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3401
3402 #undef COMPARE
3403
3404 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3405 {
3406     return soft_f16_compare(a, b, false, s);
3407 }
3408
3409 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3410 {
3411     return soft_f16_compare(a, b, true, s);
3412 }
3413
3414 static FloatRelation QEMU_FLATTEN
3415 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3416 {
3417     union_float32 ua, ub;
3418
3419     ua.s = xa;
3420     ub.s = xb;
3421
3422     if (QEMU_NO_HARDFLOAT) {
3423         goto soft;
3424     }
3425
3426     float32_input_flush2(&ua.s, &ub.s, s);
3427     if (isgreaterequal(ua.h, ub.h)) {
3428         if (isgreater(ua.h, ub.h)) {
3429             return float_relation_greater;
3430         }
3431         return float_relation_equal;
3432     }
3433     if (likely(isless(ua.h, ub.h))) {
3434         return float_relation_less;
3435     }
3436     /* The only condition remaining is unordered.
3437      * Fall through to set flags.
3438      */
3439  soft:
3440     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3441 }
3442
3443 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3444 {
3445     return f32_compare(a, b, false, s);
3446 }
3447
3448 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3449 {
3450     return f32_compare(a, b, true, s);
3451 }
3452
3453 static FloatRelation QEMU_FLATTEN
3454 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3455 {
3456     union_float64 ua, ub;
3457
3458     ua.s = xa;
3459     ub.s = xb;
3460
3461     if (QEMU_NO_HARDFLOAT) {
3462         goto soft;
3463     }
3464
3465     float64_input_flush2(&ua.s, &ub.s, s);
3466     if (isgreaterequal(ua.h, ub.h)) {
3467         if (isgreater(ua.h, ub.h)) {
3468             return float_relation_greater;
3469         }
3470         return float_relation_equal;
3471     }
3472     if (likely(isless(ua.h, ub.h))) {
3473         return float_relation_less;
3474     }
3475     /* The only condition remaining is unordered.
3476      * Fall through to set flags.
3477      */
3478  soft:
3479     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3480 }
3481
3482 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3483 {
3484     return f64_compare(a, b, false, s);
3485 }
3486
3487 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3488 {
3489     return f64_compare(a, b, true, s);
3490 }
3491
3492 static FloatRelation QEMU_FLATTEN
3493 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3494 {
3495     FloatParts64 pa, pb;
3496
3497     bfloat16_unpack_canonical(&pa, a, s);
3498     bfloat16_unpack_canonical(&pb, b, s);
3499     return compare_floats(pa, pb, is_quiet, s);
3500 }
3501
3502 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3503 {
3504     return soft_bf16_compare(a, b, false, s);
3505 }
3506
3507 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3508 {
3509     return soft_bf16_compare(a, b, true, s);
3510 }
3511
3512 /* Multiply A by 2 raised to the power N.  */
3513 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3514 {
3515     if (unlikely(is_nan(a.cls))) {
3516         parts_return_nan(&a, s);
3517     }
3518     if (a.cls == float_class_normal) {
3519         /* The largest float type (even though not supported by FloatParts64)
3520          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3521          * still allows rounding to infinity, without allowing overflow
3522          * within the int32_t that backs FloatParts64.exp.
3523          */
3524         n = MIN(MAX(n, -0x10000), 0x10000);
3525         a.exp += n;
3526     }
3527     return a;
3528 }
3529
3530 float16 float16_scalbn(float16 a, int n, float_status *status)
3531 {
3532     FloatParts64 pa, pr;
3533
3534     float16_unpack_canonical(&pa, a, status);
3535     pr = scalbn_decomposed(pa, n, status);
3536     return float16_round_pack_canonical(&pr, status);
3537 }
3538
3539 float32 float32_scalbn(float32 a, int n, float_status *status)
3540 {
3541     FloatParts64 pa, pr;
3542
3543     float32_unpack_canonical(&pa, a, status);
3544     pr = scalbn_decomposed(pa, n, status);
3545     return float32_round_pack_canonical(&pr, status);
3546 }
3547
3548 float64 float64_scalbn(float64 a, int n, float_status *status)
3549 {
3550     FloatParts64 pa, pr;
3551
3552     float64_unpack_canonical(&pa, a, status);
3553     pr = scalbn_decomposed(pa, n, status);
3554     return float64_round_pack_canonical(&pr, status);
3555 }
3556
3557 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3558 {
3559     FloatParts64 pa, pr;
3560
3561     bfloat16_unpack_canonical(&pa, a, status);
3562     pr = scalbn_decomposed(pa, n, status);
3563     return bfloat16_round_pack_canonical(&pr, status);
3564 }
3565
3566 /*
3567  * Square Root
3568  *
3569  * The old softfloat code did an approximation step before zeroing in
3570  * on the final result. However for simpleness we just compute the
3571  * square root by iterating down from the implicit bit to enough extra
3572  * bits to ensure we get a correctly rounded result.
3573  *
3574  * This does mean however the calculation is slower than before,
3575  * especially for 64 bit floats.
3576  */
3577
3578 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3579 {
3580     uint64_t a_frac, r_frac, s_frac;
3581     int bit, last_bit;
3582
3583     if (is_nan(a.cls)) {
3584         parts_return_nan(&a, s);
3585         return a;
3586     }
3587     if (a.cls == float_class_zero) {
3588         return a;  /* sqrt(+-0) = +-0 */
3589     }
3590     if (a.sign) {
3591         float_raise(float_flag_invalid, s);
3592         parts_default_nan(&a, s);
3593         return a;
3594     }
3595     if (a.cls == float_class_inf) {
3596         return a;  /* sqrt(+inf) = +inf */
3597     }
3598
3599     assert(a.cls == float_class_normal);
3600
3601     /* We need two overflow bits at the top. Adding room for that is a
3602      * right shift. If the exponent is odd, we can discard the low bit
3603      * by multiplying the fraction by 2; that's a left shift. Combine
3604      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3605      */
3606     a_frac = a.frac >> (2 - (a.exp & 1));
3607     a.exp >>= 1;
3608
3609     /* Bit-by-bit computation of sqrt.  */
3610     r_frac = 0;
3611     s_frac = 0;
3612
3613     /* Iterate from implicit bit down to the 3 extra bits to compute a
3614      * properly rounded result. Remember we've inserted two more bits
3615      * at the top, so these positions are two less.
3616      */
3617     bit = DECOMPOSED_BINARY_POINT - 2;
3618     last_bit = MAX(p->frac_shift - 4, 0);
3619     do {
3620         uint64_t q = 1ULL << bit;
3621         uint64_t t_frac = s_frac + q;
3622         if (t_frac <= a_frac) {
3623             s_frac = t_frac + q;
3624             a_frac -= t_frac;
3625             r_frac += q;
3626         }
3627         a_frac <<= 1;
3628     } while (--bit >= last_bit);
3629
3630     /* Undo the right shift done above. If there is any remaining
3631      * fraction, the result is inexact. Set the sticky bit.
3632      */
3633     a.frac = (r_frac << 2) + (a_frac != 0);
3634
3635     return a;
3636 }
3637
3638 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3639 {
3640     FloatParts64 pa, pr;
3641
3642     float16_unpack_canonical(&pa, a, status);
3643     pr = sqrt_float(pa, status, &float16_params);
3644     return float16_round_pack_canonical(&pr, status);
3645 }
3646
3647 static float32 QEMU_SOFTFLOAT_ATTR
3648 soft_f32_sqrt(float32 a, float_status *status)
3649 {
3650     FloatParts64 pa, pr;
3651
3652     float32_unpack_canonical(&pa, a, status);
3653     pr = sqrt_float(pa, status, &float32_params);
3654     return float32_round_pack_canonical(&pr, status);
3655 }
3656
3657 static float64 QEMU_SOFTFLOAT_ATTR
3658 soft_f64_sqrt(float64 a, float_status *status)
3659 {
3660     FloatParts64 pa, pr;
3661
3662     float64_unpack_canonical(&pa, a, status);
3663     pr = sqrt_float(pa, status, &float64_params);
3664     return float64_round_pack_canonical(&pr, status);
3665 }
3666
3667 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3668 {
3669     union_float32 ua, ur;
3670
3671     ua.s = xa;
3672     if (unlikely(!can_use_fpu(s))) {
3673         goto soft;
3674     }
3675
3676     float32_input_flush1(&ua.s, s);
3677     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3678         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3679                        fpclassify(ua.h) == FP_ZERO) ||
3680                      signbit(ua.h))) {
3681             goto soft;
3682         }
3683     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3684                         float32_is_neg(ua.s))) {
3685         goto soft;
3686     }
3687     ur.h = sqrtf(ua.h);
3688     return ur.s;
3689
3690  soft:
3691     return soft_f32_sqrt(ua.s, s);
3692 }
3693
3694 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3695 {
3696     union_float64 ua, ur;
3697
3698     ua.s = xa;
3699     if (unlikely(!can_use_fpu(s))) {
3700         goto soft;
3701     }
3702
3703     float64_input_flush1(&ua.s, s);
3704     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3705         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3706                        fpclassify(ua.h) == FP_ZERO) ||
3707                      signbit(ua.h))) {
3708             goto soft;
3709         }
3710     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3711                         float64_is_neg(ua.s))) {
3712         goto soft;
3713     }
3714     ur.h = sqrt(ua.h);
3715     return ur.s;
3716
3717  soft:
3718     return soft_f64_sqrt(ua.s, s);
3719 }
3720
3721 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3722 {
3723     FloatParts64 pa, pr;
3724
3725     bfloat16_unpack_canonical(&pa, a, status);
3726     pr = sqrt_float(pa, status, &bfloat16_params);
3727     return bfloat16_round_pack_canonical(&pr, status);
3728 }
3729
3730 /*----------------------------------------------------------------------------
3731 | The pattern for a default generated NaN.
3732 *----------------------------------------------------------------------------*/
3733
3734 float16 float16_default_nan(float_status *status)
3735 {
3736     FloatParts64 p;
3737
3738     parts_default_nan(&p, status);
3739     p.frac >>= float16_params.frac_shift;
3740     return float16_pack_raw(&p);
3741 }
3742
3743 float32 float32_default_nan(float_status *status)
3744 {
3745     FloatParts64 p;
3746
3747     parts_default_nan(&p, status);
3748     p.frac >>= float32_params.frac_shift;
3749     return float32_pack_raw(&p);
3750 }
3751
3752 float64 float64_default_nan(float_status *status)
3753 {
3754     FloatParts64 p;
3755
3756     parts_default_nan(&p, status);
3757     p.frac >>= float64_params.frac_shift;
3758     return float64_pack_raw(&p);
3759 }
3760
3761 float128 float128_default_nan(float_status *status)
3762 {
3763     FloatParts128 p;
3764
3765     parts_default_nan(&p, status);
3766     frac_shr(&p, float128_params.frac_shift);
3767     return float128_pack_raw(&p);
3768 }
3769
3770 bfloat16 bfloat16_default_nan(float_status *status)
3771 {
3772     FloatParts64 p;
3773
3774     parts_default_nan(&p, status);
3775     p.frac >>= bfloat16_params.frac_shift;
3776     return bfloat16_pack_raw(&p);
3777 }
3778
3779 /*----------------------------------------------------------------------------
3780 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3781 *----------------------------------------------------------------------------*/
3782
3783 float16 float16_silence_nan(float16 a, float_status *status)
3784 {
3785     FloatParts64 p;
3786
3787     float16_unpack_raw(&p, a);
3788     p.frac <<= float16_params.frac_shift;
3789     parts_silence_nan(&p, status);
3790     p.frac >>= float16_params.frac_shift;
3791     return float16_pack_raw(&p);
3792 }
3793
3794 float32 float32_silence_nan(float32 a, float_status *status)
3795 {
3796     FloatParts64 p;
3797
3798     float32_unpack_raw(&p, a);
3799     p.frac <<= float32_params.frac_shift;
3800     parts_silence_nan(&p, status);
3801     p.frac >>= float32_params.frac_shift;
3802     return float32_pack_raw(&p);
3803 }
3804
3805 float64 float64_silence_nan(float64 a, float_status *status)
3806 {
3807     FloatParts64 p;
3808
3809     float64_unpack_raw(&p, a);
3810     p.frac <<= float64_params.frac_shift;
3811     parts_silence_nan(&p, status);
3812     p.frac >>= float64_params.frac_shift;
3813     return float64_pack_raw(&p);
3814 }
3815
3816 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3817 {
3818     FloatParts64 p;
3819
3820     bfloat16_unpack_raw(&p, a);
3821     p.frac <<= bfloat16_params.frac_shift;
3822     parts_silence_nan(&p, status);
3823     p.frac >>= bfloat16_params.frac_shift;
3824     return bfloat16_pack_raw(&p);
3825 }
3826
3827 float128 float128_silence_nan(float128 a, float_status *status)
3828 {
3829     FloatParts128 p;
3830
3831     float128_unpack_raw(&p, a);
3832     frac_shl(&p, float128_params.frac_shift);
3833     parts_silence_nan(&p, status);
3834     frac_shr(&p, float128_params.frac_shift);
3835     return float128_pack_raw(&p);
3836 }
3837
3838 /*----------------------------------------------------------------------------
3839 | If `a' is denormal and we are in flush-to-zero mode then set the
3840 | input-denormal exception and return zero. Otherwise just return the value.
3841 *----------------------------------------------------------------------------*/
3842
3843 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3844 {
3845     if (p.exp == 0 && p.frac != 0) {
3846         float_raise(float_flag_input_denormal, status);
3847         return true;
3848     }
3849
3850     return false;
3851 }
3852
3853 float16 float16_squash_input_denormal(float16 a, float_status *status)
3854 {
3855     if (status->flush_inputs_to_zero) {
3856         FloatParts64 p;
3857
3858         float16_unpack_raw(&p, a);
3859         if (parts_squash_denormal(p, status)) {
3860             return float16_set_sign(float16_zero, p.sign);
3861         }
3862     }
3863     return a;
3864 }
3865
3866 float32 float32_squash_input_denormal(float32 a, float_status *status)
3867 {
3868     if (status->flush_inputs_to_zero) {
3869         FloatParts64 p;
3870
3871         float32_unpack_raw(&p, a);
3872         if (parts_squash_denormal(p, status)) {
3873             return float32_set_sign(float32_zero, p.sign);
3874         }
3875     }
3876     return a;
3877 }
3878
3879 float64 float64_squash_input_denormal(float64 a, float_status *status)
3880 {
3881     if (status->flush_inputs_to_zero) {
3882         FloatParts64 p;
3883
3884         float64_unpack_raw(&p, a);
3885         if (parts_squash_denormal(p, status)) {
3886             return float64_set_sign(float64_zero, p.sign);
3887         }
3888     }
3889     return a;
3890 }
3891
3892 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3893 {
3894     if (status->flush_inputs_to_zero) {
3895         FloatParts64 p;
3896
3897         bfloat16_unpack_raw(&p, a);
3898         if (parts_squash_denormal(p, status)) {
3899             return bfloat16_set_sign(bfloat16_zero, p.sign);
3900         }
3901     }
3902     return a;
3903 }
3904
3905 /*----------------------------------------------------------------------------
3906 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3907 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3908 | input.  If `zSign' is 1, the input is negated before being converted to an
3909 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3910 | is simply rounded to an integer, with the inexact exception raised if the
3911 | input cannot be represented exactly as an integer.  However, if the fixed-
3912 | point input is too large, the invalid exception is raised and the largest
3913 | positive or negative integer is returned.
3914 *----------------------------------------------------------------------------*/
3915
3916 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3917                                  float_status *status)
3918 {
3919     int8_t roundingMode;
3920     bool roundNearestEven;
3921     int8_t roundIncrement, roundBits;
3922     int32_t z;
3923
3924     roundingMode = status->float_rounding_mode;
3925     roundNearestEven = ( roundingMode == float_round_nearest_even );
3926     switch (roundingMode) {
3927     case float_round_nearest_even:
3928     case float_round_ties_away:
3929         roundIncrement = 0x40;
3930         break;
3931     case float_round_to_zero:
3932         roundIncrement = 0;
3933         break;
3934     case float_round_up:
3935         roundIncrement = zSign ? 0 : 0x7f;
3936         break;
3937     case float_round_down:
3938         roundIncrement = zSign ? 0x7f : 0;
3939         break;
3940     case float_round_to_odd:
3941         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3942         break;
3943     default:
3944         abort();
3945     }
3946     roundBits = absZ & 0x7F;
3947     absZ = ( absZ + roundIncrement )>>7;
3948     if (!(roundBits ^ 0x40) && roundNearestEven) {
3949         absZ &= ~1;
3950     }
3951     z = absZ;
3952     if ( zSign ) z = - z;
3953     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3954         float_raise(float_flag_invalid, status);
3955         return zSign ? INT32_MIN : INT32_MAX;
3956     }
3957     if (roundBits) {
3958         float_raise(float_flag_inexact, status);
3959     }
3960     return z;
3961
3962 }
3963
3964 /*----------------------------------------------------------------------------
3965 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3966 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3967 | and returns the properly rounded 64-bit integer corresponding to the input.
3968 | If `zSign' is 1, the input is negated before being converted to an integer.
3969 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3970 | the inexact exception raised if the input cannot be represented exactly as
3971 | an integer.  However, if the fixed-point input is too large, the invalid
3972 | exception is raised and the largest positive or negative integer is
3973 | returned.
3974 *----------------------------------------------------------------------------*/
3975
3976 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3977                                float_status *status)
3978 {
3979     int8_t roundingMode;
3980     bool roundNearestEven, increment;
3981     int64_t z;
3982
3983     roundingMode = status->float_rounding_mode;
3984     roundNearestEven = ( roundingMode == float_round_nearest_even );
3985     switch (roundingMode) {
3986     case float_round_nearest_even:
3987     case float_round_ties_away:
3988         increment = ((int64_t) absZ1 < 0);
3989         break;
3990     case float_round_to_zero:
3991         increment = 0;
3992         break;
3993     case float_round_up:
3994         increment = !zSign && absZ1;
3995         break;
3996     case float_round_down:
3997         increment = zSign && absZ1;
3998         break;
3999     case float_round_to_odd:
4000         increment = !(absZ0 & 1) && absZ1;
4001         break;
4002     default:
4003         abort();
4004     }
4005     if ( increment ) {
4006         ++absZ0;
4007         if ( absZ0 == 0 ) goto overflow;
4008         if (!(absZ1 << 1) && roundNearestEven) {
4009             absZ0 &= ~1;
4010         }
4011     }
4012     z = absZ0;
4013     if ( zSign ) z = - z;
4014     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4015  overflow:
4016         float_raise(float_flag_invalid, status);
4017         return zSign ? INT64_MIN : INT64_MAX;
4018     }
4019     if (absZ1) {
4020         float_raise(float_flag_inexact, status);
4021     }
4022     return z;
4023
4024 }
4025
4026 /*----------------------------------------------------------------------------
4027 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4028 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4029 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4030 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4031 | with the inexact exception raised if the input cannot be represented exactly
4032 | as an integer.  However, if the fixed-point input is too large, the invalid
4033 | exception is raised and the largest unsigned integer is returned.
4034 *----------------------------------------------------------------------------*/
4035
4036 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4037                                 uint64_t absZ1, float_status *status)
4038 {
4039     int8_t roundingMode;
4040     bool roundNearestEven, increment;
4041
4042     roundingMode = status->float_rounding_mode;
4043     roundNearestEven = (roundingMode == float_round_nearest_even);
4044     switch (roundingMode) {
4045     case float_round_nearest_even:
4046     case float_round_ties_away:
4047         increment = ((int64_t)absZ1 < 0);
4048         break;
4049     case float_round_to_zero:
4050         increment = 0;
4051         break;
4052     case float_round_up:
4053         increment = !zSign && absZ1;
4054         break;
4055     case float_round_down:
4056         increment = zSign && absZ1;
4057         break;
4058     case float_round_to_odd:
4059         increment = !(absZ0 & 1) && absZ1;
4060         break;
4061     default:
4062         abort();
4063     }
4064     if (increment) {
4065         ++absZ0;
4066         if (absZ0 == 0) {
4067             float_raise(float_flag_invalid, status);
4068             return UINT64_MAX;
4069         }
4070         if (!(absZ1 << 1) && roundNearestEven) {
4071             absZ0 &= ~1;
4072         }
4073     }
4074
4075     if (zSign && absZ0) {
4076         float_raise(float_flag_invalid, status);
4077         return 0;
4078     }
4079
4080     if (absZ1) {
4081         float_raise(float_flag_inexact, status);
4082     }
4083     return absZ0;
4084 }
4085
4086 /*----------------------------------------------------------------------------
4087 | Normalizes the subnormal single-precision floating-point value represented
4088 | by the denormalized significand `aSig'.  The normalized exponent and
4089 | significand are stored at the locations pointed to by `zExpPtr' and
4090 | `zSigPtr', respectively.
4091 *----------------------------------------------------------------------------*/
4092
4093 static void
4094  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4095 {
4096     int8_t shiftCount;
4097
4098     shiftCount = clz32(aSig) - 8;
4099     *zSigPtr = aSig<<shiftCount;
4100     *zExpPtr = 1 - shiftCount;
4101
4102 }
4103
4104 /*----------------------------------------------------------------------------
4105 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4106 | and significand `zSig', and returns the proper single-precision floating-
4107 | point value corresponding to the abstract input.  Ordinarily, the abstract
4108 | value is simply rounded and packed into the single-precision format, with
4109 | the inexact exception raised if the abstract input cannot be represented
4110 | exactly.  However, if the abstract value is too large, the overflow and
4111 | inexact exceptions are raised and an infinity or maximal finite value is
4112 | returned.  If the abstract value is too small, the input value is rounded to
4113 | a subnormal number, and the underflow and inexact exceptions are raised if
4114 | the abstract input cannot be represented exactly as a subnormal single-
4115 | precision floating-point number.
4116 |     The input significand `zSig' has its binary point between bits 30
4117 | and 29, which is 7 bits to the left of the usual location.  This shifted
4118 | significand must be normalized or smaller.  If `zSig' is not normalized,
4119 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4120 | and it must not require rounding.  In the usual case that `zSig' is
4121 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4122 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4123 | Binary Floating-Point Arithmetic.
4124 *----------------------------------------------------------------------------*/
4125
4126 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4127                                    float_status *status)
4128 {
4129     int8_t roundingMode;
4130     bool roundNearestEven;
4131     int8_t roundIncrement, roundBits;
4132     bool isTiny;
4133
4134     roundingMode = status->float_rounding_mode;
4135     roundNearestEven = ( roundingMode == float_round_nearest_even );
4136     switch (roundingMode) {
4137     case float_round_nearest_even:
4138     case float_round_ties_away:
4139         roundIncrement = 0x40;
4140         break;
4141     case float_round_to_zero:
4142         roundIncrement = 0;
4143         break;
4144     case float_round_up:
4145         roundIncrement = zSign ? 0 : 0x7f;
4146         break;
4147     case float_round_down:
4148         roundIncrement = zSign ? 0x7f : 0;
4149         break;
4150     case float_round_to_odd:
4151         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4152         break;
4153     default:
4154         abort();
4155         break;
4156     }
4157     roundBits = zSig & 0x7F;
4158     if ( 0xFD <= (uint16_t) zExp ) {
4159         if (    ( 0xFD < zExp )
4160              || (    ( zExp == 0xFD )
4161                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4162            ) {
4163             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4164                                    roundIncrement != 0;
4165             float_raise(float_flag_overflow | float_flag_inexact, status);
4166             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4167         }
4168         if ( zExp < 0 ) {
4169             if (status->flush_to_zero) {
4170                 float_raise(float_flag_output_denormal, status);
4171                 return packFloat32(zSign, 0, 0);
4172             }
4173             isTiny = status->tininess_before_rounding
4174                   || (zExp < -1)
4175                   || (zSig + roundIncrement < 0x80000000);
4176             shift32RightJamming( zSig, - zExp, &zSig );
4177             zExp = 0;
4178             roundBits = zSig & 0x7F;
4179             if (isTiny && roundBits) {
4180                 float_raise(float_flag_underflow, status);
4181             }
4182             if (roundingMode == float_round_to_odd) {
4183                 /*
4184                  * For round-to-odd case, the roundIncrement depends on
4185                  * zSig which just changed.
4186                  */
4187                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4188             }
4189         }
4190     }
4191     if (roundBits) {
4192         float_raise(float_flag_inexact, status);
4193     }
4194     zSig = ( zSig + roundIncrement )>>7;
4195     if (!(roundBits ^ 0x40) && roundNearestEven) {
4196         zSig &= ~1;
4197     }
4198     if ( zSig == 0 ) zExp = 0;
4199     return packFloat32( zSign, zExp, zSig );
4200
4201 }
4202
4203 /*----------------------------------------------------------------------------
4204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4205 | and significand `zSig', and returns the proper single-precision floating-
4206 | point value corresponding to the abstract input.  This routine is just like
4207 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4208 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4209 | floating-point exponent.
4210 *----------------------------------------------------------------------------*/
4211
4212 static float32
4213  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4214                               float_status *status)
4215 {
4216     int8_t shiftCount;
4217
4218     shiftCount = clz32(zSig) - 1;
4219     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4220                                status);
4221
4222 }
4223
4224 /*----------------------------------------------------------------------------
4225 | Normalizes the subnormal double-precision floating-point value represented
4226 | by the denormalized significand `aSig'.  The normalized exponent and
4227 | significand are stored at the locations pointed to by `zExpPtr' and
4228 | `zSigPtr', respectively.
4229 *----------------------------------------------------------------------------*/
4230
4231 static void
4232  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4233 {
4234     int8_t shiftCount;
4235
4236     shiftCount = clz64(aSig) - 11;
4237     *zSigPtr = aSig<<shiftCount;
4238     *zExpPtr = 1 - shiftCount;
4239
4240 }
4241
4242 /*----------------------------------------------------------------------------
4243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4244 | double-precision floating-point value, returning the result.  After being
4245 | shifted into the proper positions, the three fields are simply added
4246 | together to form the result.  This means that any integer portion of `zSig'
4247 | will be added into the exponent.  Since a properly normalized significand
4248 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4249 | than the desired result exponent whenever `zSig' is a complete, normalized
4250 | significand.
4251 *----------------------------------------------------------------------------*/
4252
4253 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4254 {
4255
4256     return make_float64(
4257         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4258
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4263 | and significand `zSig', and returns the proper double-precision floating-
4264 | point value corresponding to the abstract input.  Ordinarily, the abstract
4265 | value is simply rounded and packed into the double-precision format, with
4266 | the inexact exception raised if the abstract input cannot be represented
4267 | exactly.  However, if the abstract value is too large, the overflow and
4268 | inexact exceptions are raised and an infinity or maximal finite value is
4269 | returned.  If the abstract value is too small, the input value is rounded to
4270 | a subnormal number, and the underflow and inexact exceptions are raised if
4271 | the abstract input cannot be represented exactly as a subnormal double-
4272 | precision floating-point number.
4273 |     The input significand `zSig' has its binary point between bits 62
4274 | and 61, which is 10 bits to the left of the usual location.  This shifted
4275 | significand must be normalized or smaller.  If `zSig' is not normalized,
4276 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4277 | and it must not require rounding.  In the usual case that `zSig' is
4278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4279 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4280 | Binary Floating-Point Arithmetic.
4281 *----------------------------------------------------------------------------*/
4282
4283 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4284                                    float_status *status)
4285 {
4286     int8_t roundingMode;
4287     bool roundNearestEven;
4288     int roundIncrement, roundBits;
4289     bool isTiny;
4290
4291     roundingMode = status->float_rounding_mode;
4292     roundNearestEven = ( roundingMode == float_round_nearest_even );
4293     switch (roundingMode) {
4294     case float_round_nearest_even:
4295     case float_round_ties_away:
4296         roundIncrement = 0x200;
4297         break;
4298     case float_round_to_zero:
4299         roundIncrement = 0;
4300         break;
4301     case float_round_up:
4302         roundIncrement = zSign ? 0 : 0x3ff;
4303         break;
4304     case float_round_down:
4305         roundIncrement = zSign ? 0x3ff : 0;
4306         break;
4307     case float_round_to_odd:
4308         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4309         break;
4310     default:
4311         abort();
4312     }
4313     roundBits = zSig & 0x3FF;
4314     if ( 0x7FD <= (uint16_t) zExp ) {
4315         if (    ( 0x7FD < zExp )
4316              || (    ( zExp == 0x7FD )
4317                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4318            ) {
4319             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4320                                    roundIncrement != 0;
4321             float_raise(float_flag_overflow | float_flag_inexact, status);
4322             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4323         }
4324         if ( zExp < 0 ) {
4325             if (status->flush_to_zero) {
4326                 float_raise(float_flag_output_denormal, status);
4327                 return packFloat64(zSign, 0, 0);
4328             }
4329             isTiny = status->tininess_before_rounding
4330                   || (zExp < -1)
4331                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4332             shift64RightJamming( zSig, - zExp, &zSig );
4333             zExp = 0;
4334             roundBits = zSig & 0x3FF;
4335             if (isTiny && roundBits) {
4336                 float_raise(float_flag_underflow, status);
4337             }
4338             if (roundingMode == float_round_to_odd) {
4339                 /*
4340                  * For round-to-odd case, the roundIncrement depends on
4341                  * zSig which just changed.
4342                  */
4343                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4344             }
4345         }
4346     }
4347     if (roundBits) {
4348         float_raise(float_flag_inexact, status);
4349     }
4350     zSig = ( zSig + roundIncrement )>>10;
4351     if (!(roundBits ^ 0x200) && roundNearestEven) {
4352         zSig &= ~1;
4353     }
4354     if ( zSig == 0 ) zExp = 0;
4355     return packFloat64( zSign, zExp, zSig );
4356
4357 }
4358
4359 /*----------------------------------------------------------------------------
4360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4361 | and significand `zSig', and returns the proper double-precision floating-
4362 | point value corresponding to the abstract input.  This routine is just like
4363 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4364 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4365 | floating-point exponent.
4366 *----------------------------------------------------------------------------*/
4367
4368 static float64
4369  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4370                               float_status *status)
4371 {
4372     int8_t shiftCount;
4373
4374     shiftCount = clz64(zSig) - 1;
4375     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4376                                status);
4377
4378 }
4379
4380 /*----------------------------------------------------------------------------
4381 | Normalizes the subnormal extended double-precision floating-point value
4382 | represented by the denormalized significand `aSig'.  The normalized exponent
4383 | and significand are stored at the locations pointed to by `zExpPtr' and
4384 | `zSigPtr', respectively.
4385 *----------------------------------------------------------------------------*/
4386
4387 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4388                                 uint64_t *zSigPtr)
4389 {
4390     int8_t shiftCount;
4391
4392     shiftCount = clz64(aSig);
4393     *zSigPtr = aSig<<shiftCount;
4394     *zExpPtr = 1 - shiftCount;
4395 }
4396
4397 /*----------------------------------------------------------------------------
4398 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4399 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4400 | and returns the proper extended double-precision floating-point value
4401 | corresponding to the abstract input.  Ordinarily, the abstract value is
4402 | rounded and packed into the extended double-precision format, with the
4403 | inexact exception raised if the abstract input cannot be represented
4404 | exactly.  However, if the abstract value is too large, the overflow and
4405 | inexact exceptions are raised and an infinity or maximal finite value is
4406 | returned.  If the abstract value is too small, the input value is rounded to
4407 | a subnormal number, and the underflow and inexact exceptions are raised if
4408 | the abstract input cannot be represented exactly as a subnormal extended
4409 | double-precision floating-point number.
4410 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4411 | number of bits as single or double precision, respectively.  Otherwise, the
4412 | result is rounded to the full precision of the extended double-precision
4413 | format.
4414 |     The input significand must be normalized or smaller.  If the input
4415 | significand is not normalized, `zExp' must be 0; in that case, the result
4416 | returned is a subnormal number, and it must not require rounding.  The
4417 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4418 | Floating-Point Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420
4421 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4422                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4423                               float_status *status)
4424 {
4425     int8_t roundingMode;
4426     bool roundNearestEven, increment, isTiny;
4427     int64_t roundIncrement, roundMask, roundBits;
4428
4429     roundingMode = status->float_rounding_mode;
4430     roundNearestEven = ( roundingMode == float_round_nearest_even );
4431     if ( roundingPrecision == 80 ) goto precision80;
4432     if ( roundingPrecision == 64 ) {
4433         roundIncrement = UINT64_C(0x0000000000000400);
4434         roundMask = UINT64_C(0x00000000000007FF);
4435     }
4436     else if ( roundingPrecision == 32 ) {
4437         roundIncrement = UINT64_C(0x0000008000000000);
4438         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4439     }
4440     else {
4441         goto precision80;
4442     }
4443     zSig0 |= ( zSig1 != 0 );
4444     switch (roundingMode) {
4445     case float_round_nearest_even:
4446     case float_round_ties_away:
4447         break;
4448     case float_round_to_zero:
4449         roundIncrement = 0;
4450         break;
4451     case float_round_up:
4452         roundIncrement = zSign ? 0 : roundMask;
4453         break;
4454     case float_round_down:
4455         roundIncrement = zSign ? roundMask : 0;
4456         break;
4457     default:
4458         abort();
4459     }
4460     roundBits = zSig0 & roundMask;
4461     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4462         if (    ( 0x7FFE < zExp )
4463              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4464            ) {
4465             goto overflow;
4466         }
4467         if ( zExp <= 0 ) {
4468             if (status->flush_to_zero) {
4469                 float_raise(float_flag_output_denormal, status);
4470                 return packFloatx80(zSign, 0, 0);
4471             }
4472             isTiny = status->tininess_before_rounding
4473                   || (zExp < 0 )
4474                   || (zSig0 <= zSig0 + roundIncrement);
4475             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4476             zExp = 0;
4477             roundBits = zSig0 & roundMask;
4478             if (isTiny && roundBits) {
4479                 float_raise(float_flag_underflow, status);
4480             }
4481             if (roundBits) {
4482                 float_raise(float_flag_inexact, status);
4483             }
4484             zSig0 += roundIncrement;
4485             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4486             roundIncrement = roundMask + 1;
4487             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4488                 roundMask |= roundIncrement;
4489             }
4490             zSig0 &= ~ roundMask;
4491             return packFloatx80( zSign, zExp, zSig0 );
4492         }
4493     }
4494     if (roundBits) {
4495         float_raise(float_flag_inexact, status);
4496     }
4497     zSig0 += roundIncrement;
4498     if ( zSig0 < roundIncrement ) {
4499         ++zExp;
4500         zSig0 = UINT64_C(0x8000000000000000);
4501     }
4502     roundIncrement = roundMask + 1;
4503     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4504         roundMask |= roundIncrement;
4505     }
4506     zSig0 &= ~ roundMask;
4507     if ( zSig0 == 0 ) zExp = 0;
4508     return packFloatx80( zSign, zExp, zSig0 );
4509  precision80:
4510     switch (roundingMode) {
4511     case float_round_nearest_even:
4512     case float_round_ties_away:
4513         increment = ((int64_t)zSig1 < 0);
4514         break;
4515     case float_round_to_zero:
4516         increment = 0;
4517         break;
4518     case float_round_up:
4519         increment = !zSign && zSig1;
4520         break;
4521     case float_round_down:
4522         increment = zSign && zSig1;
4523         break;
4524     default:
4525         abort();
4526     }
4527     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4528         if (    ( 0x7FFE < zExp )
4529              || (    ( zExp == 0x7FFE )
4530                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4531                   && increment
4532                 )
4533            ) {
4534             roundMask = 0;
4535  overflow:
4536             float_raise(float_flag_overflow | float_flag_inexact, status);
4537             if (    ( roundingMode == float_round_to_zero )
4538                  || ( zSign && ( roundingMode == float_round_up ) )
4539                  || ( ! zSign && ( roundingMode == float_round_down ) )
4540                ) {
4541                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4542             }
4543             return packFloatx80(zSign,
4544                                 floatx80_infinity_high,
4545                                 floatx80_infinity_low);
4546         }
4547         if ( zExp <= 0 ) {
4548             isTiny = status->tininess_before_rounding
4549                   || (zExp < 0)
4550                   || !increment
4551                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4552             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4553             zExp = 0;
4554             if (isTiny && zSig1) {
4555                 float_raise(float_flag_underflow, status);
4556             }
4557             if (zSig1) {
4558                 float_raise(float_flag_inexact, status);
4559             }
4560             switch (roundingMode) {
4561             case float_round_nearest_even:
4562             case float_round_ties_away:
4563                 increment = ((int64_t)zSig1 < 0);
4564                 break;
4565             case float_round_to_zero:
4566                 increment = 0;
4567                 break;
4568             case float_round_up:
4569                 increment = !zSign && zSig1;
4570                 break;
4571             case float_round_down:
4572                 increment = zSign && zSig1;
4573                 break;
4574             default:
4575                 abort();
4576             }
4577             if ( increment ) {
4578                 ++zSig0;
4579                 if (!(zSig1 << 1) && roundNearestEven) {
4580                     zSig0 &= ~1;
4581                 }
4582                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4583             }
4584             return packFloatx80( zSign, zExp, zSig0 );
4585         }
4586     }
4587     if (zSig1) {
4588         float_raise(float_flag_inexact, status);
4589     }
4590     if ( increment ) {
4591         ++zSig0;
4592         if ( zSig0 == 0 ) {
4593             ++zExp;
4594             zSig0 = UINT64_C(0x8000000000000000);
4595         }
4596         else {
4597             if (!(zSig1 << 1) && roundNearestEven) {
4598                 zSig0 &= ~1;
4599             }
4600         }
4601     }
4602     else {
4603         if ( zSig0 == 0 ) zExp = 0;
4604     }
4605     return packFloatx80( zSign, zExp, zSig0 );
4606
4607 }
4608
4609 /*----------------------------------------------------------------------------
4610 | Takes an abstract floating-point value having sign `zSign', exponent
4611 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4612 | and returns the proper extended double-precision floating-point value
4613 | corresponding to the abstract input.  This routine is just like
4614 | `roundAndPackFloatx80' except that the input significand does not have to be
4615 | normalized.
4616 *----------------------------------------------------------------------------*/
4617
4618 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4619                                        bool zSign, int32_t zExp,
4620                                        uint64_t zSig0, uint64_t zSig1,
4621                                        float_status *status)
4622 {
4623     int8_t shiftCount;
4624
4625     if ( zSig0 == 0 ) {
4626         zSig0 = zSig1;
4627         zSig1 = 0;
4628         zExp -= 64;
4629     }
4630     shiftCount = clz64(zSig0);
4631     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4632     zExp -= shiftCount;
4633     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4634                                 zSig0, zSig1, status);
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the least-significant 64 fraction bits of the quadruple-precision
4640 | floating-point value `a'.
4641 *----------------------------------------------------------------------------*/
4642
4643 static inline uint64_t extractFloat128Frac1( float128 a )
4644 {
4645
4646     return a.low;
4647
4648 }
4649
4650 /*----------------------------------------------------------------------------
4651 | Returns the most-significant 48 fraction bits of the quadruple-precision
4652 | floating-point value `a'.
4653 *----------------------------------------------------------------------------*/
4654
4655 static inline uint64_t extractFloat128Frac0( float128 a )
4656 {
4657
4658     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4659
4660 }
4661
4662 /*----------------------------------------------------------------------------
4663 | Returns the exponent bits of the quadruple-precision floating-point value
4664 | `a'.
4665 *----------------------------------------------------------------------------*/
4666
4667 static inline int32_t extractFloat128Exp( float128 a )
4668 {
4669
4670     return ( a.high>>48 ) & 0x7FFF;
4671
4672 }
4673
4674 /*----------------------------------------------------------------------------
4675 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4676 *----------------------------------------------------------------------------*/
4677
4678 static inline bool extractFloat128Sign(float128 a)
4679 {
4680     return a.high >> 63;
4681 }
4682
4683 /*----------------------------------------------------------------------------
4684 | Normalizes the subnormal quadruple-precision floating-point value
4685 | represented by the denormalized significand formed by the concatenation of
4686 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4687 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4688 | significand are stored at the location pointed to by `zSig0Ptr', and the
4689 | least significant 64 bits of the normalized significand are stored at the
4690 | location pointed to by `zSig1Ptr'.
4691 *----------------------------------------------------------------------------*/
4692
4693 static void
4694  normalizeFloat128Subnormal(
4695      uint64_t aSig0,
4696      uint64_t aSig1,
4697      int32_t *zExpPtr,
4698      uint64_t *zSig0Ptr,
4699      uint64_t *zSig1Ptr
4700  )
4701 {
4702     int8_t shiftCount;
4703
4704     if ( aSig0 == 0 ) {
4705         shiftCount = clz64(aSig1) - 15;
4706         if ( shiftCount < 0 ) {
4707             *zSig0Ptr = aSig1>>( - shiftCount );
4708             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4709         }
4710         else {
4711             *zSig0Ptr = aSig1<<shiftCount;
4712             *zSig1Ptr = 0;
4713         }
4714         *zExpPtr = - shiftCount - 63;
4715     }
4716     else {
4717         shiftCount = clz64(aSig0) - 15;
4718         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4719         *zExpPtr = 1 - shiftCount;
4720     }
4721
4722 }
4723
4724 /*----------------------------------------------------------------------------
4725 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4726 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4727 | floating-point value, returning the result.  After being shifted into the
4728 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4729 | added together to form the most significant 32 bits of the result.  This
4730 | means that any integer portion of `zSig0' will be added into the exponent.
4731 | Since a properly normalized significand will have an integer portion equal
4732 | to 1, the `zExp' input should be 1 less than the desired result exponent
4733 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4734 | significand.
4735 *----------------------------------------------------------------------------*/
4736
4737 static inline float128
4738 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4739 {
4740     float128 z;
4741
4742     z.low = zSig1;
4743     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4744     return z;
4745 }
4746
4747 /*----------------------------------------------------------------------------
4748 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4749 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4750 | and `zSig2', and returns the proper quadruple-precision floating-point value
4751 | corresponding to the abstract input.  Ordinarily, the abstract value is
4752 | simply rounded and packed into the quadruple-precision format, with the
4753 | inexact exception raised if the abstract input cannot be represented
4754 | exactly.  However, if the abstract value is too large, the overflow and
4755 | inexact exceptions are raised and an infinity or maximal finite value is
4756 | returned.  If the abstract value is too small, the input value is rounded to
4757 | a subnormal number, and the underflow and inexact exceptions are raised if
4758 | the abstract input cannot be represented exactly as a subnormal quadruple-
4759 | precision floating-point number.
4760 |     The input significand must be normalized or smaller.  If the input
4761 | significand is not normalized, `zExp' must be 0; in that case, the result
4762 | returned is a subnormal number, and it must not require rounding.  In the
4763 | usual case that the input significand is normalized, `zExp' must be 1 less
4764 | than the ``true'' floating-point exponent.  The handling of underflow and
4765 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4766 *----------------------------------------------------------------------------*/
4767
4768 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4769                                      uint64_t zSig0, uint64_t zSig1,
4770                                      uint64_t zSig2, float_status *status)
4771 {
4772     int8_t roundingMode;
4773     bool roundNearestEven, increment, isTiny;
4774
4775     roundingMode = status->float_rounding_mode;
4776     roundNearestEven = ( roundingMode == float_round_nearest_even );
4777     switch (roundingMode) {
4778     case float_round_nearest_even:
4779     case float_round_ties_away:
4780         increment = ((int64_t)zSig2 < 0);
4781         break;
4782     case float_round_to_zero:
4783         increment = 0;
4784         break;
4785     case float_round_up:
4786         increment = !zSign && zSig2;
4787         break;
4788     case float_round_down:
4789         increment = zSign && zSig2;
4790         break;
4791     case float_round_to_odd:
4792         increment = !(zSig1 & 0x1) && zSig2;
4793         break;
4794     default:
4795         abort();
4796     }
4797     if ( 0x7FFD <= (uint32_t) zExp ) {
4798         if (    ( 0x7FFD < zExp )
4799              || (    ( zExp == 0x7FFD )
4800                   && eq128(
4801                          UINT64_C(0x0001FFFFFFFFFFFF),
4802                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4803                          zSig0,
4804                          zSig1
4805                      )
4806                   && increment
4807                 )
4808            ) {
4809             float_raise(float_flag_overflow | float_flag_inexact, status);
4810             if (    ( roundingMode == float_round_to_zero )
4811                  || ( zSign && ( roundingMode == float_round_up ) )
4812                  || ( ! zSign && ( roundingMode == float_round_down ) )
4813                  || (roundingMode == float_round_to_odd)
4814                ) {
4815                 return
4816                     packFloat128(
4817                         zSign,
4818                         0x7FFE,
4819                         UINT64_C(0x0000FFFFFFFFFFFF),
4820                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4821                     );
4822             }
4823             return packFloat128( zSign, 0x7FFF, 0, 0 );
4824         }
4825         if ( zExp < 0 ) {
4826             if (status->flush_to_zero) {
4827                 float_raise(float_flag_output_denormal, status);
4828                 return packFloat128(zSign, 0, 0, 0);
4829             }
4830             isTiny = status->tininess_before_rounding
4831                   || (zExp < -1)
4832                   || !increment
4833                   || lt128(zSig0, zSig1,
4834                            UINT64_C(0x0001FFFFFFFFFFFF),
4835                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4836             shift128ExtraRightJamming(
4837                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4838             zExp = 0;
4839             if (isTiny && zSig2) {
4840                 float_raise(float_flag_underflow, status);
4841             }
4842             switch (roundingMode) {
4843             case float_round_nearest_even:
4844             case float_round_ties_away:
4845                 increment = ((int64_t)zSig2 < 0);
4846                 break;
4847             case float_round_to_zero:
4848                 increment = 0;
4849                 break;
4850             case float_round_up:
4851                 increment = !zSign && zSig2;
4852                 break;
4853             case float_round_down:
4854                 increment = zSign && zSig2;
4855                 break;
4856             case float_round_to_odd:
4857                 increment = !(zSig1 & 0x1) && zSig2;
4858                 break;
4859             default:
4860                 abort();
4861             }
4862         }
4863     }
4864     if (zSig2) {
4865         float_raise(float_flag_inexact, status);
4866     }
4867     if ( increment ) {
4868         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4869         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4870             zSig1 &= ~1;
4871         }
4872     }
4873     else {
4874         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4875     }
4876     return packFloat128( zSign, zExp, zSig0, zSig1 );
4877
4878 }
4879
4880 /*----------------------------------------------------------------------------
4881 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4882 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4883 | returns the proper quadruple-precision floating-point value corresponding
4884 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4885 | except that the input significand has fewer bits and does not have to be
4886 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4887 | point exponent.
4888 *----------------------------------------------------------------------------*/
4889
4890 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4891                                               uint64_t zSig0, uint64_t zSig1,
4892                                               float_status *status)
4893 {
4894     int8_t shiftCount;
4895     uint64_t zSig2;
4896
4897     if ( zSig0 == 0 ) {
4898         zSig0 = zSig1;
4899         zSig1 = 0;
4900         zExp -= 64;
4901     }
4902     shiftCount = clz64(zSig0) - 15;
4903     if ( 0 <= shiftCount ) {
4904         zSig2 = 0;
4905         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4906     }
4907     else {
4908         shift128ExtraRightJamming(
4909             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4910     }
4911     zExp -= shiftCount;
4912     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4913
4914 }
4915
4916
4917 /*----------------------------------------------------------------------------
4918 | Returns the result of converting the 32-bit two's complement integer `a'
4919 | to the extended double-precision floating-point format.  The conversion
4920 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4921 | Arithmetic.
4922 *----------------------------------------------------------------------------*/
4923
4924 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4925 {
4926     bool zSign;
4927     uint32_t absA;
4928     int8_t shiftCount;
4929     uint64_t zSig;
4930
4931     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4932     zSign = ( a < 0 );
4933     absA = zSign ? - a : a;
4934     shiftCount = clz32(absA) + 32;
4935     zSig = absA;
4936     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4937
4938 }
4939
4940 /*----------------------------------------------------------------------------
4941 | Returns the result of converting the 32-bit two's complement integer `a' to
4942 | the quadruple-precision floating-point format.  The conversion is performed
4943 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4945
4946 float128 int32_to_float128(int32_t a, float_status *status)
4947 {
4948     bool zSign;
4949     uint32_t absA;
4950     int8_t shiftCount;
4951     uint64_t zSig0;
4952
4953     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4954     zSign = ( a < 0 );
4955     absA = zSign ? - a : a;
4956     shiftCount = clz32(absA) + 17;
4957     zSig0 = absA;
4958     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4959
4960 }
4961
4962 /*----------------------------------------------------------------------------
4963 | Returns the result of converting the 64-bit two's complement integer `a'
4964 | to the extended double-precision floating-point format.  The conversion
4965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4966 | Arithmetic.
4967 *----------------------------------------------------------------------------*/
4968
4969 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4970 {
4971     bool zSign;
4972     uint64_t absA;
4973     int8_t shiftCount;
4974
4975     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4976     zSign = ( a < 0 );
4977     absA = zSign ? - a : a;
4978     shiftCount = clz64(absA);
4979     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4980
4981 }
4982
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of converting the 64-bit two's complement integer `a' to
4985 | the quadruple-precision floating-point format.  The conversion is performed
4986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4987 *----------------------------------------------------------------------------*/
4988
4989 float128 int64_to_float128(int64_t a, float_status *status)
4990 {
4991     bool zSign;
4992     uint64_t absA;
4993     int8_t shiftCount;
4994     int32_t zExp;
4995     uint64_t zSig0, zSig1;
4996
4997     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4998     zSign = ( a < 0 );
4999     absA = zSign ? - a : a;
5000     shiftCount = clz64(absA) + 49;
5001     zExp = 0x406E - shiftCount;
5002     if ( 64 <= shiftCount ) {
5003         zSig1 = 0;
5004         zSig0 = absA;
5005         shiftCount -= 64;
5006     }
5007     else {
5008         zSig1 = absA;
5009         zSig0 = 0;
5010     }
5011     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5012     return packFloat128( zSign, zExp, zSig0, zSig1 );
5013
5014 }
5015
5016 /*----------------------------------------------------------------------------
5017 | Returns the result of converting the 64-bit unsigned integer `a'
5018 | to the quadruple-precision floating-point format.  The conversion is performed
5019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 *----------------------------------------------------------------------------*/
5021
5022 float128 uint64_to_float128(uint64_t a, float_status *status)
5023 {
5024     if (a == 0) {
5025         return float128_zero;
5026     }
5027     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5028 }
5029
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the single-precision floating-point value
5032 | `a' to the extended double-precision floating-point format.  The conversion
5033 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5034 | Arithmetic.
5035 *----------------------------------------------------------------------------*/
5036
5037 floatx80 float32_to_floatx80(float32 a, float_status *status)
5038 {
5039     bool aSign;
5040     int aExp;
5041     uint32_t aSig;
5042
5043     a = float32_squash_input_denormal(a, status);
5044     aSig = extractFloat32Frac( a );
5045     aExp = extractFloat32Exp( a );
5046     aSign = extractFloat32Sign( a );
5047     if ( aExp == 0xFF ) {
5048         if (aSig) {
5049             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5050                                                status);
5051             return floatx80_silence_nan(res, status);
5052         }
5053         return packFloatx80(aSign,
5054                             floatx80_infinity_high,
5055                             floatx80_infinity_low);
5056     }
5057     if ( aExp == 0 ) {
5058         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5059         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5060     }
5061     aSig |= 0x00800000;
5062     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5063
5064 }
5065
5066 /*----------------------------------------------------------------------------
5067 | Returns the result of converting the single-precision floating-point value
5068 | `a' to the double-precision floating-point format.  The conversion is
5069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5070 | Arithmetic.
5071 *----------------------------------------------------------------------------*/
5072
5073 float128 float32_to_float128(float32 a, float_status *status)
5074 {
5075     bool aSign;
5076     int aExp;
5077     uint32_t aSig;
5078
5079     a = float32_squash_input_denormal(a, status);
5080     aSig = extractFloat32Frac( a );
5081     aExp = extractFloat32Exp( a );
5082     aSign = extractFloat32Sign( a );
5083     if ( aExp == 0xFF ) {
5084         if (aSig) {
5085             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5086         }
5087         return packFloat128( aSign, 0x7FFF, 0, 0 );
5088     }
5089     if ( aExp == 0 ) {
5090         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5091         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5092         --aExp;
5093     }
5094     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5095
5096 }
5097
5098 /*----------------------------------------------------------------------------
5099 | Returns the remainder of the single-precision floating-point value `a'
5100 | with respect to the corresponding value `b'.  The operation is performed
5101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102 *----------------------------------------------------------------------------*/
5103
5104 float32 float32_rem(float32 a, float32 b, float_status *status)
5105 {
5106     bool aSign, zSign;
5107     int aExp, bExp, expDiff;
5108     uint32_t aSig, bSig;
5109     uint32_t q;
5110     uint64_t aSig64, bSig64, q64;
5111     uint32_t alternateASig;
5112     int32_t sigMean;
5113     a = float32_squash_input_denormal(a, status);
5114     b = float32_squash_input_denormal(b, status);
5115
5116     aSig = extractFloat32Frac( a );
5117     aExp = extractFloat32Exp( a );
5118     aSign = extractFloat32Sign( a );
5119     bSig = extractFloat32Frac( b );
5120     bExp = extractFloat32Exp( b );
5121     if ( aExp == 0xFF ) {
5122         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5123             return propagateFloat32NaN(a, b, status);
5124         }
5125         float_raise(float_flag_invalid, status);
5126         return float32_default_nan(status);
5127     }
5128     if ( bExp == 0xFF ) {
5129         if (bSig) {
5130             return propagateFloat32NaN(a, b, status);
5131         }
5132         return a;
5133     }
5134     if ( bExp == 0 ) {
5135         if ( bSig == 0 ) {
5136             float_raise(float_flag_invalid, status);
5137             return float32_default_nan(status);
5138         }
5139         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5140     }
5141     if ( aExp == 0 ) {
5142         if ( aSig == 0 ) return a;
5143         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5144     }
5145     expDiff = aExp - bExp;
5146     aSig |= 0x00800000;
5147     bSig |= 0x00800000;
5148     if ( expDiff < 32 ) {
5149         aSig <<= 8;
5150         bSig <<= 8;
5151         if ( expDiff < 0 ) {
5152             if ( expDiff < -1 ) return a;
5153             aSig >>= 1;
5154         }
5155         q = ( bSig <= aSig );
5156         if ( q ) aSig -= bSig;
5157         if ( 0 < expDiff ) {
5158             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5159             q >>= 32 - expDiff;
5160             bSig >>= 2;
5161             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5162         }
5163         else {
5164             aSig >>= 2;
5165             bSig >>= 2;
5166         }
5167     }
5168     else {
5169         if ( bSig <= aSig ) aSig -= bSig;
5170         aSig64 = ( (uint64_t) aSig )<<40;
5171         bSig64 = ( (uint64_t) bSig )<<40;
5172         expDiff -= 64;
5173         while ( 0 < expDiff ) {
5174             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5175             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5176             aSig64 = - ( ( bSig * q64 )<<38 );
5177             expDiff -= 62;
5178         }
5179         expDiff += 64;
5180         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5181         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5182         q = q64>>( 64 - expDiff );
5183         bSig <<= 6;
5184         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5185     }
5186     do {
5187         alternateASig = aSig;
5188         ++q;
5189         aSig -= bSig;
5190     } while ( 0 <= (int32_t) aSig );
5191     sigMean = aSig + alternateASig;
5192     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5193         aSig = alternateASig;
5194     }
5195     zSign = ( (int32_t) aSig < 0 );
5196     if ( zSign ) aSig = - aSig;
5197     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5198 }
5199
5200
5201
5202 /*----------------------------------------------------------------------------
5203 | Returns the binary exponential of the single-precision floating-point value
5204 | `a'. The operation is performed according to the IEC/IEEE Standard for
5205 | Binary Floating-Point Arithmetic.
5206 |
5207 | Uses the following identities:
5208 |
5209 | 1. -------------------------------------------------------------------------
5210 |      x    x*ln(2)
5211 |     2  = e
5212 |
5213 | 2. -------------------------------------------------------------------------
5214 |                      2     3     4     5           n
5215 |      x        x     x     x     x     x           x
5216 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5217 |               1!    2!    3!    4!    5!          n!
5218 *----------------------------------------------------------------------------*/
5219
5220 static const float64 float32_exp2_coefficients[15] =
5221 {
5222     const_float64( 0x3ff0000000000000ll ), /*  1 */
5223     const_float64( 0x3fe0000000000000ll ), /*  2 */
5224     const_float64( 0x3fc5555555555555ll ), /*  3 */
5225     const_float64( 0x3fa5555555555555ll ), /*  4 */
5226     const_float64( 0x3f81111111111111ll ), /*  5 */
5227     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5228     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5229     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5230     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5231     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5232     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5233     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5234     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5235     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5236     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5237 };
5238
5239 float32 float32_exp2(float32 a, float_status *status)
5240 {
5241     bool aSign;
5242     int aExp;
5243     uint32_t aSig;
5244     float64 r, x, xn;
5245     int i;
5246     a = float32_squash_input_denormal(a, status);
5247
5248     aSig = extractFloat32Frac( a );
5249     aExp = extractFloat32Exp( a );
5250     aSign = extractFloat32Sign( a );
5251
5252     if ( aExp == 0xFF) {
5253         if (aSig) {
5254             return propagateFloat32NaN(a, float32_zero, status);
5255         }
5256         return (aSign) ? float32_zero : a;
5257     }
5258     if (aExp == 0) {
5259         if (aSig == 0) return float32_one;
5260     }
5261
5262     float_raise(float_flag_inexact, status);
5263
5264     /* ******************************* */
5265     /* using float64 for approximation */
5266     /* ******************************* */
5267     x = float32_to_float64(a, status);
5268     x = float64_mul(x, float64_ln2, status);
5269
5270     xn = x;
5271     r = float64_one;
5272     for (i = 0 ; i < 15 ; i++) {
5273         float64 f;
5274
5275         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5276         r = float64_add(r, f, status);
5277
5278         xn = float64_mul(xn, x, status);
5279     }
5280
5281     return float64_to_float32(r, status);
5282 }
5283
5284 /*----------------------------------------------------------------------------
5285 | Returns the binary log of the single-precision floating-point value `a'.
5286 | The operation is performed according to the IEC/IEEE Standard for Binary
5287 | Floating-Point Arithmetic.
5288 *----------------------------------------------------------------------------*/
5289 float32 float32_log2(float32 a, float_status *status)
5290 {
5291     bool aSign, zSign;
5292     int aExp;
5293     uint32_t aSig, zSig, i;
5294
5295     a = float32_squash_input_denormal(a, status);
5296     aSig = extractFloat32Frac( a );
5297     aExp = extractFloat32Exp( a );
5298     aSign = extractFloat32Sign( a );
5299
5300     if ( aExp == 0 ) {
5301         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5302         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5303     }
5304     if ( aSign ) {
5305         float_raise(float_flag_invalid, status);
5306         return float32_default_nan(status);
5307     }
5308     if ( aExp == 0xFF ) {
5309         if (aSig) {
5310             return propagateFloat32NaN(a, float32_zero, status);
5311         }
5312         return a;
5313     }
5314
5315     aExp -= 0x7F;
5316     aSig |= 0x00800000;
5317     zSign = aExp < 0;
5318     zSig = aExp << 23;
5319
5320     for (i = 1 << 22; i > 0; i >>= 1) {
5321         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5322         if ( aSig & 0x01000000 ) {
5323             aSig >>= 1;
5324             zSig |= i;
5325         }
5326     }
5327
5328     if ( zSign )
5329         zSig = -zSig;
5330
5331     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5332 }
5333
5334 /*----------------------------------------------------------------------------
5335 | Returns the result of converting the double-precision floating-point value
5336 | `a' to the extended double-precision floating-point format.  The conversion
5337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338 | Arithmetic.
5339 *----------------------------------------------------------------------------*/
5340
5341 floatx80 float64_to_floatx80(float64 a, float_status *status)
5342 {
5343     bool aSign;
5344     int aExp;
5345     uint64_t aSig;
5346
5347     a = float64_squash_input_denormal(a, status);
5348     aSig = extractFloat64Frac( a );
5349     aExp = extractFloat64Exp( a );
5350     aSign = extractFloat64Sign( a );
5351     if ( aExp == 0x7FF ) {
5352         if (aSig) {
5353             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5354                                                status);
5355             return floatx80_silence_nan(res, status);
5356         }
5357         return packFloatx80(aSign,
5358                             floatx80_infinity_high,
5359                             floatx80_infinity_low);
5360     }
5361     if ( aExp == 0 ) {
5362         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5363         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5364     }
5365     return
5366         packFloatx80(
5367             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5368
5369 }
5370
5371 /*----------------------------------------------------------------------------
5372 | Returns the result of converting the double-precision floating-point value
5373 | `a' to the quadruple-precision floating-point format.  The conversion is
5374 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5375 | Arithmetic.
5376 *----------------------------------------------------------------------------*/
5377
5378 float128 float64_to_float128(float64 a, float_status *status)
5379 {
5380     bool aSign;
5381     int aExp;
5382     uint64_t aSig, zSig0, zSig1;
5383
5384     a = float64_squash_input_denormal(a, status);
5385     aSig = extractFloat64Frac( a );
5386     aExp = extractFloat64Exp( a );
5387     aSign = extractFloat64Sign( a );
5388     if ( aExp == 0x7FF ) {
5389         if (aSig) {
5390             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5391         }
5392         return packFloat128( aSign, 0x7FFF, 0, 0 );
5393     }
5394     if ( aExp == 0 ) {
5395         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5396         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5397         --aExp;
5398     }
5399     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5400     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5401
5402 }
5403
5404
5405 /*----------------------------------------------------------------------------
5406 | Returns the remainder of the double-precision floating-point value `a'
5407 | with respect to the corresponding value `b'.  The operation is performed
5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 *----------------------------------------------------------------------------*/
5410
5411 float64 float64_rem(float64 a, float64 b, float_status *status)
5412 {
5413     bool aSign, zSign;
5414     int aExp, bExp, expDiff;
5415     uint64_t aSig, bSig;
5416     uint64_t q, alternateASig;
5417     int64_t sigMean;
5418
5419     a = float64_squash_input_denormal(a, status);
5420     b = float64_squash_input_denormal(b, status);
5421     aSig = extractFloat64Frac( a );
5422     aExp = extractFloat64Exp( a );
5423     aSign = extractFloat64Sign( a );
5424     bSig = extractFloat64Frac( b );
5425     bExp = extractFloat64Exp( b );
5426     if ( aExp == 0x7FF ) {
5427         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5428             return propagateFloat64NaN(a, b, status);
5429         }
5430         float_raise(float_flag_invalid, status);
5431         return float64_default_nan(status);
5432     }
5433     if ( bExp == 0x7FF ) {
5434         if (bSig) {
5435             return propagateFloat64NaN(a, b, status);
5436         }
5437         return a;
5438     }
5439     if ( bExp == 0 ) {
5440         if ( bSig == 0 ) {
5441             float_raise(float_flag_invalid, status);
5442             return float64_default_nan(status);
5443         }
5444         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5445     }
5446     if ( aExp == 0 ) {
5447         if ( aSig == 0 ) return a;
5448         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5449     }
5450     expDiff = aExp - bExp;
5451     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5452     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5453     if ( expDiff < 0 ) {
5454         if ( expDiff < -1 ) return a;
5455         aSig >>= 1;
5456     }
5457     q = ( bSig <= aSig );
5458     if ( q ) aSig -= bSig;
5459     expDiff -= 64;
5460     while ( 0 < expDiff ) {
5461         q = estimateDiv128To64( aSig, 0, bSig );
5462         q = ( 2 < q ) ? q - 2 : 0;
5463         aSig = - ( ( bSig>>2 ) * q );
5464         expDiff -= 62;
5465     }
5466     expDiff += 64;
5467     if ( 0 < expDiff ) {
5468         q = estimateDiv128To64( aSig, 0, bSig );
5469         q = ( 2 < q ) ? q - 2 : 0;
5470         q >>= 64 - expDiff;
5471         bSig >>= 2;
5472         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5473     }
5474     else {
5475         aSig >>= 2;
5476         bSig >>= 2;
5477     }
5478     do {
5479         alternateASig = aSig;
5480         ++q;
5481         aSig -= bSig;
5482     } while ( 0 <= (int64_t) aSig );
5483     sigMean = aSig + alternateASig;
5484     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5485         aSig = alternateASig;
5486     }
5487     zSign = ( (int64_t) aSig < 0 );
5488     if ( zSign ) aSig = - aSig;
5489     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5490
5491 }
5492
5493 /*----------------------------------------------------------------------------
5494 | Returns the binary log of the double-precision floating-point value `a'.
5495 | The operation is performed according to the IEC/IEEE Standard for Binary
5496 | Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498 float64 float64_log2(float64 a, float_status *status)
5499 {
5500     bool aSign, zSign;
5501     int aExp;
5502     uint64_t aSig, aSig0, aSig1, zSig, i;
5503     a = float64_squash_input_denormal(a, status);
5504
5505     aSig = extractFloat64Frac( a );
5506     aExp = extractFloat64Exp( a );
5507     aSign = extractFloat64Sign( a );
5508
5509     if ( aExp == 0 ) {
5510         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5511         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5512     }
5513     if ( aSign ) {
5514         float_raise(float_flag_invalid, status);
5515         return float64_default_nan(status);
5516     }
5517     if ( aExp == 0x7FF ) {
5518         if (aSig) {
5519             return propagateFloat64NaN(a, float64_zero, status);
5520         }
5521         return a;
5522     }
5523
5524     aExp -= 0x3FF;
5525     aSig |= UINT64_C(0x0010000000000000);
5526     zSign = aExp < 0;
5527     zSig = (uint64_t)aExp << 52;
5528     for (i = 1LL << 51; i > 0; i >>= 1) {
5529         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5530         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5531         if ( aSig & UINT64_C(0x0020000000000000) ) {
5532             aSig >>= 1;
5533             zSig |= i;
5534         }
5535     }
5536
5537     if ( zSign )
5538         zSig = -zSig;
5539     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5540 }
5541
5542 /*----------------------------------------------------------------------------
5543 | Returns the result of converting the extended double-precision floating-
5544 | point value `a' to the 32-bit two's complement integer format.  The
5545 | conversion is performed according to the IEC/IEEE Standard for Binary
5546 | Floating-Point Arithmetic---which means in particular that the conversion
5547 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5548 | largest positive integer is returned.  Otherwise, if the conversion
5549 | overflows, the largest integer with the same sign as `a' is returned.
5550 *----------------------------------------------------------------------------*/
5551
5552 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5553 {
5554     bool aSign;
5555     int32_t aExp, shiftCount;
5556     uint64_t aSig;
5557
5558     if (floatx80_invalid_encoding(a)) {
5559         float_raise(float_flag_invalid, status);
5560         return 1 << 31;
5561     }
5562     aSig = extractFloatx80Frac( a );
5563     aExp = extractFloatx80Exp( a );
5564     aSign = extractFloatx80Sign( a );
5565     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5566     shiftCount = 0x4037 - aExp;
5567     if ( shiftCount <= 0 ) shiftCount = 1;
5568     shift64RightJamming( aSig, shiftCount, &aSig );
5569     return roundAndPackInt32(aSign, aSig, status);
5570
5571 }
5572
5573 /*----------------------------------------------------------------------------
5574 | Returns the result of converting the extended double-precision floating-
5575 | point value `a' to the 32-bit two's complement integer format.  The
5576 | conversion is performed according to the IEC/IEEE Standard for Binary
5577 | Floating-Point Arithmetic, except that the conversion is always rounded
5578 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5579 | Otherwise, if the conversion overflows, the largest integer with the same
5580 | sign as `a' is returned.
5581 *----------------------------------------------------------------------------*/
5582
5583 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5584 {
5585     bool aSign;
5586     int32_t aExp, shiftCount;
5587     uint64_t aSig, savedASig;
5588     int32_t z;
5589
5590     if (floatx80_invalid_encoding(a)) {
5591         float_raise(float_flag_invalid, status);
5592         return 1 << 31;
5593     }
5594     aSig = extractFloatx80Frac( a );
5595     aExp = extractFloatx80Exp( a );
5596     aSign = extractFloatx80Sign( a );
5597     if ( 0x401E < aExp ) {
5598         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5599         goto invalid;
5600     }
5601     else if ( aExp < 0x3FFF ) {
5602         if (aExp || aSig) {
5603             float_raise(float_flag_inexact, status);
5604         }
5605         return 0;
5606     }
5607     shiftCount = 0x403E - aExp;
5608     savedASig = aSig;
5609     aSig >>= shiftCount;
5610     z = aSig;
5611     if ( aSign ) z = - z;
5612     if ( ( z < 0 ) ^ aSign ) {
5613  invalid:
5614         float_raise(float_flag_invalid, status);
5615         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5616     }
5617     if ( ( aSig<<shiftCount ) != savedASig ) {
5618         float_raise(float_flag_inexact, status);
5619     }
5620     return z;
5621
5622 }
5623
5624 /*----------------------------------------------------------------------------
5625 | Returns the result of converting the extended double-precision floating-
5626 | point value `a' to the 64-bit two's complement integer format.  The
5627 | conversion is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic---which means in particular that the conversion
5629 | is rounded according to the current rounding mode.  If `a' is a NaN,
5630 | the largest positive integer is returned.  Otherwise, if the conversion
5631 | overflows, the largest integer with the same sign as `a' is returned.
5632 *----------------------------------------------------------------------------*/
5633
5634 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5635 {
5636     bool aSign;
5637     int32_t aExp, shiftCount;
5638     uint64_t aSig, aSigExtra;
5639
5640     if (floatx80_invalid_encoding(a)) {
5641         float_raise(float_flag_invalid, status);
5642         return 1ULL << 63;
5643     }
5644     aSig = extractFloatx80Frac( a );
5645     aExp = extractFloatx80Exp( a );
5646     aSign = extractFloatx80Sign( a );
5647     shiftCount = 0x403E - aExp;
5648     if ( shiftCount <= 0 ) {
5649         if ( shiftCount ) {
5650             float_raise(float_flag_invalid, status);
5651             if (!aSign || floatx80_is_any_nan(a)) {
5652                 return INT64_MAX;
5653             }
5654             return INT64_MIN;
5655         }
5656         aSigExtra = 0;
5657     }
5658     else {
5659         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5660     }
5661     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5662
5663 }
5664
5665 /*----------------------------------------------------------------------------
5666 | Returns the result of converting the extended double-precision floating-
5667 | point value `a' to the 64-bit two's complement integer format.  The
5668 | conversion is performed according to the IEC/IEEE Standard for Binary
5669 | Floating-Point Arithmetic, except that the conversion is always rounded
5670 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5671 | Otherwise, if the conversion overflows, the largest integer with the same
5672 | sign as `a' is returned.
5673 *----------------------------------------------------------------------------*/
5674
5675 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5676 {
5677     bool aSign;
5678     int32_t aExp, shiftCount;
5679     uint64_t aSig;
5680     int64_t z;
5681
5682     if (floatx80_invalid_encoding(a)) {
5683         float_raise(float_flag_invalid, status);
5684         return 1ULL << 63;
5685     }
5686     aSig = extractFloatx80Frac( a );
5687     aExp = extractFloatx80Exp( a );
5688     aSign = extractFloatx80Sign( a );
5689     shiftCount = aExp - 0x403E;
5690     if ( 0 <= shiftCount ) {
5691         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5692         if ( ( a.high != 0xC03E ) || aSig ) {
5693             float_raise(float_flag_invalid, status);
5694             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5695                 return INT64_MAX;
5696             }
5697         }
5698         return INT64_MIN;
5699     }
5700     else if ( aExp < 0x3FFF ) {
5701         if (aExp | aSig) {
5702             float_raise(float_flag_inexact, status);
5703         }
5704         return 0;
5705     }
5706     z = aSig>>( - shiftCount );
5707     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5708         float_raise(float_flag_inexact, status);
5709     }
5710     if ( aSign ) z = - z;
5711     return z;
5712
5713 }
5714
5715 /*----------------------------------------------------------------------------
5716 | Returns the result of converting the extended double-precision floating-
5717 | point value `a' to the single-precision floating-point format.  The
5718 | conversion is performed according to the IEC/IEEE Standard for Binary
5719 | Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5721
5722 float32 floatx80_to_float32(floatx80 a, float_status *status)
5723 {
5724     bool aSign;
5725     int32_t aExp;
5726     uint64_t aSig;
5727
5728     if (floatx80_invalid_encoding(a)) {
5729         float_raise(float_flag_invalid, status);
5730         return float32_default_nan(status);
5731     }
5732     aSig = extractFloatx80Frac( a );
5733     aExp = extractFloatx80Exp( a );
5734     aSign = extractFloatx80Sign( a );
5735     if ( aExp == 0x7FFF ) {
5736         if ( (uint64_t) ( aSig<<1 ) ) {
5737             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5738                                              status);
5739             return float32_silence_nan(res, status);
5740         }
5741         return packFloat32( aSign, 0xFF, 0 );
5742     }
5743     shift64RightJamming( aSig, 33, &aSig );
5744     if ( aExp || aSig ) aExp -= 0x3F81;
5745     return roundAndPackFloat32(aSign, aExp, aSig, status);
5746
5747 }
5748
5749 /*----------------------------------------------------------------------------
5750 | Returns the result of converting the extended double-precision floating-
5751 | point value `a' to the double-precision floating-point format.  The
5752 | conversion is performed according to the IEC/IEEE Standard for Binary
5753 | Floating-Point Arithmetic.
5754 *----------------------------------------------------------------------------*/
5755
5756 float64 floatx80_to_float64(floatx80 a, float_status *status)
5757 {
5758     bool aSign;
5759     int32_t aExp;
5760     uint64_t aSig, zSig;
5761
5762     if (floatx80_invalid_encoding(a)) {
5763         float_raise(float_flag_invalid, status);
5764         return float64_default_nan(status);
5765     }
5766     aSig = extractFloatx80Frac( a );
5767     aExp = extractFloatx80Exp( a );
5768     aSign = extractFloatx80Sign( a );
5769     if ( aExp == 0x7FFF ) {
5770         if ( (uint64_t) ( aSig<<1 ) ) {
5771             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5772                                              status);
5773             return float64_silence_nan(res, status);
5774         }
5775         return packFloat64( aSign, 0x7FF, 0 );
5776     }
5777     shift64RightJamming( aSig, 1, &zSig );
5778     if ( aExp || aSig ) aExp -= 0x3C01;
5779     return roundAndPackFloat64(aSign, aExp, zSig, status);
5780
5781 }
5782
5783 /*----------------------------------------------------------------------------
5784 | Returns the result of converting the extended double-precision floating-
5785 | point value `a' to the quadruple-precision floating-point format.  The
5786 | conversion is performed according to the IEC/IEEE Standard for Binary
5787 | Floating-Point Arithmetic.
5788 *----------------------------------------------------------------------------*/
5789
5790 float128 floatx80_to_float128(floatx80 a, float_status *status)
5791 {
5792     bool aSign;
5793     int aExp;
5794     uint64_t aSig, zSig0, zSig1;
5795
5796     if (floatx80_invalid_encoding(a)) {
5797         float_raise(float_flag_invalid, status);
5798         return float128_default_nan(status);
5799     }
5800     aSig = extractFloatx80Frac( a );
5801     aExp = extractFloatx80Exp( a );
5802     aSign = extractFloatx80Sign( a );
5803     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5804         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5805                                            status);
5806         return float128_silence_nan(res, status);
5807     }
5808     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5809     return packFloat128( aSign, aExp, zSig0, zSig1 );
5810
5811 }
5812
5813 /*----------------------------------------------------------------------------
5814 | Rounds the extended double-precision floating-point value `a'
5815 | to the precision provided by floatx80_rounding_precision and returns the
5816 | result as an extended double-precision floating-point value.
5817 | The operation is performed according to the IEC/IEEE Standard for Binary
5818 | Floating-Point Arithmetic.
5819 *----------------------------------------------------------------------------*/
5820
5821 floatx80 floatx80_round(floatx80 a, float_status *status)
5822 {
5823     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5824                                 extractFloatx80Sign(a),
5825                                 extractFloatx80Exp(a),
5826                                 extractFloatx80Frac(a), 0, status);
5827 }
5828
5829 /*----------------------------------------------------------------------------
5830 | Rounds the extended double-precision floating-point value `a' to an integer,
5831 | and returns the result as an extended quadruple-precision floating-point
5832 | value.  The operation is performed according to the IEC/IEEE Standard for
5833 | Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5835
5836 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5837 {
5838     bool aSign;
5839     int32_t aExp;
5840     uint64_t lastBitMask, roundBitsMask;
5841     floatx80 z;
5842
5843     if (floatx80_invalid_encoding(a)) {
5844         float_raise(float_flag_invalid, status);
5845         return floatx80_default_nan(status);
5846     }
5847     aExp = extractFloatx80Exp( a );
5848     if ( 0x403E <= aExp ) {
5849         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5850             return propagateFloatx80NaN(a, a, status);
5851         }
5852         return a;
5853     }
5854     if ( aExp < 0x3FFF ) {
5855         if (    ( aExp == 0 )
5856              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5857             return a;
5858         }
5859         float_raise(float_flag_inexact, status);
5860         aSign = extractFloatx80Sign( a );
5861         switch (status->float_rounding_mode) {
5862          case float_round_nearest_even:
5863             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5864                ) {
5865                 return
5866                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5867             }
5868             break;
5869         case float_round_ties_away:
5870             if (aExp == 0x3FFE) {
5871                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5872             }
5873             break;
5874          case float_round_down:
5875             return
5876                   aSign ?
5877                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5878                 : packFloatx80( 0, 0, 0 );
5879          case float_round_up:
5880             return
5881                   aSign ? packFloatx80( 1, 0, 0 )
5882                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5883
5884         case float_round_to_zero:
5885             break;
5886         default:
5887             g_assert_not_reached();
5888         }
5889         return packFloatx80( aSign, 0, 0 );
5890     }
5891     lastBitMask = 1;
5892     lastBitMask <<= 0x403E - aExp;
5893     roundBitsMask = lastBitMask - 1;
5894     z = a;
5895     switch (status->float_rounding_mode) {
5896     case float_round_nearest_even:
5897         z.low += lastBitMask>>1;
5898         if ((z.low & roundBitsMask) == 0) {
5899             z.low &= ~lastBitMask;
5900         }
5901         break;
5902     case float_round_ties_away:
5903         z.low += lastBitMask >> 1;
5904         break;
5905     case float_round_to_zero:
5906         break;
5907     case float_round_up:
5908         if (!extractFloatx80Sign(z)) {
5909             z.low += roundBitsMask;
5910         }
5911         break;
5912     case float_round_down:
5913         if (extractFloatx80Sign(z)) {
5914             z.low += roundBitsMask;
5915         }
5916         break;
5917     default:
5918         abort();
5919     }
5920     z.low &= ~ roundBitsMask;
5921     if ( z.low == 0 ) {
5922         ++z.high;
5923         z.low = UINT64_C(0x8000000000000000);
5924     }
5925     if (z.low != a.low) {
5926         float_raise(float_flag_inexact, status);
5927     }
5928     return z;
5929
5930 }
5931
5932 /*----------------------------------------------------------------------------
5933 | Returns the result of adding the absolute values of the extended double-
5934 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5935 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5936 | The addition is performed according to the IEC/IEEE Standard for Binary
5937 | Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939
5940 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5941                                 float_status *status)
5942 {
5943     int32_t aExp, bExp, zExp;
5944     uint64_t aSig, bSig, zSig0, zSig1;
5945     int32_t expDiff;
5946
5947     aSig = extractFloatx80Frac( a );
5948     aExp = extractFloatx80Exp( a );
5949     bSig = extractFloatx80Frac( b );
5950     bExp = extractFloatx80Exp( b );
5951     expDiff = aExp - bExp;
5952     if ( 0 < expDiff ) {
5953         if ( aExp == 0x7FFF ) {
5954             if ((uint64_t)(aSig << 1)) {
5955                 return propagateFloatx80NaN(a, b, status);
5956             }
5957             return a;
5958         }
5959         if ( bExp == 0 ) --expDiff;
5960         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5961         zExp = aExp;
5962     }
5963     else if ( expDiff < 0 ) {
5964         if ( bExp == 0x7FFF ) {
5965             if ((uint64_t)(bSig << 1)) {
5966                 return propagateFloatx80NaN(a, b, status);
5967             }
5968             return packFloatx80(zSign,
5969                                 floatx80_infinity_high,
5970                                 floatx80_infinity_low);
5971         }
5972         if ( aExp == 0 ) ++expDiff;
5973         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5974         zExp = bExp;
5975     }
5976     else {
5977         if ( aExp == 0x7FFF ) {
5978             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5979                 return propagateFloatx80NaN(a, b, status);
5980             }
5981             return a;
5982         }
5983         zSig1 = 0;
5984         zSig0 = aSig + bSig;
5985         if ( aExp == 0 ) {
5986             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5987                 /* At least one of the values is a pseudo-denormal,
5988                  * and there is a carry out of the result.  */
5989                 zExp = 1;
5990                 goto shiftRight1;
5991             }
5992             if (zSig0 == 0) {
5993                 return packFloatx80(zSign, 0, 0);
5994             }
5995             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5996             goto roundAndPack;
5997         }
5998         zExp = aExp;
5999         goto shiftRight1;
6000     }
6001     zSig0 = aSig + bSig;
6002     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6003  shiftRight1:
6004     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6005     zSig0 |= UINT64_C(0x8000000000000000);
6006     ++zExp;
6007  roundAndPack:
6008     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6009                                 zSign, zExp, zSig0, zSig1, status);
6010 }
6011
6012 /*----------------------------------------------------------------------------
6013 | Returns the result of subtracting the absolute values of the extended
6014 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6015 | difference is negated before being returned.  `zSign' is ignored if the
6016 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6017 | Standard for Binary Floating-Point Arithmetic.
6018 *----------------------------------------------------------------------------*/
6019
6020 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6021                                 float_status *status)
6022 {
6023     int32_t aExp, bExp, zExp;
6024     uint64_t aSig, bSig, zSig0, zSig1;
6025     int32_t expDiff;
6026
6027     aSig = extractFloatx80Frac( a );
6028     aExp = extractFloatx80Exp( a );
6029     bSig = extractFloatx80Frac( b );
6030     bExp = extractFloatx80Exp( b );
6031     expDiff = aExp - bExp;
6032     if ( 0 < expDiff ) goto aExpBigger;
6033     if ( expDiff < 0 ) goto bExpBigger;
6034     if ( aExp == 0x7FFF ) {
6035         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6036             return propagateFloatx80NaN(a, b, status);
6037         }
6038         float_raise(float_flag_invalid, status);
6039         return floatx80_default_nan(status);
6040     }
6041     if ( aExp == 0 ) {
6042         aExp = 1;
6043         bExp = 1;
6044     }
6045     zSig1 = 0;
6046     if ( bSig < aSig ) goto aBigger;
6047     if ( aSig < bSig ) goto bBigger;
6048     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6049  bExpBigger:
6050     if ( bExp == 0x7FFF ) {
6051         if ((uint64_t)(bSig << 1)) {
6052             return propagateFloatx80NaN(a, b, status);
6053         }
6054         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6055                             floatx80_infinity_low);
6056     }
6057     if ( aExp == 0 ) ++expDiff;
6058     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6059  bBigger:
6060     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6061     zExp = bExp;
6062     zSign ^= 1;
6063     goto normalizeRoundAndPack;
6064  aExpBigger:
6065     if ( aExp == 0x7FFF ) {
6066         if ((uint64_t)(aSig << 1)) {
6067             return propagateFloatx80NaN(a, b, status);
6068         }
6069         return a;
6070     }
6071     if ( bExp == 0 ) --expDiff;
6072     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6073  aBigger:
6074     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6075     zExp = aExp;
6076  normalizeRoundAndPack:
6077     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6078                                          zSign, zExp, zSig0, zSig1, status);
6079 }
6080
6081 /*----------------------------------------------------------------------------
6082 | Returns the result of adding the extended double-precision floating-point
6083 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6084 | Standard for Binary Floating-Point Arithmetic.
6085 *----------------------------------------------------------------------------*/
6086
6087 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6088 {
6089     bool aSign, bSign;
6090
6091     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6092         float_raise(float_flag_invalid, status);
6093         return floatx80_default_nan(status);
6094     }
6095     aSign = extractFloatx80Sign( a );
6096     bSign = extractFloatx80Sign( b );
6097     if ( aSign == bSign ) {
6098         return addFloatx80Sigs(a, b, aSign, status);
6099     }
6100     else {
6101         return subFloatx80Sigs(a, b, aSign, status);
6102     }
6103
6104 }
6105
6106 /*----------------------------------------------------------------------------
6107 | Returns the result of subtracting the extended double-precision floating-
6108 | point values `a' and `b'.  The operation is performed according to the
6109 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6110 *----------------------------------------------------------------------------*/
6111
6112 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6113 {
6114     bool aSign, bSign;
6115
6116     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6117         float_raise(float_flag_invalid, status);
6118         return floatx80_default_nan(status);
6119     }
6120     aSign = extractFloatx80Sign( a );
6121     bSign = extractFloatx80Sign( b );
6122     if ( aSign == bSign ) {
6123         return subFloatx80Sigs(a, b, aSign, status);
6124     }
6125     else {
6126         return addFloatx80Sigs(a, b, aSign, status);
6127     }
6128
6129 }
6130
6131 /*----------------------------------------------------------------------------
6132 | Returns the result of multiplying the extended double-precision floating-
6133 | point values `a' and `b'.  The operation is performed according to the
6134 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6135 *----------------------------------------------------------------------------*/
6136
6137 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6138 {
6139     bool aSign, bSign, zSign;
6140     int32_t aExp, bExp, zExp;
6141     uint64_t aSig, bSig, zSig0, zSig1;
6142
6143     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6144         float_raise(float_flag_invalid, status);
6145         return floatx80_default_nan(status);
6146     }
6147     aSig = extractFloatx80Frac( a );
6148     aExp = extractFloatx80Exp( a );
6149     aSign = extractFloatx80Sign( a );
6150     bSig = extractFloatx80Frac( b );
6151     bExp = extractFloatx80Exp( b );
6152     bSign = extractFloatx80Sign( b );
6153     zSign = aSign ^ bSign;
6154     if ( aExp == 0x7FFF ) {
6155         if (    (uint64_t) ( aSig<<1 )
6156              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6157             return propagateFloatx80NaN(a, b, status);
6158         }
6159         if ( ( bExp | bSig ) == 0 ) goto invalid;
6160         return packFloatx80(zSign, floatx80_infinity_high,
6161                                    floatx80_infinity_low);
6162     }
6163     if ( bExp == 0x7FFF ) {
6164         if ((uint64_t)(bSig << 1)) {
6165             return propagateFloatx80NaN(a, b, status);
6166         }
6167         if ( ( aExp | aSig ) == 0 ) {
6168  invalid:
6169             float_raise(float_flag_invalid, status);
6170             return floatx80_default_nan(status);
6171         }
6172         return packFloatx80(zSign, floatx80_infinity_high,
6173                                    floatx80_infinity_low);
6174     }
6175     if ( aExp == 0 ) {
6176         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6177         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6178     }
6179     if ( bExp == 0 ) {
6180         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6181         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6182     }
6183     zExp = aExp + bExp - 0x3FFE;
6184     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6185     if ( 0 < (int64_t) zSig0 ) {
6186         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6187         --zExp;
6188     }
6189     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6190                                 zSign, zExp, zSig0, zSig1, status);
6191 }
6192
6193 /*----------------------------------------------------------------------------
6194 | Returns the result of dividing the extended double-precision floating-point
6195 | value `a' by the corresponding value `b'.  The operation is performed
6196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6197 *----------------------------------------------------------------------------*/
6198
6199 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6200 {
6201     bool aSign, bSign, zSign;
6202     int32_t aExp, bExp, zExp;
6203     uint64_t aSig, bSig, zSig0, zSig1;
6204     uint64_t rem0, rem1, rem2, term0, term1, term2;
6205
6206     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6207         float_raise(float_flag_invalid, status);
6208         return floatx80_default_nan(status);
6209     }
6210     aSig = extractFloatx80Frac( a );
6211     aExp = extractFloatx80Exp( a );
6212     aSign = extractFloatx80Sign( a );
6213     bSig = extractFloatx80Frac( b );
6214     bExp = extractFloatx80Exp( b );
6215     bSign = extractFloatx80Sign( b );
6216     zSign = aSign ^ bSign;
6217     if ( aExp == 0x7FFF ) {
6218         if ((uint64_t)(aSig << 1)) {
6219             return propagateFloatx80NaN(a, b, status);
6220         }
6221         if ( bExp == 0x7FFF ) {
6222             if ((uint64_t)(bSig << 1)) {
6223                 return propagateFloatx80NaN(a, b, status);
6224             }
6225             goto invalid;
6226         }
6227         return packFloatx80(zSign, floatx80_infinity_high,
6228                                    floatx80_infinity_low);
6229     }
6230     if ( bExp == 0x7FFF ) {
6231         if ((uint64_t)(bSig << 1)) {
6232             return propagateFloatx80NaN(a, b, status);
6233         }
6234         return packFloatx80( zSign, 0, 0 );
6235     }
6236     if ( bExp == 0 ) {
6237         if ( bSig == 0 ) {
6238             if ( ( aExp | aSig ) == 0 ) {
6239  invalid:
6240                 float_raise(float_flag_invalid, status);
6241                 return floatx80_default_nan(status);
6242             }
6243             float_raise(float_flag_divbyzero, status);
6244             return packFloatx80(zSign, floatx80_infinity_high,
6245                                        floatx80_infinity_low);
6246         }
6247         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6248     }
6249     if ( aExp == 0 ) {
6250         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6251         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6252     }
6253     zExp = aExp - bExp + 0x3FFE;
6254     rem1 = 0;
6255     if ( bSig <= aSig ) {
6256         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6257         ++zExp;
6258     }
6259     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6260     mul64To128( bSig, zSig0, &term0, &term1 );
6261     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6262     while ( (int64_t) rem0 < 0 ) {
6263         --zSig0;
6264         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6265     }
6266     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6267     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6268         mul64To128( bSig, zSig1, &term1, &term2 );
6269         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6270         while ( (int64_t) rem1 < 0 ) {
6271             --zSig1;
6272             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6273         }
6274         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6275     }
6276     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6277                                 zSign, zExp, zSig0, zSig1, status);
6278 }
6279
6280 /*----------------------------------------------------------------------------
6281 | Returns the remainder of the extended double-precision floating-point value
6282 | `a' with respect to the corresponding value `b'.  The operation is performed
6283 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6284 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6285 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6286 | the absolute value of the integer quotient.
6287 *----------------------------------------------------------------------------*/
6288
6289 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6290                          float_status *status)
6291 {
6292     bool aSign, zSign;
6293     int32_t aExp, bExp, expDiff, aExpOrig;
6294     uint64_t aSig0, aSig1, bSig;
6295     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6296
6297     *quotient = 0;
6298     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6299         float_raise(float_flag_invalid, status);
6300         return floatx80_default_nan(status);
6301     }
6302     aSig0 = extractFloatx80Frac( a );
6303     aExpOrig = aExp = extractFloatx80Exp( a );
6304     aSign = extractFloatx80Sign( a );
6305     bSig = extractFloatx80Frac( b );
6306     bExp = extractFloatx80Exp( b );
6307     if ( aExp == 0x7FFF ) {
6308         if (    (uint64_t) ( aSig0<<1 )
6309              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6310             return propagateFloatx80NaN(a, b, status);
6311         }
6312         goto invalid;
6313     }
6314     if ( bExp == 0x7FFF ) {
6315         if ((uint64_t)(bSig << 1)) {
6316             return propagateFloatx80NaN(a, b, status);
6317         }
6318         if (aExp == 0 && aSig0 >> 63) {
6319             /*
6320              * Pseudo-denormal argument must be returned in normalized
6321              * form.
6322              */
6323             return packFloatx80(aSign, 1, aSig0);
6324         }
6325         return a;
6326     }
6327     if ( bExp == 0 ) {
6328         if ( bSig == 0 ) {
6329  invalid:
6330             float_raise(float_flag_invalid, status);
6331             return floatx80_default_nan(status);
6332         }
6333         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6334     }
6335     if ( aExp == 0 ) {
6336         if ( aSig0 == 0 ) return a;
6337         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6338     }
6339     zSign = aSign;
6340     expDiff = aExp - bExp;
6341     aSig1 = 0;
6342     if ( expDiff < 0 ) {
6343         if ( mod || expDiff < -1 ) {
6344             if (aExp == 1 && aExpOrig == 0) {
6345                 /*
6346                  * Pseudo-denormal argument must be returned in
6347                  * normalized form.
6348                  */
6349                 return packFloatx80(aSign, aExp, aSig0);
6350             }
6351             return a;
6352         }
6353         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6354         expDiff = 0;
6355     }
6356     *quotient = q = ( bSig <= aSig0 );
6357     if ( q ) aSig0 -= bSig;
6358     expDiff -= 64;
6359     while ( 0 < expDiff ) {
6360         q = estimateDiv128To64( aSig0, aSig1, bSig );
6361         q = ( 2 < q ) ? q - 2 : 0;
6362         mul64To128( bSig, q, &term0, &term1 );
6363         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6364         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6365         expDiff -= 62;
6366         *quotient <<= 62;
6367         *quotient += q;
6368     }
6369     expDiff += 64;
6370     if ( 0 < expDiff ) {
6371         q = estimateDiv128To64( aSig0, aSig1, bSig );
6372         q = ( 2 < q ) ? q - 2 : 0;
6373         q >>= 64 - expDiff;
6374         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6375         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6376         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6377         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6378             ++q;
6379             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6380         }
6381         if (expDiff < 64) {
6382             *quotient <<= expDiff;
6383         } else {
6384             *quotient = 0;
6385         }
6386         *quotient += q;
6387     }
6388     else {
6389         term1 = 0;
6390         term0 = bSig;
6391     }
6392     if (!mod) {
6393         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6394         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6395                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6396                         && ( q & 1 ) )
6397             ) {
6398             aSig0 = alternateASig0;
6399             aSig1 = alternateASig1;
6400             zSign = ! zSign;
6401             ++*quotient;
6402         }
6403     }
6404     return
6405         normalizeRoundAndPackFloatx80(
6406             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6407
6408 }
6409
6410 /*----------------------------------------------------------------------------
6411 | Returns the remainder of the extended double-precision floating-point value
6412 | `a' with respect to the corresponding value `b'.  The operation is performed
6413 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6414 *----------------------------------------------------------------------------*/
6415
6416 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6417 {
6418     uint64_t quotient;
6419     return floatx80_modrem(a, b, false, &quotient, status);
6420 }
6421
6422 /*----------------------------------------------------------------------------
6423 | Returns the remainder of the extended double-precision floating-point value
6424 | `a' with respect to the corresponding value `b', with the quotient truncated
6425 | toward zero.
6426 *----------------------------------------------------------------------------*/
6427
6428 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6429 {
6430     uint64_t quotient;
6431     return floatx80_modrem(a, b, true, &quotient, status);
6432 }
6433
6434 /*----------------------------------------------------------------------------
6435 | Returns the square root of the extended double-precision floating-point
6436 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6437 | for Binary Floating-Point Arithmetic.
6438 *----------------------------------------------------------------------------*/
6439
6440 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6441 {
6442     bool aSign;
6443     int32_t aExp, zExp;
6444     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6445     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6446
6447     if (floatx80_invalid_encoding(a)) {
6448         float_raise(float_flag_invalid, status);
6449         return floatx80_default_nan(status);
6450     }
6451     aSig0 = extractFloatx80Frac( a );
6452     aExp = extractFloatx80Exp( a );
6453     aSign = extractFloatx80Sign( a );
6454     if ( aExp == 0x7FFF ) {
6455         if ((uint64_t)(aSig0 << 1)) {
6456             return propagateFloatx80NaN(a, a, status);
6457         }
6458         if ( ! aSign ) return a;
6459         goto invalid;
6460     }
6461     if ( aSign ) {
6462         if ( ( aExp | aSig0 ) == 0 ) return a;
6463  invalid:
6464         float_raise(float_flag_invalid, status);
6465         return floatx80_default_nan(status);
6466     }
6467     if ( aExp == 0 ) {
6468         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6469         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6470     }
6471     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6472     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6473     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6474     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6475     doubleZSig0 = zSig0<<1;
6476     mul64To128( zSig0, zSig0, &term0, &term1 );
6477     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6478     while ( (int64_t) rem0 < 0 ) {
6479         --zSig0;
6480         doubleZSig0 -= 2;
6481         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6482     }
6483     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6484     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6485         if ( zSig1 == 0 ) zSig1 = 1;
6486         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6487         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6488         mul64To128( zSig1, zSig1, &term2, &term3 );
6489         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6490         while ( (int64_t) rem1 < 0 ) {
6491             --zSig1;
6492             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6493             term3 |= 1;
6494             term2 |= doubleZSig0;
6495             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6496         }
6497         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6498     }
6499     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6500     zSig0 |= doubleZSig0;
6501     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6502                                 0, zExp, zSig0, zSig1, status);
6503 }
6504
6505 /*----------------------------------------------------------------------------
6506 | Returns the result of converting the quadruple-precision floating-point
6507 | value `a' to the 32-bit two's complement integer format.  The conversion
6508 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6509 | Arithmetic---which means in particular that the conversion is rounded
6510 | according to the current rounding mode.  If `a' is a NaN, the largest
6511 | positive integer is returned.  Otherwise, if the conversion overflows, the
6512 | largest integer with the same sign as `a' is returned.
6513 *----------------------------------------------------------------------------*/
6514
6515 int32_t float128_to_int32(float128 a, float_status *status)
6516 {
6517     bool aSign;
6518     int32_t aExp, shiftCount;
6519     uint64_t aSig0, aSig1;
6520
6521     aSig1 = extractFloat128Frac1( a );
6522     aSig0 = extractFloat128Frac0( a );
6523     aExp = extractFloat128Exp( a );
6524     aSign = extractFloat128Sign( a );
6525     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6526     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6527     aSig0 |= ( aSig1 != 0 );
6528     shiftCount = 0x4028 - aExp;
6529     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6530     return roundAndPackInt32(aSign, aSig0, status);
6531
6532 }
6533
6534 /*----------------------------------------------------------------------------
6535 | Returns the result of converting the quadruple-precision floating-point
6536 | value `a' to the 32-bit two's complement integer format.  The conversion
6537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6538 | Arithmetic, except that the conversion is always rounded toward zero.  If
6539 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6540 | conversion overflows, the largest integer with the same sign as `a' is
6541 | returned.
6542 *----------------------------------------------------------------------------*/
6543
6544 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6545 {
6546     bool aSign;
6547     int32_t aExp, shiftCount;
6548     uint64_t aSig0, aSig1, savedASig;
6549     int32_t z;
6550
6551     aSig1 = extractFloat128Frac1( a );
6552     aSig0 = extractFloat128Frac0( a );
6553     aExp = extractFloat128Exp( a );
6554     aSign = extractFloat128Sign( a );
6555     aSig0 |= ( aSig1 != 0 );
6556     if ( 0x401E < aExp ) {
6557         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6558         goto invalid;
6559     }
6560     else if ( aExp < 0x3FFF ) {
6561         if (aExp || aSig0) {
6562             float_raise(float_flag_inexact, status);
6563         }
6564         return 0;
6565     }
6566     aSig0 |= UINT64_C(0x0001000000000000);
6567     shiftCount = 0x402F - aExp;
6568     savedASig = aSig0;
6569     aSig0 >>= shiftCount;
6570     z = aSig0;
6571     if ( aSign ) z = - z;
6572     if ( ( z < 0 ) ^ aSign ) {
6573  invalid:
6574         float_raise(float_flag_invalid, status);
6575         return aSign ? INT32_MIN : INT32_MAX;
6576     }
6577     if ( ( aSig0<<shiftCount ) != savedASig ) {
6578         float_raise(float_flag_inexact, status);
6579     }
6580     return z;
6581
6582 }
6583
6584 /*----------------------------------------------------------------------------
6585 | Returns the result of converting the quadruple-precision floating-point
6586 | value `a' to the 64-bit two's complement integer format.  The conversion
6587 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6588 | Arithmetic---which means in particular that the conversion is rounded
6589 | according to the current rounding mode.  If `a' is a NaN, the largest
6590 | positive integer is returned.  Otherwise, if the conversion overflows, the
6591 | largest integer with the same sign as `a' is returned.
6592 *----------------------------------------------------------------------------*/
6593
6594 int64_t float128_to_int64(float128 a, float_status *status)
6595 {
6596     bool aSign;
6597     int32_t aExp, shiftCount;
6598     uint64_t aSig0, aSig1;
6599
6600     aSig1 = extractFloat128Frac1( a );
6601     aSig0 = extractFloat128Frac0( a );
6602     aExp = extractFloat128Exp( a );
6603     aSign = extractFloat128Sign( a );
6604     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6605     shiftCount = 0x402F - aExp;
6606     if ( shiftCount <= 0 ) {
6607         if ( 0x403E < aExp ) {
6608             float_raise(float_flag_invalid, status);
6609             if (    ! aSign
6610                  || (    ( aExp == 0x7FFF )
6611                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6612                     )
6613                ) {
6614                 return INT64_MAX;
6615             }
6616             return INT64_MIN;
6617         }
6618         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6619     }
6620     else {
6621         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6622     }
6623     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6624
6625 }
6626
6627 /*----------------------------------------------------------------------------
6628 | Returns the result of converting the quadruple-precision floating-point
6629 | value `a' to the 64-bit two's complement integer format.  The conversion
6630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6631 | Arithmetic, except that the conversion is always rounded toward zero.
6632 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6633 | the conversion overflows, the largest integer with the same sign as `a' is
6634 | returned.
6635 *----------------------------------------------------------------------------*/
6636
6637 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6638 {
6639     bool aSign;
6640     int32_t aExp, shiftCount;
6641     uint64_t aSig0, aSig1;
6642     int64_t z;
6643
6644     aSig1 = extractFloat128Frac1( a );
6645     aSig0 = extractFloat128Frac0( a );
6646     aExp = extractFloat128Exp( a );
6647     aSign = extractFloat128Sign( a );
6648     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6649     shiftCount = aExp - 0x402F;
6650     if ( 0 < shiftCount ) {
6651         if ( 0x403E <= aExp ) {
6652             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6653             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6654                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6655                 if (aSig1) {
6656                     float_raise(float_flag_inexact, status);
6657                 }
6658             }
6659             else {
6660                 float_raise(float_flag_invalid, status);
6661                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6662                     return INT64_MAX;
6663                 }
6664             }
6665             return INT64_MIN;
6666         }
6667         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6668         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6669             float_raise(float_flag_inexact, status);
6670         }
6671     }
6672     else {
6673         if ( aExp < 0x3FFF ) {
6674             if ( aExp | aSig0 | aSig1 ) {
6675                 float_raise(float_flag_inexact, status);
6676             }
6677             return 0;
6678         }
6679         z = aSig0>>( - shiftCount );
6680         if (    aSig1
6681              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6682             float_raise(float_flag_inexact, status);
6683         }
6684     }
6685     if ( aSign ) z = - z;
6686     return z;
6687
6688 }
6689
6690 /*----------------------------------------------------------------------------
6691 | Returns the result of converting the quadruple-precision floating-point value
6692 | `a' to the 64-bit unsigned integer format.  The conversion is
6693 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6694 | Arithmetic---which means in particular that the conversion is rounded
6695 | according to the current rounding mode.  If `a' is a NaN, the largest
6696 | positive integer is returned.  If the conversion overflows, the
6697 | largest unsigned integer is returned.  If 'a' is negative, the value is
6698 | rounded and zero is returned; negative values that do not round to zero
6699 | will raise the inexact exception.
6700 *----------------------------------------------------------------------------*/
6701
6702 uint64_t float128_to_uint64(float128 a, float_status *status)
6703 {
6704     bool aSign;
6705     int aExp;
6706     int shiftCount;
6707     uint64_t aSig0, aSig1;
6708
6709     aSig0 = extractFloat128Frac0(a);
6710     aSig1 = extractFloat128Frac1(a);
6711     aExp = extractFloat128Exp(a);
6712     aSign = extractFloat128Sign(a);
6713     if (aSign && (aExp > 0x3FFE)) {
6714         float_raise(float_flag_invalid, status);
6715         if (float128_is_any_nan(a)) {
6716             return UINT64_MAX;
6717         } else {
6718             return 0;
6719         }
6720     }
6721     if (aExp) {
6722         aSig0 |= UINT64_C(0x0001000000000000);
6723     }
6724     shiftCount = 0x402F - aExp;
6725     if (shiftCount <= 0) {
6726         if (0x403E < aExp) {
6727             float_raise(float_flag_invalid, status);
6728             return UINT64_MAX;
6729         }
6730         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6731     } else {
6732         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6733     }
6734     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6735 }
6736
6737 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6738 {
6739     uint64_t v;
6740     signed char current_rounding_mode = status->float_rounding_mode;
6741
6742     set_float_rounding_mode(float_round_to_zero, status);
6743     v = float128_to_uint64(a, status);
6744     set_float_rounding_mode(current_rounding_mode, status);
6745
6746     return v;
6747 }
6748
6749 /*----------------------------------------------------------------------------
6750 | Returns the result of converting the quadruple-precision floating-point
6751 | value `a' to the 32-bit unsigned integer format.  The conversion
6752 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6753 | Arithmetic except that the conversion is always rounded toward zero.
6754 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6755 | if the conversion overflows, the largest unsigned integer is returned.
6756 | If 'a' is negative, the value is rounded and zero is returned; negative
6757 | values that do not round to zero will raise the inexact exception.
6758 *----------------------------------------------------------------------------*/
6759
6760 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6761 {
6762     uint64_t v;
6763     uint32_t res;
6764     int old_exc_flags = get_float_exception_flags(status);
6765
6766     v = float128_to_uint64_round_to_zero(a, status);
6767     if (v > 0xffffffff) {
6768         res = 0xffffffff;
6769     } else {
6770         return v;
6771     }
6772     set_float_exception_flags(old_exc_flags, status);
6773     float_raise(float_flag_invalid, status);
6774     return res;
6775 }
6776
6777 /*----------------------------------------------------------------------------
6778 | Returns the result of converting the quadruple-precision floating-point value
6779 | `a' to the 32-bit unsigned integer format.  The conversion is
6780 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6781 | Arithmetic---which means in particular that the conversion is rounded
6782 | according to the current rounding mode.  If `a' is a NaN, the largest
6783 | positive integer is returned.  If the conversion overflows, the
6784 | largest unsigned integer is returned.  If 'a' is negative, the value is
6785 | rounded and zero is returned; negative values that do not round to zero
6786 | will raise the inexact exception.
6787 *----------------------------------------------------------------------------*/
6788
6789 uint32_t float128_to_uint32(float128 a, float_status *status)
6790 {
6791     uint64_t v;
6792     uint32_t res;
6793     int old_exc_flags = get_float_exception_flags(status);
6794
6795     v = float128_to_uint64(a, status);
6796     if (v > 0xffffffff) {
6797         res = 0xffffffff;
6798     } else {
6799         return v;
6800     }
6801     set_float_exception_flags(old_exc_flags, status);
6802     float_raise(float_flag_invalid, status);
6803     return res;
6804 }
6805
6806 /*----------------------------------------------------------------------------
6807 | Returns the result of converting the quadruple-precision floating-point
6808 | value `a' to the single-precision floating-point format.  The conversion
6809 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6810 | Arithmetic.
6811 *----------------------------------------------------------------------------*/
6812
6813 float32 float128_to_float32(float128 a, float_status *status)
6814 {
6815     bool aSign;
6816     int32_t aExp;
6817     uint64_t aSig0, aSig1;
6818     uint32_t zSig;
6819
6820     aSig1 = extractFloat128Frac1( a );
6821     aSig0 = extractFloat128Frac0( a );
6822     aExp = extractFloat128Exp( a );
6823     aSign = extractFloat128Sign( a );
6824     if ( aExp == 0x7FFF ) {
6825         if ( aSig0 | aSig1 ) {
6826             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6827         }
6828         return packFloat32( aSign, 0xFF, 0 );
6829     }
6830     aSig0 |= ( aSig1 != 0 );
6831     shift64RightJamming( aSig0, 18, &aSig0 );
6832     zSig = aSig0;
6833     if ( aExp || zSig ) {
6834         zSig |= 0x40000000;
6835         aExp -= 0x3F81;
6836     }
6837     return roundAndPackFloat32(aSign, aExp, zSig, status);
6838
6839 }
6840
6841 /*----------------------------------------------------------------------------
6842 | Returns the result of converting the quadruple-precision floating-point
6843 | value `a' to the double-precision floating-point format.  The conversion
6844 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6845 | Arithmetic.
6846 *----------------------------------------------------------------------------*/
6847
6848 float64 float128_to_float64(float128 a, float_status *status)
6849 {
6850     bool aSign;
6851     int32_t aExp;
6852     uint64_t aSig0, aSig1;
6853
6854     aSig1 = extractFloat128Frac1( a );
6855     aSig0 = extractFloat128Frac0( a );
6856     aExp = extractFloat128Exp( a );
6857     aSign = extractFloat128Sign( a );
6858     if ( aExp == 0x7FFF ) {
6859         if ( aSig0 | aSig1 ) {
6860             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6861         }
6862         return packFloat64( aSign, 0x7FF, 0 );
6863     }
6864     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6865     aSig0 |= ( aSig1 != 0 );
6866     if ( aExp || aSig0 ) {
6867         aSig0 |= UINT64_C(0x4000000000000000);
6868         aExp -= 0x3C01;
6869     }
6870     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6871
6872 }
6873
6874 /*----------------------------------------------------------------------------
6875 | Returns the result of converting the quadruple-precision floating-point
6876 | value `a' to the extended double-precision floating-point format.  The
6877 | conversion is performed according to the IEC/IEEE Standard for Binary
6878 | Floating-Point Arithmetic.
6879 *----------------------------------------------------------------------------*/
6880
6881 floatx80 float128_to_floatx80(float128 a, float_status *status)
6882 {
6883     bool aSign;
6884     int32_t aExp;
6885     uint64_t aSig0, aSig1;
6886
6887     aSig1 = extractFloat128Frac1( a );
6888     aSig0 = extractFloat128Frac0( a );
6889     aExp = extractFloat128Exp( a );
6890     aSign = extractFloat128Sign( a );
6891     if ( aExp == 0x7FFF ) {
6892         if ( aSig0 | aSig1 ) {
6893             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6894                                                status);
6895             return floatx80_silence_nan(res, status);
6896         }
6897         return packFloatx80(aSign, floatx80_infinity_high,
6898                                    floatx80_infinity_low);
6899     }
6900     if ( aExp == 0 ) {
6901         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6902         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6903     }
6904     else {
6905         aSig0 |= UINT64_C(0x0001000000000000);
6906     }
6907     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6908     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6909
6910 }
6911
6912 /*----------------------------------------------------------------------------
6913 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6914 | returns the result as a quadruple-precision floating-point value.  The
6915 | operation is performed according to the IEC/IEEE Standard for Binary
6916 | Floating-Point Arithmetic.
6917 *----------------------------------------------------------------------------*/
6918
6919 float128 float128_round_to_int(float128 a, float_status *status)
6920 {
6921     bool aSign;
6922     int32_t aExp;
6923     uint64_t lastBitMask, roundBitsMask;
6924     float128 z;
6925
6926     aExp = extractFloat128Exp( a );
6927     if ( 0x402F <= aExp ) {
6928         if ( 0x406F <= aExp ) {
6929             if (    ( aExp == 0x7FFF )
6930                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6931                ) {
6932                 return propagateFloat128NaN(a, a, status);
6933             }
6934             return a;
6935         }
6936         lastBitMask = 1;
6937         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6938         roundBitsMask = lastBitMask - 1;
6939         z = a;
6940         switch (status->float_rounding_mode) {
6941         case float_round_nearest_even:
6942             if ( lastBitMask ) {
6943                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6944                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6945             }
6946             else {
6947                 if ( (int64_t) z.low < 0 ) {
6948                     ++z.high;
6949                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6950                 }
6951             }
6952             break;
6953         case float_round_ties_away:
6954             if (lastBitMask) {
6955                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6956             } else {
6957                 if ((int64_t) z.low < 0) {
6958                     ++z.high;
6959                 }
6960             }
6961             break;
6962         case float_round_to_zero:
6963             break;
6964         case float_round_up:
6965             if (!extractFloat128Sign(z)) {
6966                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6967             }
6968             break;
6969         case float_round_down:
6970             if (extractFloat128Sign(z)) {
6971                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6972             }
6973             break;
6974         case float_round_to_odd:
6975             /*
6976              * Note that if lastBitMask == 0, the last bit is the lsb
6977              * of high, and roundBitsMask == -1.
6978              */
6979             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6980                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6981             }
6982             break;
6983         default:
6984             abort();
6985         }
6986         z.low &= ~ roundBitsMask;
6987     }
6988     else {
6989         if ( aExp < 0x3FFF ) {
6990             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6991             float_raise(float_flag_inexact, status);
6992             aSign = extractFloat128Sign( a );
6993             switch (status->float_rounding_mode) {
6994             case float_round_nearest_even:
6995                 if (    ( aExp == 0x3FFE )
6996                      && (   extractFloat128Frac0( a )
6997                           | extractFloat128Frac1( a ) )
6998                    ) {
6999                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7000                 }
7001                 break;
7002             case float_round_ties_away:
7003                 if (aExp == 0x3FFE) {
7004                     return packFloat128(aSign, 0x3FFF, 0, 0);
7005                 }
7006                 break;
7007             case float_round_down:
7008                 return
7009                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7010                     : packFloat128( 0, 0, 0, 0 );
7011             case float_round_up:
7012                 return
7013                       aSign ? packFloat128( 1, 0, 0, 0 )
7014                     : packFloat128( 0, 0x3FFF, 0, 0 );
7015
7016             case float_round_to_odd:
7017                 return packFloat128(aSign, 0x3FFF, 0, 0);
7018
7019             case float_round_to_zero:
7020                 break;
7021             }
7022             return packFloat128( aSign, 0, 0, 0 );
7023         }
7024         lastBitMask = 1;
7025         lastBitMask <<= 0x402F - aExp;
7026         roundBitsMask = lastBitMask - 1;
7027         z.low = 0;
7028         z.high = a.high;
7029         switch (status->float_rounding_mode) {
7030         case float_round_nearest_even:
7031             z.high += lastBitMask>>1;
7032             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7033                 z.high &= ~ lastBitMask;
7034             }
7035             break;
7036         case float_round_ties_away:
7037             z.high += lastBitMask>>1;
7038             break;
7039         case float_round_to_zero:
7040             break;
7041         case float_round_up:
7042             if (!extractFloat128Sign(z)) {
7043                 z.high |= ( a.low != 0 );
7044                 z.high += roundBitsMask;
7045             }
7046             break;
7047         case float_round_down:
7048             if (extractFloat128Sign(z)) {
7049                 z.high |= (a.low != 0);
7050                 z.high += roundBitsMask;
7051             }
7052             break;
7053         case float_round_to_odd:
7054             if ((z.high & lastBitMask) == 0) {
7055                 z.high |= (a.low != 0);
7056                 z.high += roundBitsMask;
7057             }
7058             break;
7059         default:
7060             abort();
7061         }
7062         z.high &= ~ roundBitsMask;
7063     }
7064     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7065         float_raise(float_flag_inexact, status);
7066     }
7067     return z;
7068
7069 }
7070
7071 /*----------------------------------------------------------------------------
7072 | Returns the result of multiplying the quadruple-precision floating-point
7073 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7074 | Standard for Binary Floating-Point Arithmetic.
7075 *----------------------------------------------------------------------------*/
7076
7077 float128 float128_mul(float128 a, float128 b, float_status *status)
7078 {
7079     bool aSign, bSign, zSign;
7080     int32_t aExp, bExp, zExp;
7081     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7082
7083     aSig1 = extractFloat128Frac1( a );
7084     aSig0 = extractFloat128Frac0( a );
7085     aExp = extractFloat128Exp( a );
7086     aSign = extractFloat128Sign( a );
7087     bSig1 = extractFloat128Frac1( b );
7088     bSig0 = extractFloat128Frac0( b );
7089     bExp = extractFloat128Exp( b );
7090     bSign = extractFloat128Sign( b );
7091     zSign = aSign ^ bSign;
7092     if ( aExp == 0x7FFF ) {
7093         if (    ( aSig0 | aSig1 )
7094              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7095             return propagateFloat128NaN(a, b, status);
7096         }
7097         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7098         return packFloat128( zSign, 0x7FFF, 0, 0 );
7099     }
7100     if ( bExp == 0x7FFF ) {
7101         if (bSig0 | bSig1) {
7102             return propagateFloat128NaN(a, b, status);
7103         }
7104         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7105  invalid:
7106             float_raise(float_flag_invalid, status);
7107             return float128_default_nan(status);
7108         }
7109         return packFloat128( zSign, 0x7FFF, 0, 0 );
7110     }
7111     if ( aExp == 0 ) {
7112         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7113         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7114     }
7115     if ( bExp == 0 ) {
7116         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7117         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7118     }
7119     zExp = aExp + bExp - 0x4000;
7120     aSig0 |= UINT64_C(0x0001000000000000);
7121     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7122     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7123     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7124     zSig2 |= ( zSig3 != 0 );
7125     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7126         shift128ExtraRightJamming(
7127             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7128         ++zExp;
7129     }
7130     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7131
7132 }
7133
7134 /*----------------------------------------------------------------------------
7135 | Returns the result of dividing the quadruple-precision floating-point value
7136 | `a' by the corresponding value `b'.  The operation is performed according to
7137 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7139
7140 float128 float128_div(float128 a, float128 b, float_status *status)
7141 {
7142     bool aSign, bSign, zSign;
7143     int32_t aExp, bExp, zExp;
7144     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7145     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7146
7147     aSig1 = extractFloat128Frac1( a );
7148     aSig0 = extractFloat128Frac0( a );
7149     aExp = extractFloat128Exp( a );
7150     aSign = extractFloat128Sign( a );
7151     bSig1 = extractFloat128Frac1( b );
7152     bSig0 = extractFloat128Frac0( b );
7153     bExp = extractFloat128Exp( b );
7154     bSign = extractFloat128Sign( b );
7155     zSign = aSign ^ bSign;
7156     if ( aExp == 0x7FFF ) {
7157         if (aSig0 | aSig1) {
7158             return propagateFloat128NaN(a, b, status);
7159         }
7160         if ( bExp == 0x7FFF ) {
7161             if (bSig0 | bSig1) {
7162                 return propagateFloat128NaN(a, b, status);
7163             }
7164             goto invalid;
7165         }
7166         return packFloat128( zSign, 0x7FFF, 0, 0 );
7167     }
7168     if ( bExp == 0x7FFF ) {
7169         if (bSig0 | bSig1) {
7170             return propagateFloat128NaN(a, b, status);
7171         }
7172         return packFloat128( zSign, 0, 0, 0 );
7173     }
7174     if ( bExp == 0 ) {
7175         if ( ( bSig0 | bSig1 ) == 0 ) {
7176             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7177  invalid:
7178                 float_raise(float_flag_invalid, status);
7179                 return float128_default_nan(status);
7180             }
7181             float_raise(float_flag_divbyzero, status);
7182             return packFloat128( zSign, 0x7FFF, 0, 0 );
7183         }
7184         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7185     }
7186     if ( aExp == 0 ) {
7187         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7188         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7189     }
7190     zExp = aExp - bExp + 0x3FFD;
7191     shortShift128Left(
7192         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7193     shortShift128Left(
7194         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7195     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7196         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7197         ++zExp;
7198     }
7199     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7200     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7201     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7202     while ( (int64_t) rem0 < 0 ) {
7203         --zSig0;
7204         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7205     }
7206     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7207     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7208         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7209         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7210         while ( (int64_t) rem1 < 0 ) {
7211             --zSig1;
7212             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7213         }
7214         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7215     }
7216     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7217     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7218
7219 }
7220
7221 /*----------------------------------------------------------------------------
7222 | Returns the remainder of the quadruple-precision floating-point value `a'
7223 | with respect to the corresponding value `b'.  The operation is performed
7224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7225 *----------------------------------------------------------------------------*/
7226
7227 float128 float128_rem(float128 a, float128 b, float_status *status)
7228 {
7229     bool aSign, zSign;
7230     int32_t aExp, bExp, expDiff;
7231     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7232     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7233     int64_t sigMean0;
7234
7235     aSig1 = extractFloat128Frac1( a );
7236     aSig0 = extractFloat128Frac0( a );
7237     aExp = extractFloat128Exp( a );
7238     aSign = extractFloat128Sign( a );
7239     bSig1 = extractFloat128Frac1( b );
7240     bSig0 = extractFloat128Frac0( b );
7241     bExp = extractFloat128Exp( b );
7242     if ( aExp == 0x7FFF ) {
7243         if (    ( aSig0 | aSig1 )
7244              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7245             return propagateFloat128NaN(a, b, status);
7246         }
7247         goto invalid;
7248     }
7249     if ( bExp == 0x7FFF ) {
7250         if (bSig0 | bSig1) {
7251             return propagateFloat128NaN(a, b, status);
7252         }
7253         return a;
7254     }
7255     if ( bExp == 0 ) {
7256         if ( ( bSig0 | bSig1 ) == 0 ) {
7257  invalid:
7258             float_raise(float_flag_invalid, status);
7259             return float128_default_nan(status);
7260         }
7261         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7262     }
7263     if ( aExp == 0 ) {
7264         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7265         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7266     }
7267     expDiff = aExp - bExp;
7268     if ( expDiff < -1 ) return a;
7269     shortShift128Left(
7270         aSig0 | UINT64_C(0x0001000000000000),
7271         aSig1,
7272         15 - ( expDiff < 0 ),
7273         &aSig0,
7274         &aSig1
7275     );
7276     shortShift128Left(
7277         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7278     q = le128( bSig0, bSig1, aSig0, aSig1 );
7279     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7280     expDiff -= 64;
7281     while ( 0 < expDiff ) {
7282         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7283         q = ( 4 < q ) ? q - 4 : 0;
7284         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7285         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7286         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7287         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7288         expDiff -= 61;
7289     }
7290     if ( -64 < expDiff ) {
7291         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7292         q = ( 4 < q ) ? q - 4 : 0;
7293         q >>= - expDiff;
7294         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7295         expDiff += 52;
7296         if ( expDiff < 0 ) {
7297             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7298         }
7299         else {
7300             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7301         }
7302         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7303         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7304     }
7305     else {
7306         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7307         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7308     }
7309     do {
7310         alternateASig0 = aSig0;
7311         alternateASig1 = aSig1;
7312         ++q;
7313         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7314     } while ( 0 <= (int64_t) aSig0 );
7315     add128(
7316         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7317     if (    ( sigMean0 < 0 )
7318          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7319         aSig0 = alternateASig0;
7320         aSig1 = alternateASig1;
7321     }
7322     zSign = ( (int64_t) aSig0 < 0 );
7323     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7324     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7325                                          status);
7326 }
7327
7328 /*----------------------------------------------------------------------------
7329 | Returns the square root of the quadruple-precision floating-point value `a'.
7330 | The operation is performed according to the IEC/IEEE Standard for Binary
7331 | Floating-Point Arithmetic.
7332 *----------------------------------------------------------------------------*/
7333
7334 float128 float128_sqrt(float128 a, float_status *status)
7335 {
7336     bool aSign;
7337     int32_t aExp, zExp;
7338     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7339     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7340
7341     aSig1 = extractFloat128Frac1( a );
7342     aSig0 = extractFloat128Frac0( a );
7343     aExp = extractFloat128Exp( a );
7344     aSign = extractFloat128Sign( a );
7345     if ( aExp == 0x7FFF ) {
7346         if (aSig0 | aSig1) {
7347             return propagateFloat128NaN(a, a, status);
7348         }
7349         if ( ! aSign ) return a;
7350         goto invalid;
7351     }
7352     if ( aSign ) {
7353         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7354  invalid:
7355         float_raise(float_flag_invalid, status);
7356         return float128_default_nan(status);
7357     }
7358     if ( aExp == 0 ) {
7359         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7360         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7361     }
7362     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7363     aSig0 |= UINT64_C(0x0001000000000000);
7364     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7365     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7366     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7367     doubleZSig0 = zSig0<<1;
7368     mul64To128( zSig0, zSig0, &term0, &term1 );
7369     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7370     while ( (int64_t) rem0 < 0 ) {
7371         --zSig0;
7372         doubleZSig0 -= 2;
7373         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7374     }
7375     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7376     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7377         if ( zSig1 == 0 ) zSig1 = 1;
7378         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7379         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7380         mul64To128( zSig1, zSig1, &term2, &term3 );
7381         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7382         while ( (int64_t) rem1 < 0 ) {
7383             --zSig1;
7384             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7385             term3 |= 1;
7386             term2 |= doubleZSig0;
7387             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7388         }
7389         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7390     }
7391     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7392     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7393
7394 }
7395
7396 static inline FloatRelation
7397 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7398                           float_status *status)
7399 {
7400     bool aSign, bSign;
7401
7402     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7403         float_raise(float_flag_invalid, status);
7404         return float_relation_unordered;
7405     }
7406     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7407           ( extractFloatx80Frac( a )<<1 ) ) ||
7408         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7409           ( extractFloatx80Frac( b )<<1 ) )) {
7410         if (!is_quiet ||
7411             floatx80_is_signaling_nan(a, status) ||
7412             floatx80_is_signaling_nan(b, status)) {
7413             float_raise(float_flag_invalid, status);
7414         }
7415         return float_relation_unordered;
7416     }
7417     aSign = extractFloatx80Sign( a );
7418     bSign = extractFloatx80Sign( b );
7419     if ( aSign != bSign ) {
7420
7421         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7422              ( ( a.low | b.low ) == 0 ) ) {
7423             /* zero case */
7424             return float_relation_equal;
7425         } else {
7426             return 1 - (2 * aSign);
7427         }
7428     } else {
7429         /* Normalize pseudo-denormals before comparison.  */
7430         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7431             ++a.high;
7432         }
7433         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7434             ++b.high;
7435         }
7436         if (a.low == b.low && a.high == b.high) {
7437             return float_relation_equal;
7438         } else {
7439             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7440         }
7441     }
7442 }
7443
7444 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7445 {
7446     return floatx80_compare_internal(a, b, 0, status);
7447 }
7448
7449 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7450                                      float_status *status)
7451 {
7452     return floatx80_compare_internal(a, b, 1, status);
7453 }
7454
7455 static inline FloatRelation
7456 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7457                           float_status *status)
7458 {
7459     bool aSign, bSign;
7460
7461     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7462           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7463         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7464           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7465         if (!is_quiet ||
7466             float128_is_signaling_nan(a, status) ||
7467             float128_is_signaling_nan(b, status)) {
7468             float_raise(float_flag_invalid, status);
7469         }
7470         return float_relation_unordered;
7471     }
7472     aSign = extractFloat128Sign( a );
7473     bSign = extractFloat128Sign( b );
7474     if ( aSign != bSign ) {
7475         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7476             /* zero case */
7477             return float_relation_equal;
7478         } else {
7479             return 1 - (2 * aSign);
7480         }
7481     } else {
7482         if (a.low == b.low && a.high == b.high) {
7483             return float_relation_equal;
7484         } else {
7485             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7486         }
7487     }
7488 }
7489
7490 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7491 {
7492     return float128_compare_internal(a, b, 0, status);
7493 }
7494
7495 FloatRelation float128_compare_quiet(float128 a, float128 b,
7496                                      float_status *status)
7497 {
7498     return float128_compare_internal(a, b, 1, status);
7499 }
7500
7501 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7502 {
7503     bool aSign;
7504     int32_t aExp;
7505     uint64_t aSig;
7506
7507     if (floatx80_invalid_encoding(a)) {
7508         float_raise(float_flag_invalid, status);
7509         return floatx80_default_nan(status);
7510     }
7511     aSig = extractFloatx80Frac( a );
7512     aExp = extractFloatx80Exp( a );
7513     aSign = extractFloatx80Sign( a );
7514
7515     if ( aExp == 0x7FFF ) {
7516         if ( aSig<<1 ) {
7517             return propagateFloatx80NaN(a, a, status);
7518         }
7519         return a;
7520     }
7521
7522     if (aExp == 0) {
7523         if (aSig == 0) {
7524             return a;
7525         }
7526         aExp++;
7527     }
7528
7529     if (n > 0x10000) {
7530         n = 0x10000;
7531     } else if (n < -0x10000) {
7532         n = -0x10000;
7533     }
7534
7535     aExp += n;
7536     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7537                                          aSign, aExp, aSig, 0, status);
7538 }
7539
7540 float128 float128_scalbn(float128 a, int n, float_status *status)
7541 {
7542     bool aSign;
7543     int32_t aExp;
7544     uint64_t aSig0, aSig1;
7545
7546     aSig1 = extractFloat128Frac1( a );
7547     aSig0 = extractFloat128Frac0( a );
7548     aExp = extractFloat128Exp( a );
7549     aSign = extractFloat128Sign( a );
7550     if ( aExp == 0x7FFF ) {
7551         if ( aSig0 | aSig1 ) {
7552             return propagateFloat128NaN(a, a, status);
7553         }
7554         return a;
7555     }
7556     if (aExp != 0) {
7557         aSig0 |= UINT64_C(0x0001000000000000);
7558     } else if (aSig0 == 0 && aSig1 == 0) {
7559         return a;
7560     } else {
7561         aExp++;
7562     }
7563
7564     if (n > 0x10000) {
7565         n = 0x10000;
7566     } else if (n < -0x10000) {
7567         n = -0x10000;
7568     }
7569
7570     aExp += n - 1;
7571     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7572                                          , status);
7573
7574 }
7575
7576 static void __attribute__((constructor)) softfloat_init(void)
7577 {
7578     union_float64 ua, ub, uc, ur;
7579
7580     if (QEMU_NO_HARDFLOAT) {
7581         return;
7582     }
7583     /*
7584      * Test that the host's FMA is not obviously broken. For example,
7585      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7586      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7587      */
7588     ua.s = 0x0020000000000001ULL;
7589     ub.s = 0x3ca0000000000000ULL;
7590     uc.s = 0x0020000000000000ULL;
7591     ur.h = fma(ua.h, ub.h, uc.h);
7592     if (ur.s != 0x0020000000000001ULL) {
7593         force_soft_fma = true;
7594     }
7595 }