fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float.
 504  * The exponent is unbiased and the fraction is normalized.
 505  *
 506  * The fraction words are stored in big-endian word ordering,
 507  * so that truncation from a larger format to a smaller format
 508  * can be done simply by ignoring subsequent elements.
 509  */
 510
 511 typedef struct {
 512     FloatClass cls;
 513     bool sign;
 514     int32_t exp;
 515     union {
 516         /* Routines that know the structure may reference the singular name. */
 517         uint64_t frac;
 518         /*
 519          * Routines expanded with multiple structures reference "hi" and "lo"
 520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
 521          * both the same word and aliased here.
 522          */
 523         uint64_t frac_hi;
 524         uint64_t frac_lo;
 525     };
 526 } FloatParts64;
 527
 528 typedef struct {
 529     FloatClass cls;
 530     bool sign;
 531     int32_t exp;
 532     uint64_t frac_hi;
 533     uint64_t frac_lo;
 534 } FloatParts128;
 535
 536 /* These apply to the most significant word of each FloatPartsN. */
 537 #define DECOMPOSED_BINARY_POINT    63
 538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 539
 540 /* Structure holding all of the relevant parameters for a format.
 541  *   exp_size: the size of the exponent field
 542  *   exp_bias: the offset applied to the exponent field
 543  *   exp_max: the maximum normalised exponent
 544  *   frac_size: the size of the fraction field
 545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 546  * The following are computed based the size of fraction
 547  *   frac_lsb: least significant bit of fraction
 548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 549  *   round_mask/roundeven_mask: masks used for rounding
 550  * The following optional modifiers are available:
 551  *   arm_althp: handle ARM Alternative Half Precision
 552  */
 553 typedef struct {
 554     int exp_size;
 555     int exp_bias;
 556     int exp_max;
 557     int frac_size;
 558     int frac_shift;
 559     uint64_t frac_lsb;
 560     uint64_t frac_lsbm1;
 561     uint64_t round_mask;
 562     uint64_t roundeven_mask;
 563     bool arm_althp;
 564 } FloatFmt;
 565
 566 /* Expand fields based on the size of exponent and fraction */
 567 #define FLOAT_PARAMS(E, F)                                           \
 568     .exp_size       = E,                                             \
 569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 570     .exp_max        = (1 << E) - 1,                                  \
 571     .frac_size      = F,                                             \
 572     .frac_shift     = (-F - 1) & 63,                                 \
 573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
 574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
 575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
 576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
 577
 578 static const FloatFmt float16_params = {
 579     FLOAT_PARAMS(5, 10)
 580 };
 581
 582 static const FloatFmt float16_params_ahp = {
 583     FLOAT_PARAMS(5, 10),
 584     .arm_althp = true
 585 };
 586
 587 static const FloatFmt bfloat16_params = {
 588     FLOAT_PARAMS(8, 7)
 589 };
 590
 591 static const FloatFmt float32_params = {
 592     FLOAT_PARAMS(8, 23)
 593 };
 594
 595 static const FloatFmt float64_params = {
 596     FLOAT_PARAMS(11, 52)
 597 };
 598
 599 static const FloatFmt float128_params = {
 600     FLOAT_PARAMS(15, 112)
 601 };
 602
 603 /* Unpack a float to parts, but do not canonicalize.  */
 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 605 {
 606     const int f_size = fmt->frac_size;
 607     const int e_size = fmt->exp_size;
 608
 609     *r = (FloatParts64) {
 610         .cls = float_class_unclassified,
 611         .sign = extract64(raw, f_size + e_size, 1),
 612         .exp = extract64(raw, f_size, e_size),
 613         .frac = extract64(raw, 0, f_size)
 614     };
 615 }
 616
 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
 618 {
 619     unpack_raw64(p, &float16_params, f);
 620 }
 621
 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 623 {
 624     unpack_raw64(p, &bfloat16_params, f);
 625 }
 626
 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
 628 {
 629     unpack_raw64(p, &float32_params, f);
 630 }
 631
 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
 633 {
 634     unpack_raw64(p, &float64_params, f);
 635 }
 636
 637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
 638 {
 639     const int f_size = float128_params.frac_size - 64;
 640     const int e_size = float128_params.exp_size;
 641
 642     *p = (FloatParts128) {
 643         .cls = float_class_unclassified,
 644         .sign = extract64(f.high, f_size + e_size, 1),
 645         .exp = extract64(f.high, f_size, e_size),
 646         .frac_hi = extract64(f.high, 0, f_size),
 647         .frac_lo = f.low,
 648     };
 649 }
 650
 651 /* Pack a float from parts, but do not canonicalize.  */
 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 653 {
 654     const int f_size = fmt->frac_size;
 655     const int e_size = fmt->exp_size;
 656     uint64_t ret;
 657
 658     ret = (uint64_t)p->sign << (f_size + e_size);
 659     ret = deposit64(ret, f_size, e_size, p->exp);
 660     ret = deposit64(ret, 0, f_size, p->frac);
 661     return ret;
 662 }
 663
 664 static inline float16 float16_pack_raw(const FloatParts64 *p)
 665 {
 666     return make_float16(pack_raw64(p, &float16_params));
 667 }
 668
 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
 670 {
 671     return pack_raw64(p, &bfloat16_params);
 672 }
 673
 674 static inline float32 float32_pack_raw(const FloatParts64 *p)
 675 {
 676     return make_float32(pack_raw64(p, &float32_params));
 677 }
 678
 679 static inline float64 float64_pack_raw(const FloatParts64 *p)
 680 {
 681     return make_float64(pack_raw64(p, &float64_params));
 682 }
 683
 684 static float128 float128_pack_raw(const FloatParts128 *p)
 685 {
 686     const int f_size = float128_params.frac_size - 64;
 687     const int e_size = float128_params.exp_size;
 688     uint64_t hi;
 689
 690     hi = (uint64_t)p->sign << (f_size + e_size);
 691     hi = deposit64(hi, f_size, e_size, p->exp);
 692     hi = deposit64(hi, 0, f_size, p->frac_hi);
 693     return make_float128(hi, p->frac_lo);
 694 }
 695
 696 /*----------------------------------------------------------------------------
 697 | Functions and definitions to determine:  (1) whether tininess for underflow
 698 | is detected before or after rounding by default, (2) what (if anything)
 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 701 | are propagated from function inputs to output.  These details are target-
 702 | specific.
 703 *----------------------------------------------------------------------------*/
 704 #include "softfloat-specialize.c.inc"
 705
 706 #define PARTS_GENERIC_64_128(NAME, P) \
 707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 708
 709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
 710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
 711
 712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
 713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
 714
 715 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
 716
 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
 718                                       float_status *s);
 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
 720                                         float_status *s);
 721
 722 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
 723
 724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
 725                                              FloatParts64 *c, float_status *s,
 726                                              int ab_mask, int abc_mask);
 727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
 728                                                FloatParts128 *b,
 729                                                FloatParts128 *c,
 730                                                float_status *s,
 731                                                int ab_mask, int abc_mask);
 732
 733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
 734     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
 735
 736 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
 737                                  const FloatFmt *fmt);
 738 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
 739                                   const FloatFmt *fmt);
 740
 741 #define parts_canonicalize(A, S, F) \
 742     PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
 743
 744 static void parts64_uncanon(FloatParts64 *p, float_status *status,
 745                             const FloatFmt *fmt);
 746 static void parts128_uncanon(FloatParts128 *p, float_status *status,
 747                              const FloatFmt *fmt);
 748
 749 #define parts_uncanon(A, S, F) \
 750     PARTS_GENERIC_64_128(uncanon, A)(A, S, F)
 751
 752 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b);
 753 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b);
 754
 755 #define parts_add_normal(A, B) \
 756     PARTS_GENERIC_64_128(add_normal, A)(A, B)
 757
 758 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b);
 759 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b);
 760
 761 #define parts_sub_normal(A, B) \
 762     PARTS_GENERIC_64_128(sub_normal, A)(A, B)
 763
 764 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b,
 765                                     float_status *s, bool subtract);
 766 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b,
 767                                       float_status *s, bool subtract);
 768
 769 #define parts_addsub(A, B, S, Z) \
 770     PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z)
 771
 772 /*
 773  * Helper functions for softfloat-parts.c.inc, per-size operations.
 774  */
 775
 776 #define FRAC_GENERIC_64_128(NAME, P) \
 777     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 778
 779 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
 780 {
 781     return uadd64_overflow(a->frac, b->frac, &r->frac);
 782 }
 783
 784 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
 785 {
 786     bool c = 0;
 787     r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c);
 788     r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c);
 789     return c;
 790 }
 791
 792 #define frac_add(R, A, B)  FRAC_GENERIC_64_128(add, R)(R, A, B)
 793
 794 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c)
 795 {
 796     return uadd64_overflow(a->frac, c, &r->frac);
 797 }
 798
 799 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c)
 800 {
 801     c = uadd64_overflow(a->frac_lo, c, &r->frac_lo);
 802     return uadd64_overflow(a->frac_hi, c, &r->frac_hi);
 803 }
 804
 805 #define frac_addi(R, A, C)  FRAC_GENERIC_64_128(addi, R)(R, A, C)
 806
 807 static void frac64_allones(FloatParts64 *a)
 808 {
 809     a->frac = -1;
 810 }
 811
 812 static void frac128_allones(FloatParts128 *a)
 813 {
 814     a->frac_hi = a->frac_lo = -1;
 815 }
 816
 817 #define frac_allones(A)  FRAC_GENERIC_64_128(allones, A)(A)
 818
 819 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
 820 {
 821     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
 822 }
 823
 824 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
 825 {
 826     uint64_t ta = a->frac_hi, tb = b->frac_hi;
 827     if (ta == tb) {
 828         ta = a->frac_lo, tb = b->frac_lo;
 829         if (ta == tb) {
 830             return 0;
 831         }
 832     }
 833     return ta < tb ? -1 : 1;
 834 }
 835
 836 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
 837
 838 static void frac64_clear(FloatParts64 *a)
 839 {
 840     a->frac = 0;
 841 }
 842
 843 static void frac128_clear(FloatParts128 *a)
 844 {
 845     a->frac_hi = a->frac_lo = 0;
 846 }
 847
 848 #define frac_clear(A)  FRAC_GENERIC_64_128(clear, A)(A)
 849
 850 static bool frac64_eqz(FloatParts64 *a)
 851 {
 852     return a->frac == 0;
 853 }
 854
 855 static bool frac128_eqz(FloatParts128 *a)
 856 {
 857     return (a->frac_hi | a->frac_lo) == 0;
 858 }
 859
 860 #define frac_eqz(A)  FRAC_GENERIC_64_128(eqz, A)(A)
 861
 862 static void frac64_neg(FloatParts64 *a)
 863 {
 864     a->frac = -a->frac;
 865 }
 866
 867 static void frac128_neg(FloatParts128 *a)
 868 {
 869     bool c = 0;
 870     a->frac_lo = usub64_borrow(0, a->frac_lo, &c);
 871     a->frac_hi = usub64_borrow(0, a->frac_hi, &c);
 872 }
 873
 874 #define frac_neg(A)  FRAC_GENERIC_64_128(neg, A)(A)
 875
 876 static int frac64_normalize(FloatParts64 *a)
 877 {
 878     if (a->frac) {
 879         int shift = clz64(a->frac);
 880         a->frac <<= shift;
 881         return shift;
 882     }
 883     return 64;
 884 }
 885
 886 static int frac128_normalize(FloatParts128 *a)
 887 {
 888     if (a->frac_hi) {
 889         int shl = clz64(a->frac_hi);
 890         if (shl) {
 891             int shr = 64 - shl;
 892             a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
 893             a->frac_lo = (a->frac_lo << shl);
 894         }
 895         return shl;
 896     } else if (a->frac_lo) {
 897         int shl = clz64(a->frac_lo);
 898         a->frac_hi = (a->frac_lo << shl);
 899         a->frac_lo = 0;
 900         return shl + 64;
 901     }
 902     return 128;
 903 }
 904
 905 #define frac_normalize(A)  FRAC_GENERIC_64_128(normalize, A)(A)
 906
 907 static void frac64_shl(FloatParts64 *a, int c)
 908 {
 909     a->frac <<= c;
 910 }
 911
 912 static void frac128_shl(FloatParts128 *a, int c)
 913 {
 914     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 915 }
 916
 917 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
 918
 919 static void frac64_shr(FloatParts64 *a, int c)
 920 {
 921     a->frac >>= c;
 922 }
 923
 924 static void frac128_shr(FloatParts128 *a, int c)
 925 {
 926     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 927 }
 928
 929 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
 930
 931 static void frac64_shrjam(FloatParts64 *a, int c)
 932 {
 933     shift64RightJamming(a->frac, c, &a->frac);
 934 }
 935
 936 static void frac128_shrjam(FloatParts128 *a, int c)
 937 {
 938     shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 939 }
 940
 941 #define frac_shrjam(A, C)  FRAC_GENERIC_64_128(shrjam, A)(A, C)
 942
 943 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b)
 944 {
 945     return usub64_overflow(a->frac, b->frac, &r->frac);
 946 }
 947
 948 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b)
 949 {
 950     bool c = 0;
 951     r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c);
 952     r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c);
 953     return c;
 954 }
 955
 956 #define frac_sub(R, A, B)  FRAC_GENERIC_64_128(sub, R)(R, A, B)
 957
 958 #define partsN(NAME)   glue(glue(glue(parts,N),_),NAME)
 959 #define FloatPartsN    glue(FloatParts,N)
 960
 961 #define N 64
 962
 963 #include "softfloat-parts-addsub.c.inc"
 964 #include "softfloat-parts.c.inc"
 965
 966 #undef  N
 967 #define N 128
 968
 969 #include "softfloat-parts-addsub.c.inc"
 970 #include "softfloat-parts.c.inc"
 971
 972 #undef  N
 973 #undef  partsN
 974 #undef  FloatPartsN
 975
 976 /*
 977  * Pack/unpack routines with a specific FloatFmt.
 978  */
 979
 980 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
 981                                       float_status *s, const FloatFmt *params)
 982 {
 983     float16_unpack_raw(p, f);
 984     parts_canonicalize(p, s, params);
 985 }
 986
 987 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
 988                                      float_status *s)
 989 {
 990     float16a_unpack_canonical(p, f, s, &float16_params);
 991 }
 992
 993 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
 994                                       float_status *s)
 995 {
 996     bfloat16_unpack_raw(p, f);
 997     parts_canonicalize(p, s, &bfloat16_params);
 998 }
 999
1000 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1001                                              float_status *s,
1002                                              const FloatFmt *params)
1003 {
1004     parts_uncanon(p, s, params);
1005     return float16_pack_raw(p);
1006 }
1007
1008 static float16 float16_round_pack_canonical(FloatParts64 *p,
1009                                             float_status *s)
1010 {
1011     return float16a_round_pack_canonical(p, s, &float16_params);
1012 }
1013
1014 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1015                                               float_status *s)
1016 {
1017     parts_uncanon(p, s, &bfloat16_params);
1018     return bfloat16_pack_raw(p);
1019 }
1020
1021 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1022                                      float_status *s)
1023 {
1024     float32_unpack_raw(p, f);
1025     parts_canonicalize(p, s, &float32_params);
1026 }
1027
1028 static float32 float32_round_pack_canonical(FloatParts64 *p,
1029                                             float_status *s)
1030 {
1031     parts_uncanon(p, s, &float32_params);
1032     return float32_pack_raw(p);
1033 }
1034
1035 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1036                                      float_status *s)
1037 {
1038     float64_unpack_raw(p, f);
1039     parts_canonicalize(p, s, &float64_params);
1040 }
1041
1042 static float64 float64_round_pack_canonical(FloatParts64 *p,
1043                                             float_status *s)
1044 {
1045     parts_uncanon(p, s, &float64_params);
1046     return float64_pack_raw(p);
1047 }
1048
1049 /*
1050  * Addition and subtraction
1051  */
1052
1053 static float16 QEMU_FLATTEN
1054 float16_addsub(float16 a, float16 b, float_status *status, bool subtract)
1055 {
1056     FloatParts64 pa, pb, *pr;
1057
1058     float16_unpack_canonical(&pa, a, status);
1059     float16_unpack_canonical(&pb, b, status);
1060     pr = parts_addsub(&pa, &pb, status, subtract);
1061
1062     return float16_round_pack_canonical(pr, status);
1063 }
1064
1065 float16 float16_add(float16 a, float16 b, float_status *status)
1066 {
1067     return float16_addsub(a, b, status, false);
1068 }
1069
1070 float16 float16_sub(float16 a, float16 b, float_status *status)
1071 {
1072     return float16_addsub(a, b, status, true);
1073 }
1074
1075 static float32 QEMU_SOFTFLOAT_ATTR
1076 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract)
1077 {
1078     FloatParts64 pa, pb, *pr;
1079
1080     float32_unpack_canonical(&pa, a, status);
1081     float32_unpack_canonical(&pb, b, status);
1082     pr = parts_addsub(&pa, &pb, status, subtract);
1083
1084     return float32_round_pack_canonical(pr, status);
1085 }
1086
1087 static float32 soft_f32_add(float32 a, float32 b, float_status *status)
1088 {
1089     return soft_f32_addsub(a, b, status, false);
1090 }
1091
1092 static float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1093 {
1094     return soft_f32_addsub(a, b, status, true);
1095 }
1096
1097 static float64 QEMU_SOFTFLOAT_ATTR
1098 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract)
1099 {
1100     FloatParts64 pa, pb, *pr;
1101
1102     float64_unpack_canonical(&pa, a, status);
1103     float64_unpack_canonical(&pb, b, status);
1104     pr = parts_addsub(&pa, &pb, status, subtract);
1105
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 static float64 soft_f64_add(float64 a, float64 b, float_status *status)
1110 {
1111     return soft_f64_addsub(a, b, status, false);
1112 }
1113
1114 static float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1115 {
1116     return soft_f64_addsub(a, b, status, true);
1117 }
1118
1119 static float hard_f32_add(float a, float b)
1120 {
1121     return a + b;
1122 }
1123
1124 static float hard_f32_sub(float a, float b)
1125 {
1126     return a - b;
1127 }
1128
1129 static double hard_f64_add(double a, double b)
1130 {
1131     return a + b;
1132 }
1133
1134 static double hard_f64_sub(double a, double b)
1135 {
1136     return a - b;
1137 }
1138
1139 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1140 {
1141     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1142         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1143     }
1144     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1145 }
1146
1147 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1148 {
1149     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1150         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1151     } else {
1152         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1153     }
1154 }
1155
1156 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1157                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1158 {
1159     return float32_gen2(a, b, s, hard, soft,
1160                         f32_is_zon2, f32_addsubmul_post);
1161 }
1162
1163 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1164                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1165 {
1166     return float64_gen2(a, b, s, hard, soft,
1167                         f64_is_zon2, f64_addsubmul_post);
1168 }
1169
1170 float32 QEMU_FLATTEN
1171 float32_add(float32 a, float32 b, float_status *s)
1172 {
1173     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1174 }
1175
1176 float32 QEMU_FLATTEN
1177 float32_sub(float32 a, float32 b, float_status *s)
1178 {
1179     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1180 }
1181
1182 float64 QEMU_FLATTEN
1183 float64_add(float64 a, float64 b, float_status *s)
1184 {
1185     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1186 }
1187
1188 float64 QEMU_FLATTEN
1189 float64_sub(float64 a, float64 b, float_status *s)
1190 {
1191     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1192 }
1193
1194 static bfloat16 QEMU_FLATTEN
1195 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract)
1196 {
1197     FloatParts64 pa, pb, *pr;
1198
1199     bfloat16_unpack_canonical(&pa, a, status);
1200     bfloat16_unpack_canonical(&pb, b, status);
1201     pr = parts_addsub(&pa, &pb, status, subtract);
1202
1203     return bfloat16_round_pack_canonical(pr, status);
1204 }
1205
1206 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1207 {
1208     return bfloat16_addsub(a, b, status, false);
1209 }
1210
1211 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1212 {
1213     return bfloat16_addsub(a, b, status, true);
1214 }
1215
1216 /*
1217  * Returns the result of multiplying the floating-point values `a' and
1218  * `b'. The operation is performed according to the IEC/IEEE Standard
1219  * for Binary Floating-Point Arithmetic.
1220  */
1221
1222 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1223 {
1224     bool sign = a.sign ^ b.sign;
1225
1226     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1227         uint64_t hi, lo;
1228         int exp = a.exp + b.exp;
1229
1230         mul64To128(a.frac, b.frac, &hi, &lo);
1231         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1232             exp += 1;
1233         } else {
1234             hi <<= 1;
1235         }
1236         hi |= (lo != 0);
1237
1238         /* Re-use a */
1239         a.exp = exp;
1240         a.sign = sign;
1241         a.frac = hi;
1242         return a;
1243     }
1244     /* handle all the NaN cases */
1245     if (is_nan(a.cls) || is_nan(b.cls)) {
1246         return *parts_pick_nan(&a, &b, s);
1247     }
1248     /* Inf * Zero == NaN */
1249     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1250         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1251         float_raise(float_flag_invalid, s);
1252         parts_default_nan(&a, s);
1253         return a;
1254     }
1255     /* Multiply by 0 or Inf */
1256     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1257         a.sign = sign;
1258         return a;
1259     }
1260     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1261         b.sign = sign;
1262         return b;
1263     }
1264     g_assert_not_reached();
1265 }
1266
1267 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1268 {
1269     FloatParts64 pa, pb, pr;
1270
1271     float16_unpack_canonical(&pa, a, status);
1272     float16_unpack_canonical(&pb, b, status);
1273     pr = mul_floats(pa, pb, status);
1274
1275     return float16_round_pack_canonical(&pr, status);
1276 }
1277
1278 static float32 QEMU_SOFTFLOAT_ATTR
1279 soft_f32_mul(float32 a, float32 b, float_status *status)
1280 {
1281     FloatParts64 pa, pb, pr;
1282
1283     float32_unpack_canonical(&pa, a, status);
1284     float32_unpack_canonical(&pb, b, status);
1285     pr = mul_floats(pa, pb, status);
1286
1287     return float32_round_pack_canonical(&pr, status);
1288 }
1289
1290 static float64 QEMU_SOFTFLOAT_ATTR
1291 soft_f64_mul(float64 a, float64 b, float_status *status)
1292 {
1293     FloatParts64 pa, pb, pr;
1294
1295     float64_unpack_canonical(&pa, a, status);
1296     float64_unpack_canonical(&pb, b, status);
1297     pr = mul_floats(pa, pb, status);
1298
1299     return float64_round_pack_canonical(&pr, status);
1300 }
1301
1302 static float hard_f32_mul(float a, float b)
1303 {
1304     return a * b;
1305 }
1306
1307 static double hard_f64_mul(double a, double b)
1308 {
1309     return a * b;
1310 }
1311
1312 float32 QEMU_FLATTEN
1313 float32_mul(float32 a, float32 b, float_status *s)
1314 {
1315     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1316                         f32_is_zon2, f32_addsubmul_post);
1317 }
1318
1319 float64 QEMU_FLATTEN
1320 float64_mul(float64 a, float64 b, float_status *s)
1321 {
1322     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1323                         f64_is_zon2, f64_addsubmul_post);
1324 }
1325
1326 /*
1327  * Returns the result of multiplying the bfloat16
1328  * values `a' and `b'.
1329  */
1330
1331 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1332 {
1333     FloatParts64 pa, pb, pr;
1334
1335     bfloat16_unpack_canonical(&pa, a, status);
1336     bfloat16_unpack_canonical(&pb, b, status);
1337     pr = mul_floats(pa, pb, status);
1338
1339     return bfloat16_round_pack_canonical(&pr, status);
1340 }
1341
1342 /*
1343  * Returns the result of multiplying the floating-point values `a' and
1344  * `b' then adding 'c', with no intermediate rounding step after the
1345  * multiplication. The operation is performed according to the
1346  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1347  * The flags argument allows the caller to select negation of the
1348  * addend, the intermediate product, or the final result. (The
1349  * difference between this and having the caller do a separate
1350  * negation is that negating externally will flip the sign bit on
1351  * NaNs.)
1352  */
1353
1354 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1355                                 int flags, float_status *s)
1356 {
1357     bool inf_zero, p_sign;
1358     bool sign_flip = flags & float_muladd_negate_result;
1359     FloatClass p_class;
1360     uint64_t hi, lo;
1361     int p_exp;
1362     int ab_mask, abc_mask;
1363
1364     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1365     abc_mask = float_cmask(c.cls) | ab_mask;
1366     inf_zero = ab_mask == float_cmask_infzero;
1367
1368     /* It is implementation-defined whether the cases of (0,inf,qnan)
1369      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1370      * they return if they do), so we have to hand this information
1371      * off to the target-specific pick-a-NaN routine.
1372      */
1373     if (unlikely(abc_mask & float_cmask_anynan)) {
1374         return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1375     }
1376
1377     if (inf_zero) {
1378         float_raise(float_flag_invalid, s);
1379         parts_default_nan(&a, s);
1380         return a;
1381     }
1382
1383     if (flags & float_muladd_negate_c) {
1384         c.sign ^= 1;
1385     }
1386
1387     p_sign = a.sign ^ b.sign;
1388
1389     if (flags & float_muladd_negate_product) {
1390         p_sign ^= 1;
1391     }
1392
1393     if (ab_mask & float_cmask_inf) {
1394         p_class = float_class_inf;
1395     } else if (ab_mask & float_cmask_zero) {
1396         p_class = float_class_zero;
1397     } else {
1398         p_class = float_class_normal;
1399     }
1400
1401     if (c.cls == float_class_inf) {
1402         if (p_class == float_class_inf && p_sign != c.sign) {
1403             float_raise(float_flag_invalid, s);
1404             parts_default_nan(&c, s);
1405         } else {
1406             c.sign ^= sign_flip;
1407         }
1408         return c;
1409     }
1410
1411     if (p_class == float_class_inf) {
1412         a.cls = float_class_inf;
1413         a.sign = p_sign ^ sign_flip;
1414         return a;
1415     }
1416
1417     if (p_class == float_class_zero) {
1418         if (c.cls == float_class_zero) {
1419             if (p_sign != c.sign) {
1420                 p_sign = s->float_rounding_mode == float_round_down;
1421             }
1422             c.sign = p_sign;
1423         } else if (flags & float_muladd_halve_result) {
1424             c.exp -= 1;
1425         }
1426         c.sign ^= sign_flip;
1427         return c;
1428     }
1429
1430     /* a & b should be normals now... */
1431     assert(a.cls == float_class_normal &&
1432            b.cls == float_class_normal);
1433
1434     p_exp = a.exp + b.exp;
1435
1436     mul64To128(a.frac, b.frac, &hi, &lo);
1437
1438     /* Renormalize to the msb. */
1439     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1440         p_exp += 1;
1441     } else {
1442         shortShift128Left(hi, lo, 1, &hi, &lo);
1443     }
1444
1445     /* + add/sub */
1446     if (c.cls != float_class_zero) {
1447         int exp_diff = p_exp - c.exp;
1448         if (p_sign == c.sign) {
1449             /* Addition */
1450             if (exp_diff <= 0) {
1451                 shift64RightJamming(hi, -exp_diff, &hi);
1452                 p_exp = c.exp;
1453                 if (uadd64_overflow(hi, c.frac, &hi)) {
1454                     shift64RightJamming(hi, 1, &hi);
1455                     hi |= DECOMPOSED_IMPLICIT_BIT;
1456                     p_exp += 1;
1457                 }
1458             } else {
1459                 uint64_t c_hi, c_lo, over;
1460                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1461                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1462                 if (over) {
1463                     shift64RightJamming(hi, 1, &hi);
1464                     hi |= DECOMPOSED_IMPLICIT_BIT;
1465                     p_exp += 1;
1466                 }
1467             }
1468         } else {
1469             /* Subtraction */
1470             uint64_t c_hi = c.frac, c_lo = 0;
1471
1472             if (exp_diff <= 0) {
1473                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1474                 if (exp_diff == 0
1475                     &&
1476                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1477                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1478                 } else {
1479                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1480                     p_sign ^= 1;
1481                     p_exp = c.exp;
1482                 }
1483             } else {
1484                 shift128RightJamming(c_hi, c_lo,
1485                                      exp_diff,
1486                                      &c_hi, &c_lo);
1487                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1488             }
1489
1490             if (hi == 0 && lo == 0) {
1491                 a.cls = float_class_zero;
1492                 a.sign = s->float_rounding_mode == float_round_down;
1493                 a.sign ^= sign_flip;
1494                 return a;
1495             } else {
1496                 int shift;
1497                 if (hi != 0) {
1498                     shift = clz64(hi);
1499                 } else {
1500                     shift = clz64(lo) + 64;
1501                 }
1502                 /* Normalizing to a binary point of 124 is the
1503                    correct adjust for the exponent.  However since we're
1504                    shifting, we might as well put the binary point back
1505                    at 63 where we really want it.  Therefore shift as
1506                    if we're leaving 1 bit at the top of the word, but
1507                    adjust the exponent as if we're leaving 3 bits.  */
1508                 shift128Left(hi, lo, shift, &hi, &lo);
1509                 p_exp -= shift;
1510             }
1511         }
1512     }
1513     hi |= (lo != 0);
1514
1515     if (flags & float_muladd_halve_result) {
1516         p_exp -= 1;
1517     }
1518
1519     /* finally prepare our result */
1520     a.cls = float_class_normal;
1521     a.sign = p_sign ^ sign_flip;
1522     a.exp = p_exp;
1523     a.frac = hi;
1524
1525     return a;
1526 }
1527
1528 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1529                                                 int flags, float_status *status)
1530 {
1531     FloatParts64 pa, pb, pc, pr;
1532
1533     float16_unpack_canonical(&pa, a, status);
1534     float16_unpack_canonical(&pb, b, status);
1535     float16_unpack_canonical(&pc, c, status);
1536     pr = muladd_floats(pa, pb, pc, flags, status);
1537
1538     return float16_round_pack_canonical(&pr, status);
1539 }
1540
1541 static float32 QEMU_SOFTFLOAT_ATTR
1542 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1543                 float_status *status)
1544 {
1545     FloatParts64 pa, pb, pc, pr;
1546
1547     float32_unpack_canonical(&pa, a, status);
1548     float32_unpack_canonical(&pb, b, status);
1549     float32_unpack_canonical(&pc, c, status);
1550     pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552     return float32_round_pack_canonical(&pr, status);
1553 }
1554
1555 static float64 QEMU_SOFTFLOAT_ATTR
1556 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1557                 float_status *status)
1558 {
1559     FloatParts64 pa, pb, pc, pr;
1560
1561     float64_unpack_canonical(&pa, a, status);
1562     float64_unpack_canonical(&pb, b, status);
1563     float64_unpack_canonical(&pc, c, status);
1564     pr = muladd_floats(pa, pb, pc, flags, status);
1565
1566     return float64_round_pack_canonical(&pr, status);
1567 }
1568
1569 static bool force_soft_fma;
1570
1571 float32 QEMU_FLATTEN
1572 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1573 {
1574     union_float32 ua, ub, uc, ur;
1575
1576     ua.s = xa;
1577     ub.s = xb;
1578     uc.s = xc;
1579
1580     if (unlikely(!can_use_fpu(s))) {
1581         goto soft;
1582     }
1583     if (unlikely(flags & float_muladd_halve_result)) {
1584         goto soft;
1585     }
1586
1587     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1588     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1589         goto soft;
1590     }
1591
1592     if (unlikely(force_soft_fma)) {
1593         goto soft;
1594     }
1595
1596     /*
1597      * When (a || b) == 0, there's no need to check for under/over flow,
1598      * since we know the addend is (normal || 0) and the product is 0.
1599      */
1600     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1601         union_float32 up;
1602         bool prod_sign;
1603
1604         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1605         prod_sign ^= !!(flags & float_muladd_negate_product);
1606         up.s = float32_set_sign(float32_zero, prod_sign);
1607
1608         if (flags & float_muladd_negate_c) {
1609             uc.h = -uc.h;
1610         }
1611         ur.h = up.h + uc.h;
1612     } else {
1613         union_float32 ua_orig = ua;
1614         union_float32 uc_orig = uc;
1615
1616         if (flags & float_muladd_negate_product) {
1617             ua.h = -ua.h;
1618         }
1619         if (flags & float_muladd_negate_c) {
1620             uc.h = -uc.h;
1621         }
1622
1623         ur.h = fmaf(ua.h, ub.h, uc.h);
1624
1625         if (unlikely(f32_is_inf(ur))) {
1626             float_raise(float_flag_overflow, s);
1627         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1628             ua = ua_orig;
1629             uc = uc_orig;
1630             goto soft;
1631         }
1632     }
1633     if (flags & float_muladd_negate_result) {
1634         return float32_chs(ur.s);
1635     }
1636     return ur.s;
1637
1638  soft:
1639     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1640 }
1641
1642 float64 QEMU_FLATTEN
1643 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1644 {
1645     union_float64 ua, ub, uc, ur;
1646
1647     ua.s = xa;
1648     ub.s = xb;
1649     uc.s = xc;
1650
1651     if (unlikely(!can_use_fpu(s))) {
1652         goto soft;
1653     }
1654     if (unlikely(flags & float_muladd_halve_result)) {
1655         goto soft;
1656     }
1657
1658     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1659     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1660         goto soft;
1661     }
1662
1663     if (unlikely(force_soft_fma)) {
1664         goto soft;
1665     }
1666
1667     /*
1668      * When (a || b) == 0, there's no need to check for under/over flow,
1669      * since we know the addend is (normal || 0) and the product is 0.
1670      */
1671     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1672         union_float64 up;
1673         bool prod_sign;
1674
1675         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1676         prod_sign ^= !!(flags & float_muladd_negate_product);
1677         up.s = float64_set_sign(float64_zero, prod_sign);
1678
1679         if (flags & float_muladd_negate_c) {
1680             uc.h = -uc.h;
1681         }
1682         ur.h = up.h + uc.h;
1683     } else {
1684         union_float64 ua_orig = ua;
1685         union_float64 uc_orig = uc;
1686
1687         if (flags & float_muladd_negate_product) {
1688             ua.h = -ua.h;
1689         }
1690         if (flags & float_muladd_negate_c) {
1691             uc.h = -uc.h;
1692         }
1693
1694         ur.h = fma(ua.h, ub.h, uc.h);
1695
1696         if (unlikely(f64_is_inf(ur))) {
1697             float_raise(float_flag_overflow, s);
1698         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1699             ua = ua_orig;
1700             uc = uc_orig;
1701             goto soft;
1702         }
1703     }
1704     if (flags & float_muladd_negate_result) {
1705         return float64_chs(ur.s);
1706     }
1707     return ur.s;
1708
1709  soft:
1710     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1711 }
1712
1713 /*
1714  * Returns the result of multiplying the bfloat16 values `a'
1715  * and `b' then adding 'c', with no intermediate rounding step after the
1716  * multiplication.
1717  */
1718
1719 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1720                                       int flags, float_status *status)
1721 {
1722     FloatParts64 pa, pb, pc, pr;
1723
1724     bfloat16_unpack_canonical(&pa, a, status);
1725     bfloat16_unpack_canonical(&pb, b, status);
1726     bfloat16_unpack_canonical(&pc, c, status);
1727     pr = muladd_floats(pa, pb, pc, flags, status);
1728
1729     return bfloat16_round_pack_canonical(&pr, status);
1730 }
1731
1732 /*
1733  * Returns the result of dividing the floating-point value `a' by the
1734  * corresponding value `b'. The operation is performed according to
1735  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1736  */
1737
1738 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1739 {
1740     bool sign = a.sign ^ b.sign;
1741
1742     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1743         uint64_t n0, n1, q, r;
1744         int exp = a.exp - b.exp;
1745
1746         /*
1747          * We want a 2*N / N-bit division to produce exactly an N-bit
1748          * result, so that we do not lose any precision and so that we
1749          * do not have to renormalize afterward.  If A.frac < B.frac,
1750          * then division would produce an (N-1)-bit result; shift A left
1751          * by one to produce the an N-bit result, and decrement the
1752          * exponent to match.
1753          *
1754          * The udiv_qrnnd algorithm that we're using requires normalization,
1755          * i.e. the msb of the denominator must be set, which is already true.
1756          */
1757         if (a.frac < b.frac) {
1758             exp -= 1;
1759             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1760         } else {
1761             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1762         }
1763         q = udiv_qrnnd(&r, n1, n0, b.frac);
1764
1765         /* Set lsb if there is a remainder, to set inexact. */
1766         a.frac = q | (r != 0);
1767         a.sign = sign;
1768         a.exp = exp;
1769         return a;
1770     }
1771     /* handle all the NaN cases */
1772     if (is_nan(a.cls) || is_nan(b.cls)) {
1773         return *parts_pick_nan(&a, &b, s);
1774     }
1775     /* 0/0 or Inf/Inf */
1776     if (a.cls == b.cls
1777         &&
1778         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1779         float_raise(float_flag_invalid, s);
1780         parts_default_nan(&a, s);
1781         return a;
1782     }
1783     /* Inf / x or 0 / x */
1784     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1785         a.sign = sign;
1786         return a;
1787     }
1788     /* Div 0 => Inf */
1789     if (b.cls == float_class_zero) {
1790         float_raise(float_flag_divbyzero, s);
1791         a.cls = float_class_inf;
1792         a.sign = sign;
1793         return a;
1794     }
1795     /* Div by Inf */
1796     if (b.cls == float_class_inf) {
1797         a.cls = float_class_zero;
1798         a.sign = sign;
1799         return a;
1800     }
1801     g_assert_not_reached();
1802 }
1803
1804 float16 float16_div(float16 a, float16 b, float_status *status)
1805 {
1806     FloatParts64 pa, pb, pr;
1807
1808     float16_unpack_canonical(&pa, a, status);
1809     float16_unpack_canonical(&pb, b, status);
1810     pr = div_floats(pa, pb, status);
1811
1812     return float16_round_pack_canonical(&pr, status);
1813 }
1814
1815 static float32 QEMU_SOFTFLOAT_ATTR
1816 soft_f32_div(float32 a, float32 b, float_status *status)
1817 {
1818     FloatParts64 pa, pb, pr;
1819
1820     float32_unpack_canonical(&pa, a, status);
1821     float32_unpack_canonical(&pb, b, status);
1822     pr = div_floats(pa, pb, status);
1823
1824     return float32_round_pack_canonical(&pr, status);
1825 }
1826
1827 static float64 QEMU_SOFTFLOAT_ATTR
1828 soft_f64_div(float64 a, float64 b, float_status *status)
1829 {
1830     FloatParts64 pa, pb, pr;
1831
1832     float64_unpack_canonical(&pa, a, status);
1833     float64_unpack_canonical(&pb, b, status);
1834     pr = div_floats(pa, pb, status);
1835
1836     return float64_round_pack_canonical(&pr, status);
1837 }
1838
1839 static float hard_f32_div(float a, float b)
1840 {
1841     return a / b;
1842 }
1843
1844 static double hard_f64_div(double a, double b)
1845 {
1846     return a / b;
1847 }
1848
1849 static bool f32_div_pre(union_float32 a, union_float32 b)
1850 {
1851     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1852         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1853                fpclassify(b.h) == FP_NORMAL;
1854     }
1855     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1856 }
1857
1858 static bool f64_div_pre(union_float64 a, union_float64 b)
1859 {
1860     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1861         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1862                fpclassify(b.h) == FP_NORMAL;
1863     }
1864     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1865 }
1866
1867 static bool f32_div_post(union_float32 a, union_float32 b)
1868 {
1869     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1870         return fpclassify(a.h) != FP_ZERO;
1871     }
1872     return !float32_is_zero(a.s);
1873 }
1874
1875 static bool f64_div_post(union_float64 a, union_float64 b)
1876 {
1877     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1878         return fpclassify(a.h) != FP_ZERO;
1879     }
1880     return !float64_is_zero(a.s);
1881 }
1882
1883 float32 QEMU_FLATTEN
1884 float32_div(float32 a, float32 b, float_status *s)
1885 {
1886     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1887                         f32_div_pre, f32_div_post);
1888 }
1889
1890 float64 QEMU_FLATTEN
1891 float64_div(float64 a, float64 b, float_status *s)
1892 {
1893     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1894                         f64_div_pre, f64_div_post);
1895 }
1896
1897 /*
1898  * Returns the result of dividing the bfloat16
1899  * value `a' by the corresponding value `b'.
1900  */
1901
1902 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1903 {
1904     FloatParts64 pa, pb, pr;
1905
1906     bfloat16_unpack_canonical(&pa, a, status);
1907     bfloat16_unpack_canonical(&pb, b, status);
1908     pr = div_floats(pa, pb, status);
1909
1910     return bfloat16_round_pack_canonical(&pr, status);
1911 }
1912
1913 /*
1914  * Float to Float conversions
1915  *
1916  * Returns the result of converting one float format to another. The
1917  * conversion is performed according to the IEC/IEEE Standard for
1918  * Binary Floating-Point Arithmetic.
1919  *
1920  * The float_to_float helper only needs to take care of raising
1921  * invalid exceptions and handling the conversion on NaNs.
1922  */
1923
1924 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1925                                  float_status *s)
1926 {
1927     if (dstf->arm_althp) {
1928         switch (a.cls) {
1929         case float_class_qnan:
1930         case float_class_snan:
1931             /* There is no NaN in the destination format.  Raise Invalid
1932              * and return a zero with the sign of the input NaN.
1933              */
1934             float_raise(float_flag_invalid, s);
1935             a.cls = float_class_zero;
1936             a.frac = 0;
1937             a.exp = 0;
1938             break;
1939
1940         case float_class_inf:
1941             /* There is no Inf in the destination format.  Raise Invalid
1942              * and return the maximum normal with the correct sign.
1943              */
1944             float_raise(float_flag_invalid, s);
1945             a.cls = float_class_normal;
1946             a.exp = dstf->exp_max;
1947             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1948             break;
1949
1950         default:
1951             break;
1952         }
1953     } else if (is_nan(a.cls)) {
1954         parts_return_nan(&a, s);
1955     }
1956     return a;
1957 }
1958
1959 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1960 {
1961     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1962     FloatParts64 pa, pr;
1963
1964     float16a_unpack_canonical(&pa, a, s, fmt16);
1965     pr = float_to_float(pa, &float32_params, s);
1966     return float32_round_pack_canonical(&pr, s);
1967 }
1968
1969 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1970 {
1971     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1972     FloatParts64 pa, pr;
1973
1974     float16a_unpack_canonical(&pa, a, s, fmt16);
1975     pr = float_to_float(pa, &float64_params, s);
1976     return float64_round_pack_canonical(&pr, s);
1977 }
1978
1979 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1980 {
1981     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1982     FloatParts64 pa, pr;
1983
1984     float32_unpack_canonical(&pa, a, s);
1985     pr = float_to_float(pa, fmt16, s);
1986     return float16a_round_pack_canonical(&pr, s, fmt16);
1987 }
1988
1989 static float64 QEMU_SOFTFLOAT_ATTR
1990 soft_float32_to_float64(float32 a, float_status *s)
1991 {
1992     FloatParts64 pa, pr;
1993
1994     float32_unpack_canonical(&pa, a, s);
1995     pr = float_to_float(pa, &float64_params, s);
1996     return float64_round_pack_canonical(&pr, s);
1997 }
1998
1999 float64 float32_to_float64(float32 a, float_status *s)
2000 {
2001     if (likely(float32_is_normal(a))) {
2002         /* Widening conversion can never produce inexact results.  */
2003         union_float32 uf;
2004         union_float64 ud;
2005         uf.s = a;
2006         ud.h = uf.h;
2007         return ud.s;
2008     } else if (float32_is_zero(a)) {
2009         return float64_set_sign(float64_zero, float32_is_neg(a));
2010     } else {
2011         return soft_float32_to_float64(a, s);
2012     }
2013 }
2014
2015 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2016 {
2017     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2018     FloatParts64 pa, pr;
2019
2020     float64_unpack_canonical(&pa, a, s);
2021     pr = float_to_float(pa, fmt16, s);
2022     return float16a_round_pack_canonical(&pr, s, fmt16);
2023 }
2024
2025 float32 float64_to_float32(float64 a, float_status *s)
2026 {
2027     FloatParts64 pa, pr;
2028
2029     float64_unpack_canonical(&pa, a, s);
2030     pr = float_to_float(pa, &float32_params, s);
2031     return float32_round_pack_canonical(&pr, s);
2032 }
2033
2034 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2035 {
2036     FloatParts64 pa, pr;
2037
2038     bfloat16_unpack_canonical(&pa, a, s);
2039     pr = float_to_float(pa, &float32_params, s);
2040     return float32_round_pack_canonical(&pr, s);
2041 }
2042
2043 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2044 {
2045     FloatParts64 pa, pr;
2046
2047     bfloat16_unpack_canonical(&pa, a, s);
2048     pr = float_to_float(pa, &float64_params, s);
2049     return float64_round_pack_canonical(&pr, s);
2050 }
2051
2052 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2053 {
2054     FloatParts64 pa, pr;
2055
2056     float32_unpack_canonical(&pa, a, s);
2057     pr = float_to_float(pa, &bfloat16_params, s);
2058     return bfloat16_round_pack_canonical(&pr, s);
2059 }
2060
2061 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2062 {
2063     FloatParts64 pa, pr;
2064
2065     float64_unpack_canonical(&pa, a, s);
2066     pr = float_to_float(pa, &bfloat16_params, s);
2067     return bfloat16_round_pack_canonical(&pr, s);
2068 }
2069
2070 /*
2071  * Rounds the floating-point value `a' to an integer, and returns the
2072  * result as a floating-point value. The operation is performed
2073  * according to the IEC/IEEE Standard for Binary Floating-Point
2074  * Arithmetic.
2075  */
2076
2077 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2078                                int scale, float_status *s)
2079 {
2080     switch (a.cls) {
2081     case float_class_qnan:
2082     case float_class_snan:
2083         parts_return_nan(&a, s);
2084         break;
2085
2086     case float_class_zero:
2087     case float_class_inf:
2088         /* already "integral" */
2089         break;
2090
2091     case float_class_normal:
2092         scale = MIN(MAX(scale, -0x10000), 0x10000);
2093         a.exp += scale;
2094
2095         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2096             /* already integral */
2097             break;
2098         }
2099         if (a.exp < 0) {
2100             bool one;
2101             /* all fractional */
2102             float_raise(float_flag_inexact, s);
2103             switch (rmode) {
2104             case float_round_nearest_even:
2105                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2106                 break;
2107             case float_round_ties_away:
2108                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2109                 break;
2110             case float_round_to_zero:
2111                 one = false;
2112                 break;
2113             case float_round_up:
2114                 one = !a.sign;
2115                 break;
2116             case float_round_down:
2117                 one = a.sign;
2118                 break;
2119             case float_round_to_odd:
2120                 one = true;
2121                 break;
2122             default:
2123                 g_assert_not_reached();
2124             }
2125
2126             if (one) {
2127                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2128                 a.exp = 0;
2129             } else {
2130                 a.cls = float_class_zero;
2131             }
2132         } else {
2133             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2134             uint64_t frac_lsbm1 = frac_lsb >> 1;
2135             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2136             uint64_t rnd_mask = rnd_even_mask >> 1;
2137             uint64_t inc;
2138
2139             switch (rmode) {
2140             case float_round_nearest_even:
2141                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2142                 break;
2143             case float_round_ties_away:
2144                 inc = frac_lsbm1;
2145                 break;
2146             case float_round_to_zero:
2147                 inc = 0;
2148                 break;
2149             case float_round_up:
2150                 inc = a.sign ? 0 : rnd_mask;
2151                 break;
2152             case float_round_down:
2153                 inc = a.sign ? rnd_mask : 0;
2154                 break;
2155             case float_round_to_odd:
2156                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2157                 break;
2158             default:
2159                 g_assert_not_reached();
2160             }
2161
2162             if (a.frac & rnd_mask) {
2163                 float_raise(float_flag_inexact, s);
2164                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2165                     a.frac >>= 1;
2166                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2167                     a.exp++;
2168                 }
2169                 a.frac &= ~rnd_mask;
2170             }
2171         }
2172         break;
2173     default:
2174         g_assert_not_reached();
2175     }
2176     return a;
2177 }
2178
2179 float16 float16_round_to_int(float16 a, float_status *s)
2180 {
2181     FloatParts64 pa, pr;
2182
2183     float16_unpack_canonical(&pa, a, s);
2184     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2185     return float16_round_pack_canonical(&pr, s);
2186 }
2187
2188 float32 float32_round_to_int(float32 a, float_status *s)
2189 {
2190     FloatParts64 pa, pr;
2191
2192     float32_unpack_canonical(&pa, a, s);
2193     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2194     return float32_round_pack_canonical(&pr, s);
2195 }
2196
2197 float64 float64_round_to_int(float64 a, float_status *s)
2198 {
2199     FloatParts64 pa, pr;
2200
2201     float64_unpack_canonical(&pa, a, s);
2202     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2203     return float64_round_pack_canonical(&pr, s);
2204 }
2205
2206 /*
2207  * Rounds the bfloat16 value `a' to an integer, and returns the
2208  * result as a bfloat16 value.
2209  */
2210
2211 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2212 {
2213     FloatParts64 pa, pr;
2214
2215     bfloat16_unpack_canonical(&pa, a, s);
2216     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2217     return bfloat16_round_pack_canonical(&pr, s);
2218 }
2219
2220 /*
2221  * Returns the result of converting the floating-point value `a' to
2222  * the two's complement integer format. The conversion is performed
2223  * according to the IEC/IEEE Standard for Binary Floating-Point
2224  * Arithmetic---which means in particular that the conversion is
2225  * rounded according to the current rounding mode. If `a' is a NaN,
2226  * the largest positive integer is returned. Otherwise, if the
2227  * conversion overflows, the largest integer with the same sign as `a'
2228  * is returned.
2229 */
2230
2231 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2232                                      int scale, int64_t min, int64_t max,
2233                                      float_status *s)
2234 {
2235     uint64_t r;
2236     int orig_flags = get_float_exception_flags(s);
2237     FloatParts64 p = round_to_int(in, rmode, scale, s);
2238
2239     switch (p.cls) {
2240     case float_class_snan:
2241     case float_class_qnan:
2242         s->float_exception_flags = orig_flags | float_flag_invalid;
2243         return max;
2244     case float_class_inf:
2245         s->float_exception_flags = orig_flags | float_flag_invalid;
2246         return p.sign ? min : max;
2247     case float_class_zero:
2248         return 0;
2249     case float_class_normal:
2250         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2251             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2252         } else {
2253             r = UINT64_MAX;
2254         }
2255         if (p.sign) {
2256             if (r <= -(uint64_t) min) {
2257                 return -r;
2258             } else {
2259                 s->float_exception_flags = orig_flags | float_flag_invalid;
2260                 return min;
2261             }
2262         } else {
2263             if (r <= max) {
2264                 return r;
2265             } else {
2266                 s->float_exception_flags = orig_flags | float_flag_invalid;
2267                 return max;
2268             }
2269         }
2270     default:
2271         g_assert_not_reached();
2272     }
2273 }
2274
2275 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2276                               float_status *s)
2277 {
2278     FloatParts64 p;
2279
2280     float16_unpack_canonical(&p, a, s);
2281     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2282 }
2283
2284 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2285                                 float_status *s)
2286 {
2287     FloatParts64 p;
2288
2289     float16_unpack_canonical(&p, a, s);
2290     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2291 }
2292
2293 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2294                                 float_status *s)
2295 {
2296     FloatParts64 p;
2297
2298     float16_unpack_canonical(&p, a, s);
2299     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2300 }
2301
2302 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2303                                 float_status *s)
2304 {
2305     FloatParts64 p;
2306
2307     float16_unpack_canonical(&p, a, s);
2308     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2309 }
2310
2311 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2312                                 float_status *s)
2313 {
2314     FloatParts64 p;
2315
2316     float32_unpack_canonical(&p, a, s);
2317     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2318 }
2319
2320 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2321                                 float_status *s)
2322 {
2323     FloatParts64 p;
2324
2325     float32_unpack_canonical(&p, a, s);
2326     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2327 }
2328
2329 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2330                                 float_status *s)
2331 {
2332     FloatParts64 p;
2333
2334     float32_unpack_canonical(&p, a, s);
2335     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2336 }
2337
2338 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2339                                 float_status *s)
2340 {
2341     FloatParts64 p;
2342
2343     float64_unpack_canonical(&p, a, s);
2344     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2345 }
2346
2347 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2348                                 float_status *s)
2349 {
2350     FloatParts64 p;
2351
2352     float64_unpack_canonical(&p, a, s);
2353     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2354 }
2355
2356 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2357                                 float_status *s)
2358 {
2359     FloatParts64 p;
2360
2361     float64_unpack_canonical(&p, a, s);
2362     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2363 }
2364
2365 int8_t float16_to_int8(float16 a, float_status *s)
2366 {
2367     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2368 }
2369
2370 int16_t float16_to_int16(float16 a, float_status *s)
2371 {
2372     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2373 }
2374
2375 int32_t float16_to_int32(float16 a, float_status *s)
2376 {
2377     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2378 }
2379
2380 int64_t float16_to_int64(float16 a, float_status *s)
2381 {
2382     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2383 }
2384
2385 int16_t float32_to_int16(float32 a, float_status *s)
2386 {
2387     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2388 }
2389
2390 int32_t float32_to_int32(float32 a, float_status *s)
2391 {
2392     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2393 }
2394
2395 int64_t float32_to_int64(float32 a, float_status *s)
2396 {
2397     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2398 }
2399
2400 int16_t float64_to_int16(float64 a, float_status *s)
2401 {
2402     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2403 }
2404
2405 int32_t float64_to_int32(float64 a, float_status *s)
2406 {
2407     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2408 }
2409
2410 int64_t float64_to_int64(float64 a, float_status *s)
2411 {
2412     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2413 }
2414
2415 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2416 {
2417     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2418 }
2419
2420 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2421 {
2422     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2423 }
2424
2425 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2426 {
2427     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2428 }
2429
2430 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2431 {
2432     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2433 }
2434
2435 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2436 {
2437     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2438 }
2439
2440 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2441 {
2442     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2443 }
2444
2445 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2446 {
2447     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2448 }
2449
2450 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2451 {
2452     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2453 }
2454
2455 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2456 {
2457     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2458 }
2459
2460 /*
2461  * Returns the result of converting the floating-point value `a' to
2462  * the two's complement integer format.
2463  */
2464
2465 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2466                                  float_status *s)
2467 {
2468     FloatParts64 p;
2469
2470     bfloat16_unpack_canonical(&p, a, s);
2471     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2472 }
2473
2474 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2475                                  float_status *s)
2476 {
2477     FloatParts64 p;
2478
2479     bfloat16_unpack_canonical(&p, a, s);
2480     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2481 }
2482
2483 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2484                                  float_status *s)
2485 {
2486     FloatParts64 p;
2487
2488     bfloat16_unpack_canonical(&p, a, s);
2489     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2490 }
2491
2492 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2493 {
2494     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2495 }
2496
2497 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2498 {
2499     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2500 }
2501
2502 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2503 {
2504     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2505 }
2506
2507 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2508 {
2509     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2510 }
2511
2512 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2513 {
2514     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2515 }
2516
2517 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2518 {
2519     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2520 }
2521
2522 /*
2523  *  Returns the result of converting the floating-point value `a' to
2524  *  the unsigned integer format. The conversion is performed according
2525  *  to the IEC/IEEE Standard for Binary Floating-Point
2526  *  Arithmetic---which means in particular that the conversion is
2527  *  rounded according to the current rounding mode. If `a' is a NaN,
2528  *  the largest unsigned integer is returned. Otherwise, if the
2529  *  conversion overflows, the largest unsigned integer is returned. If
2530  *  the 'a' is negative, the result is rounded and zero is returned;
2531  *  values that do not round to zero will raise the inexact exception
2532  *  flag.
2533  */
2534
2535 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2536                                        int scale, uint64_t max,
2537                                        float_status *s)
2538 {
2539     int orig_flags = get_float_exception_flags(s);
2540     FloatParts64 p = round_to_int(in, rmode, scale, s);
2541     uint64_t r;
2542
2543     switch (p.cls) {
2544     case float_class_snan:
2545     case float_class_qnan:
2546         s->float_exception_flags = orig_flags | float_flag_invalid;
2547         return max;
2548     case float_class_inf:
2549         s->float_exception_flags = orig_flags | float_flag_invalid;
2550         return p.sign ? 0 : max;
2551     case float_class_zero:
2552         return 0;
2553     case float_class_normal:
2554         if (p.sign) {
2555             s->float_exception_flags = orig_flags | float_flag_invalid;
2556             return 0;
2557         }
2558
2559         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2560             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2561         } else {
2562             s->float_exception_flags = orig_flags | float_flag_invalid;
2563             return max;
2564         }
2565
2566         /* For uint64 this will never trip, but if p.exp is too large
2567          * to shift a decomposed fraction we shall have exited via the
2568          * 3rd leg above.
2569          */
2570         if (r > max) {
2571             s->float_exception_flags = orig_flags | float_flag_invalid;
2572             return max;
2573         }
2574         return r;
2575     default:
2576         g_assert_not_reached();
2577     }
2578 }
2579
2580 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2581                                 float_status *s)
2582 {
2583     FloatParts64 p;
2584
2585     float16_unpack_canonical(&p, a, s);
2586     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2587 }
2588
2589 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2590                                   float_status *s)
2591 {
2592     FloatParts64 p;
2593
2594     float16_unpack_canonical(&p, a, s);
2595     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2596 }
2597
2598 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2599                                   float_status *s)
2600 {
2601     FloatParts64 p;
2602
2603     float16_unpack_canonical(&p, a, s);
2604     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2605 }
2606
2607 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2608                                   float_status *s)
2609 {
2610     FloatParts64 p;
2611
2612     float16_unpack_canonical(&p, a, s);
2613     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2614 }
2615
2616 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2617                                   float_status *s)
2618 {
2619     FloatParts64 p;
2620
2621     float32_unpack_canonical(&p, a, s);
2622     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2623 }
2624
2625 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2626                                   float_status *s)
2627 {
2628     FloatParts64 p;
2629
2630     float32_unpack_canonical(&p, a, s);
2631     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2632 }
2633
2634 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2635                                   float_status *s)
2636 {
2637     FloatParts64 p;
2638
2639     float32_unpack_canonical(&p, a, s);
2640     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2641 }
2642
2643 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2644                                   float_status *s)
2645 {
2646     FloatParts64 p;
2647
2648     float64_unpack_canonical(&p, a, s);
2649     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2650 }
2651
2652 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2653                                   float_status *s)
2654 {
2655     FloatParts64 p;
2656
2657     float64_unpack_canonical(&p, a, s);
2658     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2659 }
2660
2661 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2662                                   float_status *s)
2663 {
2664     FloatParts64 p;
2665
2666     float64_unpack_canonical(&p, a, s);
2667     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2668 }
2669
2670 uint8_t float16_to_uint8(float16 a, float_status *s)
2671 {
2672     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2673 }
2674
2675 uint16_t float16_to_uint16(float16 a, float_status *s)
2676 {
2677     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2678 }
2679
2680 uint32_t float16_to_uint32(float16 a, float_status *s)
2681 {
2682     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2683 }
2684
2685 uint64_t float16_to_uint64(float16 a, float_status *s)
2686 {
2687     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2688 }
2689
2690 uint16_t float32_to_uint16(float32 a, float_status *s)
2691 {
2692     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2693 }
2694
2695 uint32_t float32_to_uint32(float32 a, float_status *s)
2696 {
2697     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2698 }
2699
2700 uint64_t float32_to_uint64(float32 a, float_status *s)
2701 {
2702     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2703 }
2704
2705 uint16_t float64_to_uint16(float64 a, float_status *s)
2706 {
2707     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2708 }
2709
2710 uint32_t float64_to_uint32(float64 a, float_status *s)
2711 {
2712     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2713 }
2714
2715 uint64_t float64_to_uint64(float64 a, float_status *s)
2716 {
2717     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2718 }
2719
2720 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2721 {
2722     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2723 }
2724
2725 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2726 {
2727     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2728 }
2729
2730 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2731 {
2732     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2733 }
2734
2735 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2736 {
2737     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2738 }
2739
2740 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2741 {
2742     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2743 }
2744
2745 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2746 {
2747     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2748 }
2749
2750 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2751 {
2752     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2753 }
2754
2755 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2756 {
2757     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2758 }
2759
2760 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2761 {
2762     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2763 }
2764
2765 /*
2766  *  Returns the result of converting the bfloat16 value `a' to
2767  *  the unsigned integer format.
2768  */
2769
2770 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2771                                    int scale, float_status *s)
2772 {
2773     FloatParts64 p;
2774
2775     bfloat16_unpack_canonical(&p, a, s);
2776     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2777 }
2778
2779 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2780                                    int scale, float_status *s)
2781 {
2782     FloatParts64 p;
2783
2784     bfloat16_unpack_canonical(&p, a, s);
2785     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2786 }
2787
2788 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2789                                    int scale, float_status *s)
2790 {
2791     FloatParts64 p;
2792
2793     bfloat16_unpack_canonical(&p, a, s);
2794     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2795 }
2796
2797 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2798 {
2799     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2800 }
2801
2802 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2803 {
2804     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2805 }
2806
2807 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2808 {
2809     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2810 }
2811
2812 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2813 {
2814     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2815 }
2816
2817 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2818 {
2819     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2820 }
2821
2822 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2823 {
2824     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2825 }
2826
2827 /*
2828  * Integer to float conversions
2829  *
2830  * Returns the result of converting the two's complement integer `a'
2831  * to the floating-point format. The conversion is performed according
2832  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833  */
2834
2835 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2836 {
2837     FloatParts64 r = { .sign = false };
2838
2839     if (a == 0) {
2840         r.cls = float_class_zero;
2841     } else {
2842         uint64_t f = a;
2843         int shift;
2844
2845         r.cls = float_class_normal;
2846         if (a < 0) {
2847             f = -f;
2848             r.sign = true;
2849         }
2850         shift = clz64(f);
2851         scale = MIN(MAX(scale, -0x10000), 0x10000);
2852
2853         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2854         r.frac = f << shift;
2855     }
2856
2857     return r;
2858 }
2859
2860 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2861 {
2862     FloatParts64 pa = int_to_float(a, scale, status);
2863     return float16_round_pack_canonical(&pa, status);
2864 }
2865
2866 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2867 {
2868     return int64_to_float16_scalbn(a, scale, status);
2869 }
2870
2871 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2872 {
2873     return int64_to_float16_scalbn(a, scale, status);
2874 }
2875
2876 float16 int64_to_float16(int64_t a, float_status *status)
2877 {
2878     return int64_to_float16_scalbn(a, 0, status);
2879 }
2880
2881 float16 int32_to_float16(int32_t a, float_status *status)
2882 {
2883     return int64_to_float16_scalbn(a, 0, status);
2884 }
2885
2886 float16 int16_to_float16(int16_t a, float_status *status)
2887 {
2888     return int64_to_float16_scalbn(a, 0, status);
2889 }
2890
2891 float16 int8_to_float16(int8_t a, float_status *status)
2892 {
2893     return int64_to_float16_scalbn(a, 0, status);
2894 }
2895
2896 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2897 {
2898     FloatParts64 pa = int_to_float(a, scale, status);
2899     return float32_round_pack_canonical(&pa, status);
2900 }
2901
2902 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2903 {
2904     return int64_to_float32_scalbn(a, scale, status);
2905 }
2906
2907 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2908 {
2909     return int64_to_float32_scalbn(a, scale, status);
2910 }
2911
2912 float32 int64_to_float32(int64_t a, float_status *status)
2913 {
2914     return int64_to_float32_scalbn(a, 0, status);
2915 }
2916
2917 float32 int32_to_float32(int32_t a, float_status *status)
2918 {
2919     return int64_to_float32_scalbn(a, 0, status);
2920 }
2921
2922 float32 int16_to_float32(int16_t a, float_status *status)
2923 {
2924     return int64_to_float32_scalbn(a, 0, status);
2925 }
2926
2927 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2928 {
2929     FloatParts64 pa = int_to_float(a, scale, status);
2930     return float64_round_pack_canonical(&pa, status);
2931 }
2932
2933 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2934 {
2935     return int64_to_float64_scalbn(a, scale, status);
2936 }
2937
2938 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2939 {
2940     return int64_to_float64_scalbn(a, scale, status);
2941 }
2942
2943 float64 int64_to_float64(int64_t a, float_status *status)
2944 {
2945     return int64_to_float64_scalbn(a, 0, status);
2946 }
2947
2948 float64 int32_to_float64(int32_t a, float_status *status)
2949 {
2950     return int64_to_float64_scalbn(a, 0, status);
2951 }
2952
2953 float64 int16_to_float64(int16_t a, float_status *status)
2954 {
2955     return int64_to_float64_scalbn(a, 0, status);
2956 }
2957
2958 /*
2959  * Returns the result of converting the two's complement integer `a'
2960  * to the bfloat16 format.
2961  */
2962
2963 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2964 {
2965     FloatParts64 pa = int_to_float(a, scale, status);
2966     return bfloat16_round_pack_canonical(&pa, status);
2967 }
2968
2969 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2970 {
2971     return int64_to_bfloat16_scalbn(a, scale, status);
2972 }
2973
2974 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2975 {
2976     return int64_to_bfloat16_scalbn(a, scale, status);
2977 }
2978
2979 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2980 {
2981     return int64_to_bfloat16_scalbn(a, 0, status);
2982 }
2983
2984 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2985 {
2986     return int64_to_bfloat16_scalbn(a, 0, status);
2987 }
2988
2989 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2990 {
2991     return int64_to_bfloat16_scalbn(a, 0, status);
2992 }
2993
2994 /*
2995  * Unsigned Integer to float conversions
2996  *
2997  * Returns the result of converting the unsigned integer `a' to the
2998  * floating-point format. The conversion is performed according to the
2999  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000  */
3001
3002 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3003 {
3004     FloatParts64 r = { .sign = false };
3005     int shift;
3006
3007     if (a == 0) {
3008         r.cls = float_class_zero;
3009     } else {
3010         scale = MIN(MAX(scale, -0x10000), 0x10000);
3011         shift = clz64(a);
3012         r.cls = float_class_normal;
3013         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3014         r.frac = a << shift;
3015     }
3016
3017     return r;
3018 }
3019
3020 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3021 {
3022     FloatParts64 pa = uint_to_float(a, scale, status);
3023     return float16_round_pack_canonical(&pa, status);
3024 }
3025
3026 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3027 {
3028     return uint64_to_float16_scalbn(a, scale, status);
3029 }
3030
3031 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3032 {
3033     return uint64_to_float16_scalbn(a, scale, status);
3034 }
3035
3036 float16 uint64_to_float16(uint64_t a, float_status *status)
3037 {
3038     return uint64_to_float16_scalbn(a, 0, status);
3039 }
3040
3041 float16 uint32_to_float16(uint32_t a, float_status *status)
3042 {
3043     return uint64_to_float16_scalbn(a, 0, status);
3044 }
3045
3046 float16 uint16_to_float16(uint16_t a, float_status *status)
3047 {
3048     return uint64_to_float16_scalbn(a, 0, status);
3049 }
3050
3051 float16 uint8_to_float16(uint8_t a, float_status *status)
3052 {
3053     return uint64_to_float16_scalbn(a, 0, status);
3054 }
3055
3056 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3057 {
3058     FloatParts64 pa = uint_to_float(a, scale, status);
3059     return float32_round_pack_canonical(&pa, status);
3060 }
3061
3062 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3063 {
3064     return uint64_to_float32_scalbn(a, scale, status);
3065 }
3066
3067 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3068 {
3069     return uint64_to_float32_scalbn(a, scale, status);
3070 }
3071
3072 float32 uint64_to_float32(uint64_t a, float_status *status)
3073 {
3074     return uint64_to_float32_scalbn(a, 0, status);
3075 }
3076
3077 float32 uint32_to_float32(uint32_t a, float_status *status)
3078 {
3079     return uint64_to_float32_scalbn(a, 0, status);
3080 }
3081
3082 float32 uint16_to_float32(uint16_t a, float_status *status)
3083 {
3084     return uint64_to_float32_scalbn(a, 0, status);
3085 }
3086
3087 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3088 {
3089     FloatParts64 pa = uint_to_float(a, scale, status);
3090     return float64_round_pack_canonical(&pa, status);
3091 }
3092
3093 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3094 {
3095     return uint64_to_float64_scalbn(a, scale, status);
3096 }
3097
3098 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3099 {
3100     return uint64_to_float64_scalbn(a, scale, status);
3101 }
3102
3103 float64 uint64_to_float64(uint64_t a, float_status *status)
3104 {
3105     return uint64_to_float64_scalbn(a, 0, status);
3106 }
3107
3108 float64 uint32_to_float64(uint32_t a, float_status *status)
3109 {
3110     return uint64_to_float64_scalbn(a, 0, status);
3111 }
3112
3113 float64 uint16_to_float64(uint16_t a, float_status *status)
3114 {
3115     return uint64_to_float64_scalbn(a, 0, status);
3116 }
3117
3118 /*
3119  * Returns the result of converting the unsigned integer `a' to the
3120  * bfloat16 format.
3121  */
3122
3123 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3124 {
3125     FloatParts64 pa = uint_to_float(a, scale, status);
3126     return bfloat16_round_pack_canonical(&pa, status);
3127 }
3128
3129 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3130 {
3131     return uint64_to_bfloat16_scalbn(a, scale, status);
3132 }
3133
3134 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3135 {
3136     return uint64_to_bfloat16_scalbn(a, scale, status);
3137 }
3138
3139 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3140 {
3141     return uint64_to_bfloat16_scalbn(a, 0, status);
3142 }
3143
3144 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3145 {
3146     return uint64_to_bfloat16_scalbn(a, 0, status);
3147 }
3148
3149 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3150 {
3151     return uint64_to_bfloat16_scalbn(a, 0, status);
3152 }
3153
3154 /* Float Min/Max */
3155 /* min() and max() functions. These can't be implemented as
3156  * 'compare and pick one input' because that would mishandle
3157  * NaNs and +0 vs -0.
3158  *
3159  * minnum() and maxnum() functions. These are similar to the min()
3160  * and max() functions but if one of the arguments is a QNaN and
3161  * the other is numerical then the numerical argument is returned.
3162  * SNaNs will get quietened before being returned.
3163  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3164  * and maxNum() operations. min() and max() are the typical min/max
3165  * semantics provided by many CPUs which predate that specification.
3166  *
3167  * minnummag() and maxnummag() functions correspond to minNumMag()
3168  * and minNumMag() from the IEEE-754 2008.
3169  */
3170 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3171                                 bool ieee, bool ismag, float_status *s)
3172 {
3173     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3174         if (ieee) {
3175             /* Takes two floating-point values `a' and `b', one of
3176              * which is a NaN, and returns the appropriate NaN
3177              * result. If either `a' or `b' is a signaling NaN,
3178              * the invalid exception is raised.
3179              */
3180             if (is_snan(a.cls) || is_snan(b.cls)) {
3181                 return *parts_pick_nan(&a, &b, s);
3182             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3183                 return b;
3184             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3185                 return a;
3186             }
3187         }
3188         return *parts_pick_nan(&a, &b, s);
3189     } else {
3190         int a_exp, b_exp;
3191
3192         switch (a.cls) {
3193         case float_class_normal:
3194             a_exp = a.exp;
3195             break;
3196         case float_class_inf:
3197             a_exp = INT_MAX;
3198             break;
3199         case float_class_zero:
3200             a_exp = INT_MIN;
3201             break;
3202         default:
3203             g_assert_not_reached();
3204             break;
3205         }
3206         switch (b.cls) {
3207         case float_class_normal:
3208             b_exp = b.exp;
3209             break;
3210         case float_class_inf:
3211             b_exp = INT_MAX;
3212             break;
3213         case float_class_zero:
3214             b_exp = INT_MIN;
3215             break;
3216         default:
3217             g_assert_not_reached();
3218             break;
3219         }
3220
3221         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3222             bool a_less = a_exp < b_exp;
3223             if (a_exp == b_exp) {
3224                 a_less = a.frac < b.frac;
3225             }
3226             return a_less ^ ismin ? b : a;
3227         }
3228
3229         if (a.sign == b.sign) {
3230             bool a_less = a_exp < b_exp;
3231             if (a_exp == b_exp) {
3232                 a_less = a.frac < b.frac;
3233             }
3234             return a.sign ^ a_less ^ ismin ? b : a;
3235         } else {
3236             return a.sign ^ ismin ? b : a;
3237         }
3238     }
3239 }
3240
3241 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3242 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3243                                      float_status *s)                   \
3244 {                                                                       \
3245     FloatParts64 pa, pb, pr;                                            \
3246     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3247     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3248     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3249     return float ## sz ## _round_pack_canonical(&pr, s);                \
3250 }
3251
3252 MINMAX(16, min, true, false, false)
3253 MINMAX(16, minnum, true, true, false)
3254 MINMAX(16, minnummag, true, true, true)
3255 MINMAX(16, max, false, false, false)
3256 MINMAX(16, maxnum, false, true, false)
3257 MINMAX(16, maxnummag, false, true, true)
3258
3259 MINMAX(32, min, true, false, false)
3260 MINMAX(32, minnum, true, true, false)
3261 MINMAX(32, minnummag, true, true, true)
3262 MINMAX(32, max, false, false, false)
3263 MINMAX(32, maxnum, false, true, false)
3264 MINMAX(32, maxnummag, false, true, true)
3265
3266 MINMAX(64, min, true, false, false)
3267 MINMAX(64, minnum, true, true, false)
3268 MINMAX(64, minnummag, true, true, true)
3269 MINMAX(64, max, false, false, false)
3270 MINMAX(64, maxnum, false, true, false)
3271 MINMAX(64, maxnummag, false, true, true)
3272
3273 #undef MINMAX
3274
3275 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3276 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3277 {                                                                       \
3278     FloatParts64 pa, pb, pr;                                            \
3279     bfloat16_unpack_canonical(&pa, a, s);                               \
3280     bfloat16_unpack_canonical(&pb, b, s);                               \
3281     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3282     return bfloat16_round_pack_canonical(&pr, s);                       \
3283 }
3284
3285 BF16_MINMAX(min, true, false, false)
3286 BF16_MINMAX(minnum, true, true, false)
3287 BF16_MINMAX(minnummag, true, true, true)
3288 BF16_MINMAX(max, false, false, false)
3289 BF16_MINMAX(maxnum, false, true, false)
3290 BF16_MINMAX(maxnummag, false, true, true)
3291
3292 #undef BF16_MINMAX
3293
3294 /* Floating point compare */
3295 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3296                                     float_status *s)
3297 {
3298     if (is_nan(a.cls) || is_nan(b.cls)) {
3299         if (!is_quiet ||
3300             a.cls == float_class_snan ||
3301             b.cls == float_class_snan) {
3302             float_raise(float_flag_invalid, s);
3303         }
3304         return float_relation_unordered;
3305     }
3306
3307     if (a.cls == float_class_zero) {
3308         if (b.cls == float_class_zero) {
3309             return float_relation_equal;
3310         }
3311         return b.sign ? float_relation_greater : float_relation_less;
3312     } else if (b.cls == float_class_zero) {
3313         return a.sign ? float_relation_less : float_relation_greater;
3314     }
3315
3316     /* The only really important thing about infinity is its sign. If
3317      * both are infinities the sign marks the smallest of the two.
3318      */
3319     if (a.cls == float_class_inf) {
3320         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3321             return float_relation_equal;
3322         }
3323         return a.sign ? float_relation_less : float_relation_greater;
3324     } else if (b.cls == float_class_inf) {
3325         return b.sign ? float_relation_greater : float_relation_less;
3326     }
3327
3328     if (a.sign != b.sign) {
3329         return a.sign ? float_relation_less : float_relation_greater;
3330     }
3331
3332     if (a.exp == b.exp) {
3333         if (a.frac == b.frac) {
3334             return float_relation_equal;
3335         }
3336         if (a.sign) {
3337             return a.frac > b.frac ?
3338                 float_relation_less : float_relation_greater;
3339         } else {
3340             return a.frac > b.frac ?
3341                 float_relation_greater : float_relation_less;
3342         }
3343     } else {
3344         if (a.sign) {
3345             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3346         } else {
3347             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3348         }
3349     }
3350 }
3351
3352 #define COMPARE(name, attr, sz)                                         \
3353 static int attr                                                         \
3354 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3355 {                                                                       \
3356     FloatParts64 pa, pb;                                                \
3357     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3358     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3359     return compare_floats(pa, pb, is_quiet, s);                         \
3360 }
3361
3362 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3363 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3364 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3365
3366 #undef COMPARE
3367
3368 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3369 {
3370     return soft_f16_compare(a, b, false, s);
3371 }
3372
3373 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3374 {
3375     return soft_f16_compare(a, b, true, s);
3376 }
3377
3378 static FloatRelation QEMU_FLATTEN
3379 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3380 {
3381     union_float32 ua, ub;
3382
3383     ua.s = xa;
3384     ub.s = xb;
3385
3386     if (QEMU_NO_HARDFLOAT) {
3387         goto soft;
3388     }
3389
3390     float32_input_flush2(&ua.s, &ub.s, s);
3391     if (isgreaterequal(ua.h, ub.h)) {
3392         if (isgreater(ua.h, ub.h)) {
3393             return float_relation_greater;
3394         }
3395         return float_relation_equal;
3396     }
3397     if (likely(isless(ua.h, ub.h))) {
3398         return float_relation_less;
3399     }
3400     /* The only condition remaining is unordered.
3401      * Fall through to set flags.
3402      */
3403  soft:
3404     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3405 }
3406
3407 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3408 {
3409     return f32_compare(a, b, false, s);
3410 }
3411
3412 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3413 {
3414     return f32_compare(a, b, true, s);
3415 }
3416
3417 static FloatRelation QEMU_FLATTEN
3418 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3419 {
3420     union_float64 ua, ub;
3421
3422     ua.s = xa;
3423     ub.s = xb;
3424
3425     if (QEMU_NO_HARDFLOAT) {
3426         goto soft;
3427     }
3428
3429     float64_input_flush2(&ua.s, &ub.s, s);
3430     if (isgreaterequal(ua.h, ub.h)) {
3431         if (isgreater(ua.h, ub.h)) {
3432             return float_relation_greater;
3433         }
3434         return float_relation_equal;
3435     }
3436     if (likely(isless(ua.h, ub.h))) {
3437         return float_relation_less;
3438     }
3439     /* The only condition remaining is unordered.
3440      * Fall through to set flags.
3441      */
3442  soft:
3443     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3444 }
3445
3446 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3447 {
3448     return f64_compare(a, b, false, s);
3449 }
3450
3451 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3452 {
3453     return f64_compare(a, b, true, s);
3454 }
3455
3456 static FloatRelation QEMU_FLATTEN
3457 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3458 {
3459     FloatParts64 pa, pb;
3460
3461     bfloat16_unpack_canonical(&pa, a, s);
3462     bfloat16_unpack_canonical(&pb, b, s);
3463     return compare_floats(pa, pb, is_quiet, s);
3464 }
3465
3466 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3467 {
3468     return soft_bf16_compare(a, b, false, s);
3469 }
3470
3471 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3472 {
3473     return soft_bf16_compare(a, b, true, s);
3474 }
3475
3476 /* Multiply A by 2 raised to the power N.  */
3477 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3478 {
3479     if (unlikely(is_nan(a.cls))) {
3480         parts_return_nan(&a, s);
3481     }
3482     if (a.cls == float_class_normal) {
3483         /* The largest float type (even though not supported by FloatParts64)
3484          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3485          * still allows rounding to infinity, without allowing overflow
3486          * within the int32_t that backs FloatParts64.exp.
3487          */
3488         n = MIN(MAX(n, -0x10000), 0x10000);
3489         a.exp += n;
3490     }
3491     return a;
3492 }
3493
3494 float16 float16_scalbn(float16 a, int n, float_status *status)
3495 {
3496     FloatParts64 pa, pr;
3497
3498     float16_unpack_canonical(&pa, a, status);
3499     pr = scalbn_decomposed(pa, n, status);
3500     return float16_round_pack_canonical(&pr, status);
3501 }
3502
3503 float32 float32_scalbn(float32 a, int n, float_status *status)
3504 {
3505     FloatParts64 pa, pr;
3506
3507     float32_unpack_canonical(&pa, a, status);
3508     pr = scalbn_decomposed(pa, n, status);
3509     return float32_round_pack_canonical(&pr, status);
3510 }
3511
3512 float64 float64_scalbn(float64 a, int n, float_status *status)
3513 {
3514     FloatParts64 pa, pr;
3515
3516     float64_unpack_canonical(&pa, a, status);
3517     pr = scalbn_decomposed(pa, n, status);
3518     return float64_round_pack_canonical(&pr, status);
3519 }
3520
3521 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3522 {
3523     FloatParts64 pa, pr;
3524
3525     bfloat16_unpack_canonical(&pa, a, status);
3526     pr = scalbn_decomposed(pa, n, status);
3527     return bfloat16_round_pack_canonical(&pr, status);
3528 }
3529
3530 /*
3531  * Square Root
3532  *
3533  * The old softfloat code did an approximation step before zeroing in
3534  * on the final result. However for simpleness we just compute the
3535  * square root by iterating down from the implicit bit to enough extra
3536  * bits to ensure we get a correctly rounded result.
3537  *
3538  * This does mean however the calculation is slower than before,
3539  * especially for 64 bit floats.
3540  */
3541
3542 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3543 {
3544     uint64_t a_frac, r_frac, s_frac;
3545     int bit, last_bit;
3546
3547     if (is_nan(a.cls)) {
3548         parts_return_nan(&a, s);
3549         return a;
3550     }
3551     if (a.cls == float_class_zero) {
3552         return a;  /* sqrt(+-0) = +-0 */
3553     }
3554     if (a.sign) {
3555         float_raise(float_flag_invalid, s);
3556         parts_default_nan(&a, s);
3557         return a;
3558     }
3559     if (a.cls == float_class_inf) {
3560         return a;  /* sqrt(+inf) = +inf */
3561     }
3562
3563     assert(a.cls == float_class_normal);
3564
3565     /* We need two overflow bits at the top. Adding room for that is a
3566      * right shift. If the exponent is odd, we can discard the low bit
3567      * by multiplying the fraction by 2; that's a left shift. Combine
3568      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3569      */
3570     a_frac = a.frac >> (2 - (a.exp & 1));
3571     a.exp >>= 1;
3572
3573     /* Bit-by-bit computation of sqrt.  */
3574     r_frac = 0;
3575     s_frac = 0;
3576
3577     /* Iterate from implicit bit down to the 3 extra bits to compute a
3578      * properly rounded result. Remember we've inserted two more bits
3579      * at the top, so these positions are two less.
3580      */
3581     bit = DECOMPOSED_BINARY_POINT - 2;
3582     last_bit = MAX(p->frac_shift - 4, 0);
3583     do {
3584         uint64_t q = 1ULL << bit;
3585         uint64_t t_frac = s_frac + q;
3586         if (t_frac <= a_frac) {
3587             s_frac = t_frac + q;
3588             a_frac -= t_frac;
3589             r_frac += q;
3590         }
3591         a_frac <<= 1;
3592     } while (--bit >= last_bit);
3593
3594     /* Undo the right shift done above. If there is any remaining
3595      * fraction, the result is inexact. Set the sticky bit.
3596      */
3597     a.frac = (r_frac << 2) + (a_frac != 0);
3598
3599     return a;
3600 }
3601
3602 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3603 {
3604     FloatParts64 pa, pr;
3605
3606     float16_unpack_canonical(&pa, a, status);
3607     pr = sqrt_float(pa, status, &float16_params);
3608     return float16_round_pack_canonical(&pr, status);
3609 }
3610
3611 static float32 QEMU_SOFTFLOAT_ATTR
3612 soft_f32_sqrt(float32 a, float_status *status)
3613 {
3614     FloatParts64 pa, pr;
3615
3616     float32_unpack_canonical(&pa, a, status);
3617     pr = sqrt_float(pa, status, &float32_params);
3618     return float32_round_pack_canonical(&pr, status);
3619 }
3620
3621 static float64 QEMU_SOFTFLOAT_ATTR
3622 soft_f64_sqrt(float64 a, float_status *status)
3623 {
3624     FloatParts64 pa, pr;
3625
3626     float64_unpack_canonical(&pa, a, status);
3627     pr = sqrt_float(pa, status, &float64_params);
3628     return float64_round_pack_canonical(&pr, status);
3629 }
3630
3631 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3632 {
3633     union_float32 ua, ur;
3634
3635     ua.s = xa;
3636     if (unlikely(!can_use_fpu(s))) {
3637         goto soft;
3638     }
3639
3640     float32_input_flush1(&ua.s, s);
3641     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3642         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3643                        fpclassify(ua.h) == FP_ZERO) ||
3644                      signbit(ua.h))) {
3645             goto soft;
3646         }
3647     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3648                         float32_is_neg(ua.s))) {
3649         goto soft;
3650     }
3651     ur.h = sqrtf(ua.h);
3652     return ur.s;
3653
3654  soft:
3655     return soft_f32_sqrt(ua.s, s);
3656 }
3657
3658 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3659 {
3660     union_float64 ua, ur;
3661
3662     ua.s = xa;
3663     if (unlikely(!can_use_fpu(s))) {
3664         goto soft;
3665     }
3666
3667     float64_input_flush1(&ua.s, s);
3668     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3669         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3670                        fpclassify(ua.h) == FP_ZERO) ||
3671                      signbit(ua.h))) {
3672             goto soft;
3673         }
3674     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3675                         float64_is_neg(ua.s))) {
3676         goto soft;
3677     }
3678     ur.h = sqrt(ua.h);
3679     return ur.s;
3680
3681  soft:
3682     return soft_f64_sqrt(ua.s, s);
3683 }
3684
3685 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3686 {
3687     FloatParts64 pa, pr;
3688
3689     bfloat16_unpack_canonical(&pa, a, status);
3690     pr = sqrt_float(pa, status, &bfloat16_params);
3691     return bfloat16_round_pack_canonical(&pr, status);
3692 }
3693
3694 /*----------------------------------------------------------------------------
3695 | The pattern for a default generated NaN.
3696 *----------------------------------------------------------------------------*/
3697
3698 float16 float16_default_nan(float_status *status)
3699 {
3700     FloatParts64 p;
3701
3702     parts_default_nan(&p, status);
3703     p.frac >>= float16_params.frac_shift;
3704     return float16_pack_raw(&p);
3705 }
3706
3707 float32 float32_default_nan(float_status *status)
3708 {
3709     FloatParts64 p;
3710
3711     parts_default_nan(&p, status);
3712     p.frac >>= float32_params.frac_shift;
3713     return float32_pack_raw(&p);
3714 }
3715
3716 float64 float64_default_nan(float_status *status)
3717 {
3718     FloatParts64 p;
3719
3720     parts_default_nan(&p, status);
3721     p.frac >>= float64_params.frac_shift;
3722     return float64_pack_raw(&p);
3723 }
3724
3725 float128 float128_default_nan(float_status *status)
3726 {
3727     FloatParts128 p;
3728
3729     parts_default_nan(&p, status);
3730     frac_shr(&p, float128_params.frac_shift);
3731     return float128_pack_raw(&p);
3732 }
3733
3734 bfloat16 bfloat16_default_nan(float_status *status)
3735 {
3736     FloatParts64 p;
3737
3738     parts_default_nan(&p, status);
3739     p.frac >>= bfloat16_params.frac_shift;
3740     return bfloat16_pack_raw(&p);
3741 }
3742
3743 /*----------------------------------------------------------------------------
3744 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3745 *----------------------------------------------------------------------------*/
3746
3747 float16 float16_silence_nan(float16 a, float_status *status)
3748 {
3749     FloatParts64 p;
3750
3751     float16_unpack_raw(&p, a);
3752     p.frac <<= float16_params.frac_shift;
3753     parts_silence_nan(&p, status);
3754     p.frac >>= float16_params.frac_shift;
3755     return float16_pack_raw(&p);
3756 }
3757
3758 float32 float32_silence_nan(float32 a, float_status *status)
3759 {
3760     FloatParts64 p;
3761
3762     float32_unpack_raw(&p, a);
3763     p.frac <<= float32_params.frac_shift;
3764     parts_silence_nan(&p, status);
3765     p.frac >>= float32_params.frac_shift;
3766     return float32_pack_raw(&p);
3767 }
3768
3769 float64 float64_silence_nan(float64 a, float_status *status)
3770 {
3771     FloatParts64 p;
3772
3773     float64_unpack_raw(&p, a);
3774     p.frac <<= float64_params.frac_shift;
3775     parts_silence_nan(&p, status);
3776     p.frac >>= float64_params.frac_shift;
3777     return float64_pack_raw(&p);
3778 }
3779
3780 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3781 {
3782     FloatParts64 p;
3783
3784     bfloat16_unpack_raw(&p, a);
3785     p.frac <<= bfloat16_params.frac_shift;
3786     parts_silence_nan(&p, status);
3787     p.frac >>= bfloat16_params.frac_shift;
3788     return bfloat16_pack_raw(&p);
3789 }
3790
3791 float128 float128_silence_nan(float128 a, float_status *status)
3792 {
3793     FloatParts128 p;
3794
3795     float128_unpack_raw(&p, a);
3796     frac_shl(&p, float128_params.frac_shift);
3797     parts_silence_nan(&p, status);
3798     frac_shr(&p, float128_params.frac_shift);
3799     return float128_pack_raw(&p);
3800 }
3801
3802 /*----------------------------------------------------------------------------
3803 | If `a' is denormal and we are in flush-to-zero mode then set the
3804 | input-denormal exception and return zero. Otherwise just return the value.
3805 *----------------------------------------------------------------------------*/
3806
3807 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3808 {
3809     if (p.exp == 0 && p.frac != 0) {
3810         float_raise(float_flag_input_denormal, status);
3811         return true;
3812     }
3813
3814     return false;
3815 }
3816
3817 float16 float16_squash_input_denormal(float16 a, float_status *status)
3818 {
3819     if (status->flush_inputs_to_zero) {
3820         FloatParts64 p;
3821
3822         float16_unpack_raw(&p, a);
3823         if (parts_squash_denormal(p, status)) {
3824             return float16_set_sign(float16_zero, p.sign);
3825         }
3826     }
3827     return a;
3828 }
3829
3830 float32 float32_squash_input_denormal(float32 a, float_status *status)
3831 {
3832     if (status->flush_inputs_to_zero) {
3833         FloatParts64 p;
3834
3835         float32_unpack_raw(&p, a);
3836         if (parts_squash_denormal(p, status)) {
3837             return float32_set_sign(float32_zero, p.sign);
3838         }
3839     }
3840     return a;
3841 }
3842
3843 float64 float64_squash_input_denormal(float64 a, float_status *status)
3844 {
3845     if (status->flush_inputs_to_zero) {
3846         FloatParts64 p;
3847
3848         float64_unpack_raw(&p, a);
3849         if (parts_squash_denormal(p, status)) {
3850             return float64_set_sign(float64_zero, p.sign);
3851         }
3852     }
3853     return a;
3854 }
3855
3856 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3857 {
3858     if (status->flush_inputs_to_zero) {
3859         FloatParts64 p;
3860
3861         bfloat16_unpack_raw(&p, a);
3862         if (parts_squash_denormal(p, status)) {
3863             return bfloat16_set_sign(bfloat16_zero, p.sign);
3864         }
3865     }
3866     return a;
3867 }
3868
3869 /*----------------------------------------------------------------------------
3870 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3871 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3872 | input.  If `zSign' is 1, the input is negated before being converted to an
3873 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3874 | is simply rounded to an integer, with the inexact exception raised if the
3875 | input cannot be represented exactly as an integer.  However, if the fixed-
3876 | point input is too large, the invalid exception is raised and the largest
3877 | positive or negative integer is returned.
3878 *----------------------------------------------------------------------------*/
3879
3880 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3881                                  float_status *status)
3882 {
3883     int8_t roundingMode;
3884     bool roundNearestEven;
3885     int8_t roundIncrement, roundBits;
3886     int32_t z;
3887
3888     roundingMode = status->float_rounding_mode;
3889     roundNearestEven = ( roundingMode == float_round_nearest_even );
3890     switch (roundingMode) {
3891     case float_round_nearest_even:
3892     case float_round_ties_away:
3893         roundIncrement = 0x40;
3894         break;
3895     case float_round_to_zero:
3896         roundIncrement = 0;
3897         break;
3898     case float_round_up:
3899         roundIncrement = zSign ? 0 : 0x7f;
3900         break;
3901     case float_round_down:
3902         roundIncrement = zSign ? 0x7f : 0;
3903         break;
3904     case float_round_to_odd:
3905         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3906         break;
3907     default:
3908         abort();
3909     }
3910     roundBits = absZ & 0x7F;
3911     absZ = ( absZ + roundIncrement )>>7;
3912     if (!(roundBits ^ 0x40) && roundNearestEven) {
3913         absZ &= ~1;
3914     }
3915     z = absZ;
3916     if ( zSign ) z = - z;
3917     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3918         float_raise(float_flag_invalid, status);
3919         return zSign ? INT32_MIN : INT32_MAX;
3920     }
3921     if (roundBits) {
3922         float_raise(float_flag_inexact, status);
3923     }
3924     return z;
3925
3926 }
3927
3928 /*----------------------------------------------------------------------------
3929 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3930 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3931 | and returns the properly rounded 64-bit integer corresponding to the input.
3932 | If `zSign' is 1, the input is negated before being converted to an integer.
3933 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3934 | the inexact exception raised if the input cannot be represented exactly as
3935 | an integer.  However, if the fixed-point input is too large, the invalid
3936 | exception is raised and the largest positive or negative integer is
3937 | returned.
3938 *----------------------------------------------------------------------------*/
3939
3940 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3941                                float_status *status)
3942 {
3943     int8_t roundingMode;
3944     bool roundNearestEven, increment;
3945     int64_t z;
3946
3947     roundingMode = status->float_rounding_mode;
3948     roundNearestEven = ( roundingMode == float_round_nearest_even );
3949     switch (roundingMode) {
3950     case float_round_nearest_even:
3951     case float_round_ties_away:
3952         increment = ((int64_t) absZ1 < 0);
3953         break;
3954     case float_round_to_zero:
3955         increment = 0;
3956         break;
3957     case float_round_up:
3958         increment = !zSign && absZ1;
3959         break;
3960     case float_round_down:
3961         increment = zSign && absZ1;
3962         break;
3963     case float_round_to_odd:
3964         increment = !(absZ0 & 1) && absZ1;
3965         break;
3966     default:
3967         abort();
3968     }
3969     if ( increment ) {
3970         ++absZ0;
3971         if ( absZ0 == 0 ) goto overflow;
3972         if (!(absZ1 << 1) && roundNearestEven) {
3973             absZ0 &= ~1;
3974         }
3975     }
3976     z = absZ0;
3977     if ( zSign ) z = - z;
3978     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3979  overflow:
3980         float_raise(float_flag_invalid, status);
3981         return zSign ? INT64_MIN : INT64_MAX;
3982     }
3983     if (absZ1) {
3984         float_raise(float_flag_inexact, status);
3985     }
3986     return z;
3987
3988 }
3989
3990 /*----------------------------------------------------------------------------
3991 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3992 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3993 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3994 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3995 | with the inexact exception raised if the input cannot be represented exactly
3996 | as an integer.  However, if the fixed-point input is too large, the invalid
3997 | exception is raised and the largest unsigned integer is returned.
3998 *----------------------------------------------------------------------------*/
3999
4000 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4001                                 uint64_t absZ1, float_status *status)
4002 {
4003     int8_t roundingMode;
4004     bool roundNearestEven, increment;
4005
4006     roundingMode = status->float_rounding_mode;
4007     roundNearestEven = (roundingMode == float_round_nearest_even);
4008     switch (roundingMode) {
4009     case float_round_nearest_even:
4010     case float_round_ties_away:
4011         increment = ((int64_t)absZ1 < 0);
4012         break;
4013     case float_round_to_zero:
4014         increment = 0;
4015         break;
4016     case float_round_up:
4017         increment = !zSign && absZ1;
4018         break;
4019     case float_round_down:
4020         increment = zSign && absZ1;
4021         break;
4022     case float_round_to_odd:
4023         increment = !(absZ0 & 1) && absZ1;
4024         break;
4025     default:
4026         abort();
4027     }
4028     if (increment) {
4029         ++absZ0;
4030         if (absZ0 == 0) {
4031             float_raise(float_flag_invalid, status);
4032             return UINT64_MAX;
4033         }
4034         if (!(absZ1 << 1) && roundNearestEven) {
4035             absZ0 &= ~1;
4036         }
4037     }
4038
4039     if (zSign && absZ0) {
4040         float_raise(float_flag_invalid, status);
4041         return 0;
4042     }
4043
4044     if (absZ1) {
4045         float_raise(float_flag_inexact, status);
4046     }
4047     return absZ0;
4048 }
4049
4050 /*----------------------------------------------------------------------------
4051 | Normalizes the subnormal single-precision floating-point value represented
4052 | by the denormalized significand `aSig'.  The normalized exponent and
4053 | significand are stored at the locations pointed to by `zExpPtr' and
4054 | `zSigPtr', respectively.
4055 *----------------------------------------------------------------------------*/
4056
4057 static void
4058  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4059 {
4060     int8_t shiftCount;
4061
4062     shiftCount = clz32(aSig) - 8;
4063     *zSigPtr = aSig<<shiftCount;
4064     *zExpPtr = 1 - shiftCount;
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4070 | and significand `zSig', and returns the proper single-precision floating-
4071 | point value corresponding to the abstract input.  Ordinarily, the abstract
4072 | value is simply rounded and packed into the single-precision format, with
4073 | the inexact exception raised if the abstract input cannot be represented
4074 | exactly.  However, if the abstract value is too large, the overflow and
4075 | inexact exceptions are raised and an infinity or maximal finite value is
4076 | returned.  If the abstract value is too small, the input value is rounded to
4077 | a subnormal number, and the underflow and inexact exceptions are raised if
4078 | the abstract input cannot be represented exactly as a subnormal single-
4079 | precision floating-point number.
4080 |     The input significand `zSig' has its binary point between bits 30
4081 | and 29, which is 7 bits to the left of the usual location.  This shifted
4082 | significand must be normalized or smaller.  If `zSig' is not normalized,
4083 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4084 | and it must not require rounding.  In the usual case that `zSig' is
4085 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4086 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4087 | Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4089
4090 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4091                                    float_status *status)
4092 {
4093     int8_t roundingMode;
4094     bool roundNearestEven;
4095     int8_t roundIncrement, roundBits;
4096     bool isTiny;
4097
4098     roundingMode = status->float_rounding_mode;
4099     roundNearestEven = ( roundingMode == float_round_nearest_even );
4100     switch (roundingMode) {
4101     case float_round_nearest_even:
4102     case float_round_ties_away:
4103         roundIncrement = 0x40;
4104         break;
4105     case float_round_to_zero:
4106         roundIncrement = 0;
4107         break;
4108     case float_round_up:
4109         roundIncrement = zSign ? 0 : 0x7f;
4110         break;
4111     case float_round_down:
4112         roundIncrement = zSign ? 0x7f : 0;
4113         break;
4114     case float_round_to_odd:
4115         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4116         break;
4117     default:
4118         abort();
4119         break;
4120     }
4121     roundBits = zSig & 0x7F;
4122     if ( 0xFD <= (uint16_t) zExp ) {
4123         if (    ( 0xFD < zExp )
4124              || (    ( zExp == 0xFD )
4125                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4126            ) {
4127             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4128                                    roundIncrement != 0;
4129             float_raise(float_flag_overflow | float_flag_inexact, status);
4130             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4131         }
4132         if ( zExp < 0 ) {
4133             if (status->flush_to_zero) {
4134                 float_raise(float_flag_output_denormal, status);
4135                 return packFloat32(zSign, 0, 0);
4136             }
4137             isTiny = status->tininess_before_rounding
4138                   || (zExp < -1)
4139                   || (zSig + roundIncrement < 0x80000000);
4140             shift32RightJamming( zSig, - zExp, &zSig );
4141             zExp = 0;
4142             roundBits = zSig & 0x7F;
4143             if (isTiny && roundBits) {
4144                 float_raise(float_flag_underflow, status);
4145             }
4146             if (roundingMode == float_round_to_odd) {
4147                 /*
4148                  * For round-to-odd case, the roundIncrement depends on
4149                  * zSig which just changed.
4150                  */
4151                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4152             }
4153         }
4154     }
4155     if (roundBits) {
4156         float_raise(float_flag_inexact, status);
4157     }
4158     zSig = ( zSig + roundIncrement )>>7;
4159     if (!(roundBits ^ 0x40) && roundNearestEven) {
4160         zSig &= ~1;
4161     }
4162     if ( zSig == 0 ) zExp = 0;
4163     return packFloat32( zSign, zExp, zSig );
4164
4165 }
4166
4167 /*----------------------------------------------------------------------------
4168 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4169 | and significand `zSig', and returns the proper single-precision floating-
4170 | point value corresponding to the abstract input.  This routine is just like
4171 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4172 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4173 | floating-point exponent.
4174 *----------------------------------------------------------------------------*/
4175
4176 static float32
4177  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4178                               float_status *status)
4179 {
4180     int8_t shiftCount;
4181
4182     shiftCount = clz32(zSig) - 1;
4183     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4184                                status);
4185
4186 }
4187
4188 /*----------------------------------------------------------------------------
4189 | Normalizes the subnormal double-precision floating-point value represented
4190 | by the denormalized significand `aSig'.  The normalized exponent and
4191 | significand are stored at the locations pointed to by `zExpPtr' and
4192 | `zSigPtr', respectively.
4193 *----------------------------------------------------------------------------*/
4194
4195 static void
4196  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4197 {
4198     int8_t shiftCount;
4199
4200     shiftCount = clz64(aSig) - 11;
4201     *zSigPtr = aSig<<shiftCount;
4202     *zExpPtr = 1 - shiftCount;
4203
4204 }
4205
4206 /*----------------------------------------------------------------------------
4207 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4208 | double-precision floating-point value, returning the result.  After being
4209 | shifted into the proper positions, the three fields are simply added
4210 | together to form the result.  This means that any integer portion of `zSig'
4211 | will be added into the exponent.  Since a properly normalized significand
4212 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4213 | than the desired result exponent whenever `zSig' is a complete, normalized
4214 | significand.
4215 *----------------------------------------------------------------------------*/
4216
4217 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4218 {
4219
4220     return make_float64(
4221         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4222
4223 }
4224
4225 /*----------------------------------------------------------------------------
4226 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4227 | and significand `zSig', and returns the proper double-precision floating-
4228 | point value corresponding to the abstract input.  Ordinarily, the abstract
4229 | value is simply rounded and packed into the double-precision format, with
4230 | the inexact exception raised if the abstract input cannot be represented
4231 | exactly.  However, if the abstract value is too large, the overflow and
4232 | inexact exceptions are raised and an infinity or maximal finite value is
4233 | returned.  If the abstract value is too small, the input value is rounded to
4234 | a subnormal number, and the underflow and inexact exceptions are raised if
4235 | the abstract input cannot be represented exactly as a subnormal double-
4236 | precision floating-point number.
4237 |     The input significand `zSig' has its binary point between bits 62
4238 | and 61, which is 10 bits to the left of the usual location.  This shifted
4239 | significand must be normalized or smaller.  If `zSig' is not normalized,
4240 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4241 | and it must not require rounding.  In the usual case that `zSig' is
4242 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4243 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4244 | Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4246
4247 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4248                                    float_status *status)
4249 {
4250     int8_t roundingMode;
4251     bool roundNearestEven;
4252     int roundIncrement, roundBits;
4253     bool isTiny;
4254
4255     roundingMode = status->float_rounding_mode;
4256     roundNearestEven = ( roundingMode == float_round_nearest_even );
4257     switch (roundingMode) {
4258     case float_round_nearest_even:
4259     case float_round_ties_away:
4260         roundIncrement = 0x200;
4261         break;
4262     case float_round_to_zero:
4263         roundIncrement = 0;
4264         break;
4265     case float_round_up:
4266         roundIncrement = zSign ? 0 : 0x3ff;
4267         break;
4268     case float_round_down:
4269         roundIncrement = zSign ? 0x3ff : 0;
4270         break;
4271     case float_round_to_odd:
4272         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4273         break;
4274     default:
4275         abort();
4276     }
4277     roundBits = zSig & 0x3FF;
4278     if ( 0x7FD <= (uint16_t) zExp ) {
4279         if (    ( 0x7FD < zExp )
4280              || (    ( zExp == 0x7FD )
4281                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4282            ) {
4283             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4284                                    roundIncrement != 0;
4285             float_raise(float_flag_overflow | float_flag_inexact, status);
4286             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4287         }
4288         if ( zExp < 0 ) {
4289             if (status->flush_to_zero) {
4290                 float_raise(float_flag_output_denormal, status);
4291                 return packFloat64(zSign, 0, 0);
4292             }
4293             isTiny = status->tininess_before_rounding
4294                   || (zExp < -1)
4295                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4296             shift64RightJamming( zSig, - zExp, &zSig );
4297             zExp = 0;
4298             roundBits = zSig & 0x3FF;
4299             if (isTiny && roundBits) {
4300                 float_raise(float_flag_underflow, status);
4301             }
4302             if (roundingMode == float_round_to_odd) {
4303                 /*
4304                  * For round-to-odd case, the roundIncrement depends on
4305                  * zSig which just changed.
4306                  */
4307                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4308             }
4309         }
4310     }
4311     if (roundBits) {
4312         float_raise(float_flag_inexact, status);
4313     }
4314     zSig = ( zSig + roundIncrement )>>10;
4315     if (!(roundBits ^ 0x200) && roundNearestEven) {
4316         zSig &= ~1;
4317     }
4318     if ( zSig == 0 ) zExp = 0;
4319     return packFloat64( zSign, zExp, zSig );
4320
4321 }
4322
4323 /*----------------------------------------------------------------------------
4324 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4325 | and significand `zSig', and returns the proper double-precision floating-
4326 | point value corresponding to the abstract input.  This routine is just like
4327 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4328 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4329 | floating-point exponent.
4330 *----------------------------------------------------------------------------*/
4331
4332 static float64
4333  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4334                               float_status *status)
4335 {
4336     int8_t shiftCount;
4337
4338     shiftCount = clz64(zSig) - 1;
4339     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4340                                status);
4341
4342 }
4343
4344 /*----------------------------------------------------------------------------
4345 | Normalizes the subnormal extended double-precision floating-point value
4346 | represented by the denormalized significand `aSig'.  The normalized exponent
4347 | and significand are stored at the locations pointed to by `zExpPtr' and
4348 | `zSigPtr', respectively.
4349 *----------------------------------------------------------------------------*/
4350
4351 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4352                                 uint64_t *zSigPtr)
4353 {
4354     int8_t shiftCount;
4355
4356     shiftCount = clz64(aSig);
4357     *zSigPtr = aSig<<shiftCount;
4358     *zExpPtr = 1 - shiftCount;
4359 }
4360
4361 /*----------------------------------------------------------------------------
4362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4363 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4364 | and returns the proper extended double-precision floating-point value
4365 | corresponding to the abstract input.  Ordinarily, the abstract value is
4366 | rounded and packed into the extended double-precision format, with the
4367 | inexact exception raised if the abstract input cannot be represented
4368 | exactly.  However, if the abstract value is too large, the overflow and
4369 | inexact exceptions are raised and an infinity or maximal finite value is
4370 | returned.  If the abstract value is too small, the input value is rounded to
4371 | a subnormal number, and the underflow and inexact exceptions are raised if
4372 | the abstract input cannot be represented exactly as a subnormal extended
4373 | double-precision floating-point number.
4374 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4375 | number of bits as single or double precision, respectively.  Otherwise, the
4376 | result is rounded to the full precision of the extended double-precision
4377 | format.
4378 |     The input significand must be normalized or smaller.  If the input
4379 | significand is not normalized, `zExp' must be 0; in that case, the result
4380 | returned is a subnormal number, and it must not require rounding.  The
4381 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4382 | Floating-Point Arithmetic.
4383 *----------------------------------------------------------------------------*/
4384
4385 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4386                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4387                               float_status *status)
4388 {
4389     int8_t roundingMode;
4390     bool roundNearestEven, increment, isTiny;
4391     int64_t roundIncrement, roundMask, roundBits;
4392
4393     roundingMode = status->float_rounding_mode;
4394     roundNearestEven = ( roundingMode == float_round_nearest_even );
4395     if ( roundingPrecision == 80 ) goto precision80;
4396     if ( roundingPrecision == 64 ) {
4397         roundIncrement = UINT64_C(0x0000000000000400);
4398         roundMask = UINT64_C(0x00000000000007FF);
4399     }
4400     else if ( roundingPrecision == 32 ) {
4401         roundIncrement = UINT64_C(0x0000008000000000);
4402         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4403     }
4404     else {
4405         goto precision80;
4406     }
4407     zSig0 |= ( zSig1 != 0 );
4408     switch (roundingMode) {
4409     case float_round_nearest_even:
4410     case float_round_ties_away:
4411         break;
4412     case float_round_to_zero:
4413         roundIncrement = 0;
4414         break;
4415     case float_round_up:
4416         roundIncrement = zSign ? 0 : roundMask;
4417         break;
4418     case float_round_down:
4419         roundIncrement = zSign ? roundMask : 0;
4420         break;
4421     default:
4422         abort();
4423     }
4424     roundBits = zSig0 & roundMask;
4425     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4426         if (    ( 0x7FFE < zExp )
4427              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4428            ) {
4429             goto overflow;
4430         }
4431         if ( zExp <= 0 ) {
4432             if (status->flush_to_zero) {
4433                 float_raise(float_flag_output_denormal, status);
4434                 return packFloatx80(zSign, 0, 0);
4435             }
4436             isTiny = status->tininess_before_rounding
4437                   || (zExp < 0 )
4438                   || (zSig0 <= zSig0 + roundIncrement);
4439             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4440             zExp = 0;
4441             roundBits = zSig0 & roundMask;
4442             if (isTiny && roundBits) {
4443                 float_raise(float_flag_underflow, status);
4444             }
4445             if (roundBits) {
4446                 float_raise(float_flag_inexact, status);
4447             }
4448             zSig0 += roundIncrement;
4449             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4450             roundIncrement = roundMask + 1;
4451             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4452                 roundMask |= roundIncrement;
4453             }
4454             zSig0 &= ~ roundMask;
4455             return packFloatx80( zSign, zExp, zSig0 );
4456         }
4457     }
4458     if (roundBits) {
4459         float_raise(float_flag_inexact, status);
4460     }
4461     zSig0 += roundIncrement;
4462     if ( zSig0 < roundIncrement ) {
4463         ++zExp;
4464         zSig0 = UINT64_C(0x8000000000000000);
4465     }
4466     roundIncrement = roundMask + 1;
4467     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4468         roundMask |= roundIncrement;
4469     }
4470     zSig0 &= ~ roundMask;
4471     if ( zSig0 == 0 ) zExp = 0;
4472     return packFloatx80( zSign, zExp, zSig0 );
4473  precision80:
4474     switch (roundingMode) {
4475     case float_round_nearest_even:
4476     case float_round_ties_away:
4477         increment = ((int64_t)zSig1 < 0);
4478         break;
4479     case float_round_to_zero:
4480         increment = 0;
4481         break;
4482     case float_round_up:
4483         increment = !zSign && zSig1;
4484         break;
4485     case float_round_down:
4486         increment = zSign && zSig1;
4487         break;
4488     default:
4489         abort();
4490     }
4491     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4492         if (    ( 0x7FFE < zExp )
4493              || (    ( zExp == 0x7FFE )
4494                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4495                   && increment
4496                 )
4497            ) {
4498             roundMask = 0;
4499  overflow:
4500             float_raise(float_flag_overflow | float_flag_inexact, status);
4501             if (    ( roundingMode == float_round_to_zero )
4502                  || ( zSign && ( roundingMode == float_round_up ) )
4503                  || ( ! zSign && ( roundingMode == float_round_down ) )
4504                ) {
4505                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4506             }
4507             return packFloatx80(zSign,
4508                                 floatx80_infinity_high,
4509                                 floatx80_infinity_low);
4510         }
4511         if ( zExp <= 0 ) {
4512             isTiny = status->tininess_before_rounding
4513                   || (zExp < 0)
4514                   || !increment
4515                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4516             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4517             zExp = 0;
4518             if (isTiny && zSig1) {
4519                 float_raise(float_flag_underflow, status);
4520             }
4521             if (zSig1) {
4522                 float_raise(float_flag_inexact, status);
4523             }
4524             switch (roundingMode) {
4525             case float_round_nearest_even:
4526             case float_round_ties_away:
4527                 increment = ((int64_t)zSig1 < 0);
4528                 break;
4529             case float_round_to_zero:
4530                 increment = 0;
4531                 break;
4532             case float_round_up:
4533                 increment = !zSign && zSig1;
4534                 break;
4535             case float_round_down:
4536                 increment = zSign && zSig1;
4537                 break;
4538             default:
4539                 abort();
4540             }
4541             if ( increment ) {
4542                 ++zSig0;
4543                 if (!(zSig1 << 1) && roundNearestEven) {
4544                     zSig0 &= ~1;
4545                 }
4546                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4547             }
4548             return packFloatx80( zSign, zExp, zSig0 );
4549         }
4550     }
4551     if (zSig1) {
4552         float_raise(float_flag_inexact, status);
4553     }
4554     if ( increment ) {
4555         ++zSig0;
4556         if ( zSig0 == 0 ) {
4557             ++zExp;
4558             zSig0 = UINT64_C(0x8000000000000000);
4559         }
4560         else {
4561             if (!(zSig1 << 1) && roundNearestEven) {
4562                 zSig0 &= ~1;
4563             }
4564         }
4565     }
4566     else {
4567         if ( zSig0 == 0 ) zExp = 0;
4568     }
4569     return packFloatx80( zSign, zExp, zSig0 );
4570
4571 }
4572
4573 /*----------------------------------------------------------------------------
4574 | Takes an abstract floating-point value having sign `zSign', exponent
4575 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4576 | and returns the proper extended double-precision floating-point value
4577 | corresponding to the abstract input.  This routine is just like
4578 | `roundAndPackFloatx80' except that the input significand does not have to be
4579 | normalized.
4580 *----------------------------------------------------------------------------*/
4581
4582 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4583                                        bool zSign, int32_t zExp,
4584                                        uint64_t zSig0, uint64_t zSig1,
4585                                        float_status *status)
4586 {
4587     int8_t shiftCount;
4588
4589     if ( zSig0 == 0 ) {
4590         zSig0 = zSig1;
4591         zSig1 = 0;
4592         zExp -= 64;
4593     }
4594     shiftCount = clz64(zSig0);
4595     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4596     zExp -= shiftCount;
4597     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4598                                 zSig0, zSig1, status);
4599
4600 }
4601
4602 /*----------------------------------------------------------------------------
4603 | Returns the least-significant 64 fraction bits of the quadruple-precision
4604 | floating-point value `a'.
4605 *----------------------------------------------------------------------------*/
4606
4607 static inline uint64_t extractFloat128Frac1( float128 a )
4608 {
4609
4610     return a.low;
4611
4612 }
4613
4614 /*----------------------------------------------------------------------------
4615 | Returns the most-significant 48 fraction bits of the quadruple-precision
4616 | floating-point value `a'.
4617 *----------------------------------------------------------------------------*/
4618
4619 static inline uint64_t extractFloat128Frac0( float128 a )
4620 {
4621
4622     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4623
4624 }
4625
4626 /*----------------------------------------------------------------------------
4627 | Returns the exponent bits of the quadruple-precision floating-point value
4628 | `a'.
4629 *----------------------------------------------------------------------------*/
4630
4631 static inline int32_t extractFloat128Exp( float128 a )
4632 {
4633
4634     return ( a.high>>48 ) & 0x7FFF;
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4640 *----------------------------------------------------------------------------*/
4641
4642 static inline bool extractFloat128Sign(float128 a)
4643 {
4644     return a.high >> 63;
4645 }
4646
4647 /*----------------------------------------------------------------------------
4648 | Normalizes the subnormal quadruple-precision floating-point value
4649 | represented by the denormalized significand formed by the concatenation of
4650 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4651 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4652 | significand are stored at the location pointed to by `zSig0Ptr', and the
4653 | least significant 64 bits of the normalized significand are stored at the
4654 | location pointed to by `zSig1Ptr'.
4655 *----------------------------------------------------------------------------*/
4656
4657 static void
4658  normalizeFloat128Subnormal(
4659      uint64_t aSig0,
4660      uint64_t aSig1,
4661      int32_t *zExpPtr,
4662      uint64_t *zSig0Ptr,
4663      uint64_t *zSig1Ptr
4664  )
4665 {
4666     int8_t shiftCount;
4667
4668     if ( aSig0 == 0 ) {
4669         shiftCount = clz64(aSig1) - 15;
4670         if ( shiftCount < 0 ) {
4671             *zSig0Ptr = aSig1>>( - shiftCount );
4672             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4673         }
4674         else {
4675             *zSig0Ptr = aSig1<<shiftCount;
4676             *zSig1Ptr = 0;
4677         }
4678         *zExpPtr = - shiftCount - 63;
4679     }
4680     else {
4681         shiftCount = clz64(aSig0) - 15;
4682         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4683         *zExpPtr = 1 - shiftCount;
4684     }
4685
4686 }
4687
4688 /*----------------------------------------------------------------------------
4689 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4690 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4691 | floating-point value, returning the result.  After being shifted into the
4692 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4693 | added together to form the most significant 32 bits of the result.  This
4694 | means that any integer portion of `zSig0' will be added into the exponent.
4695 | Since a properly normalized significand will have an integer portion equal
4696 | to 1, the `zExp' input should be 1 less than the desired result exponent
4697 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4698 | significand.
4699 *----------------------------------------------------------------------------*/
4700
4701 static inline float128
4702 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4703 {
4704     float128 z;
4705
4706     z.low = zSig1;
4707     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4708     return z;
4709 }
4710
4711 /*----------------------------------------------------------------------------
4712 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4713 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4714 | and `zSig2', and returns the proper quadruple-precision floating-point value
4715 | corresponding to the abstract input.  Ordinarily, the abstract value is
4716 | simply rounded and packed into the quadruple-precision format, with the
4717 | inexact exception raised if the abstract input cannot be represented
4718 | exactly.  However, if the abstract value is too large, the overflow and
4719 | inexact exceptions are raised and an infinity or maximal finite value is
4720 | returned.  If the abstract value is too small, the input value is rounded to
4721 | a subnormal number, and the underflow and inexact exceptions are raised if
4722 | the abstract input cannot be represented exactly as a subnormal quadruple-
4723 | precision floating-point number.
4724 |     The input significand must be normalized or smaller.  If the input
4725 | significand is not normalized, `zExp' must be 0; in that case, the result
4726 | returned is a subnormal number, and it must not require rounding.  In the
4727 | usual case that the input significand is normalized, `zExp' must be 1 less
4728 | than the ``true'' floating-point exponent.  The handling of underflow and
4729 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4730 *----------------------------------------------------------------------------*/
4731
4732 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4733                                      uint64_t zSig0, uint64_t zSig1,
4734                                      uint64_t zSig2, float_status *status)
4735 {
4736     int8_t roundingMode;
4737     bool roundNearestEven, increment, isTiny;
4738
4739     roundingMode = status->float_rounding_mode;
4740     roundNearestEven = ( roundingMode == float_round_nearest_even );
4741     switch (roundingMode) {
4742     case float_round_nearest_even:
4743     case float_round_ties_away:
4744         increment = ((int64_t)zSig2 < 0);
4745         break;
4746     case float_round_to_zero:
4747         increment = 0;
4748         break;
4749     case float_round_up:
4750         increment = !zSign && zSig2;
4751         break;
4752     case float_round_down:
4753         increment = zSign && zSig2;
4754         break;
4755     case float_round_to_odd:
4756         increment = !(zSig1 & 0x1) && zSig2;
4757         break;
4758     default:
4759         abort();
4760     }
4761     if ( 0x7FFD <= (uint32_t) zExp ) {
4762         if (    ( 0x7FFD < zExp )
4763              || (    ( zExp == 0x7FFD )
4764                   && eq128(
4765                          UINT64_C(0x0001FFFFFFFFFFFF),
4766                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4767                          zSig0,
4768                          zSig1
4769                      )
4770                   && increment
4771                 )
4772            ) {
4773             float_raise(float_flag_overflow | float_flag_inexact, status);
4774             if (    ( roundingMode == float_round_to_zero )
4775                  || ( zSign && ( roundingMode == float_round_up ) )
4776                  || ( ! zSign && ( roundingMode == float_round_down ) )
4777                  || (roundingMode == float_round_to_odd)
4778                ) {
4779                 return
4780                     packFloat128(
4781                         zSign,
4782                         0x7FFE,
4783                         UINT64_C(0x0000FFFFFFFFFFFF),
4784                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4785                     );
4786             }
4787             return packFloat128( zSign, 0x7FFF, 0, 0 );
4788         }
4789         if ( zExp < 0 ) {
4790             if (status->flush_to_zero) {
4791                 float_raise(float_flag_output_denormal, status);
4792                 return packFloat128(zSign, 0, 0, 0);
4793             }
4794             isTiny = status->tininess_before_rounding
4795                   || (zExp < -1)
4796                   || !increment
4797                   || lt128(zSig0, zSig1,
4798                            UINT64_C(0x0001FFFFFFFFFFFF),
4799                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4800             shift128ExtraRightJamming(
4801                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4802             zExp = 0;
4803             if (isTiny && zSig2) {
4804                 float_raise(float_flag_underflow, status);
4805             }
4806             switch (roundingMode) {
4807             case float_round_nearest_even:
4808             case float_round_ties_away:
4809                 increment = ((int64_t)zSig2 < 0);
4810                 break;
4811             case float_round_to_zero:
4812                 increment = 0;
4813                 break;
4814             case float_round_up:
4815                 increment = !zSign && zSig2;
4816                 break;
4817             case float_round_down:
4818                 increment = zSign && zSig2;
4819                 break;
4820             case float_round_to_odd:
4821                 increment = !(zSig1 & 0x1) && zSig2;
4822                 break;
4823             default:
4824                 abort();
4825             }
4826         }
4827     }
4828     if (zSig2) {
4829         float_raise(float_flag_inexact, status);
4830     }
4831     if ( increment ) {
4832         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4833         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4834             zSig1 &= ~1;
4835         }
4836     }
4837     else {
4838         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4839     }
4840     return packFloat128( zSign, zExp, zSig0, zSig1 );
4841
4842 }
4843
4844 /*----------------------------------------------------------------------------
4845 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4846 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4847 | returns the proper quadruple-precision floating-point value corresponding
4848 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4849 | except that the input significand has fewer bits and does not have to be
4850 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4851 | point exponent.
4852 *----------------------------------------------------------------------------*/
4853
4854 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4855                                               uint64_t zSig0, uint64_t zSig1,
4856                                               float_status *status)
4857 {
4858     int8_t shiftCount;
4859     uint64_t zSig2;
4860
4861     if ( zSig0 == 0 ) {
4862         zSig0 = zSig1;
4863         zSig1 = 0;
4864         zExp -= 64;
4865     }
4866     shiftCount = clz64(zSig0) - 15;
4867     if ( 0 <= shiftCount ) {
4868         zSig2 = 0;
4869         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4870     }
4871     else {
4872         shift128ExtraRightJamming(
4873             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4874     }
4875     zExp -= shiftCount;
4876     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4877
4878 }
4879
4880
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the 32-bit two's complement integer `a'
4883 | to the extended double-precision floating-point format.  The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic.
4886 *----------------------------------------------------------------------------*/
4887
4888 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4889 {
4890     bool zSign;
4891     uint32_t absA;
4892     int8_t shiftCount;
4893     uint64_t zSig;
4894
4895     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4896     zSign = ( a < 0 );
4897     absA = zSign ? - a : a;
4898     shiftCount = clz32(absA) + 32;
4899     zSig = absA;
4900     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4901
4902 }
4903
4904 /*----------------------------------------------------------------------------
4905 | Returns the result of converting the 32-bit two's complement integer `a' to
4906 | the quadruple-precision floating-point format.  The conversion is performed
4907 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4908 *----------------------------------------------------------------------------*/
4909
4910 float128 int32_to_float128(int32_t a, float_status *status)
4911 {
4912     bool zSign;
4913     uint32_t absA;
4914     int8_t shiftCount;
4915     uint64_t zSig0;
4916
4917     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4918     zSign = ( a < 0 );
4919     absA = zSign ? - a : a;
4920     shiftCount = clz32(absA) + 17;
4921     zSig0 = absA;
4922     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4923
4924 }
4925
4926 /*----------------------------------------------------------------------------
4927 | Returns the result of converting the 64-bit two's complement integer `a'
4928 | to the extended double-precision floating-point format.  The conversion
4929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4930 | Arithmetic.
4931 *----------------------------------------------------------------------------*/
4932
4933 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4934 {
4935     bool zSign;
4936     uint64_t absA;
4937     int8_t shiftCount;
4938
4939     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4940     zSign = ( a < 0 );
4941     absA = zSign ? - a : a;
4942     shiftCount = clz64(absA);
4943     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4944
4945 }
4946
4947 /*----------------------------------------------------------------------------
4948 | Returns the result of converting the 64-bit two's complement integer `a' to
4949 | the quadruple-precision floating-point format.  The conversion is performed
4950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4951 *----------------------------------------------------------------------------*/
4952
4953 float128 int64_to_float128(int64_t a, float_status *status)
4954 {
4955     bool zSign;
4956     uint64_t absA;
4957     int8_t shiftCount;
4958     int32_t zExp;
4959     uint64_t zSig0, zSig1;
4960
4961     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4962     zSign = ( a < 0 );
4963     absA = zSign ? - a : a;
4964     shiftCount = clz64(absA) + 49;
4965     zExp = 0x406E - shiftCount;
4966     if ( 64 <= shiftCount ) {
4967         zSig1 = 0;
4968         zSig0 = absA;
4969         shiftCount -= 64;
4970     }
4971     else {
4972         zSig1 = absA;
4973         zSig0 = 0;
4974     }
4975     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4976     return packFloat128( zSign, zExp, zSig0, zSig1 );
4977
4978 }
4979
4980 /*----------------------------------------------------------------------------
4981 | Returns the result of converting the 64-bit unsigned integer `a'
4982 | to the quadruple-precision floating-point format.  The conversion is performed
4983 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4984 *----------------------------------------------------------------------------*/
4985
4986 float128 uint64_to_float128(uint64_t a, float_status *status)
4987 {
4988     if (a == 0) {
4989         return float128_zero;
4990     }
4991     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4992 }
4993
4994 /*----------------------------------------------------------------------------
4995 | Returns the result of converting the single-precision floating-point value
4996 | `a' to the extended double-precision floating-point format.  The conversion
4997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4998 | Arithmetic.
4999 *----------------------------------------------------------------------------*/
5000
5001 floatx80 float32_to_floatx80(float32 a, float_status *status)
5002 {
5003     bool aSign;
5004     int aExp;
5005     uint32_t aSig;
5006
5007     a = float32_squash_input_denormal(a, status);
5008     aSig = extractFloat32Frac( a );
5009     aExp = extractFloat32Exp( a );
5010     aSign = extractFloat32Sign( a );
5011     if ( aExp == 0xFF ) {
5012         if (aSig) {
5013             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5014                                                status);
5015             return floatx80_silence_nan(res, status);
5016         }
5017         return packFloatx80(aSign,
5018                             floatx80_infinity_high,
5019                             floatx80_infinity_low);
5020     }
5021     if ( aExp == 0 ) {
5022         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5023         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5024     }
5025     aSig |= 0x00800000;
5026     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5027
5028 }
5029
5030 /*----------------------------------------------------------------------------
5031 | Returns the result of converting the single-precision floating-point value
5032 | `a' to the double-precision floating-point format.  The conversion is
5033 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5034 | Arithmetic.
5035 *----------------------------------------------------------------------------*/
5036
5037 float128 float32_to_float128(float32 a, float_status *status)
5038 {
5039     bool aSign;
5040     int aExp;
5041     uint32_t aSig;
5042
5043     a = float32_squash_input_denormal(a, status);
5044     aSig = extractFloat32Frac( a );
5045     aExp = extractFloat32Exp( a );
5046     aSign = extractFloat32Sign( a );
5047     if ( aExp == 0xFF ) {
5048         if (aSig) {
5049             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5050         }
5051         return packFloat128( aSign, 0x7FFF, 0, 0 );
5052     }
5053     if ( aExp == 0 ) {
5054         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5055         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5056         --aExp;
5057     }
5058     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5059
5060 }
5061
5062 /*----------------------------------------------------------------------------
5063 | Returns the remainder of the single-precision floating-point value `a'
5064 | with respect to the corresponding value `b'.  The operation is performed
5065 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5066 *----------------------------------------------------------------------------*/
5067
5068 float32 float32_rem(float32 a, float32 b, float_status *status)
5069 {
5070     bool aSign, zSign;
5071     int aExp, bExp, expDiff;
5072     uint32_t aSig, bSig;
5073     uint32_t q;
5074     uint64_t aSig64, bSig64, q64;
5075     uint32_t alternateASig;
5076     int32_t sigMean;
5077     a = float32_squash_input_denormal(a, status);
5078     b = float32_squash_input_denormal(b, status);
5079
5080     aSig = extractFloat32Frac( a );
5081     aExp = extractFloat32Exp( a );
5082     aSign = extractFloat32Sign( a );
5083     bSig = extractFloat32Frac( b );
5084     bExp = extractFloat32Exp( b );
5085     if ( aExp == 0xFF ) {
5086         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5087             return propagateFloat32NaN(a, b, status);
5088         }
5089         float_raise(float_flag_invalid, status);
5090         return float32_default_nan(status);
5091     }
5092     if ( bExp == 0xFF ) {
5093         if (bSig) {
5094             return propagateFloat32NaN(a, b, status);
5095         }
5096         return a;
5097     }
5098     if ( bExp == 0 ) {
5099         if ( bSig == 0 ) {
5100             float_raise(float_flag_invalid, status);
5101             return float32_default_nan(status);
5102         }
5103         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5104     }
5105     if ( aExp == 0 ) {
5106         if ( aSig == 0 ) return a;
5107         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5108     }
5109     expDiff = aExp - bExp;
5110     aSig |= 0x00800000;
5111     bSig |= 0x00800000;
5112     if ( expDiff < 32 ) {
5113         aSig <<= 8;
5114         bSig <<= 8;
5115         if ( expDiff < 0 ) {
5116             if ( expDiff < -1 ) return a;
5117             aSig >>= 1;
5118         }
5119         q = ( bSig <= aSig );
5120         if ( q ) aSig -= bSig;
5121         if ( 0 < expDiff ) {
5122             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5123             q >>= 32 - expDiff;
5124             bSig >>= 2;
5125             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5126         }
5127         else {
5128             aSig >>= 2;
5129             bSig >>= 2;
5130         }
5131     }
5132     else {
5133         if ( bSig <= aSig ) aSig -= bSig;
5134         aSig64 = ( (uint64_t) aSig )<<40;
5135         bSig64 = ( (uint64_t) bSig )<<40;
5136         expDiff -= 64;
5137         while ( 0 < expDiff ) {
5138             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5139             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5140             aSig64 = - ( ( bSig * q64 )<<38 );
5141             expDiff -= 62;
5142         }
5143         expDiff += 64;
5144         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5145         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5146         q = q64>>( 64 - expDiff );
5147         bSig <<= 6;
5148         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5149     }
5150     do {
5151         alternateASig = aSig;
5152         ++q;
5153         aSig -= bSig;
5154     } while ( 0 <= (int32_t) aSig );
5155     sigMean = aSig + alternateASig;
5156     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5157         aSig = alternateASig;
5158     }
5159     zSign = ( (int32_t) aSig < 0 );
5160     if ( zSign ) aSig = - aSig;
5161     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5162 }
5163
5164
5165
5166 /*----------------------------------------------------------------------------
5167 | Returns the binary exponential of the single-precision floating-point value
5168 | `a'. The operation is performed according to the IEC/IEEE Standard for
5169 | Binary Floating-Point Arithmetic.
5170 |
5171 | Uses the following identities:
5172 |
5173 | 1. -------------------------------------------------------------------------
5174 |      x    x*ln(2)
5175 |     2  = e
5176 |
5177 | 2. -------------------------------------------------------------------------
5178 |                      2     3     4     5           n
5179 |      x        x     x     x     x     x           x
5180 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5181 |               1!    2!    3!    4!    5!          n!
5182 *----------------------------------------------------------------------------*/
5183
5184 static const float64 float32_exp2_coefficients[15] =
5185 {
5186     const_float64( 0x3ff0000000000000ll ), /*  1 */
5187     const_float64( 0x3fe0000000000000ll ), /*  2 */
5188     const_float64( 0x3fc5555555555555ll ), /*  3 */
5189     const_float64( 0x3fa5555555555555ll ), /*  4 */
5190     const_float64( 0x3f81111111111111ll ), /*  5 */
5191     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5192     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5193     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5194     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5195     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5196     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5197     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5198     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5199     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5200     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5201 };
5202
5203 float32 float32_exp2(float32 a, float_status *status)
5204 {
5205     bool aSign;
5206     int aExp;
5207     uint32_t aSig;
5208     float64 r, x, xn;
5209     int i;
5210     a = float32_squash_input_denormal(a, status);
5211
5212     aSig = extractFloat32Frac( a );
5213     aExp = extractFloat32Exp( a );
5214     aSign = extractFloat32Sign( a );
5215
5216     if ( aExp == 0xFF) {
5217         if (aSig) {
5218             return propagateFloat32NaN(a, float32_zero, status);
5219         }
5220         return (aSign) ? float32_zero : a;
5221     }
5222     if (aExp == 0) {
5223         if (aSig == 0) return float32_one;
5224     }
5225
5226     float_raise(float_flag_inexact, status);
5227
5228     /* ******************************* */
5229     /* using float64 for approximation */
5230     /* ******************************* */
5231     x = float32_to_float64(a, status);
5232     x = float64_mul(x, float64_ln2, status);
5233
5234     xn = x;
5235     r = float64_one;
5236     for (i = 0 ; i < 15 ; i++) {
5237         float64 f;
5238
5239         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5240         r = float64_add(r, f, status);
5241
5242         xn = float64_mul(xn, x, status);
5243     }
5244
5245     return float64_to_float32(r, status);
5246 }
5247
5248 /*----------------------------------------------------------------------------
5249 | Returns the binary log of the single-precision floating-point value `a'.
5250 | The operation is performed according to the IEC/IEEE Standard for Binary
5251 | Floating-Point Arithmetic.
5252 *----------------------------------------------------------------------------*/
5253 float32 float32_log2(float32 a, float_status *status)
5254 {
5255     bool aSign, zSign;
5256     int aExp;
5257     uint32_t aSig, zSig, i;
5258
5259     a = float32_squash_input_denormal(a, status);
5260     aSig = extractFloat32Frac( a );
5261     aExp = extractFloat32Exp( a );
5262     aSign = extractFloat32Sign( a );
5263
5264     if ( aExp == 0 ) {
5265         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5266         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5267     }
5268     if ( aSign ) {
5269         float_raise(float_flag_invalid, status);
5270         return float32_default_nan(status);
5271     }
5272     if ( aExp == 0xFF ) {
5273         if (aSig) {
5274             return propagateFloat32NaN(a, float32_zero, status);
5275         }
5276         return a;
5277     }
5278
5279     aExp -= 0x7F;
5280     aSig |= 0x00800000;
5281     zSign = aExp < 0;
5282     zSig = aExp << 23;
5283
5284     for (i = 1 << 22; i > 0; i >>= 1) {
5285         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5286         if ( aSig & 0x01000000 ) {
5287             aSig >>= 1;
5288             zSig |= i;
5289         }
5290     }
5291
5292     if ( zSign )
5293         zSig = -zSig;
5294
5295     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5296 }
5297
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of converting the double-precision floating-point value
5300 | `a' to the extended double-precision floating-point format.  The conversion
5301 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5302 | Arithmetic.
5303 *----------------------------------------------------------------------------*/
5304
5305 floatx80 float64_to_floatx80(float64 a, float_status *status)
5306 {
5307     bool aSign;
5308     int aExp;
5309     uint64_t aSig;
5310
5311     a = float64_squash_input_denormal(a, status);
5312     aSig = extractFloat64Frac( a );
5313     aExp = extractFloat64Exp( a );
5314     aSign = extractFloat64Sign( a );
5315     if ( aExp == 0x7FF ) {
5316         if (aSig) {
5317             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5318                                                status);
5319             return floatx80_silence_nan(res, status);
5320         }
5321         return packFloatx80(aSign,
5322                             floatx80_infinity_high,
5323                             floatx80_infinity_low);
5324     }
5325     if ( aExp == 0 ) {
5326         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5327         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5328     }
5329     return
5330         packFloatx80(
5331             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5332
5333 }
5334
5335 /*----------------------------------------------------------------------------
5336 | Returns the result of converting the double-precision floating-point value
5337 | `a' to the quadruple-precision floating-point format.  The conversion is
5338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5339 | Arithmetic.
5340 *----------------------------------------------------------------------------*/
5341
5342 float128 float64_to_float128(float64 a, float_status *status)
5343 {
5344     bool aSign;
5345     int aExp;
5346     uint64_t aSig, zSig0, zSig1;
5347
5348     a = float64_squash_input_denormal(a, status);
5349     aSig = extractFloat64Frac( a );
5350     aExp = extractFloat64Exp( a );
5351     aSign = extractFloat64Sign( a );
5352     if ( aExp == 0x7FF ) {
5353         if (aSig) {
5354             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5355         }
5356         return packFloat128( aSign, 0x7FFF, 0, 0 );
5357     }
5358     if ( aExp == 0 ) {
5359         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5360         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5361         --aExp;
5362     }
5363     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5364     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5365
5366 }
5367
5368
5369 /*----------------------------------------------------------------------------
5370 | Returns the remainder of the double-precision floating-point value `a'
5371 | with respect to the corresponding value `b'.  The operation is performed
5372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5373 *----------------------------------------------------------------------------*/
5374
5375 float64 float64_rem(float64 a, float64 b, float_status *status)
5376 {
5377     bool aSign, zSign;
5378     int aExp, bExp, expDiff;
5379     uint64_t aSig, bSig;
5380     uint64_t q, alternateASig;
5381     int64_t sigMean;
5382
5383     a = float64_squash_input_denormal(a, status);
5384     b = float64_squash_input_denormal(b, status);
5385     aSig = extractFloat64Frac( a );
5386     aExp = extractFloat64Exp( a );
5387     aSign = extractFloat64Sign( a );
5388     bSig = extractFloat64Frac( b );
5389     bExp = extractFloat64Exp( b );
5390     if ( aExp == 0x7FF ) {
5391         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5392             return propagateFloat64NaN(a, b, status);
5393         }
5394         float_raise(float_flag_invalid, status);
5395         return float64_default_nan(status);
5396     }
5397     if ( bExp == 0x7FF ) {
5398         if (bSig) {
5399             return propagateFloat64NaN(a, b, status);
5400         }
5401         return a;
5402     }
5403     if ( bExp == 0 ) {
5404         if ( bSig == 0 ) {
5405             float_raise(float_flag_invalid, status);
5406             return float64_default_nan(status);
5407         }
5408         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5409     }
5410     if ( aExp == 0 ) {
5411         if ( aSig == 0 ) return a;
5412         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5413     }
5414     expDiff = aExp - bExp;
5415     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5416     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5417     if ( expDiff < 0 ) {
5418         if ( expDiff < -1 ) return a;
5419         aSig >>= 1;
5420     }
5421     q = ( bSig <= aSig );
5422     if ( q ) aSig -= bSig;
5423     expDiff -= 64;
5424     while ( 0 < expDiff ) {
5425         q = estimateDiv128To64( aSig, 0, bSig );
5426         q = ( 2 < q ) ? q - 2 : 0;
5427         aSig = - ( ( bSig>>2 ) * q );
5428         expDiff -= 62;
5429     }
5430     expDiff += 64;
5431     if ( 0 < expDiff ) {
5432         q = estimateDiv128To64( aSig, 0, bSig );
5433         q = ( 2 < q ) ? q - 2 : 0;
5434         q >>= 64 - expDiff;
5435         bSig >>= 2;
5436         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5437     }
5438     else {
5439         aSig >>= 2;
5440         bSig >>= 2;
5441     }
5442     do {
5443         alternateASig = aSig;
5444         ++q;
5445         aSig -= bSig;
5446     } while ( 0 <= (int64_t) aSig );
5447     sigMean = aSig + alternateASig;
5448     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5449         aSig = alternateASig;
5450     }
5451     zSign = ( (int64_t) aSig < 0 );
5452     if ( zSign ) aSig = - aSig;
5453     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5454
5455 }
5456
5457 /*----------------------------------------------------------------------------
5458 | Returns the binary log of the double-precision floating-point value `a'.
5459 | The operation is performed according to the IEC/IEEE Standard for Binary
5460 | Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462 float64 float64_log2(float64 a, float_status *status)
5463 {
5464     bool aSign, zSign;
5465     int aExp;
5466     uint64_t aSig, aSig0, aSig1, zSig, i;
5467     a = float64_squash_input_denormal(a, status);
5468
5469     aSig = extractFloat64Frac( a );
5470     aExp = extractFloat64Exp( a );
5471     aSign = extractFloat64Sign( a );
5472
5473     if ( aExp == 0 ) {
5474         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5475         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5476     }
5477     if ( aSign ) {
5478         float_raise(float_flag_invalid, status);
5479         return float64_default_nan(status);
5480     }
5481     if ( aExp == 0x7FF ) {
5482         if (aSig) {
5483             return propagateFloat64NaN(a, float64_zero, status);
5484         }
5485         return a;
5486     }
5487
5488     aExp -= 0x3FF;
5489     aSig |= UINT64_C(0x0010000000000000);
5490     zSign = aExp < 0;
5491     zSig = (uint64_t)aExp << 52;
5492     for (i = 1LL << 51; i > 0; i >>= 1) {
5493         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5494         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5495         if ( aSig & UINT64_C(0x0020000000000000) ) {
5496             aSig >>= 1;
5497             zSig |= i;
5498         }
5499     }
5500
5501     if ( zSign )
5502         zSig = -zSig;
5503     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5504 }
5505
5506 /*----------------------------------------------------------------------------
5507 | Returns the result of converting the extended double-precision floating-
5508 | point value `a' to the 32-bit two's complement integer format.  The
5509 | conversion is performed according to the IEC/IEEE Standard for Binary
5510 | Floating-Point Arithmetic---which means in particular that the conversion
5511 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5512 | largest positive integer is returned.  Otherwise, if the conversion
5513 | overflows, the largest integer with the same sign as `a' is returned.
5514 *----------------------------------------------------------------------------*/
5515
5516 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5517 {
5518     bool aSign;
5519     int32_t aExp, shiftCount;
5520     uint64_t aSig;
5521
5522     if (floatx80_invalid_encoding(a)) {
5523         float_raise(float_flag_invalid, status);
5524         return 1 << 31;
5525     }
5526     aSig = extractFloatx80Frac( a );
5527     aExp = extractFloatx80Exp( a );
5528     aSign = extractFloatx80Sign( a );
5529     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5530     shiftCount = 0x4037 - aExp;
5531     if ( shiftCount <= 0 ) shiftCount = 1;
5532     shift64RightJamming( aSig, shiftCount, &aSig );
5533     return roundAndPackInt32(aSign, aSig, status);
5534
5535 }
5536
5537 /*----------------------------------------------------------------------------
5538 | Returns the result of converting the extended double-precision floating-
5539 | point value `a' to the 32-bit two's complement integer format.  The
5540 | conversion is performed according to the IEC/IEEE Standard for Binary
5541 | Floating-Point Arithmetic, except that the conversion is always rounded
5542 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5543 | Otherwise, if the conversion overflows, the largest integer with the same
5544 | sign as `a' is returned.
5545 *----------------------------------------------------------------------------*/
5546
5547 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5548 {
5549     bool aSign;
5550     int32_t aExp, shiftCount;
5551     uint64_t aSig, savedASig;
5552     int32_t z;
5553
5554     if (floatx80_invalid_encoding(a)) {
5555         float_raise(float_flag_invalid, status);
5556         return 1 << 31;
5557     }
5558     aSig = extractFloatx80Frac( a );
5559     aExp = extractFloatx80Exp( a );
5560     aSign = extractFloatx80Sign( a );
5561     if ( 0x401E < aExp ) {
5562         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5563         goto invalid;
5564     }
5565     else if ( aExp < 0x3FFF ) {
5566         if (aExp || aSig) {
5567             float_raise(float_flag_inexact, status);
5568         }
5569         return 0;
5570     }
5571     shiftCount = 0x403E - aExp;
5572     savedASig = aSig;
5573     aSig >>= shiftCount;
5574     z = aSig;
5575     if ( aSign ) z = - z;
5576     if ( ( z < 0 ) ^ aSign ) {
5577  invalid:
5578         float_raise(float_flag_invalid, status);
5579         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5580     }
5581     if ( ( aSig<<shiftCount ) != savedASig ) {
5582         float_raise(float_flag_inexact, status);
5583     }
5584     return z;
5585
5586 }
5587
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the 64-bit two's complement integer format.  The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic---which means in particular that the conversion
5593 | is rounded according to the current rounding mode.  If `a' is a NaN,
5594 | the largest positive integer is returned.  Otherwise, if the conversion
5595 | overflows, the largest integer with the same sign as `a' is returned.
5596 *----------------------------------------------------------------------------*/
5597
5598 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5599 {
5600     bool aSign;
5601     int32_t aExp, shiftCount;
5602     uint64_t aSig, aSigExtra;
5603
5604     if (floatx80_invalid_encoding(a)) {
5605         float_raise(float_flag_invalid, status);
5606         return 1ULL << 63;
5607     }
5608     aSig = extractFloatx80Frac( a );
5609     aExp = extractFloatx80Exp( a );
5610     aSign = extractFloatx80Sign( a );
5611     shiftCount = 0x403E - aExp;
5612     if ( shiftCount <= 0 ) {
5613         if ( shiftCount ) {
5614             float_raise(float_flag_invalid, status);
5615             if (!aSign || floatx80_is_any_nan(a)) {
5616                 return INT64_MAX;
5617             }
5618             return INT64_MIN;
5619         }
5620         aSigExtra = 0;
5621     }
5622     else {
5623         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5624     }
5625     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5626
5627 }
5628
5629 /*----------------------------------------------------------------------------
5630 | Returns the result of converting the extended double-precision floating-
5631 | point value `a' to the 64-bit two's complement integer format.  The
5632 | conversion is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic, except that the conversion is always rounded
5634 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5635 | Otherwise, if the conversion overflows, the largest integer with the same
5636 | sign as `a' is returned.
5637 *----------------------------------------------------------------------------*/
5638
5639 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5640 {
5641     bool aSign;
5642     int32_t aExp, shiftCount;
5643     uint64_t aSig;
5644     int64_t z;
5645
5646     if (floatx80_invalid_encoding(a)) {
5647         float_raise(float_flag_invalid, status);
5648         return 1ULL << 63;
5649     }
5650     aSig = extractFloatx80Frac( a );
5651     aExp = extractFloatx80Exp( a );
5652     aSign = extractFloatx80Sign( a );
5653     shiftCount = aExp - 0x403E;
5654     if ( 0 <= shiftCount ) {
5655         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5656         if ( ( a.high != 0xC03E ) || aSig ) {
5657             float_raise(float_flag_invalid, status);
5658             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5659                 return INT64_MAX;
5660             }
5661         }
5662         return INT64_MIN;
5663     }
5664     else if ( aExp < 0x3FFF ) {
5665         if (aExp | aSig) {
5666             float_raise(float_flag_inexact, status);
5667         }
5668         return 0;
5669     }
5670     z = aSig>>( - shiftCount );
5671     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5672         float_raise(float_flag_inexact, status);
5673     }
5674     if ( aSign ) z = - z;
5675     return z;
5676
5677 }
5678
5679 /*----------------------------------------------------------------------------
5680 | Returns the result of converting the extended double-precision floating-
5681 | point value `a' to the single-precision floating-point format.  The
5682 | conversion is performed according to the IEC/IEEE Standard for Binary
5683 | Floating-Point Arithmetic.
5684 *----------------------------------------------------------------------------*/
5685
5686 float32 floatx80_to_float32(floatx80 a, float_status *status)
5687 {
5688     bool aSign;
5689     int32_t aExp;
5690     uint64_t aSig;
5691
5692     if (floatx80_invalid_encoding(a)) {
5693         float_raise(float_flag_invalid, status);
5694         return float32_default_nan(status);
5695     }
5696     aSig = extractFloatx80Frac( a );
5697     aExp = extractFloatx80Exp( a );
5698     aSign = extractFloatx80Sign( a );
5699     if ( aExp == 0x7FFF ) {
5700         if ( (uint64_t) ( aSig<<1 ) ) {
5701             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5702                                              status);
5703             return float32_silence_nan(res, status);
5704         }
5705         return packFloat32( aSign, 0xFF, 0 );
5706     }
5707     shift64RightJamming( aSig, 33, &aSig );
5708     if ( aExp || aSig ) aExp -= 0x3F81;
5709     return roundAndPackFloat32(aSign, aExp, aSig, status);
5710
5711 }
5712
5713 /*----------------------------------------------------------------------------
5714 | Returns the result of converting the extended double-precision floating-
5715 | point value `a' to the double-precision floating-point format.  The
5716 | conversion is performed according to the IEC/IEEE Standard for Binary
5717 | Floating-Point Arithmetic.
5718 *----------------------------------------------------------------------------*/
5719
5720 float64 floatx80_to_float64(floatx80 a, float_status *status)
5721 {
5722     bool aSign;
5723     int32_t aExp;
5724     uint64_t aSig, zSig;
5725
5726     if (floatx80_invalid_encoding(a)) {
5727         float_raise(float_flag_invalid, status);
5728         return float64_default_nan(status);
5729     }
5730     aSig = extractFloatx80Frac( a );
5731     aExp = extractFloatx80Exp( a );
5732     aSign = extractFloatx80Sign( a );
5733     if ( aExp == 0x7FFF ) {
5734         if ( (uint64_t) ( aSig<<1 ) ) {
5735             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5736                                              status);
5737             return float64_silence_nan(res, status);
5738         }
5739         return packFloat64( aSign, 0x7FF, 0 );
5740     }
5741     shift64RightJamming( aSig, 1, &zSig );
5742     if ( aExp || aSig ) aExp -= 0x3C01;
5743     return roundAndPackFloat64(aSign, aExp, zSig, status);
5744
5745 }
5746
5747 /*----------------------------------------------------------------------------
5748 | Returns the result of converting the extended double-precision floating-
5749 | point value `a' to the quadruple-precision floating-point format.  The
5750 | conversion is performed according to the IEC/IEEE Standard for Binary
5751 | Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5753
5754 float128 floatx80_to_float128(floatx80 a, float_status *status)
5755 {
5756     bool aSign;
5757     int aExp;
5758     uint64_t aSig, zSig0, zSig1;
5759
5760     if (floatx80_invalid_encoding(a)) {
5761         float_raise(float_flag_invalid, status);
5762         return float128_default_nan(status);
5763     }
5764     aSig = extractFloatx80Frac( a );
5765     aExp = extractFloatx80Exp( a );
5766     aSign = extractFloatx80Sign( a );
5767     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5768         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5769                                            status);
5770         return float128_silence_nan(res, status);
5771     }
5772     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5773     return packFloat128( aSign, aExp, zSig0, zSig1 );
5774
5775 }
5776
5777 /*----------------------------------------------------------------------------
5778 | Rounds the extended double-precision floating-point value `a'
5779 | to the precision provided by floatx80_rounding_precision and returns the
5780 | result as an extended double-precision floating-point value.
5781 | The operation is performed according to the IEC/IEEE Standard for Binary
5782 | Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5784
5785 floatx80 floatx80_round(floatx80 a, float_status *status)
5786 {
5787     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5788                                 extractFloatx80Sign(a),
5789                                 extractFloatx80Exp(a),
5790                                 extractFloatx80Frac(a), 0, status);
5791 }
5792
5793 /*----------------------------------------------------------------------------
5794 | Rounds the extended double-precision floating-point value `a' to an integer,
5795 | and returns the result as an extended quadruple-precision floating-point
5796 | value.  The operation is performed according to the IEC/IEEE Standard for
5797 | Binary Floating-Point Arithmetic.
5798 *----------------------------------------------------------------------------*/
5799
5800 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5801 {
5802     bool aSign;
5803     int32_t aExp;
5804     uint64_t lastBitMask, roundBitsMask;
5805     floatx80 z;
5806
5807     if (floatx80_invalid_encoding(a)) {
5808         float_raise(float_flag_invalid, status);
5809         return floatx80_default_nan(status);
5810     }
5811     aExp = extractFloatx80Exp( a );
5812     if ( 0x403E <= aExp ) {
5813         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5814             return propagateFloatx80NaN(a, a, status);
5815         }
5816         return a;
5817     }
5818     if ( aExp < 0x3FFF ) {
5819         if (    ( aExp == 0 )
5820              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5821             return a;
5822         }
5823         float_raise(float_flag_inexact, status);
5824         aSign = extractFloatx80Sign( a );
5825         switch (status->float_rounding_mode) {
5826          case float_round_nearest_even:
5827             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5828                ) {
5829                 return
5830                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5831             }
5832             break;
5833         case float_round_ties_away:
5834             if (aExp == 0x3FFE) {
5835                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5836             }
5837             break;
5838          case float_round_down:
5839             return
5840                   aSign ?
5841                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5842                 : packFloatx80( 0, 0, 0 );
5843          case float_round_up:
5844             return
5845                   aSign ? packFloatx80( 1, 0, 0 )
5846                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5847
5848         case float_round_to_zero:
5849             break;
5850         default:
5851             g_assert_not_reached();
5852         }
5853         return packFloatx80( aSign, 0, 0 );
5854     }
5855     lastBitMask = 1;
5856     lastBitMask <<= 0x403E - aExp;
5857     roundBitsMask = lastBitMask - 1;
5858     z = a;
5859     switch (status->float_rounding_mode) {
5860     case float_round_nearest_even:
5861         z.low += lastBitMask>>1;
5862         if ((z.low & roundBitsMask) == 0) {
5863             z.low &= ~lastBitMask;
5864         }
5865         break;
5866     case float_round_ties_away:
5867         z.low += lastBitMask >> 1;
5868         break;
5869     case float_round_to_zero:
5870         break;
5871     case float_round_up:
5872         if (!extractFloatx80Sign(z)) {
5873             z.low += roundBitsMask;
5874         }
5875         break;
5876     case float_round_down:
5877         if (extractFloatx80Sign(z)) {
5878             z.low += roundBitsMask;
5879         }
5880         break;
5881     default:
5882         abort();
5883     }
5884     z.low &= ~ roundBitsMask;
5885     if ( z.low == 0 ) {
5886         ++z.high;
5887         z.low = UINT64_C(0x8000000000000000);
5888     }
5889     if (z.low != a.low) {
5890         float_raise(float_flag_inexact, status);
5891     }
5892     return z;
5893
5894 }
5895
5896 /*----------------------------------------------------------------------------
5897 | Returns the result of adding the absolute values of the extended double-
5898 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5899 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5900 | The addition is performed according to the IEC/IEEE Standard for Binary
5901 | Floating-Point Arithmetic.
5902 *----------------------------------------------------------------------------*/
5903
5904 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5905                                 float_status *status)
5906 {
5907     int32_t aExp, bExp, zExp;
5908     uint64_t aSig, bSig, zSig0, zSig1;
5909     int32_t expDiff;
5910
5911     aSig = extractFloatx80Frac( a );
5912     aExp = extractFloatx80Exp( a );
5913     bSig = extractFloatx80Frac( b );
5914     bExp = extractFloatx80Exp( b );
5915     expDiff = aExp - bExp;
5916     if ( 0 < expDiff ) {
5917         if ( aExp == 0x7FFF ) {
5918             if ((uint64_t)(aSig << 1)) {
5919                 return propagateFloatx80NaN(a, b, status);
5920             }
5921             return a;
5922         }
5923         if ( bExp == 0 ) --expDiff;
5924         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5925         zExp = aExp;
5926     }
5927     else if ( expDiff < 0 ) {
5928         if ( bExp == 0x7FFF ) {
5929             if ((uint64_t)(bSig << 1)) {
5930                 return propagateFloatx80NaN(a, b, status);
5931             }
5932             return packFloatx80(zSign,
5933                                 floatx80_infinity_high,
5934                                 floatx80_infinity_low);
5935         }
5936         if ( aExp == 0 ) ++expDiff;
5937         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5938         zExp = bExp;
5939     }
5940     else {
5941         if ( aExp == 0x7FFF ) {
5942             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5943                 return propagateFloatx80NaN(a, b, status);
5944             }
5945             return a;
5946         }
5947         zSig1 = 0;
5948         zSig0 = aSig + bSig;
5949         if ( aExp == 0 ) {
5950             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5951                 /* At least one of the values is a pseudo-denormal,
5952                  * and there is a carry out of the result.  */
5953                 zExp = 1;
5954                 goto shiftRight1;
5955             }
5956             if (zSig0 == 0) {
5957                 return packFloatx80(zSign, 0, 0);
5958             }
5959             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5960             goto roundAndPack;
5961         }
5962         zExp = aExp;
5963         goto shiftRight1;
5964     }
5965     zSig0 = aSig + bSig;
5966     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5967  shiftRight1:
5968     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5969     zSig0 |= UINT64_C(0x8000000000000000);
5970     ++zExp;
5971  roundAndPack:
5972     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5973                                 zSign, zExp, zSig0, zSig1, status);
5974 }
5975
5976 /*----------------------------------------------------------------------------
5977 | Returns the result of subtracting the absolute values of the extended
5978 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5979 | difference is negated before being returned.  `zSign' is ignored if the
5980 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5981 | Standard for Binary Floating-Point Arithmetic.
5982 *----------------------------------------------------------------------------*/
5983
5984 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5985                                 float_status *status)
5986 {
5987     int32_t aExp, bExp, zExp;
5988     uint64_t aSig, bSig, zSig0, zSig1;
5989     int32_t expDiff;
5990
5991     aSig = extractFloatx80Frac( a );
5992     aExp = extractFloatx80Exp( a );
5993     bSig = extractFloatx80Frac( b );
5994     bExp = extractFloatx80Exp( b );
5995     expDiff = aExp - bExp;
5996     if ( 0 < expDiff ) goto aExpBigger;
5997     if ( expDiff < 0 ) goto bExpBigger;
5998     if ( aExp == 0x7FFF ) {
5999         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6000             return propagateFloatx80NaN(a, b, status);
6001         }
6002         float_raise(float_flag_invalid, status);
6003         return floatx80_default_nan(status);
6004     }
6005     if ( aExp == 0 ) {
6006         aExp = 1;
6007         bExp = 1;
6008     }
6009     zSig1 = 0;
6010     if ( bSig < aSig ) goto aBigger;
6011     if ( aSig < bSig ) goto bBigger;
6012     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6013  bExpBigger:
6014     if ( bExp == 0x7FFF ) {
6015         if ((uint64_t)(bSig << 1)) {
6016             return propagateFloatx80NaN(a, b, status);
6017         }
6018         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6019                             floatx80_infinity_low);
6020     }
6021     if ( aExp == 0 ) ++expDiff;
6022     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6023  bBigger:
6024     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6025     zExp = bExp;
6026     zSign ^= 1;
6027     goto normalizeRoundAndPack;
6028  aExpBigger:
6029     if ( aExp == 0x7FFF ) {
6030         if ((uint64_t)(aSig << 1)) {
6031             return propagateFloatx80NaN(a, b, status);
6032         }
6033         return a;
6034     }
6035     if ( bExp == 0 ) --expDiff;
6036     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6037  aBigger:
6038     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6039     zExp = aExp;
6040  normalizeRoundAndPack:
6041     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6042                                          zSign, zExp, zSig0, zSig1, status);
6043 }
6044
6045 /*----------------------------------------------------------------------------
6046 | Returns the result of adding the extended double-precision floating-point
6047 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6048 | Standard for Binary Floating-Point Arithmetic.
6049 *----------------------------------------------------------------------------*/
6050
6051 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6052 {
6053     bool aSign, bSign;
6054
6055     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6056         float_raise(float_flag_invalid, status);
6057         return floatx80_default_nan(status);
6058     }
6059     aSign = extractFloatx80Sign( a );
6060     bSign = extractFloatx80Sign( b );
6061     if ( aSign == bSign ) {
6062         return addFloatx80Sigs(a, b, aSign, status);
6063     }
6064     else {
6065         return subFloatx80Sigs(a, b, aSign, status);
6066     }
6067
6068 }
6069
6070 /*----------------------------------------------------------------------------
6071 | Returns the result of subtracting the extended double-precision floating-
6072 | point values `a' and `b'.  The operation is performed according to the
6073 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6074 *----------------------------------------------------------------------------*/
6075
6076 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6077 {
6078     bool aSign, bSign;
6079
6080     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6081         float_raise(float_flag_invalid, status);
6082         return floatx80_default_nan(status);
6083     }
6084     aSign = extractFloatx80Sign( a );
6085     bSign = extractFloatx80Sign( b );
6086     if ( aSign == bSign ) {
6087         return subFloatx80Sigs(a, b, aSign, status);
6088     }
6089     else {
6090         return addFloatx80Sigs(a, b, aSign, status);
6091     }
6092
6093 }
6094
6095 /*----------------------------------------------------------------------------
6096 | Returns the result of multiplying the extended double-precision floating-
6097 | point values `a' and `b'.  The operation is performed according to the
6098 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6099 *----------------------------------------------------------------------------*/
6100
6101 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6102 {
6103     bool aSign, bSign, zSign;
6104     int32_t aExp, bExp, zExp;
6105     uint64_t aSig, bSig, zSig0, zSig1;
6106
6107     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6108         float_raise(float_flag_invalid, status);
6109         return floatx80_default_nan(status);
6110     }
6111     aSig = extractFloatx80Frac( a );
6112     aExp = extractFloatx80Exp( a );
6113     aSign = extractFloatx80Sign( a );
6114     bSig = extractFloatx80Frac( b );
6115     bExp = extractFloatx80Exp( b );
6116     bSign = extractFloatx80Sign( b );
6117     zSign = aSign ^ bSign;
6118     if ( aExp == 0x7FFF ) {
6119         if (    (uint64_t) ( aSig<<1 )
6120              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6121             return propagateFloatx80NaN(a, b, status);
6122         }
6123         if ( ( bExp | bSig ) == 0 ) goto invalid;
6124         return packFloatx80(zSign, floatx80_infinity_high,
6125                                    floatx80_infinity_low);
6126     }
6127     if ( bExp == 0x7FFF ) {
6128         if ((uint64_t)(bSig << 1)) {
6129             return propagateFloatx80NaN(a, b, status);
6130         }
6131         if ( ( aExp | aSig ) == 0 ) {
6132  invalid:
6133             float_raise(float_flag_invalid, status);
6134             return floatx80_default_nan(status);
6135         }
6136         return packFloatx80(zSign, floatx80_infinity_high,
6137                                    floatx80_infinity_low);
6138     }
6139     if ( aExp == 0 ) {
6140         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6141         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6142     }
6143     if ( bExp == 0 ) {
6144         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6145         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6146     }
6147     zExp = aExp + bExp - 0x3FFE;
6148     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6149     if ( 0 < (int64_t) zSig0 ) {
6150         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6151         --zExp;
6152     }
6153     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6154                                 zSign, zExp, zSig0, zSig1, status);
6155 }
6156
6157 /*----------------------------------------------------------------------------
6158 | Returns the result of dividing the extended double-precision floating-point
6159 | value `a' by the corresponding value `b'.  The operation is performed
6160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6161 *----------------------------------------------------------------------------*/
6162
6163 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6164 {
6165     bool aSign, bSign, zSign;
6166     int32_t aExp, bExp, zExp;
6167     uint64_t aSig, bSig, zSig0, zSig1;
6168     uint64_t rem0, rem1, rem2, term0, term1, term2;
6169
6170     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6171         float_raise(float_flag_invalid, status);
6172         return floatx80_default_nan(status);
6173     }
6174     aSig = extractFloatx80Frac( a );
6175     aExp = extractFloatx80Exp( a );
6176     aSign = extractFloatx80Sign( a );
6177     bSig = extractFloatx80Frac( b );
6178     bExp = extractFloatx80Exp( b );
6179     bSign = extractFloatx80Sign( b );
6180     zSign = aSign ^ bSign;
6181     if ( aExp == 0x7FFF ) {
6182         if ((uint64_t)(aSig << 1)) {
6183             return propagateFloatx80NaN(a, b, status);
6184         }
6185         if ( bExp == 0x7FFF ) {
6186             if ((uint64_t)(bSig << 1)) {
6187                 return propagateFloatx80NaN(a, b, status);
6188             }
6189             goto invalid;
6190         }
6191         return packFloatx80(zSign, floatx80_infinity_high,
6192                                    floatx80_infinity_low);
6193     }
6194     if ( bExp == 0x7FFF ) {
6195         if ((uint64_t)(bSig << 1)) {
6196             return propagateFloatx80NaN(a, b, status);
6197         }
6198         return packFloatx80( zSign, 0, 0 );
6199     }
6200     if ( bExp == 0 ) {
6201         if ( bSig == 0 ) {
6202             if ( ( aExp | aSig ) == 0 ) {
6203  invalid:
6204                 float_raise(float_flag_invalid, status);
6205                 return floatx80_default_nan(status);
6206             }
6207             float_raise(float_flag_divbyzero, status);
6208             return packFloatx80(zSign, floatx80_infinity_high,
6209                                        floatx80_infinity_low);
6210         }
6211         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6212     }
6213     if ( aExp == 0 ) {
6214         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6215         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6216     }
6217     zExp = aExp - bExp + 0x3FFE;
6218     rem1 = 0;
6219     if ( bSig <= aSig ) {
6220         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6221         ++zExp;
6222     }
6223     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6224     mul64To128( bSig, zSig0, &term0, &term1 );
6225     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6226     while ( (int64_t) rem0 < 0 ) {
6227         --zSig0;
6228         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6229     }
6230     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6231     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6232         mul64To128( bSig, zSig1, &term1, &term2 );
6233         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6234         while ( (int64_t) rem1 < 0 ) {
6235             --zSig1;
6236             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6237         }
6238         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6239     }
6240     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6241                                 zSign, zExp, zSig0, zSig1, status);
6242 }
6243
6244 /*----------------------------------------------------------------------------
6245 | Returns the remainder of the extended double-precision floating-point value
6246 | `a' with respect to the corresponding value `b'.  The operation is performed
6247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6248 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6249 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6250 | the absolute value of the integer quotient.
6251 *----------------------------------------------------------------------------*/
6252
6253 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6254                          float_status *status)
6255 {
6256     bool aSign, zSign;
6257     int32_t aExp, bExp, expDiff, aExpOrig;
6258     uint64_t aSig0, aSig1, bSig;
6259     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6260
6261     *quotient = 0;
6262     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6263         float_raise(float_flag_invalid, status);
6264         return floatx80_default_nan(status);
6265     }
6266     aSig0 = extractFloatx80Frac( a );
6267     aExpOrig = aExp = extractFloatx80Exp( a );
6268     aSign = extractFloatx80Sign( a );
6269     bSig = extractFloatx80Frac( b );
6270     bExp = extractFloatx80Exp( b );
6271     if ( aExp == 0x7FFF ) {
6272         if (    (uint64_t) ( aSig0<<1 )
6273              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6274             return propagateFloatx80NaN(a, b, status);
6275         }
6276         goto invalid;
6277     }
6278     if ( bExp == 0x7FFF ) {
6279         if ((uint64_t)(bSig << 1)) {
6280             return propagateFloatx80NaN(a, b, status);
6281         }
6282         if (aExp == 0 && aSig0 >> 63) {
6283             /*
6284              * Pseudo-denormal argument must be returned in normalized
6285              * form.
6286              */
6287             return packFloatx80(aSign, 1, aSig0);
6288         }
6289         return a;
6290     }
6291     if ( bExp == 0 ) {
6292         if ( bSig == 0 ) {
6293  invalid:
6294             float_raise(float_flag_invalid, status);
6295             return floatx80_default_nan(status);
6296         }
6297         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6298     }
6299     if ( aExp == 0 ) {
6300         if ( aSig0 == 0 ) return a;
6301         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6302     }
6303     zSign = aSign;
6304     expDiff = aExp - bExp;
6305     aSig1 = 0;
6306     if ( expDiff < 0 ) {
6307         if ( mod || expDiff < -1 ) {
6308             if (aExp == 1 && aExpOrig == 0) {
6309                 /*
6310                  * Pseudo-denormal argument must be returned in
6311                  * normalized form.
6312                  */
6313                 return packFloatx80(aSign, aExp, aSig0);
6314             }
6315             return a;
6316         }
6317         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6318         expDiff = 0;
6319     }
6320     *quotient = q = ( bSig <= aSig0 );
6321     if ( q ) aSig0 -= bSig;
6322     expDiff -= 64;
6323     while ( 0 < expDiff ) {
6324         q = estimateDiv128To64( aSig0, aSig1, bSig );
6325         q = ( 2 < q ) ? q - 2 : 0;
6326         mul64To128( bSig, q, &term0, &term1 );
6327         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6328         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6329         expDiff -= 62;
6330         *quotient <<= 62;
6331         *quotient += q;
6332     }
6333     expDiff += 64;
6334     if ( 0 < expDiff ) {
6335         q = estimateDiv128To64( aSig0, aSig1, bSig );
6336         q = ( 2 < q ) ? q - 2 : 0;
6337         q >>= 64 - expDiff;
6338         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6339         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6340         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6341         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6342             ++q;
6343             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6344         }
6345         if (expDiff < 64) {
6346             *quotient <<= expDiff;
6347         } else {
6348             *quotient = 0;
6349         }
6350         *quotient += q;
6351     }
6352     else {
6353         term1 = 0;
6354         term0 = bSig;
6355     }
6356     if (!mod) {
6357         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6358         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6359                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6360                         && ( q & 1 ) )
6361             ) {
6362             aSig0 = alternateASig0;
6363             aSig1 = alternateASig1;
6364             zSign = ! zSign;
6365             ++*quotient;
6366         }
6367     }
6368     return
6369         normalizeRoundAndPackFloatx80(
6370             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6371
6372 }
6373
6374 /*----------------------------------------------------------------------------
6375 | Returns the remainder of the extended double-precision floating-point value
6376 | `a' with respect to the corresponding value `b'.  The operation is performed
6377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6378 *----------------------------------------------------------------------------*/
6379
6380 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6381 {
6382     uint64_t quotient;
6383     return floatx80_modrem(a, b, false, &quotient, status);
6384 }
6385
6386 /*----------------------------------------------------------------------------
6387 | Returns the remainder of the extended double-precision floating-point value
6388 | `a' with respect to the corresponding value `b', with the quotient truncated
6389 | toward zero.
6390 *----------------------------------------------------------------------------*/
6391
6392 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6393 {
6394     uint64_t quotient;
6395     return floatx80_modrem(a, b, true, &quotient, status);
6396 }
6397
6398 /*----------------------------------------------------------------------------
6399 | Returns the square root of the extended double-precision floating-point
6400 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6401 | for Binary Floating-Point Arithmetic.
6402 *----------------------------------------------------------------------------*/
6403
6404 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6405 {
6406     bool aSign;
6407     int32_t aExp, zExp;
6408     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6409     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6410
6411     if (floatx80_invalid_encoding(a)) {
6412         float_raise(float_flag_invalid, status);
6413         return floatx80_default_nan(status);
6414     }
6415     aSig0 = extractFloatx80Frac( a );
6416     aExp = extractFloatx80Exp( a );
6417     aSign = extractFloatx80Sign( a );
6418     if ( aExp == 0x7FFF ) {
6419         if ((uint64_t)(aSig0 << 1)) {
6420             return propagateFloatx80NaN(a, a, status);
6421         }
6422         if ( ! aSign ) return a;
6423         goto invalid;
6424     }
6425     if ( aSign ) {
6426         if ( ( aExp | aSig0 ) == 0 ) return a;
6427  invalid:
6428         float_raise(float_flag_invalid, status);
6429         return floatx80_default_nan(status);
6430     }
6431     if ( aExp == 0 ) {
6432         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6433         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6434     }
6435     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6436     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6437     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6438     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6439     doubleZSig0 = zSig0<<1;
6440     mul64To128( zSig0, zSig0, &term0, &term1 );
6441     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6442     while ( (int64_t) rem0 < 0 ) {
6443         --zSig0;
6444         doubleZSig0 -= 2;
6445         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6446     }
6447     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6448     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6449         if ( zSig1 == 0 ) zSig1 = 1;
6450         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6451         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6452         mul64To128( zSig1, zSig1, &term2, &term3 );
6453         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6454         while ( (int64_t) rem1 < 0 ) {
6455             --zSig1;
6456             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6457             term3 |= 1;
6458             term2 |= doubleZSig0;
6459             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6460         }
6461         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6462     }
6463     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6464     zSig0 |= doubleZSig0;
6465     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6466                                 0, zExp, zSig0, zSig1, status);
6467 }
6468
6469 /*----------------------------------------------------------------------------
6470 | Returns the result of converting the quadruple-precision floating-point
6471 | value `a' to the 32-bit two's complement integer format.  The conversion
6472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6473 | Arithmetic---which means in particular that the conversion is rounded
6474 | according to the current rounding mode.  If `a' is a NaN, the largest
6475 | positive integer is returned.  Otherwise, if the conversion overflows, the
6476 | largest integer with the same sign as `a' is returned.
6477 *----------------------------------------------------------------------------*/
6478
6479 int32_t float128_to_int32(float128 a, float_status *status)
6480 {
6481     bool aSign;
6482     int32_t aExp, shiftCount;
6483     uint64_t aSig0, aSig1;
6484
6485     aSig1 = extractFloat128Frac1( a );
6486     aSig0 = extractFloat128Frac0( a );
6487     aExp = extractFloat128Exp( a );
6488     aSign = extractFloat128Sign( a );
6489     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6490     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6491     aSig0 |= ( aSig1 != 0 );
6492     shiftCount = 0x4028 - aExp;
6493     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6494     return roundAndPackInt32(aSign, aSig0, status);
6495
6496 }
6497
6498 /*----------------------------------------------------------------------------
6499 | Returns the result of converting the quadruple-precision floating-point
6500 | value `a' to the 32-bit two's complement integer format.  The conversion
6501 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6502 | Arithmetic, except that the conversion is always rounded toward zero.  If
6503 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6504 | conversion overflows, the largest integer with the same sign as `a' is
6505 | returned.
6506 *----------------------------------------------------------------------------*/
6507
6508 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6509 {
6510     bool aSign;
6511     int32_t aExp, shiftCount;
6512     uint64_t aSig0, aSig1, savedASig;
6513     int32_t z;
6514
6515     aSig1 = extractFloat128Frac1( a );
6516     aSig0 = extractFloat128Frac0( a );
6517     aExp = extractFloat128Exp( a );
6518     aSign = extractFloat128Sign( a );
6519     aSig0 |= ( aSig1 != 0 );
6520     if ( 0x401E < aExp ) {
6521         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6522         goto invalid;
6523     }
6524     else if ( aExp < 0x3FFF ) {
6525         if (aExp || aSig0) {
6526             float_raise(float_flag_inexact, status);
6527         }
6528         return 0;
6529     }
6530     aSig0 |= UINT64_C(0x0001000000000000);
6531     shiftCount = 0x402F - aExp;
6532     savedASig = aSig0;
6533     aSig0 >>= shiftCount;
6534     z = aSig0;
6535     if ( aSign ) z = - z;
6536     if ( ( z < 0 ) ^ aSign ) {
6537  invalid:
6538         float_raise(float_flag_invalid, status);
6539         return aSign ? INT32_MIN : INT32_MAX;
6540     }
6541     if ( ( aSig0<<shiftCount ) != savedASig ) {
6542         float_raise(float_flag_inexact, status);
6543     }
6544     return z;
6545
6546 }
6547
6548 /*----------------------------------------------------------------------------
6549 | Returns the result of converting the quadruple-precision floating-point
6550 | value `a' to the 64-bit two's complement integer format.  The conversion
6551 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6552 | Arithmetic---which means in particular that the conversion is rounded
6553 | according to the current rounding mode.  If `a' is a NaN, the largest
6554 | positive integer is returned.  Otherwise, if the conversion overflows, the
6555 | largest integer with the same sign as `a' is returned.
6556 *----------------------------------------------------------------------------*/
6557
6558 int64_t float128_to_int64(float128 a, float_status *status)
6559 {
6560     bool aSign;
6561     int32_t aExp, shiftCount;
6562     uint64_t aSig0, aSig1;
6563
6564     aSig1 = extractFloat128Frac1( a );
6565     aSig0 = extractFloat128Frac0( a );
6566     aExp = extractFloat128Exp( a );
6567     aSign = extractFloat128Sign( a );
6568     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6569     shiftCount = 0x402F - aExp;
6570     if ( shiftCount <= 0 ) {
6571         if ( 0x403E < aExp ) {
6572             float_raise(float_flag_invalid, status);
6573             if (    ! aSign
6574                  || (    ( aExp == 0x7FFF )
6575                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6576                     )
6577                ) {
6578                 return INT64_MAX;
6579             }
6580             return INT64_MIN;
6581         }
6582         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6583     }
6584     else {
6585         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6586     }
6587     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6588
6589 }
6590
6591 /*----------------------------------------------------------------------------
6592 | Returns the result of converting the quadruple-precision floating-point
6593 | value `a' to the 64-bit two's complement integer format.  The conversion
6594 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6595 | Arithmetic, except that the conversion is always rounded toward zero.
6596 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6597 | the conversion overflows, the largest integer with the same sign as `a' is
6598 | returned.
6599 *----------------------------------------------------------------------------*/
6600
6601 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6602 {
6603     bool aSign;
6604     int32_t aExp, shiftCount;
6605     uint64_t aSig0, aSig1;
6606     int64_t z;
6607
6608     aSig1 = extractFloat128Frac1( a );
6609     aSig0 = extractFloat128Frac0( a );
6610     aExp = extractFloat128Exp( a );
6611     aSign = extractFloat128Sign( a );
6612     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6613     shiftCount = aExp - 0x402F;
6614     if ( 0 < shiftCount ) {
6615         if ( 0x403E <= aExp ) {
6616             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6617             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6618                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6619                 if (aSig1) {
6620                     float_raise(float_flag_inexact, status);
6621                 }
6622             }
6623             else {
6624                 float_raise(float_flag_invalid, status);
6625                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6626                     return INT64_MAX;
6627                 }
6628             }
6629             return INT64_MIN;
6630         }
6631         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6632         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6633             float_raise(float_flag_inexact, status);
6634         }
6635     }
6636     else {
6637         if ( aExp < 0x3FFF ) {
6638             if ( aExp | aSig0 | aSig1 ) {
6639                 float_raise(float_flag_inexact, status);
6640             }
6641             return 0;
6642         }
6643         z = aSig0>>( - shiftCount );
6644         if (    aSig1
6645              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6646             float_raise(float_flag_inexact, status);
6647         }
6648     }
6649     if ( aSign ) z = - z;
6650     return z;
6651
6652 }
6653
6654 /*----------------------------------------------------------------------------
6655 | Returns the result of converting the quadruple-precision floating-point value
6656 | `a' to the 64-bit unsigned integer format.  The conversion is
6657 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6658 | Arithmetic---which means in particular that the conversion is rounded
6659 | according to the current rounding mode.  If `a' is a NaN, the largest
6660 | positive integer is returned.  If the conversion overflows, the
6661 | largest unsigned integer is returned.  If 'a' is negative, the value is
6662 | rounded and zero is returned; negative values that do not round to zero
6663 | will raise the inexact exception.
6664 *----------------------------------------------------------------------------*/
6665
6666 uint64_t float128_to_uint64(float128 a, float_status *status)
6667 {
6668     bool aSign;
6669     int aExp;
6670     int shiftCount;
6671     uint64_t aSig0, aSig1;
6672
6673     aSig0 = extractFloat128Frac0(a);
6674     aSig1 = extractFloat128Frac1(a);
6675     aExp = extractFloat128Exp(a);
6676     aSign = extractFloat128Sign(a);
6677     if (aSign && (aExp > 0x3FFE)) {
6678         float_raise(float_flag_invalid, status);
6679         if (float128_is_any_nan(a)) {
6680             return UINT64_MAX;
6681         } else {
6682             return 0;
6683         }
6684     }
6685     if (aExp) {
6686         aSig0 |= UINT64_C(0x0001000000000000);
6687     }
6688     shiftCount = 0x402F - aExp;
6689     if (shiftCount <= 0) {
6690         if (0x403E < aExp) {
6691             float_raise(float_flag_invalid, status);
6692             return UINT64_MAX;
6693         }
6694         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6695     } else {
6696         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6697     }
6698     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6699 }
6700
6701 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6702 {
6703     uint64_t v;
6704     signed char current_rounding_mode = status->float_rounding_mode;
6705
6706     set_float_rounding_mode(float_round_to_zero, status);
6707     v = float128_to_uint64(a, status);
6708     set_float_rounding_mode(current_rounding_mode, status);
6709
6710     return v;
6711 }
6712
6713 /*----------------------------------------------------------------------------
6714 | Returns the result of converting the quadruple-precision floating-point
6715 | value `a' to the 32-bit unsigned integer format.  The conversion
6716 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6717 | Arithmetic except that the conversion is always rounded toward zero.
6718 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6719 | if the conversion overflows, the largest unsigned integer is returned.
6720 | If 'a' is negative, the value is rounded and zero is returned; negative
6721 | values that do not round to zero will raise the inexact exception.
6722 *----------------------------------------------------------------------------*/
6723
6724 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6725 {
6726     uint64_t v;
6727     uint32_t res;
6728     int old_exc_flags = get_float_exception_flags(status);
6729
6730     v = float128_to_uint64_round_to_zero(a, status);
6731     if (v > 0xffffffff) {
6732         res = 0xffffffff;
6733     } else {
6734         return v;
6735     }
6736     set_float_exception_flags(old_exc_flags, status);
6737     float_raise(float_flag_invalid, status);
6738     return res;
6739 }
6740
6741 /*----------------------------------------------------------------------------
6742 | Returns the result of converting the quadruple-precision floating-point value
6743 | `a' to the 32-bit unsigned integer format.  The conversion is
6744 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6745 | Arithmetic---which means in particular that the conversion is rounded
6746 | according to the current rounding mode.  If `a' is a NaN, the largest
6747 | positive integer is returned.  If the conversion overflows, the
6748 | largest unsigned integer is returned.  If 'a' is negative, the value is
6749 | rounded and zero is returned; negative values that do not round to zero
6750 | will raise the inexact exception.
6751 *----------------------------------------------------------------------------*/
6752
6753 uint32_t float128_to_uint32(float128 a, float_status *status)
6754 {
6755     uint64_t v;
6756     uint32_t res;
6757     int old_exc_flags = get_float_exception_flags(status);
6758
6759     v = float128_to_uint64(a, status);
6760     if (v > 0xffffffff) {
6761         res = 0xffffffff;
6762     } else {
6763         return v;
6764     }
6765     set_float_exception_flags(old_exc_flags, status);
6766     float_raise(float_flag_invalid, status);
6767     return res;
6768 }
6769
6770 /*----------------------------------------------------------------------------
6771 | Returns the result of converting the quadruple-precision floating-point
6772 | value `a' to the single-precision floating-point format.  The conversion
6773 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6774 | Arithmetic.
6775 *----------------------------------------------------------------------------*/
6776
6777 float32 float128_to_float32(float128 a, float_status *status)
6778 {
6779     bool aSign;
6780     int32_t aExp;
6781     uint64_t aSig0, aSig1;
6782     uint32_t zSig;
6783
6784     aSig1 = extractFloat128Frac1( a );
6785     aSig0 = extractFloat128Frac0( a );
6786     aExp = extractFloat128Exp( a );
6787     aSign = extractFloat128Sign( a );
6788     if ( aExp == 0x7FFF ) {
6789         if ( aSig0 | aSig1 ) {
6790             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6791         }
6792         return packFloat32( aSign, 0xFF, 0 );
6793     }
6794     aSig0 |= ( aSig1 != 0 );
6795     shift64RightJamming( aSig0, 18, &aSig0 );
6796     zSig = aSig0;
6797     if ( aExp || zSig ) {
6798         zSig |= 0x40000000;
6799         aExp -= 0x3F81;
6800     }
6801     return roundAndPackFloat32(aSign, aExp, zSig, status);
6802
6803 }
6804
6805 /*----------------------------------------------------------------------------
6806 | Returns the result of converting the quadruple-precision floating-point
6807 | value `a' to the double-precision floating-point format.  The conversion
6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6809 | Arithmetic.
6810 *----------------------------------------------------------------------------*/
6811
6812 float64 float128_to_float64(float128 a, float_status *status)
6813 {
6814     bool aSign;
6815     int32_t aExp;
6816     uint64_t aSig0, aSig1;
6817
6818     aSig1 = extractFloat128Frac1( a );
6819     aSig0 = extractFloat128Frac0( a );
6820     aExp = extractFloat128Exp( a );
6821     aSign = extractFloat128Sign( a );
6822     if ( aExp == 0x7FFF ) {
6823         if ( aSig0 | aSig1 ) {
6824             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6825         }
6826         return packFloat64( aSign, 0x7FF, 0 );
6827     }
6828     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6829     aSig0 |= ( aSig1 != 0 );
6830     if ( aExp || aSig0 ) {
6831         aSig0 |= UINT64_C(0x4000000000000000);
6832         aExp -= 0x3C01;
6833     }
6834     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6835
6836 }
6837
6838 /*----------------------------------------------------------------------------
6839 | Returns the result of converting the quadruple-precision floating-point
6840 | value `a' to the extended double-precision floating-point format.  The
6841 | conversion is performed according to the IEC/IEEE Standard for Binary
6842 | Floating-Point Arithmetic.
6843 *----------------------------------------------------------------------------*/
6844
6845 floatx80 float128_to_floatx80(float128 a, float_status *status)
6846 {
6847     bool aSign;
6848     int32_t aExp;
6849     uint64_t aSig0, aSig1;
6850
6851     aSig1 = extractFloat128Frac1( a );
6852     aSig0 = extractFloat128Frac0( a );
6853     aExp = extractFloat128Exp( a );
6854     aSign = extractFloat128Sign( a );
6855     if ( aExp == 0x7FFF ) {
6856         if ( aSig0 | aSig1 ) {
6857             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6858                                                status);
6859             return floatx80_silence_nan(res, status);
6860         }
6861         return packFloatx80(aSign, floatx80_infinity_high,
6862                                    floatx80_infinity_low);
6863     }
6864     if ( aExp == 0 ) {
6865         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6866         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6867     }
6868     else {
6869         aSig0 |= UINT64_C(0x0001000000000000);
6870     }
6871     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6872     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6873
6874 }
6875
6876 /*----------------------------------------------------------------------------
6877 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6878 | returns the result as a quadruple-precision floating-point value.  The
6879 | operation is performed according to the IEC/IEEE Standard for Binary
6880 | Floating-Point Arithmetic.
6881 *----------------------------------------------------------------------------*/
6882
6883 float128 float128_round_to_int(float128 a, float_status *status)
6884 {
6885     bool aSign;
6886     int32_t aExp;
6887     uint64_t lastBitMask, roundBitsMask;
6888     float128 z;
6889
6890     aExp = extractFloat128Exp( a );
6891     if ( 0x402F <= aExp ) {
6892         if ( 0x406F <= aExp ) {
6893             if (    ( aExp == 0x7FFF )
6894                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6895                ) {
6896                 return propagateFloat128NaN(a, a, status);
6897             }
6898             return a;
6899         }
6900         lastBitMask = 1;
6901         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6902         roundBitsMask = lastBitMask - 1;
6903         z = a;
6904         switch (status->float_rounding_mode) {
6905         case float_round_nearest_even:
6906             if ( lastBitMask ) {
6907                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6908                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6909             }
6910             else {
6911                 if ( (int64_t) z.low < 0 ) {
6912                     ++z.high;
6913                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6914                 }
6915             }
6916             break;
6917         case float_round_ties_away:
6918             if (lastBitMask) {
6919                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6920             } else {
6921                 if ((int64_t) z.low < 0) {
6922                     ++z.high;
6923                 }
6924             }
6925             break;
6926         case float_round_to_zero:
6927             break;
6928         case float_round_up:
6929             if (!extractFloat128Sign(z)) {
6930                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6931             }
6932             break;
6933         case float_round_down:
6934             if (extractFloat128Sign(z)) {
6935                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6936             }
6937             break;
6938         case float_round_to_odd:
6939             /*
6940              * Note that if lastBitMask == 0, the last bit is the lsb
6941              * of high, and roundBitsMask == -1.
6942              */
6943             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6944                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6945             }
6946             break;
6947         default:
6948             abort();
6949         }
6950         z.low &= ~ roundBitsMask;
6951     }
6952     else {
6953         if ( aExp < 0x3FFF ) {
6954             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6955             float_raise(float_flag_inexact, status);
6956             aSign = extractFloat128Sign( a );
6957             switch (status->float_rounding_mode) {
6958             case float_round_nearest_even:
6959                 if (    ( aExp == 0x3FFE )
6960                      && (   extractFloat128Frac0( a )
6961                           | extractFloat128Frac1( a ) )
6962                    ) {
6963                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6964                 }
6965                 break;
6966             case float_round_ties_away:
6967                 if (aExp == 0x3FFE) {
6968                     return packFloat128(aSign, 0x3FFF, 0, 0);
6969                 }
6970                 break;
6971             case float_round_down:
6972                 return
6973                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6974                     : packFloat128( 0, 0, 0, 0 );
6975             case float_round_up:
6976                 return
6977                       aSign ? packFloat128( 1, 0, 0, 0 )
6978                     : packFloat128( 0, 0x3FFF, 0, 0 );
6979
6980             case float_round_to_odd:
6981                 return packFloat128(aSign, 0x3FFF, 0, 0);
6982
6983             case float_round_to_zero:
6984                 break;
6985             }
6986             return packFloat128( aSign, 0, 0, 0 );
6987         }
6988         lastBitMask = 1;
6989         lastBitMask <<= 0x402F - aExp;
6990         roundBitsMask = lastBitMask - 1;
6991         z.low = 0;
6992         z.high = a.high;
6993         switch (status->float_rounding_mode) {
6994         case float_round_nearest_even:
6995             z.high += lastBitMask>>1;
6996             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6997                 z.high &= ~ lastBitMask;
6998             }
6999             break;
7000         case float_round_ties_away:
7001             z.high += lastBitMask>>1;
7002             break;
7003         case float_round_to_zero:
7004             break;
7005         case float_round_up:
7006             if (!extractFloat128Sign(z)) {
7007                 z.high |= ( a.low != 0 );
7008                 z.high += roundBitsMask;
7009             }
7010             break;
7011         case float_round_down:
7012             if (extractFloat128Sign(z)) {
7013                 z.high |= (a.low != 0);
7014                 z.high += roundBitsMask;
7015             }
7016             break;
7017         case float_round_to_odd:
7018             if ((z.high & lastBitMask) == 0) {
7019                 z.high |= (a.low != 0);
7020                 z.high += roundBitsMask;
7021             }
7022             break;
7023         default:
7024             abort();
7025         }
7026         z.high &= ~ roundBitsMask;
7027     }
7028     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7029         float_raise(float_flag_inexact, status);
7030     }
7031     return z;
7032
7033 }
7034
7035 /*----------------------------------------------------------------------------
7036 | Returns the result of adding the absolute values of the quadruple-precision
7037 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7038 | before being returned.  `zSign' is ignored if the result is a NaN.
7039 | The addition is performed according to the IEC/IEEE Standard for Binary
7040 | Floating-Point Arithmetic.
7041 *----------------------------------------------------------------------------*/
7042
7043 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7044                                 float_status *status)
7045 {
7046     int32_t aExp, bExp, zExp;
7047     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7048     int32_t expDiff;
7049
7050     aSig1 = extractFloat128Frac1( a );
7051     aSig0 = extractFloat128Frac0( a );
7052     aExp = extractFloat128Exp( a );
7053     bSig1 = extractFloat128Frac1( b );
7054     bSig0 = extractFloat128Frac0( b );
7055     bExp = extractFloat128Exp( b );
7056     expDiff = aExp - bExp;
7057     if ( 0 < expDiff ) {
7058         if ( aExp == 0x7FFF ) {
7059             if (aSig0 | aSig1) {
7060                 return propagateFloat128NaN(a, b, status);
7061             }
7062             return a;
7063         }
7064         if ( bExp == 0 ) {
7065             --expDiff;
7066         }
7067         else {
7068             bSig0 |= UINT64_C(0x0001000000000000);
7069         }
7070         shift128ExtraRightJamming(
7071             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7072         zExp = aExp;
7073     }
7074     else if ( expDiff < 0 ) {
7075         if ( bExp == 0x7FFF ) {
7076             if (bSig0 | bSig1) {
7077                 return propagateFloat128NaN(a, b, status);
7078             }
7079             return packFloat128( zSign, 0x7FFF, 0, 0 );
7080         }
7081         if ( aExp == 0 ) {
7082             ++expDiff;
7083         }
7084         else {
7085             aSig0 |= UINT64_C(0x0001000000000000);
7086         }
7087         shift128ExtraRightJamming(
7088             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7089         zExp = bExp;
7090     }
7091     else {
7092         if ( aExp == 0x7FFF ) {
7093             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7094                 return propagateFloat128NaN(a, b, status);
7095             }
7096             return a;
7097         }
7098         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7099         if ( aExp == 0 ) {
7100             if (status->flush_to_zero) {
7101                 if (zSig0 | zSig1) {
7102                     float_raise(float_flag_output_denormal, status);
7103                 }
7104                 return packFloat128(zSign, 0, 0, 0);
7105             }
7106             return packFloat128( zSign, 0, zSig0, zSig1 );
7107         }
7108         zSig2 = 0;
7109         zSig0 |= UINT64_C(0x0002000000000000);
7110         zExp = aExp;
7111         goto shiftRight1;
7112     }
7113     aSig0 |= UINT64_C(0x0001000000000000);
7114     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7115     --zExp;
7116     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7117     ++zExp;
7118  shiftRight1:
7119     shift128ExtraRightJamming(
7120         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7121  roundAndPack:
7122     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7123
7124 }
7125
7126 /*----------------------------------------------------------------------------
7127 | Returns the result of subtracting the absolute values of the quadruple-
7128 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7129 | difference is negated before being returned.  `zSign' is ignored if the
7130 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7131 | Standard for Binary Floating-Point Arithmetic.
7132 *----------------------------------------------------------------------------*/
7133
7134 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7135                                 float_status *status)
7136 {
7137     int32_t aExp, bExp, zExp;
7138     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7139     int32_t expDiff;
7140
7141     aSig1 = extractFloat128Frac1( a );
7142     aSig0 = extractFloat128Frac0( a );
7143     aExp = extractFloat128Exp( a );
7144     bSig1 = extractFloat128Frac1( b );
7145     bSig0 = extractFloat128Frac0( b );
7146     bExp = extractFloat128Exp( b );
7147     expDiff = aExp - bExp;
7148     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7149     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7150     if ( 0 < expDiff ) goto aExpBigger;
7151     if ( expDiff < 0 ) goto bExpBigger;
7152     if ( aExp == 0x7FFF ) {
7153         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7154             return propagateFloat128NaN(a, b, status);
7155         }
7156         float_raise(float_flag_invalid, status);
7157         return float128_default_nan(status);
7158     }
7159     if ( aExp == 0 ) {
7160         aExp = 1;
7161         bExp = 1;
7162     }
7163     if ( bSig0 < aSig0 ) goto aBigger;
7164     if ( aSig0 < bSig0 ) goto bBigger;
7165     if ( bSig1 < aSig1 ) goto aBigger;
7166     if ( aSig1 < bSig1 ) goto bBigger;
7167     return packFloat128(status->float_rounding_mode == float_round_down,
7168                         0, 0, 0);
7169  bExpBigger:
7170     if ( bExp == 0x7FFF ) {
7171         if (bSig0 | bSig1) {
7172             return propagateFloat128NaN(a, b, status);
7173         }
7174         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7175     }
7176     if ( aExp == 0 ) {
7177         ++expDiff;
7178     }
7179     else {
7180         aSig0 |= UINT64_C(0x4000000000000000);
7181     }
7182     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7183     bSig0 |= UINT64_C(0x4000000000000000);
7184  bBigger:
7185     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7186     zExp = bExp;
7187     zSign ^= 1;
7188     goto normalizeRoundAndPack;
7189  aExpBigger:
7190     if ( aExp == 0x7FFF ) {
7191         if (aSig0 | aSig1) {
7192             return propagateFloat128NaN(a, b, status);
7193         }
7194         return a;
7195     }
7196     if ( bExp == 0 ) {
7197         --expDiff;
7198     }
7199     else {
7200         bSig0 |= UINT64_C(0x4000000000000000);
7201     }
7202     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7203     aSig0 |= UINT64_C(0x4000000000000000);
7204  aBigger:
7205     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7206     zExp = aExp;
7207  normalizeRoundAndPack:
7208     --zExp;
7209     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7210                                          status);
7211
7212 }
7213
7214 /*----------------------------------------------------------------------------
7215 | Returns the result of adding the quadruple-precision floating-point values
7216 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7217 | for Binary Floating-Point Arithmetic.
7218 *----------------------------------------------------------------------------*/
7219
7220 float128 float128_add(float128 a, float128 b, float_status *status)
7221 {
7222     bool aSign, bSign;
7223
7224     aSign = extractFloat128Sign( a );
7225     bSign = extractFloat128Sign( b );
7226     if ( aSign == bSign ) {
7227         return addFloat128Sigs(a, b, aSign, status);
7228     }
7229     else {
7230         return subFloat128Sigs(a, b, aSign, status);
7231     }
7232
7233 }
7234
7235 /*----------------------------------------------------------------------------
7236 | Returns the result of subtracting the quadruple-precision floating-point
7237 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7238 | Standard for Binary Floating-Point Arithmetic.
7239 *----------------------------------------------------------------------------*/
7240
7241 float128 float128_sub(float128 a, float128 b, float_status *status)
7242 {
7243     bool aSign, bSign;
7244
7245     aSign = extractFloat128Sign( a );
7246     bSign = extractFloat128Sign( b );
7247     if ( aSign == bSign ) {
7248         return subFloat128Sigs(a, b, aSign, status);
7249     }
7250     else {
7251         return addFloat128Sigs(a, b, aSign, status);
7252     }
7253
7254 }
7255
7256 /*----------------------------------------------------------------------------
7257 | Returns the result of multiplying the quadruple-precision floating-point
7258 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7259 | Standard for Binary Floating-Point Arithmetic.
7260 *----------------------------------------------------------------------------*/
7261
7262 float128 float128_mul(float128 a, float128 b, float_status *status)
7263 {
7264     bool aSign, bSign, zSign;
7265     int32_t aExp, bExp, zExp;
7266     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7267
7268     aSig1 = extractFloat128Frac1( a );
7269     aSig0 = extractFloat128Frac0( a );
7270     aExp = extractFloat128Exp( a );
7271     aSign = extractFloat128Sign( a );
7272     bSig1 = extractFloat128Frac1( b );
7273     bSig0 = extractFloat128Frac0( b );
7274     bExp = extractFloat128Exp( b );
7275     bSign = extractFloat128Sign( b );
7276     zSign = aSign ^ bSign;
7277     if ( aExp == 0x7FFF ) {
7278         if (    ( aSig0 | aSig1 )
7279              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7280             return propagateFloat128NaN(a, b, status);
7281         }
7282         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7283         return packFloat128( zSign, 0x7FFF, 0, 0 );
7284     }
7285     if ( bExp == 0x7FFF ) {
7286         if (bSig0 | bSig1) {
7287             return propagateFloat128NaN(a, b, status);
7288         }
7289         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7290  invalid:
7291             float_raise(float_flag_invalid, status);
7292             return float128_default_nan(status);
7293         }
7294         return packFloat128( zSign, 0x7FFF, 0, 0 );
7295     }
7296     if ( aExp == 0 ) {
7297         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7298         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7299     }
7300     if ( bExp == 0 ) {
7301         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7302         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7303     }
7304     zExp = aExp + bExp - 0x4000;
7305     aSig0 |= UINT64_C(0x0001000000000000);
7306     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7307     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7308     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7309     zSig2 |= ( zSig3 != 0 );
7310     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7311         shift128ExtraRightJamming(
7312             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7313         ++zExp;
7314     }
7315     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7316
7317 }
7318
7319 /*----------------------------------------------------------------------------
7320 | Returns the result of dividing the quadruple-precision floating-point value
7321 | `a' by the corresponding value `b'.  The operation is performed according to
7322 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7323 *----------------------------------------------------------------------------*/
7324
7325 float128 float128_div(float128 a, float128 b, float_status *status)
7326 {
7327     bool aSign, bSign, zSign;
7328     int32_t aExp, bExp, zExp;
7329     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7330     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7331
7332     aSig1 = extractFloat128Frac1( a );
7333     aSig0 = extractFloat128Frac0( a );
7334     aExp = extractFloat128Exp( a );
7335     aSign = extractFloat128Sign( a );
7336     bSig1 = extractFloat128Frac1( b );
7337     bSig0 = extractFloat128Frac0( b );
7338     bExp = extractFloat128Exp( b );
7339     bSign = extractFloat128Sign( b );
7340     zSign = aSign ^ bSign;
7341     if ( aExp == 0x7FFF ) {
7342         if (aSig0 | aSig1) {
7343             return propagateFloat128NaN(a, b, status);
7344         }
7345         if ( bExp == 0x7FFF ) {
7346             if (bSig0 | bSig1) {
7347                 return propagateFloat128NaN(a, b, status);
7348             }
7349             goto invalid;
7350         }
7351         return packFloat128( zSign, 0x7FFF, 0, 0 );
7352     }
7353     if ( bExp == 0x7FFF ) {
7354         if (bSig0 | bSig1) {
7355             return propagateFloat128NaN(a, b, status);
7356         }
7357         return packFloat128( zSign, 0, 0, 0 );
7358     }
7359     if ( bExp == 0 ) {
7360         if ( ( bSig0 | bSig1 ) == 0 ) {
7361             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7362  invalid:
7363                 float_raise(float_flag_invalid, status);
7364                 return float128_default_nan(status);
7365             }
7366             float_raise(float_flag_divbyzero, status);
7367             return packFloat128( zSign, 0x7FFF, 0, 0 );
7368         }
7369         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7370     }
7371     if ( aExp == 0 ) {
7372         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7373         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7374     }
7375     zExp = aExp - bExp + 0x3FFD;
7376     shortShift128Left(
7377         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7378     shortShift128Left(
7379         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7380     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7381         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7382         ++zExp;
7383     }
7384     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7385     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7386     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7387     while ( (int64_t) rem0 < 0 ) {
7388         --zSig0;
7389         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7390     }
7391     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7392     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7393         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7394         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7395         while ( (int64_t) rem1 < 0 ) {
7396             --zSig1;
7397             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7398         }
7399         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7400     }
7401     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7402     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7403
7404 }
7405
7406 /*----------------------------------------------------------------------------
7407 | Returns the remainder of the quadruple-precision floating-point value `a'
7408 | with respect to the corresponding value `b'.  The operation is performed
7409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7410 *----------------------------------------------------------------------------*/
7411
7412 float128 float128_rem(float128 a, float128 b, float_status *status)
7413 {
7414     bool aSign, zSign;
7415     int32_t aExp, bExp, expDiff;
7416     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7417     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7418     int64_t sigMean0;
7419
7420     aSig1 = extractFloat128Frac1( a );
7421     aSig0 = extractFloat128Frac0( a );
7422     aExp = extractFloat128Exp( a );
7423     aSign = extractFloat128Sign( a );
7424     bSig1 = extractFloat128Frac1( b );
7425     bSig0 = extractFloat128Frac0( b );
7426     bExp = extractFloat128Exp( b );
7427     if ( aExp == 0x7FFF ) {
7428         if (    ( aSig0 | aSig1 )
7429              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7430             return propagateFloat128NaN(a, b, status);
7431         }
7432         goto invalid;
7433     }
7434     if ( bExp == 0x7FFF ) {
7435         if (bSig0 | bSig1) {
7436             return propagateFloat128NaN(a, b, status);
7437         }
7438         return a;
7439     }
7440     if ( bExp == 0 ) {
7441         if ( ( bSig0 | bSig1 ) == 0 ) {
7442  invalid:
7443             float_raise(float_flag_invalid, status);
7444             return float128_default_nan(status);
7445         }
7446         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7447     }
7448     if ( aExp == 0 ) {
7449         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7450         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7451     }
7452     expDiff = aExp - bExp;
7453     if ( expDiff < -1 ) return a;
7454     shortShift128Left(
7455         aSig0 | UINT64_C(0x0001000000000000),
7456         aSig1,
7457         15 - ( expDiff < 0 ),
7458         &aSig0,
7459         &aSig1
7460     );
7461     shortShift128Left(
7462         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7463     q = le128( bSig0, bSig1, aSig0, aSig1 );
7464     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7465     expDiff -= 64;
7466     while ( 0 < expDiff ) {
7467         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7468         q = ( 4 < q ) ? q - 4 : 0;
7469         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7470         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7471         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7472         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7473         expDiff -= 61;
7474     }
7475     if ( -64 < expDiff ) {
7476         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7477         q = ( 4 < q ) ? q - 4 : 0;
7478         q >>= - expDiff;
7479         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7480         expDiff += 52;
7481         if ( expDiff < 0 ) {
7482             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7483         }
7484         else {
7485             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7486         }
7487         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7488         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7489     }
7490     else {
7491         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7492         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7493     }
7494     do {
7495         alternateASig0 = aSig0;
7496         alternateASig1 = aSig1;
7497         ++q;
7498         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7499     } while ( 0 <= (int64_t) aSig0 );
7500     add128(
7501         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7502     if (    ( sigMean0 < 0 )
7503          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7504         aSig0 = alternateASig0;
7505         aSig1 = alternateASig1;
7506     }
7507     zSign = ( (int64_t) aSig0 < 0 );
7508     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7509     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7510                                          status);
7511 }
7512
7513 /*----------------------------------------------------------------------------
7514 | Returns the square root of the quadruple-precision floating-point value `a'.
7515 | The operation is performed according to the IEC/IEEE Standard for Binary
7516 | Floating-Point Arithmetic.
7517 *----------------------------------------------------------------------------*/
7518
7519 float128 float128_sqrt(float128 a, float_status *status)
7520 {
7521     bool aSign;
7522     int32_t aExp, zExp;
7523     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7524     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7525
7526     aSig1 = extractFloat128Frac1( a );
7527     aSig0 = extractFloat128Frac0( a );
7528     aExp = extractFloat128Exp( a );
7529     aSign = extractFloat128Sign( a );
7530     if ( aExp == 0x7FFF ) {
7531         if (aSig0 | aSig1) {
7532             return propagateFloat128NaN(a, a, status);
7533         }
7534         if ( ! aSign ) return a;
7535         goto invalid;
7536     }
7537     if ( aSign ) {
7538         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7539  invalid:
7540         float_raise(float_flag_invalid, status);
7541         return float128_default_nan(status);
7542     }
7543     if ( aExp == 0 ) {
7544         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7545         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7546     }
7547     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7548     aSig0 |= UINT64_C(0x0001000000000000);
7549     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7550     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7551     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7552     doubleZSig0 = zSig0<<1;
7553     mul64To128( zSig0, zSig0, &term0, &term1 );
7554     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7555     while ( (int64_t) rem0 < 0 ) {
7556         --zSig0;
7557         doubleZSig0 -= 2;
7558         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7559     }
7560     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7561     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7562         if ( zSig1 == 0 ) zSig1 = 1;
7563         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7564         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7565         mul64To128( zSig1, zSig1, &term2, &term3 );
7566         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7567         while ( (int64_t) rem1 < 0 ) {
7568             --zSig1;
7569             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7570             term3 |= 1;
7571             term2 |= doubleZSig0;
7572             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7573         }
7574         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7575     }
7576     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7577     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7578
7579 }
7580
7581 static inline FloatRelation
7582 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7583                           float_status *status)
7584 {
7585     bool aSign, bSign;
7586
7587     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7588         float_raise(float_flag_invalid, status);
7589         return float_relation_unordered;
7590     }
7591     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7592           ( extractFloatx80Frac( a )<<1 ) ) ||
7593         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7594           ( extractFloatx80Frac( b )<<1 ) )) {
7595         if (!is_quiet ||
7596             floatx80_is_signaling_nan(a, status) ||
7597             floatx80_is_signaling_nan(b, status)) {
7598             float_raise(float_flag_invalid, status);
7599         }
7600         return float_relation_unordered;
7601     }
7602     aSign = extractFloatx80Sign( a );
7603     bSign = extractFloatx80Sign( b );
7604     if ( aSign != bSign ) {
7605
7606         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7607              ( ( a.low | b.low ) == 0 ) ) {
7608             /* zero case */
7609             return float_relation_equal;
7610         } else {
7611             return 1 - (2 * aSign);
7612         }
7613     } else {
7614         /* Normalize pseudo-denormals before comparison.  */
7615         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7616             ++a.high;
7617         }
7618         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7619             ++b.high;
7620         }
7621         if (a.low == b.low && a.high == b.high) {
7622             return float_relation_equal;
7623         } else {
7624             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7625         }
7626     }
7627 }
7628
7629 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7630 {
7631     return floatx80_compare_internal(a, b, 0, status);
7632 }
7633
7634 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7635                                      float_status *status)
7636 {
7637     return floatx80_compare_internal(a, b, 1, status);
7638 }
7639
7640 static inline FloatRelation
7641 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7642                           float_status *status)
7643 {
7644     bool aSign, bSign;
7645
7646     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7647           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7648         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7649           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7650         if (!is_quiet ||
7651             float128_is_signaling_nan(a, status) ||
7652             float128_is_signaling_nan(b, status)) {
7653             float_raise(float_flag_invalid, status);
7654         }
7655         return float_relation_unordered;
7656     }
7657     aSign = extractFloat128Sign( a );
7658     bSign = extractFloat128Sign( b );
7659     if ( aSign != bSign ) {
7660         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7661             /* zero case */
7662             return float_relation_equal;
7663         } else {
7664             return 1 - (2 * aSign);
7665         }
7666     } else {
7667         if (a.low == b.low && a.high == b.high) {
7668             return float_relation_equal;
7669         } else {
7670             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7671         }
7672     }
7673 }
7674
7675 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7676 {
7677     return float128_compare_internal(a, b, 0, status);
7678 }
7679
7680 FloatRelation float128_compare_quiet(float128 a, float128 b,
7681                                      float_status *status)
7682 {
7683     return float128_compare_internal(a, b, 1, status);
7684 }
7685
7686 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7687 {
7688     bool aSign;
7689     int32_t aExp;
7690     uint64_t aSig;
7691
7692     if (floatx80_invalid_encoding(a)) {
7693         float_raise(float_flag_invalid, status);
7694         return floatx80_default_nan(status);
7695     }
7696     aSig = extractFloatx80Frac( a );
7697     aExp = extractFloatx80Exp( a );
7698     aSign = extractFloatx80Sign( a );
7699
7700     if ( aExp == 0x7FFF ) {
7701         if ( aSig<<1 ) {
7702             return propagateFloatx80NaN(a, a, status);
7703         }
7704         return a;
7705     }
7706
7707     if (aExp == 0) {
7708         if (aSig == 0) {
7709             return a;
7710         }
7711         aExp++;
7712     }
7713
7714     if (n > 0x10000) {
7715         n = 0x10000;
7716     } else if (n < -0x10000) {
7717         n = -0x10000;
7718     }
7719
7720     aExp += n;
7721     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7722                                          aSign, aExp, aSig, 0, status);
7723 }
7724
7725 float128 float128_scalbn(float128 a, int n, float_status *status)
7726 {
7727     bool aSign;
7728     int32_t aExp;
7729     uint64_t aSig0, aSig1;
7730
7731     aSig1 = extractFloat128Frac1( a );
7732     aSig0 = extractFloat128Frac0( a );
7733     aExp = extractFloat128Exp( a );
7734     aSign = extractFloat128Sign( a );
7735     if ( aExp == 0x7FFF ) {
7736         if ( aSig0 | aSig1 ) {
7737             return propagateFloat128NaN(a, a, status);
7738         }
7739         return a;
7740     }
7741     if (aExp != 0) {
7742         aSig0 |= UINT64_C(0x0001000000000000);
7743     } else if (aSig0 == 0 && aSig1 == 0) {
7744         return a;
7745     } else {
7746         aExp++;
7747     }
7748
7749     if (n > 0x10000) {
7750         n = 0x10000;
7751     } else if (n < -0x10000) {
7752         n = -0x10000;
7753     }
7754
7755     aExp += n - 1;
7756     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7757                                          , status);
7758
7759 }
7760
7761 static void __attribute__((constructor)) softfloat_init(void)
7762 {
7763     union_float64 ua, ub, uc, ur;
7764
7765     if (QEMU_NO_HARDFLOAT) {
7766         return;
7767     }
7768     /*
7769      * Test that the host's FMA is not obviously broken. For example,
7770      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7771      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7772      */
7773     ua.s = 0x0020000000000001ULL;
7774     ub.s = 0x3ca0000000000000ULL;
7775     uc.s = 0x0020000000000000ULL;
7776     ur.h = fma(ua.h, ub.h, uc.h);
7777     if (ur.s != 0x0020000000000001ULL) {
7778         force_soft_fma = true;
7779     }
7780 }