fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float. The
 504  * exponent is unbiased and the fraction is normalized. All
 505  * calculations are done with a 64 bit fraction and then rounded as
 506  * appropriate for the final format.
 507  *
 508  * Thanks to the packed FloatClass a decent compiler should be able to
 509  * fit the whole structure into registers and avoid using the stack
 510  * for parameter passing.
 511  */
 512
 513 typedef struct {
 514     uint64_t frac;
 515     int32_t  exp;
 516     FloatClass cls;
 517     bool sign;
 518 } FloatParts64;
 519
 520 #define DECOMPOSED_BINARY_POINT    63
 521 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 522
 523 /* Structure holding all of the relevant parameters for a format.
 524  *   exp_size: the size of the exponent field
 525  *   exp_bias: the offset applied to the exponent field
 526  *   exp_max: the maximum normalised exponent
 527  *   frac_size: the size of the fraction field
 528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 529  * The following are computed based the size of fraction
 530  *   frac_lsb: least significant bit of fraction
 531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 532  *   round_mask/roundeven_mask: masks used for rounding
 533  * The following optional modifiers are available:
 534  *   arm_althp: handle ARM Alternative Half Precision
 535  */
 536 typedef struct {
 537     int exp_size;
 538     int exp_bias;
 539     int exp_max;
 540     int frac_size;
 541     int frac_shift;
 542     uint64_t frac_lsb;
 543     uint64_t frac_lsbm1;
 544     uint64_t round_mask;
 545     uint64_t roundeven_mask;
 546     bool arm_althp;
 547 } FloatFmt;
 548
 549 /* Expand fields based on the size of exponent and fraction */
 550 #define FLOAT_PARAMS(E, F)                                           \
 551     .exp_size       = E,                                             \
 552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 553     .exp_max        = (1 << E) - 1,                                  \
 554     .frac_size      = F,                                             \
 555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 560
 561 static const FloatFmt float16_params = {
 562     FLOAT_PARAMS(5, 10)
 563 };
 564
 565 static const FloatFmt float16_params_ahp = {
 566     FLOAT_PARAMS(5, 10),
 567     .arm_althp = true
 568 };
 569
 570 static const FloatFmt bfloat16_params = {
 571     FLOAT_PARAMS(8, 7)
 572 };
 573
 574 static const FloatFmt float32_params = {
 575     FLOAT_PARAMS(8, 23)
 576 };
 577
 578 static const FloatFmt float64_params = {
 579     FLOAT_PARAMS(11, 52)
 580 };
 581
 582 /* Unpack a float to parts, but do not canonicalize.  */
 583 static inline FloatParts64 unpack_raw(FloatFmt fmt, uint64_t raw)
 584 {
 585     const int sign_pos = fmt.frac_size + fmt.exp_size;
 586
 587     return (FloatParts64) {
 588         .cls = float_class_unclassified,
 589         .sign = extract64(raw, sign_pos, 1),
 590         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 591         .frac = extract64(raw, 0, fmt.frac_size),
 592     };
 593 }
 594
 595 static inline FloatParts64 float16_unpack_raw(float16 f)
 596 {
 597     return unpack_raw(float16_params, f);
 598 }
 599
 600 static inline FloatParts64 bfloat16_unpack_raw(bfloat16 f)
 601 {
 602     return unpack_raw(bfloat16_params, f);
 603 }
 604
 605 static inline FloatParts64 float32_unpack_raw(float32 f)
 606 {
 607     return unpack_raw(float32_params, f);
 608 }
 609
 610 static inline FloatParts64 float64_unpack_raw(float64 f)
 611 {
 612     return unpack_raw(float64_params, f);
 613 }
 614
 615 /* Pack a float from parts, but do not canonicalize.  */
 616 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts64 p)
 617 {
 618     const int sign_pos = fmt.frac_size + fmt.exp_size;
 619     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 620     return deposit64(ret, sign_pos, 1, p.sign);
 621 }
 622
 623 static inline float16 float16_pack_raw(FloatParts64 p)
 624 {
 625     return make_float16(pack_raw(float16_params, p));
 626 }
 627
 628 static inline bfloat16 bfloat16_pack_raw(FloatParts64 p)
 629 {
 630     return pack_raw(bfloat16_params, p);
 631 }
 632
 633 static inline float32 float32_pack_raw(FloatParts64 p)
 634 {
 635     return make_float32(pack_raw(float32_params, p));
 636 }
 637
 638 static inline float64 float64_pack_raw(FloatParts64 p)
 639 {
 640     return make_float64(pack_raw(float64_params, p));
 641 }
 642
 643 /*----------------------------------------------------------------------------
 644 | Functions and definitions to determine:  (1) whether tininess for underflow
 645 | is detected before or after rounding by default, (2) what (if anything)
 646 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 647 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 648 | are propagated from function inputs to output.  These details are target-
 649 | specific.
 650 *----------------------------------------------------------------------------*/
 651 #include "softfloat-specialize.c.inc"
 652
 653 /* Canonicalize EXP and FRAC, setting CLS.  */
 654 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
 655                                   float_status *status)
 656 {
 657     if (part.exp == parm->exp_max && !parm->arm_althp) {
 658         if (part.frac == 0) {
 659             part.cls = float_class_inf;
 660         } else {
 661             part.frac <<= parm->frac_shift;
 662             part.cls = (parts_is_snan_frac(part.frac, status)
 663                         ? float_class_snan : float_class_qnan);
 664         }
 665     } else if (part.exp == 0) {
 666         if (likely(part.frac == 0)) {
 667             part.cls = float_class_zero;
 668         } else if (status->flush_inputs_to_zero) {
 669             float_raise(float_flag_input_denormal, status);
 670             part.cls = float_class_zero;
 671             part.frac = 0;
 672         } else {
 673             int shift = clz64(part.frac);
 674             part.cls = float_class_normal;
 675             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 676             part.frac <<= shift;
 677         }
 678     } else {
 679         part.cls = float_class_normal;
 680         part.exp -= parm->exp_bias;
 681         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 682     }
 683     return part;
 684 }
 685
 686 /* Round and uncanonicalize a floating-point number by parts. There
 687  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 688  * fraction; these bits will be removed. The exponent will be biased
 689  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 690  */
 691
 692 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
 693                                   const FloatFmt *parm)
 694 {
 695     const uint64_t frac_lsb = parm->frac_lsb;
 696     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 697     const uint64_t round_mask = parm->round_mask;
 698     const uint64_t roundeven_mask = parm->roundeven_mask;
 699     const int exp_max = parm->exp_max;
 700     const int frac_shift = parm->frac_shift;
 701     uint64_t frac, inc;
 702     int exp, flags = 0;
 703     bool overflow_norm;
 704
 705     frac = p.frac;
 706     exp = p.exp;
 707
 708     switch (p.cls) {
 709     case float_class_normal:
 710         switch (s->float_rounding_mode) {
 711         case float_round_nearest_even:
 712             overflow_norm = false;
 713             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 714             break;
 715         case float_round_ties_away:
 716             overflow_norm = false;
 717             inc = frac_lsbm1;
 718             break;
 719         case float_round_to_zero:
 720             overflow_norm = true;
 721             inc = 0;
 722             break;
 723         case float_round_up:
 724             inc = p.sign ? 0 : round_mask;
 725             overflow_norm = p.sign;
 726             break;
 727         case float_round_down:
 728             inc = p.sign ? round_mask : 0;
 729             overflow_norm = !p.sign;
 730             break;
 731         case float_round_to_odd:
 732             overflow_norm = true;
 733             inc = frac & frac_lsb ? 0 : round_mask;
 734             break;
 735         default:
 736             g_assert_not_reached();
 737         }
 738
 739         exp += parm->exp_bias;
 740         if (likely(exp > 0)) {
 741             if (frac & round_mask) {
 742                 flags |= float_flag_inexact;
 743                 if (uadd64_overflow(frac, inc, &frac)) {
 744                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
 745                     exp++;
 746                 }
 747             }
 748             frac >>= frac_shift;
 749
 750             if (parm->arm_althp) {
 751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 752                 if (unlikely(exp > exp_max)) {
 753                     /* Overflow.  Return the maximum normal.  */
 754                     flags = float_flag_invalid;
 755                     exp = exp_max;
 756                     frac = -1;
 757                 }
 758             } else if (unlikely(exp >= exp_max)) {
 759                 flags |= float_flag_overflow | float_flag_inexact;
 760                 if (overflow_norm) {
 761                     exp = exp_max - 1;
 762                     frac = -1;
 763                 } else {
 764                     p.cls = float_class_inf;
 765                     goto do_inf;
 766                 }
 767             }
 768         } else if (s->flush_to_zero) {
 769             flags |= float_flag_output_denormal;
 770             p.cls = float_class_zero;
 771             goto do_zero;
 772         } else {
 773             bool is_tiny = s->tininess_before_rounding || (exp < 0);
 774
 775             if (!is_tiny) {
 776                 uint64_t discard;
 777                 is_tiny = !uadd64_overflow(frac, inc, &discard);
 778             }
 779
 780             shift64RightJamming(frac, 1 - exp, &frac);
 781             if (frac & round_mask) {
 782                 /* Need to recompute round-to-even.  */
 783                 switch (s->float_rounding_mode) {
 784                 case float_round_nearest_even:
 785                     inc = ((frac & roundeven_mask) != frac_lsbm1
 786                            ? frac_lsbm1 : 0);
 787                     break;
 788                 case float_round_to_odd:
 789                     inc = frac & frac_lsb ? 0 : round_mask;
 790                     break;
 791                 default:
 792                     break;
 793                 }
 794                 flags |= float_flag_inexact;
 795                 frac += inc;
 796             }
 797
 798             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 799             frac >>= frac_shift;
 800
 801             if (is_tiny && (flags & float_flag_inexact)) {
 802                 flags |= float_flag_underflow;
 803             }
 804             if (exp == 0 && frac == 0) {
 805                 p.cls = float_class_zero;
 806             }
 807         }
 808         break;
 809
 810     case float_class_zero:
 811     do_zero:
 812         exp = 0;
 813         frac = 0;
 814         break;
 815
 816     case float_class_inf:
 817     do_inf:
 818         assert(!parm->arm_althp);
 819         exp = exp_max;
 820         frac = 0;
 821         break;
 822
 823     case float_class_qnan:
 824     case float_class_snan:
 825         assert(!parm->arm_althp);
 826         exp = exp_max;
 827         frac >>= parm->frac_shift;
 828         break;
 829
 830     default:
 831         g_assert_not_reached();
 832     }
 833
 834     float_raise(flags, s);
 835     p.exp = exp;
 836     p.frac = frac;
 837     return p;
 838 }
 839
 840 /* Explicit FloatFmt version */
 841 static FloatParts64 float16a_unpack_canonical(float16 f, float_status *s,
 842                                             const FloatFmt *params)
 843 {
 844     return sf_canonicalize(float16_unpack_raw(f), params, s);
 845 }
 846
 847 static FloatParts64 float16_unpack_canonical(float16 f, float_status *s)
 848 {
 849     return float16a_unpack_canonical(f, s, &float16_params);
 850 }
 851
 852 static FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s)
 853 {
 854     return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
 855 }
 856
 857 static float16 float16a_round_pack_canonical(FloatParts64 p, float_status *s,
 858                                              const FloatFmt *params)
 859 {
 860     return float16_pack_raw(round_canonical(p, s, params));
 861 }
 862
 863 static float16 float16_round_pack_canonical(FloatParts64 p, float_status *s)
 864 {
 865     return float16a_round_pack_canonical(p, s, &float16_params);
 866 }
 867
 868 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 p, float_status *s)
 869 {
 870     return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
 871 }
 872
 873 static FloatParts64 float32_unpack_canonical(float32 f, float_status *s)
 874 {
 875     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 876 }
 877
 878 static float32 float32_round_pack_canonical(FloatParts64 p, float_status *s)
 879 {
 880     return float32_pack_raw(round_canonical(p, s, &float32_params));
 881 }
 882
 883 static FloatParts64 float64_unpack_canonical(float64 f, float_status *s)
 884 {
 885     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 886 }
 887
 888 static float64 float64_round_pack_canonical(FloatParts64 p, float_status *s)
 889 {
 890     return float64_pack_raw(round_canonical(p, s, &float64_params));
 891 }
 892
 893 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
 894 {
 895     g_assert(is_nan(a.cls));
 896     if (is_snan(a.cls)) {
 897         float_raise(float_flag_invalid, s);
 898         if (!s->default_nan_mode) {
 899             return parts_silence_nan(a, s);
 900         }
 901     } else if (!s->default_nan_mode) {
 902         return a;
 903     }
 904     return parts_default_nan(s);
 905 }
 906
 907 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
 908 {
 909     if (is_snan(a.cls) || is_snan(b.cls)) {
 910         float_raise(float_flag_invalid, s);
 911     }
 912
 913     if (s->default_nan_mode) {
 914         return parts_default_nan(s);
 915     } else {
 916         if (pickNaN(a.cls, b.cls,
 917                     a.frac > b.frac ||
 918                     (a.frac == b.frac && a.sign < b.sign), s)) {
 919             a = b;
 920         }
 921         if (is_snan(a.cls)) {
 922             return parts_silence_nan(a, s);
 923         }
 924     }
 925     return a;
 926 }
 927
 928 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
 929                                   bool inf_zero, float_status *s)
 930 {
 931     int which;
 932
 933     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 934         float_raise(float_flag_invalid, s);
 935     }
 936
 937     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 938
 939     if (s->default_nan_mode) {
 940         /* Note that this check is after pickNaNMulAdd so that function
 941          * has an opportunity to set the Invalid flag.
 942          */
 943         which = 3;
 944     }
 945
 946     switch (which) {
 947     case 0:
 948         break;
 949     case 1:
 950         a = b;
 951         break;
 952     case 2:
 953         a = c;
 954         break;
 955     case 3:
 956         return parts_default_nan(s);
 957     default:
 958         g_assert_not_reached();
 959     }
 960
 961     if (is_snan(a.cls)) {
 962         return parts_silence_nan(a, s);
 963     }
 964     return a;
 965 }
 966
 967 /*
 968  * Returns the result of adding or subtracting the values of the
 969  * floating-point values `a' and `b'. The operation is performed
 970  * according to the IEC/IEEE Standard for Binary Floating-Point
 971  * Arithmetic.
 972  */
 973
 974 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
 975                                 float_status *s)
 976 {
 977     bool a_sign = a.sign;
 978     bool b_sign = b.sign ^ subtract;
 979
 980     if (a_sign != b_sign) {
 981         /* Subtraction */
 982
 983         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 984             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 985                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 986                 a.frac = a.frac - b.frac;
 987             } else {
 988                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 989                 a.frac = b.frac - a.frac;
 990                 a.exp = b.exp;
 991                 a_sign ^= 1;
 992             }
 993
 994             if (a.frac == 0) {
 995                 a.cls = float_class_zero;
 996                 a.sign = s->float_rounding_mode == float_round_down;
 997             } else {
 998                 int shift = clz64(a.frac);
 999                 a.frac = a.frac << shift;
1000                 a.exp = a.exp - shift;
1001                 a.sign = a_sign;
1002             }
1003             return a;
1004         }
1005         if (is_nan(a.cls) || is_nan(b.cls)) {
1006             return pick_nan(a, b, s);
1007         }
1008         if (a.cls == float_class_inf) {
1009             if (b.cls == float_class_inf) {
1010                 float_raise(float_flag_invalid, s);
1011                 return parts_default_nan(s);
1012             }
1013             return a;
1014         }
1015         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1016             a.sign = s->float_rounding_mode == float_round_down;
1017             return a;
1018         }
1019         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1020             b.sign = a_sign ^ 1;
1021             return b;
1022         }
1023         if (b.cls == float_class_zero) {
1024             return a;
1025         }
1026     } else {
1027         /* Addition */
1028         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1029             if (a.exp > b.exp) {
1030                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1031             } else if (a.exp < b.exp) {
1032                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1033                 a.exp = b.exp;
1034             }
1035
1036             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1037                 shift64RightJamming(a.frac, 1, &a.frac);
1038                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1039                 a.exp += 1;
1040             }
1041             return a;
1042         }
1043         if (is_nan(a.cls) || is_nan(b.cls)) {
1044             return pick_nan(a, b, s);
1045         }
1046         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1047             return a;
1048         }
1049         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1050             b.sign = b_sign;
1051             return b;
1052         }
1053     }
1054     g_assert_not_reached();
1055 }
1056
1057 /*
1058  * Returns the result of adding or subtracting the floating-point
1059  * values `a' and `b'. The operation is performed according to the
1060  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1061  */
1062
1063 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1064 {
1065     FloatParts64 pa = float16_unpack_canonical(a, status);
1066     FloatParts64 pb = float16_unpack_canonical(b, status);
1067     FloatParts64 pr = addsub_floats(pa, pb, false, status);
1068
1069     return float16_round_pack_canonical(pr, status);
1070 }
1071
1072 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1073 {
1074     FloatParts64 pa = float16_unpack_canonical(a, status);
1075     FloatParts64 pb = float16_unpack_canonical(b, status);
1076     FloatParts64 pr = addsub_floats(pa, pb, true, status);
1077
1078     return float16_round_pack_canonical(pr, status);
1079 }
1080
1081 static float32 QEMU_SOFTFLOAT_ATTR
1082 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1083 {
1084     FloatParts64 pa = float32_unpack_canonical(a, status);
1085     FloatParts64 pb = float32_unpack_canonical(b, status);
1086     FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1087
1088     return float32_round_pack_canonical(pr, status);
1089 }
1090
1091 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1092 {
1093     return soft_f32_addsub(a, b, false, status);
1094 }
1095
1096 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1097 {
1098     return soft_f32_addsub(a, b, true, status);
1099 }
1100
1101 static float64 QEMU_SOFTFLOAT_ATTR
1102 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1103 {
1104     FloatParts64 pa = float64_unpack_canonical(a, status);
1105     FloatParts64 pb = float64_unpack_canonical(b, status);
1106     FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1107
1108     return float64_round_pack_canonical(pr, status);
1109 }
1110
1111 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1112 {
1113     return soft_f64_addsub(a, b, false, status);
1114 }
1115
1116 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1117 {
1118     return soft_f64_addsub(a, b, true, status);
1119 }
1120
1121 static float hard_f32_add(float a, float b)
1122 {
1123     return a + b;
1124 }
1125
1126 static float hard_f32_sub(float a, float b)
1127 {
1128     return a - b;
1129 }
1130
1131 static double hard_f64_add(double a, double b)
1132 {
1133     return a + b;
1134 }
1135
1136 static double hard_f64_sub(double a, double b)
1137 {
1138     return a - b;
1139 }
1140
1141 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1142 {
1143     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1144         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1145     }
1146     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1147 }
1148
1149 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1150 {
1151     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1152         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1153     } else {
1154         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1155     }
1156 }
1157
1158 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1159                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1160 {
1161     return float32_gen2(a, b, s, hard, soft,
1162                         f32_is_zon2, f32_addsubmul_post);
1163 }
1164
1165 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1166                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1167 {
1168     return float64_gen2(a, b, s, hard, soft,
1169                         f64_is_zon2, f64_addsubmul_post);
1170 }
1171
1172 float32 QEMU_FLATTEN
1173 float32_add(float32 a, float32 b, float_status *s)
1174 {
1175     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1176 }
1177
1178 float32 QEMU_FLATTEN
1179 float32_sub(float32 a, float32 b, float_status *s)
1180 {
1181     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1182 }
1183
1184 float64 QEMU_FLATTEN
1185 float64_add(float64 a, float64 b, float_status *s)
1186 {
1187     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1188 }
1189
1190 float64 QEMU_FLATTEN
1191 float64_sub(float64 a, float64 b, float_status *s)
1192 {
1193     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1194 }
1195
1196 /*
1197  * Returns the result of adding or subtracting the bfloat16
1198  * values `a' and `b'.
1199  */
1200 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1201 {
1202     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1203     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1204     FloatParts64 pr = addsub_floats(pa, pb, false, status);
1205
1206     return bfloat16_round_pack_canonical(pr, status);
1207 }
1208
1209 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1210 {
1211     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1212     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1213     FloatParts64 pr = addsub_floats(pa, pb, true, status);
1214
1215     return bfloat16_round_pack_canonical(pr, status);
1216 }
1217
1218 /*
1219  * Returns the result of multiplying the floating-point values `a' and
1220  * `b'. The operation is performed according to the IEC/IEEE Standard
1221  * for Binary Floating-Point Arithmetic.
1222  */
1223
1224 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1225 {
1226     bool sign = a.sign ^ b.sign;
1227
1228     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1229         uint64_t hi, lo;
1230         int exp = a.exp + b.exp;
1231
1232         mul64To128(a.frac, b.frac, &hi, &lo);
1233         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1234             exp += 1;
1235         } else {
1236             hi <<= 1;
1237         }
1238         hi |= (lo != 0);
1239
1240         /* Re-use a */
1241         a.exp = exp;
1242         a.sign = sign;
1243         a.frac = hi;
1244         return a;
1245     }
1246     /* handle all the NaN cases */
1247     if (is_nan(a.cls) || is_nan(b.cls)) {
1248         return pick_nan(a, b, s);
1249     }
1250     /* Inf * Zero == NaN */
1251     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1252         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1253         float_raise(float_flag_invalid, s);
1254         return parts_default_nan(s);
1255     }
1256     /* Multiply by 0 or Inf */
1257     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1258         a.sign = sign;
1259         return a;
1260     }
1261     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1262         b.sign = sign;
1263         return b;
1264     }
1265     g_assert_not_reached();
1266 }
1267
1268 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1269 {
1270     FloatParts64 pa = float16_unpack_canonical(a, status);
1271     FloatParts64 pb = float16_unpack_canonical(b, status);
1272     FloatParts64 pr = mul_floats(pa, pb, status);
1273
1274     return float16_round_pack_canonical(pr, status);
1275 }
1276
1277 static float32 QEMU_SOFTFLOAT_ATTR
1278 soft_f32_mul(float32 a, float32 b, float_status *status)
1279 {
1280     FloatParts64 pa = float32_unpack_canonical(a, status);
1281     FloatParts64 pb = float32_unpack_canonical(b, status);
1282     FloatParts64 pr = mul_floats(pa, pb, status);
1283
1284     return float32_round_pack_canonical(pr, status);
1285 }
1286
1287 static float64 QEMU_SOFTFLOAT_ATTR
1288 soft_f64_mul(float64 a, float64 b, float_status *status)
1289 {
1290     FloatParts64 pa = float64_unpack_canonical(a, status);
1291     FloatParts64 pb = float64_unpack_canonical(b, status);
1292     FloatParts64 pr = mul_floats(pa, pb, status);
1293
1294     return float64_round_pack_canonical(pr, status);
1295 }
1296
1297 static float hard_f32_mul(float a, float b)
1298 {
1299     return a * b;
1300 }
1301
1302 static double hard_f64_mul(double a, double b)
1303 {
1304     return a * b;
1305 }
1306
1307 float32 QEMU_FLATTEN
1308 float32_mul(float32 a, float32 b, float_status *s)
1309 {
1310     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1311                         f32_is_zon2, f32_addsubmul_post);
1312 }
1313
1314 float64 QEMU_FLATTEN
1315 float64_mul(float64 a, float64 b, float_status *s)
1316 {
1317     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1318                         f64_is_zon2, f64_addsubmul_post);
1319 }
1320
1321 /*
1322  * Returns the result of multiplying the bfloat16
1323  * values `a' and `b'.
1324  */
1325
1326 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1327 {
1328     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1329     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1330     FloatParts64 pr = mul_floats(pa, pb, status);
1331
1332     return bfloat16_round_pack_canonical(pr, status);
1333 }
1334
1335 /*
1336  * Returns the result of multiplying the floating-point values `a' and
1337  * `b' then adding 'c', with no intermediate rounding step after the
1338  * multiplication. The operation is performed according to the
1339  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1340  * The flags argument allows the caller to select negation of the
1341  * addend, the intermediate product, or the final result. (The
1342  * difference between this and having the caller do a separate
1343  * negation is that negating externally will flip the sign bit on
1344  * NaNs.)
1345  */
1346
1347 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1348                                 int flags, float_status *s)
1349 {
1350     bool inf_zero, p_sign;
1351     bool sign_flip = flags & float_muladd_negate_result;
1352     FloatClass p_class;
1353     uint64_t hi, lo;
1354     int p_exp;
1355     int ab_mask, abc_mask;
1356
1357     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1358     abc_mask = float_cmask(c.cls) | ab_mask;
1359     inf_zero = ab_mask == float_cmask_infzero;
1360
1361     /* It is implementation-defined whether the cases of (0,inf,qnan)
1362      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1363      * they return if they do), so we have to hand this information
1364      * off to the target-specific pick-a-NaN routine.
1365      */
1366     if (unlikely(abc_mask & float_cmask_anynan)) {
1367         return pick_nan_muladd(a, b, c, inf_zero, s);
1368     }
1369
1370     if (inf_zero) {
1371         float_raise(float_flag_invalid, s);
1372         return parts_default_nan(s);
1373     }
1374
1375     if (flags & float_muladd_negate_c) {
1376         c.sign ^= 1;
1377     }
1378
1379     p_sign = a.sign ^ b.sign;
1380
1381     if (flags & float_muladd_negate_product) {
1382         p_sign ^= 1;
1383     }
1384
1385     if (ab_mask & float_cmask_inf) {
1386         p_class = float_class_inf;
1387     } else if (ab_mask & float_cmask_zero) {
1388         p_class = float_class_zero;
1389     } else {
1390         p_class = float_class_normal;
1391     }
1392
1393     if (c.cls == float_class_inf) {
1394         if (p_class == float_class_inf && p_sign != c.sign) {
1395             float_raise(float_flag_invalid, s);
1396             return parts_default_nan(s);
1397         } else {
1398             c.sign ^= sign_flip;
1399             return c;
1400         }
1401     }
1402
1403     if (p_class == float_class_inf) {
1404         a.cls = float_class_inf;
1405         a.sign = p_sign ^ sign_flip;
1406         return a;
1407     }
1408
1409     if (p_class == float_class_zero) {
1410         if (c.cls == float_class_zero) {
1411             if (p_sign != c.sign) {
1412                 p_sign = s->float_rounding_mode == float_round_down;
1413             }
1414             c.sign = p_sign;
1415         } else if (flags & float_muladd_halve_result) {
1416             c.exp -= 1;
1417         }
1418         c.sign ^= sign_flip;
1419         return c;
1420     }
1421
1422     /* a & b should be normals now... */
1423     assert(a.cls == float_class_normal &&
1424            b.cls == float_class_normal);
1425
1426     p_exp = a.exp + b.exp;
1427
1428     mul64To128(a.frac, b.frac, &hi, &lo);
1429
1430     /* Renormalize to the msb. */
1431     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1432         p_exp += 1;
1433     } else {
1434         shortShift128Left(hi, lo, 1, &hi, &lo);
1435     }
1436
1437     /* + add/sub */
1438     if (c.cls != float_class_zero) {
1439         int exp_diff = p_exp - c.exp;
1440         if (p_sign == c.sign) {
1441             /* Addition */
1442             if (exp_diff <= 0) {
1443                 shift64RightJamming(hi, -exp_diff, &hi);
1444                 p_exp = c.exp;
1445                 if (uadd64_overflow(hi, c.frac, &hi)) {
1446                     shift64RightJamming(hi, 1, &hi);
1447                     hi |= DECOMPOSED_IMPLICIT_BIT;
1448                     p_exp += 1;
1449                 }
1450             } else {
1451                 uint64_t c_hi, c_lo, over;
1452                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1453                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1454                 if (over) {
1455                     shift64RightJamming(hi, 1, &hi);
1456                     hi |= DECOMPOSED_IMPLICIT_BIT;
1457                     p_exp += 1;
1458                 }
1459             }
1460         } else {
1461             /* Subtraction */
1462             uint64_t c_hi = c.frac, c_lo = 0;
1463
1464             if (exp_diff <= 0) {
1465                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1466                 if (exp_diff == 0
1467                     &&
1468                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1469                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1470                 } else {
1471                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1472                     p_sign ^= 1;
1473                     p_exp = c.exp;
1474                 }
1475             } else {
1476                 shift128RightJamming(c_hi, c_lo,
1477                                      exp_diff,
1478                                      &c_hi, &c_lo);
1479                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1480             }
1481
1482             if (hi == 0 && lo == 0) {
1483                 a.cls = float_class_zero;
1484                 a.sign = s->float_rounding_mode == float_round_down;
1485                 a.sign ^= sign_flip;
1486                 return a;
1487             } else {
1488                 int shift;
1489                 if (hi != 0) {
1490                     shift = clz64(hi);
1491                 } else {
1492                     shift = clz64(lo) + 64;
1493                 }
1494                 /* Normalizing to a binary point of 124 is the
1495                    correct adjust for the exponent.  However since we're
1496                    shifting, we might as well put the binary point back
1497                    at 63 where we really want it.  Therefore shift as
1498                    if we're leaving 1 bit at the top of the word, but
1499                    adjust the exponent as if we're leaving 3 bits.  */
1500                 shift128Left(hi, lo, shift, &hi, &lo);
1501                 p_exp -= shift;
1502             }
1503         }
1504     }
1505     hi |= (lo != 0);
1506
1507     if (flags & float_muladd_halve_result) {
1508         p_exp -= 1;
1509     }
1510
1511     /* finally prepare our result */
1512     a.cls = float_class_normal;
1513     a.sign = p_sign ^ sign_flip;
1514     a.exp = p_exp;
1515     a.frac = hi;
1516
1517     return a;
1518 }
1519
1520 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1521                                                 int flags, float_status *status)
1522 {
1523     FloatParts64 pa = float16_unpack_canonical(a, status);
1524     FloatParts64 pb = float16_unpack_canonical(b, status);
1525     FloatParts64 pc = float16_unpack_canonical(c, status);
1526     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1527
1528     return float16_round_pack_canonical(pr, status);
1529 }
1530
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1533                 float_status *status)
1534 {
1535     FloatParts64 pa = float32_unpack_canonical(a, status);
1536     FloatParts64 pb = float32_unpack_canonical(b, status);
1537     FloatParts64 pc = float32_unpack_canonical(c, status);
1538     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1539
1540     return float32_round_pack_canonical(pr, status);
1541 }
1542
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1545                 float_status *status)
1546 {
1547     FloatParts64 pa = float64_unpack_canonical(a, status);
1548     FloatParts64 pb = float64_unpack_canonical(b, status);
1549     FloatParts64 pc = float64_unpack_canonical(c, status);
1550     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552     return float64_round_pack_canonical(pr, status);
1553 }
1554
1555 static bool force_soft_fma;
1556
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1559 {
1560     union_float32 ua, ub, uc, ur;
1561
1562     ua.s = xa;
1563     ub.s = xb;
1564     uc.s = xc;
1565
1566     if (unlikely(!can_use_fpu(s))) {
1567         goto soft;
1568     }
1569     if (unlikely(flags & float_muladd_halve_result)) {
1570         goto soft;
1571     }
1572
1573     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1574     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1575         goto soft;
1576     }
1577
1578     if (unlikely(force_soft_fma)) {
1579         goto soft;
1580     }
1581
1582     /*
1583      * When (a || b) == 0, there's no need to check for under/over flow,
1584      * since we know the addend is (normal || 0) and the product is 0.
1585      */
1586     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1587         union_float32 up;
1588         bool prod_sign;
1589
1590         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1591         prod_sign ^= !!(flags & float_muladd_negate_product);
1592         up.s = float32_set_sign(float32_zero, prod_sign);
1593
1594         if (flags & float_muladd_negate_c) {
1595             uc.h = -uc.h;
1596         }
1597         ur.h = up.h + uc.h;
1598     } else {
1599         union_float32 ua_orig = ua;
1600         union_float32 uc_orig = uc;
1601
1602         if (flags & float_muladd_negate_product) {
1603             ua.h = -ua.h;
1604         }
1605         if (flags & float_muladd_negate_c) {
1606             uc.h = -uc.h;
1607         }
1608
1609         ur.h = fmaf(ua.h, ub.h, uc.h);
1610
1611         if (unlikely(f32_is_inf(ur))) {
1612             float_raise(float_flag_overflow, s);
1613         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1614             ua = ua_orig;
1615             uc = uc_orig;
1616             goto soft;
1617         }
1618     }
1619     if (flags & float_muladd_negate_result) {
1620         return float32_chs(ur.s);
1621     }
1622     return ur.s;
1623
1624  soft:
1625     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1626 }
1627
1628 float64 QEMU_FLATTEN
1629 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1630 {
1631     union_float64 ua, ub, uc, ur;
1632
1633     ua.s = xa;
1634     ub.s = xb;
1635     uc.s = xc;
1636
1637     if (unlikely(!can_use_fpu(s))) {
1638         goto soft;
1639     }
1640     if (unlikely(flags & float_muladd_halve_result)) {
1641         goto soft;
1642     }
1643
1644     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1645     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1646         goto soft;
1647     }
1648
1649     if (unlikely(force_soft_fma)) {
1650         goto soft;
1651     }
1652
1653     /*
1654      * When (a || b) == 0, there's no need to check for under/over flow,
1655      * since we know the addend is (normal || 0) and the product is 0.
1656      */
1657     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1658         union_float64 up;
1659         bool prod_sign;
1660
1661         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1662         prod_sign ^= !!(flags & float_muladd_negate_product);
1663         up.s = float64_set_sign(float64_zero, prod_sign);
1664
1665         if (flags & float_muladd_negate_c) {
1666             uc.h = -uc.h;
1667         }
1668         ur.h = up.h + uc.h;
1669     } else {
1670         union_float64 ua_orig = ua;
1671         union_float64 uc_orig = uc;
1672
1673         if (flags & float_muladd_negate_product) {
1674             ua.h = -ua.h;
1675         }
1676         if (flags & float_muladd_negate_c) {
1677             uc.h = -uc.h;
1678         }
1679
1680         ur.h = fma(ua.h, ub.h, uc.h);
1681
1682         if (unlikely(f64_is_inf(ur))) {
1683             float_raise(float_flag_overflow, s);
1684         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1685             ua = ua_orig;
1686             uc = uc_orig;
1687             goto soft;
1688         }
1689     }
1690     if (flags & float_muladd_negate_result) {
1691         return float64_chs(ur.s);
1692     }
1693     return ur.s;
1694
1695  soft:
1696     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1697 }
1698
1699 /*
1700  * Returns the result of multiplying the bfloat16 values `a'
1701  * and `b' then adding 'c', with no intermediate rounding step after the
1702  * multiplication.
1703  */
1704
1705 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1706                                       int flags, float_status *status)
1707 {
1708     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1709     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1710     FloatParts64 pc = bfloat16_unpack_canonical(c, status);
1711     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1712
1713     return bfloat16_round_pack_canonical(pr, status);
1714 }
1715
1716 /*
1717  * Returns the result of dividing the floating-point value `a' by the
1718  * corresponding value `b'. The operation is performed according to
1719  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1720  */
1721
1722 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1723 {
1724     bool sign = a.sign ^ b.sign;
1725
1726     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1727         uint64_t n0, n1, q, r;
1728         int exp = a.exp - b.exp;
1729
1730         /*
1731          * We want a 2*N / N-bit division to produce exactly an N-bit
1732          * result, so that we do not lose any precision and so that we
1733          * do not have to renormalize afterward.  If A.frac < B.frac,
1734          * then division would produce an (N-1)-bit result; shift A left
1735          * by one to produce the an N-bit result, and decrement the
1736          * exponent to match.
1737          *
1738          * The udiv_qrnnd algorithm that we're using requires normalization,
1739          * i.e. the msb of the denominator must be set, which is already true.
1740          */
1741         if (a.frac < b.frac) {
1742             exp -= 1;
1743             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1744         } else {
1745             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1746         }
1747         q = udiv_qrnnd(&r, n1, n0, b.frac);
1748
1749         /* Set lsb if there is a remainder, to set inexact. */
1750         a.frac = q | (r != 0);
1751         a.sign = sign;
1752         a.exp = exp;
1753         return a;
1754     }
1755     /* handle all the NaN cases */
1756     if (is_nan(a.cls) || is_nan(b.cls)) {
1757         return pick_nan(a, b, s);
1758     }
1759     /* 0/0 or Inf/Inf */
1760     if (a.cls == b.cls
1761         &&
1762         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1763         float_raise(float_flag_invalid, s);
1764         return parts_default_nan(s);
1765     }
1766     /* Inf / x or 0 / x */
1767     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1768         a.sign = sign;
1769         return a;
1770     }
1771     /* Div 0 => Inf */
1772     if (b.cls == float_class_zero) {
1773         float_raise(float_flag_divbyzero, s);
1774         a.cls = float_class_inf;
1775         a.sign = sign;
1776         return a;
1777     }
1778     /* Div by Inf */
1779     if (b.cls == float_class_inf) {
1780         a.cls = float_class_zero;
1781         a.sign = sign;
1782         return a;
1783     }
1784     g_assert_not_reached();
1785 }
1786
1787 float16 float16_div(float16 a, float16 b, float_status *status)
1788 {
1789     FloatParts64 pa = float16_unpack_canonical(a, status);
1790     FloatParts64 pb = float16_unpack_canonical(b, status);
1791     FloatParts64 pr = div_floats(pa, pb, status);
1792
1793     return float16_round_pack_canonical(pr, status);
1794 }
1795
1796 static float32 QEMU_SOFTFLOAT_ATTR
1797 soft_f32_div(float32 a, float32 b, float_status *status)
1798 {
1799     FloatParts64 pa = float32_unpack_canonical(a, status);
1800     FloatParts64 pb = float32_unpack_canonical(b, status);
1801     FloatParts64 pr = div_floats(pa, pb, status);
1802
1803     return float32_round_pack_canonical(pr, status);
1804 }
1805
1806 static float64 QEMU_SOFTFLOAT_ATTR
1807 soft_f64_div(float64 a, float64 b, float_status *status)
1808 {
1809     FloatParts64 pa = float64_unpack_canonical(a, status);
1810     FloatParts64 pb = float64_unpack_canonical(b, status);
1811     FloatParts64 pr = div_floats(pa, pb, status);
1812
1813     return float64_round_pack_canonical(pr, status);
1814 }
1815
1816 static float hard_f32_div(float a, float b)
1817 {
1818     return a / b;
1819 }
1820
1821 static double hard_f64_div(double a, double b)
1822 {
1823     return a / b;
1824 }
1825
1826 static bool f32_div_pre(union_float32 a, union_float32 b)
1827 {
1828     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1829         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1830                fpclassify(b.h) == FP_NORMAL;
1831     }
1832     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1833 }
1834
1835 static bool f64_div_pre(union_float64 a, union_float64 b)
1836 {
1837     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1838         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1839                fpclassify(b.h) == FP_NORMAL;
1840     }
1841     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1842 }
1843
1844 static bool f32_div_post(union_float32 a, union_float32 b)
1845 {
1846     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1847         return fpclassify(a.h) != FP_ZERO;
1848     }
1849     return !float32_is_zero(a.s);
1850 }
1851
1852 static bool f64_div_post(union_float64 a, union_float64 b)
1853 {
1854     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1855         return fpclassify(a.h) != FP_ZERO;
1856     }
1857     return !float64_is_zero(a.s);
1858 }
1859
1860 float32 QEMU_FLATTEN
1861 float32_div(float32 a, float32 b, float_status *s)
1862 {
1863     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1864                         f32_div_pre, f32_div_post);
1865 }
1866
1867 float64 QEMU_FLATTEN
1868 float64_div(float64 a, float64 b, float_status *s)
1869 {
1870     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1871                         f64_div_pre, f64_div_post);
1872 }
1873
1874 /*
1875  * Returns the result of dividing the bfloat16
1876  * value `a' by the corresponding value `b'.
1877  */
1878
1879 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1880 {
1881     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1882     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1883     FloatParts64 pr = div_floats(pa, pb, status);
1884
1885     return bfloat16_round_pack_canonical(pr, status);
1886 }
1887
1888 /*
1889  * Float to Float conversions
1890  *
1891  * Returns the result of converting one float format to another. The
1892  * conversion is performed according to the IEC/IEEE Standard for
1893  * Binary Floating-Point Arithmetic.
1894  *
1895  * The float_to_float helper only needs to take care of raising
1896  * invalid exceptions and handling the conversion on NaNs.
1897  */
1898
1899 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1900                                  float_status *s)
1901 {
1902     if (dstf->arm_althp) {
1903         switch (a.cls) {
1904         case float_class_qnan:
1905         case float_class_snan:
1906             /* There is no NaN in the destination format.  Raise Invalid
1907              * and return a zero with the sign of the input NaN.
1908              */
1909             float_raise(float_flag_invalid, s);
1910             a.cls = float_class_zero;
1911             a.frac = 0;
1912             a.exp = 0;
1913             break;
1914
1915         case float_class_inf:
1916             /* There is no Inf in the destination format.  Raise Invalid
1917              * and return the maximum normal with the correct sign.
1918              */
1919             float_raise(float_flag_invalid, s);
1920             a.cls = float_class_normal;
1921             a.exp = dstf->exp_max;
1922             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1923             break;
1924
1925         default:
1926             break;
1927         }
1928     } else if (is_nan(a.cls)) {
1929         return return_nan(a, s);
1930     }
1931     return a;
1932 }
1933
1934 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1935 {
1936     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1937     FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1938     FloatParts64 pr = float_to_float(p, &float32_params, s);
1939     return float32_round_pack_canonical(pr, s);
1940 }
1941
1942 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1943 {
1944     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1945     FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1946     FloatParts64 pr = float_to_float(p, &float64_params, s);
1947     return float64_round_pack_canonical(pr, s);
1948 }
1949
1950 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1951 {
1952     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1953     FloatParts64 p = float32_unpack_canonical(a, s);
1954     FloatParts64 pr = float_to_float(p, fmt16, s);
1955     return float16a_round_pack_canonical(pr, s, fmt16);
1956 }
1957
1958 static float64 QEMU_SOFTFLOAT_ATTR
1959 soft_float32_to_float64(float32 a, float_status *s)
1960 {
1961     FloatParts64 p = float32_unpack_canonical(a, s);
1962     FloatParts64 pr = float_to_float(p, &float64_params, s);
1963     return float64_round_pack_canonical(pr, s);
1964 }
1965
1966 float64 float32_to_float64(float32 a, float_status *s)
1967 {
1968     if (likely(float32_is_normal(a))) {
1969         /* Widening conversion can never produce inexact results.  */
1970         union_float32 uf;
1971         union_float64 ud;
1972         uf.s = a;
1973         ud.h = uf.h;
1974         return ud.s;
1975     } else if (float32_is_zero(a)) {
1976         return float64_set_sign(float64_zero, float32_is_neg(a));
1977     } else {
1978         return soft_float32_to_float64(a, s);
1979     }
1980 }
1981
1982 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1983 {
1984     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1985     FloatParts64 p = float64_unpack_canonical(a, s);
1986     FloatParts64 pr = float_to_float(p, fmt16, s);
1987     return float16a_round_pack_canonical(pr, s, fmt16);
1988 }
1989
1990 float32 float64_to_float32(float64 a, float_status *s)
1991 {
1992     FloatParts64 p = float64_unpack_canonical(a, s);
1993     FloatParts64 pr = float_to_float(p, &float32_params, s);
1994     return float32_round_pack_canonical(pr, s);
1995 }
1996
1997 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
1998 {
1999     FloatParts64 p = bfloat16_unpack_canonical(a, s);
2000     FloatParts64 pr = float_to_float(p, &float32_params, s);
2001     return float32_round_pack_canonical(pr, s);
2002 }
2003
2004 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2005 {
2006     FloatParts64 p = bfloat16_unpack_canonical(a, s);
2007     FloatParts64 pr = float_to_float(p, &float64_params, s);
2008     return float64_round_pack_canonical(pr, s);
2009 }
2010
2011 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2012 {
2013     FloatParts64 p = float32_unpack_canonical(a, s);
2014     FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2015     return bfloat16_round_pack_canonical(pr, s);
2016 }
2017
2018 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2019 {
2020     FloatParts64 p = float64_unpack_canonical(a, s);
2021     FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2022     return bfloat16_round_pack_canonical(pr, s);
2023 }
2024
2025 /*
2026  * Rounds the floating-point value `a' to an integer, and returns the
2027  * result as a floating-point value. The operation is performed
2028  * according to the IEC/IEEE Standard for Binary Floating-Point
2029  * Arithmetic.
2030  */
2031
2032 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2033                                int scale, float_status *s)
2034 {
2035     switch (a.cls) {
2036     case float_class_qnan:
2037     case float_class_snan:
2038         return return_nan(a, s);
2039
2040     case float_class_zero:
2041     case float_class_inf:
2042         /* already "integral" */
2043         break;
2044
2045     case float_class_normal:
2046         scale = MIN(MAX(scale, -0x10000), 0x10000);
2047         a.exp += scale;
2048
2049         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2050             /* already integral */
2051             break;
2052         }
2053         if (a.exp < 0) {
2054             bool one;
2055             /* all fractional */
2056             float_raise(float_flag_inexact, s);
2057             switch (rmode) {
2058             case float_round_nearest_even:
2059                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2060                 break;
2061             case float_round_ties_away:
2062                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2063                 break;
2064             case float_round_to_zero:
2065                 one = false;
2066                 break;
2067             case float_round_up:
2068                 one = !a.sign;
2069                 break;
2070             case float_round_down:
2071                 one = a.sign;
2072                 break;
2073             case float_round_to_odd:
2074                 one = true;
2075                 break;
2076             default:
2077                 g_assert_not_reached();
2078             }
2079
2080             if (one) {
2081                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2082                 a.exp = 0;
2083             } else {
2084                 a.cls = float_class_zero;
2085             }
2086         } else {
2087             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2088             uint64_t frac_lsbm1 = frac_lsb >> 1;
2089             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2090             uint64_t rnd_mask = rnd_even_mask >> 1;
2091             uint64_t inc;
2092
2093             switch (rmode) {
2094             case float_round_nearest_even:
2095                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2096                 break;
2097             case float_round_ties_away:
2098                 inc = frac_lsbm1;
2099                 break;
2100             case float_round_to_zero:
2101                 inc = 0;
2102                 break;
2103             case float_round_up:
2104                 inc = a.sign ? 0 : rnd_mask;
2105                 break;
2106             case float_round_down:
2107                 inc = a.sign ? rnd_mask : 0;
2108                 break;
2109             case float_round_to_odd:
2110                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2111                 break;
2112             default:
2113                 g_assert_not_reached();
2114             }
2115
2116             if (a.frac & rnd_mask) {
2117                 float_raise(float_flag_inexact, s);
2118                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2119                     a.frac >>= 1;
2120                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2121                     a.exp++;
2122                 }
2123                 a.frac &= ~rnd_mask;
2124             }
2125         }
2126         break;
2127     default:
2128         g_assert_not_reached();
2129     }
2130     return a;
2131 }
2132
2133 float16 float16_round_to_int(float16 a, float_status *s)
2134 {
2135     FloatParts64 pa = float16_unpack_canonical(a, s);
2136     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2137     return float16_round_pack_canonical(pr, s);
2138 }
2139
2140 float32 float32_round_to_int(float32 a, float_status *s)
2141 {
2142     FloatParts64 pa = float32_unpack_canonical(a, s);
2143     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2144     return float32_round_pack_canonical(pr, s);
2145 }
2146
2147 float64 float64_round_to_int(float64 a, float_status *s)
2148 {
2149     FloatParts64 pa = float64_unpack_canonical(a, s);
2150     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2151     return float64_round_pack_canonical(pr, s);
2152 }
2153
2154 /*
2155  * Rounds the bfloat16 value `a' to an integer, and returns the
2156  * result as a bfloat16 value.
2157  */
2158
2159 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2160 {
2161     FloatParts64 pa = bfloat16_unpack_canonical(a, s);
2162     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2163     return bfloat16_round_pack_canonical(pr, s);
2164 }
2165
2166 /*
2167  * Returns the result of converting the floating-point value `a' to
2168  * the two's complement integer format. The conversion is performed
2169  * according to the IEC/IEEE Standard for Binary Floating-Point
2170  * Arithmetic---which means in particular that the conversion is
2171  * rounded according to the current rounding mode. If `a' is a NaN,
2172  * the largest positive integer is returned. Otherwise, if the
2173  * conversion overflows, the largest integer with the same sign as `a'
2174  * is returned.
2175 */
2176
2177 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2178                                      int scale, int64_t min, int64_t max,
2179                                      float_status *s)
2180 {
2181     uint64_t r;
2182     int orig_flags = get_float_exception_flags(s);
2183     FloatParts64 p = round_to_int(in, rmode, scale, s);
2184
2185     switch (p.cls) {
2186     case float_class_snan:
2187     case float_class_qnan:
2188         s->float_exception_flags = orig_flags | float_flag_invalid;
2189         return max;
2190     case float_class_inf:
2191         s->float_exception_flags = orig_flags | float_flag_invalid;
2192         return p.sign ? min : max;
2193     case float_class_zero:
2194         return 0;
2195     case float_class_normal:
2196         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2197             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2198         } else {
2199             r = UINT64_MAX;
2200         }
2201         if (p.sign) {
2202             if (r <= -(uint64_t) min) {
2203                 return -r;
2204             } else {
2205                 s->float_exception_flags = orig_flags | float_flag_invalid;
2206                 return min;
2207             }
2208         } else {
2209             if (r <= max) {
2210                 return r;
2211             } else {
2212                 s->float_exception_flags = orig_flags | float_flag_invalid;
2213                 return max;
2214             }
2215         }
2216     default:
2217         g_assert_not_reached();
2218     }
2219 }
2220
2221 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2222                               float_status *s)
2223 {
2224     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2225                                  rmode, scale, INT8_MIN, INT8_MAX, s);
2226 }
2227
2228 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2229                                 float_status *s)
2230 {
2231     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2232                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2233 }
2234
2235 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2236                                 float_status *s)
2237 {
2238     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2239                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2240 }
2241
2242 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2243                                 float_status *s)
2244 {
2245     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2246                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2247 }
2248
2249 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2250                                 float_status *s)
2251 {
2252     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2253                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2254 }
2255
2256 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2257                                 float_status *s)
2258 {
2259     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2260                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2261 }
2262
2263 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2264                                 float_status *s)
2265 {
2266     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2267                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2268 }
2269
2270 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2271                                 float_status *s)
2272 {
2273     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2274                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2275 }
2276
2277 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2278                                 float_status *s)
2279 {
2280     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2281                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2282 }
2283
2284 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2285                                 float_status *s)
2286 {
2287     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2288                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2289 }
2290
2291 int8_t float16_to_int8(float16 a, float_status *s)
2292 {
2293     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2294 }
2295
2296 int16_t float16_to_int16(float16 a, float_status *s)
2297 {
2298     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2299 }
2300
2301 int32_t float16_to_int32(float16 a, float_status *s)
2302 {
2303     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2304 }
2305
2306 int64_t float16_to_int64(float16 a, float_status *s)
2307 {
2308     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2309 }
2310
2311 int16_t float32_to_int16(float32 a, float_status *s)
2312 {
2313     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2314 }
2315
2316 int32_t float32_to_int32(float32 a, float_status *s)
2317 {
2318     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2319 }
2320
2321 int64_t float32_to_int64(float32 a, float_status *s)
2322 {
2323     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2324 }
2325
2326 int16_t float64_to_int16(float64 a, float_status *s)
2327 {
2328     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2329 }
2330
2331 int32_t float64_to_int32(float64 a, float_status *s)
2332 {
2333     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2334 }
2335
2336 int64_t float64_to_int64(float64 a, float_status *s)
2337 {
2338     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2339 }
2340
2341 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2342 {
2343     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2344 }
2345
2346 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2347 {
2348     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2349 }
2350
2351 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2352 {
2353     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2354 }
2355
2356 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2357 {
2358     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2359 }
2360
2361 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2362 {
2363     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2364 }
2365
2366 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2367 {
2368     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2369 }
2370
2371 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2372 {
2373     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2374 }
2375
2376 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2377 {
2378     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2379 }
2380
2381 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2382 {
2383     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2384 }
2385
2386 /*
2387  * Returns the result of converting the floating-point value `a' to
2388  * the two's complement integer format.
2389  */
2390
2391 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2392                                  float_status *s)
2393 {
2394     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2395                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2396 }
2397
2398 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2399                                  float_status *s)
2400 {
2401     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2402                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2403 }
2404
2405 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2406                                  float_status *s)
2407 {
2408     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2409                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2410 }
2411
2412 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2413 {
2414     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2415 }
2416
2417 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2418 {
2419     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2420 }
2421
2422 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2423 {
2424     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2425 }
2426
2427 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2428 {
2429     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2430 }
2431
2432 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2433 {
2434     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2435 }
2436
2437 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2438 {
2439     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2440 }
2441
2442 /*
2443  *  Returns the result of converting the floating-point value `a' to
2444  *  the unsigned integer format. The conversion is performed according
2445  *  to the IEC/IEEE Standard for Binary Floating-Point
2446  *  Arithmetic---which means in particular that the conversion is
2447  *  rounded according to the current rounding mode. If `a' is a NaN,
2448  *  the largest unsigned integer is returned. Otherwise, if the
2449  *  conversion overflows, the largest unsigned integer is returned. If
2450  *  the 'a' is negative, the result is rounded and zero is returned;
2451  *  values that do not round to zero will raise the inexact exception
2452  *  flag.
2453  */
2454
2455 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2456                                        int scale, uint64_t max,
2457                                        float_status *s)
2458 {
2459     int orig_flags = get_float_exception_flags(s);
2460     FloatParts64 p = round_to_int(in, rmode, scale, s);
2461     uint64_t r;
2462
2463     switch (p.cls) {
2464     case float_class_snan:
2465     case float_class_qnan:
2466         s->float_exception_flags = orig_flags | float_flag_invalid;
2467         return max;
2468     case float_class_inf:
2469         s->float_exception_flags = orig_flags | float_flag_invalid;
2470         return p.sign ? 0 : max;
2471     case float_class_zero:
2472         return 0;
2473     case float_class_normal:
2474         if (p.sign) {
2475             s->float_exception_flags = orig_flags | float_flag_invalid;
2476             return 0;
2477         }
2478
2479         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2480             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2481         } else {
2482             s->float_exception_flags = orig_flags | float_flag_invalid;
2483             return max;
2484         }
2485
2486         /* For uint64 this will never trip, but if p.exp is too large
2487          * to shift a decomposed fraction we shall have exited via the
2488          * 3rd leg above.
2489          */
2490         if (r > max) {
2491             s->float_exception_flags = orig_flags | float_flag_invalid;
2492             return max;
2493         }
2494         return r;
2495     default:
2496         g_assert_not_reached();
2497     }
2498 }
2499
2500 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2501                                 float_status *s)
2502 {
2503     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2504                                   rmode, scale, UINT8_MAX, s);
2505 }
2506
2507 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2508                                   float_status *s)
2509 {
2510     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2511                                   rmode, scale, UINT16_MAX, s);
2512 }
2513
2514 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2515                                   float_status *s)
2516 {
2517     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2518                                   rmode, scale, UINT32_MAX, s);
2519 }
2520
2521 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2522                                   float_status *s)
2523 {
2524     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2525                                   rmode, scale, UINT64_MAX, s);
2526 }
2527
2528 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2529                                   float_status *s)
2530 {
2531     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2532                                   rmode, scale, UINT16_MAX, s);
2533 }
2534
2535 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2536                                   float_status *s)
2537 {
2538     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2539                                   rmode, scale, UINT32_MAX, s);
2540 }
2541
2542 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2543                                   float_status *s)
2544 {
2545     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2546                                   rmode, scale, UINT64_MAX, s);
2547 }
2548
2549 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2550                                   float_status *s)
2551 {
2552     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2553                                   rmode, scale, UINT16_MAX, s);
2554 }
2555
2556 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2557                                   float_status *s)
2558 {
2559     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2560                                   rmode, scale, UINT32_MAX, s);
2561 }
2562
2563 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2564                                   float_status *s)
2565 {
2566     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2567                                   rmode, scale, UINT64_MAX, s);
2568 }
2569
2570 uint8_t float16_to_uint8(float16 a, float_status *s)
2571 {
2572     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2573 }
2574
2575 uint16_t float16_to_uint16(float16 a, float_status *s)
2576 {
2577     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2578 }
2579
2580 uint32_t float16_to_uint32(float16 a, float_status *s)
2581 {
2582     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2583 }
2584
2585 uint64_t float16_to_uint64(float16 a, float_status *s)
2586 {
2587     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2588 }
2589
2590 uint16_t float32_to_uint16(float32 a, float_status *s)
2591 {
2592     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2593 }
2594
2595 uint32_t float32_to_uint32(float32 a, float_status *s)
2596 {
2597     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2598 }
2599
2600 uint64_t float32_to_uint64(float32 a, float_status *s)
2601 {
2602     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2603 }
2604
2605 uint16_t float64_to_uint16(float64 a, float_status *s)
2606 {
2607     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2608 }
2609
2610 uint32_t float64_to_uint32(float64 a, float_status *s)
2611 {
2612     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614
2615 uint64_t float64_to_uint64(float64 a, float_status *s)
2616 {
2617     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2618 }
2619
2620 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2621 {
2622     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2623 }
2624
2625 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2626 {
2627     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2628 }
2629
2630 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2631 {
2632     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634
2635 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2636 {
2637     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639
2640 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2641 {
2642     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2643 }
2644
2645 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2646 {
2647     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2648 }
2649
2650 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2651 {
2652     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2653 }
2654
2655 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2656 {
2657     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2658 }
2659
2660 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2661 {
2662     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2663 }
2664
2665 /*
2666  *  Returns the result of converting the bfloat16 value `a' to
2667  *  the unsigned integer format.
2668  */
2669
2670 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2671                                    int scale, float_status *s)
2672 {
2673     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2674                                   rmode, scale, UINT16_MAX, s);
2675 }
2676
2677 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2678                                    int scale, float_status *s)
2679 {
2680     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2681                                   rmode, scale, UINT32_MAX, s);
2682 }
2683
2684 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2685                                    int scale, float_status *s)
2686 {
2687     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2688                                   rmode, scale, UINT64_MAX, s);
2689 }
2690
2691 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2692 {
2693     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2694 }
2695
2696 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2697 {
2698     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2699 }
2700
2701 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2702 {
2703     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2704 }
2705
2706 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2707 {
2708     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2709 }
2710
2711 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2712 {
2713     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2714 }
2715
2716 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2717 {
2718     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2719 }
2720
2721 /*
2722  * Integer to float conversions
2723  *
2724  * Returns the result of converting the two's complement integer `a'
2725  * to the floating-point format. The conversion is performed according
2726  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2727  */
2728
2729 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2730 {
2731     FloatParts64 r = { .sign = false };
2732
2733     if (a == 0) {
2734         r.cls = float_class_zero;
2735     } else {
2736         uint64_t f = a;
2737         int shift;
2738
2739         r.cls = float_class_normal;
2740         if (a < 0) {
2741             f = -f;
2742             r.sign = true;
2743         }
2744         shift = clz64(f);
2745         scale = MIN(MAX(scale, -0x10000), 0x10000);
2746
2747         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2748         r.frac = f << shift;
2749     }
2750
2751     return r;
2752 }
2753
2754 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2755 {
2756     FloatParts64 pa = int_to_float(a, scale, status);
2757     return float16_round_pack_canonical(pa, status);
2758 }
2759
2760 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2761 {
2762     return int64_to_float16_scalbn(a, scale, status);
2763 }
2764
2765 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2766 {
2767     return int64_to_float16_scalbn(a, scale, status);
2768 }
2769
2770 float16 int64_to_float16(int64_t a, float_status *status)
2771 {
2772     return int64_to_float16_scalbn(a, 0, status);
2773 }
2774
2775 float16 int32_to_float16(int32_t a, float_status *status)
2776 {
2777     return int64_to_float16_scalbn(a, 0, status);
2778 }
2779
2780 float16 int16_to_float16(int16_t a, float_status *status)
2781 {
2782     return int64_to_float16_scalbn(a, 0, status);
2783 }
2784
2785 float16 int8_to_float16(int8_t a, float_status *status)
2786 {
2787     return int64_to_float16_scalbn(a, 0, status);
2788 }
2789
2790 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2791 {
2792     FloatParts64 pa = int_to_float(a, scale, status);
2793     return float32_round_pack_canonical(pa, status);
2794 }
2795
2796 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2797 {
2798     return int64_to_float32_scalbn(a, scale, status);
2799 }
2800
2801 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2802 {
2803     return int64_to_float32_scalbn(a, scale, status);
2804 }
2805
2806 float32 int64_to_float32(int64_t a, float_status *status)
2807 {
2808     return int64_to_float32_scalbn(a, 0, status);
2809 }
2810
2811 float32 int32_to_float32(int32_t a, float_status *status)
2812 {
2813     return int64_to_float32_scalbn(a, 0, status);
2814 }
2815
2816 float32 int16_to_float32(int16_t a, float_status *status)
2817 {
2818     return int64_to_float32_scalbn(a, 0, status);
2819 }
2820
2821 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2822 {
2823     FloatParts64 pa = int_to_float(a, scale, status);
2824     return float64_round_pack_canonical(pa, status);
2825 }
2826
2827 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2828 {
2829     return int64_to_float64_scalbn(a, scale, status);
2830 }
2831
2832 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2833 {
2834     return int64_to_float64_scalbn(a, scale, status);
2835 }
2836
2837 float64 int64_to_float64(int64_t a, float_status *status)
2838 {
2839     return int64_to_float64_scalbn(a, 0, status);
2840 }
2841
2842 float64 int32_to_float64(int32_t a, float_status *status)
2843 {
2844     return int64_to_float64_scalbn(a, 0, status);
2845 }
2846
2847 float64 int16_to_float64(int16_t a, float_status *status)
2848 {
2849     return int64_to_float64_scalbn(a, 0, status);
2850 }
2851
2852 /*
2853  * Returns the result of converting the two's complement integer `a'
2854  * to the bfloat16 format.
2855  */
2856
2857 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2858 {
2859     FloatParts64 pa = int_to_float(a, scale, status);
2860     return bfloat16_round_pack_canonical(pa, status);
2861 }
2862
2863 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2864 {
2865     return int64_to_bfloat16_scalbn(a, scale, status);
2866 }
2867
2868 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2869 {
2870     return int64_to_bfloat16_scalbn(a, scale, status);
2871 }
2872
2873 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2874 {
2875     return int64_to_bfloat16_scalbn(a, 0, status);
2876 }
2877
2878 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2879 {
2880     return int64_to_bfloat16_scalbn(a, 0, status);
2881 }
2882
2883 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2884 {
2885     return int64_to_bfloat16_scalbn(a, 0, status);
2886 }
2887
2888 /*
2889  * Unsigned Integer to float conversions
2890  *
2891  * Returns the result of converting the unsigned integer `a' to the
2892  * floating-point format. The conversion is performed according to the
2893  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2894  */
2895
2896 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
2897 {
2898     FloatParts64 r = { .sign = false };
2899     int shift;
2900
2901     if (a == 0) {
2902         r.cls = float_class_zero;
2903     } else {
2904         scale = MIN(MAX(scale, -0x10000), 0x10000);
2905         shift = clz64(a);
2906         r.cls = float_class_normal;
2907         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2908         r.frac = a << shift;
2909     }
2910
2911     return r;
2912 }
2913
2914 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2915 {
2916     FloatParts64 pa = uint_to_float(a, scale, status);
2917     return float16_round_pack_canonical(pa, status);
2918 }
2919
2920 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2921 {
2922     return uint64_to_float16_scalbn(a, scale, status);
2923 }
2924
2925 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2926 {
2927     return uint64_to_float16_scalbn(a, scale, status);
2928 }
2929
2930 float16 uint64_to_float16(uint64_t a, float_status *status)
2931 {
2932     return uint64_to_float16_scalbn(a, 0, status);
2933 }
2934
2935 float16 uint32_to_float16(uint32_t a, float_status *status)
2936 {
2937     return uint64_to_float16_scalbn(a, 0, status);
2938 }
2939
2940 float16 uint16_to_float16(uint16_t a, float_status *status)
2941 {
2942     return uint64_to_float16_scalbn(a, 0, status);
2943 }
2944
2945 float16 uint8_to_float16(uint8_t a, float_status *status)
2946 {
2947     return uint64_to_float16_scalbn(a, 0, status);
2948 }
2949
2950 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2951 {
2952     FloatParts64 pa = uint_to_float(a, scale, status);
2953     return float32_round_pack_canonical(pa, status);
2954 }
2955
2956 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2957 {
2958     return uint64_to_float32_scalbn(a, scale, status);
2959 }
2960
2961 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2962 {
2963     return uint64_to_float32_scalbn(a, scale, status);
2964 }
2965
2966 float32 uint64_to_float32(uint64_t a, float_status *status)
2967 {
2968     return uint64_to_float32_scalbn(a, 0, status);
2969 }
2970
2971 float32 uint32_to_float32(uint32_t a, float_status *status)
2972 {
2973     return uint64_to_float32_scalbn(a, 0, status);
2974 }
2975
2976 float32 uint16_to_float32(uint16_t a, float_status *status)
2977 {
2978     return uint64_to_float32_scalbn(a, 0, status);
2979 }
2980
2981 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2982 {
2983     FloatParts64 pa = uint_to_float(a, scale, status);
2984     return float64_round_pack_canonical(pa, status);
2985 }
2986
2987 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2988 {
2989     return uint64_to_float64_scalbn(a, scale, status);
2990 }
2991
2992 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2993 {
2994     return uint64_to_float64_scalbn(a, scale, status);
2995 }
2996
2997 float64 uint64_to_float64(uint64_t a, float_status *status)
2998 {
2999     return uint64_to_float64_scalbn(a, 0, status);
3000 }
3001
3002 float64 uint32_to_float64(uint32_t a, float_status *status)
3003 {
3004     return uint64_to_float64_scalbn(a, 0, status);
3005 }
3006
3007 float64 uint16_to_float64(uint16_t a, float_status *status)
3008 {
3009     return uint64_to_float64_scalbn(a, 0, status);
3010 }
3011
3012 /*
3013  * Returns the result of converting the unsigned integer `a' to the
3014  * bfloat16 format.
3015  */
3016
3017 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3018 {
3019     FloatParts64 pa = uint_to_float(a, scale, status);
3020     return bfloat16_round_pack_canonical(pa, status);
3021 }
3022
3023 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3024 {
3025     return uint64_to_bfloat16_scalbn(a, scale, status);
3026 }
3027
3028 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3029 {
3030     return uint64_to_bfloat16_scalbn(a, scale, status);
3031 }
3032
3033 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3034 {
3035     return uint64_to_bfloat16_scalbn(a, 0, status);
3036 }
3037
3038 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3039 {
3040     return uint64_to_bfloat16_scalbn(a, 0, status);
3041 }
3042
3043 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3044 {
3045     return uint64_to_bfloat16_scalbn(a, 0, status);
3046 }
3047
3048 /* Float Min/Max */
3049 /* min() and max() functions. These can't be implemented as
3050  * 'compare and pick one input' because that would mishandle
3051  * NaNs and +0 vs -0.
3052  *
3053  * minnum() and maxnum() functions. These are similar to the min()
3054  * and max() functions but if one of the arguments is a QNaN and
3055  * the other is numerical then the numerical argument is returned.
3056  * SNaNs will get quietened before being returned.
3057  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3058  * and maxNum() operations. min() and max() are the typical min/max
3059  * semantics provided by many CPUs which predate that specification.
3060  *
3061  * minnummag() and maxnummag() functions correspond to minNumMag()
3062  * and minNumMag() from the IEEE-754 2008.
3063  */
3064 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3065                                 bool ieee, bool ismag, float_status *s)
3066 {
3067     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3068         if (ieee) {
3069             /* Takes two floating-point values `a' and `b', one of
3070              * which is a NaN, and returns the appropriate NaN
3071              * result. If either `a' or `b' is a signaling NaN,
3072              * the invalid exception is raised.
3073              */
3074             if (is_snan(a.cls) || is_snan(b.cls)) {
3075                 return pick_nan(a, b, s);
3076             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3077                 return b;
3078             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3079                 return a;
3080             }
3081         }
3082         return pick_nan(a, b, s);
3083     } else {
3084         int a_exp, b_exp;
3085
3086         switch (a.cls) {
3087         case float_class_normal:
3088             a_exp = a.exp;
3089             break;
3090         case float_class_inf:
3091             a_exp = INT_MAX;
3092             break;
3093         case float_class_zero:
3094             a_exp = INT_MIN;
3095             break;
3096         default:
3097             g_assert_not_reached();
3098             break;
3099         }
3100         switch (b.cls) {
3101         case float_class_normal:
3102             b_exp = b.exp;
3103             break;
3104         case float_class_inf:
3105             b_exp = INT_MAX;
3106             break;
3107         case float_class_zero:
3108             b_exp = INT_MIN;
3109             break;
3110         default:
3111             g_assert_not_reached();
3112             break;
3113         }
3114
3115         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3116             bool a_less = a_exp < b_exp;
3117             if (a_exp == b_exp) {
3118                 a_less = a.frac < b.frac;
3119             }
3120             return a_less ^ ismin ? b : a;
3121         }
3122
3123         if (a.sign == b.sign) {
3124             bool a_less = a_exp < b_exp;
3125             if (a_exp == b_exp) {
3126                 a_less = a.frac < b.frac;
3127             }
3128             return a.sign ^ a_less ^ ismin ? b : a;
3129         } else {
3130             return a.sign ^ ismin ? b : a;
3131         }
3132     }
3133 }
3134
3135 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3136 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3137                                      float_status *s)                   \
3138 {                                                                       \
3139     FloatParts64 pa = float ## sz ## _unpack_canonical(a, s);             \
3140     FloatParts64 pb = float ## sz ## _unpack_canonical(b, s);             \
3141     FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3142                                                                         \
3143     return float ## sz ## _round_pack_canonical(pr, s);                 \
3144 }
3145
3146 MINMAX(16, min, true, false, false)
3147 MINMAX(16, minnum, true, true, false)
3148 MINMAX(16, minnummag, true, true, true)
3149 MINMAX(16, max, false, false, false)
3150 MINMAX(16, maxnum, false, true, false)
3151 MINMAX(16, maxnummag, false, true, true)
3152
3153 MINMAX(32, min, true, false, false)
3154 MINMAX(32, minnum, true, true, false)
3155 MINMAX(32, minnummag, true, true, true)
3156 MINMAX(32, max, false, false, false)
3157 MINMAX(32, maxnum, false, true, false)
3158 MINMAX(32, maxnummag, false, true, true)
3159
3160 MINMAX(64, min, true, false, false)
3161 MINMAX(64, minnum, true, true, false)
3162 MINMAX(64, minnummag, true, true, true)
3163 MINMAX(64, max, false, false, false)
3164 MINMAX(64, maxnum, false, true, false)
3165 MINMAX(64, maxnummag, false, true, true)
3166
3167 #undef MINMAX
3168
3169 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3170 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3171 {                                                                       \
3172     FloatParts64 pa = bfloat16_unpack_canonical(a, s);                    \
3173     FloatParts64 pb = bfloat16_unpack_canonical(b, s);                    \
3174     FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3175                                                                         \
3176     return bfloat16_round_pack_canonical(pr, s);                        \
3177 }
3178
3179 BF16_MINMAX(min, true, false, false)
3180 BF16_MINMAX(minnum, true, true, false)
3181 BF16_MINMAX(minnummag, true, true, true)
3182 BF16_MINMAX(max, false, false, false)
3183 BF16_MINMAX(maxnum, false, true, false)
3184 BF16_MINMAX(maxnummag, false, true, true)
3185
3186 #undef BF16_MINMAX
3187
3188 /* Floating point compare */
3189 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3190                                     float_status *s)
3191 {
3192     if (is_nan(a.cls) || is_nan(b.cls)) {
3193         if (!is_quiet ||
3194             a.cls == float_class_snan ||
3195             b.cls == float_class_snan) {
3196             float_raise(float_flag_invalid, s);
3197         }
3198         return float_relation_unordered;
3199     }
3200
3201     if (a.cls == float_class_zero) {
3202         if (b.cls == float_class_zero) {
3203             return float_relation_equal;
3204         }
3205         return b.sign ? float_relation_greater : float_relation_less;
3206     } else if (b.cls == float_class_zero) {
3207         return a.sign ? float_relation_less : float_relation_greater;
3208     }
3209
3210     /* The only really important thing about infinity is its sign. If
3211      * both are infinities the sign marks the smallest of the two.
3212      */
3213     if (a.cls == float_class_inf) {
3214         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3215             return float_relation_equal;
3216         }
3217         return a.sign ? float_relation_less : float_relation_greater;
3218     } else if (b.cls == float_class_inf) {
3219         return b.sign ? float_relation_greater : float_relation_less;
3220     }
3221
3222     if (a.sign != b.sign) {
3223         return a.sign ? float_relation_less : float_relation_greater;
3224     }
3225
3226     if (a.exp == b.exp) {
3227         if (a.frac == b.frac) {
3228             return float_relation_equal;
3229         }
3230         if (a.sign) {
3231             return a.frac > b.frac ?
3232                 float_relation_less : float_relation_greater;
3233         } else {
3234             return a.frac > b.frac ?
3235                 float_relation_greater : float_relation_less;
3236         }
3237     } else {
3238         if (a.sign) {
3239             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3240         } else {
3241             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3242         }
3243     }
3244 }
3245
3246 #define COMPARE(name, attr, sz)                                         \
3247 static int attr                                                         \
3248 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3249 {                                                                       \
3250     FloatParts64 pa = float ## sz ## _unpack_canonical(a, s);             \
3251     FloatParts64 pb = float ## sz ## _unpack_canonical(b, s);             \
3252     return compare_floats(pa, pb, is_quiet, s);                         \
3253 }
3254
3255 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3256 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3257 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3258
3259 #undef COMPARE
3260
3261 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3262 {
3263     return soft_f16_compare(a, b, false, s);
3264 }
3265
3266 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3267 {
3268     return soft_f16_compare(a, b, true, s);
3269 }
3270
3271 static FloatRelation QEMU_FLATTEN
3272 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3273 {
3274     union_float32 ua, ub;
3275
3276     ua.s = xa;
3277     ub.s = xb;
3278
3279     if (QEMU_NO_HARDFLOAT) {
3280         goto soft;
3281     }
3282
3283     float32_input_flush2(&ua.s, &ub.s, s);
3284     if (isgreaterequal(ua.h, ub.h)) {
3285         if (isgreater(ua.h, ub.h)) {
3286             return float_relation_greater;
3287         }
3288         return float_relation_equal;
3289     }
3290     if (likely(isless(ua.h, ub.h))) {
3291         return float_relation_less;
3292     }
3293     /* The only condition remaining is unordered.
3294      * Fall through to set flags.
3295      */
3296  soft:
3297     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3298 }
3299
3300 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3301 {
3302     return f32_compare(a, b, false, s);
3303 }
3304
3305 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3306 {
3307     return f32_compare(a, b, true, s);
3308 }
3309
3310 static FloatRelation QEMU_FLATTEN
3311 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3312 {
3313     union_float64 ua, ub;
3314
3315     ua.s = xa;
3316     ub.s = xb;
3317
3318     if (QEMU_NO_HARDFLOAT) {
3319         goto soft;
3320     }
3321
3322     float64_input_flush2(&ua.s, &ub.s, s);
3323     if (isgreaterequal(ua.h, ub.h)) {
3324         if (isgreater(ua.h, ub.h)) {
3325             return float_relation_greater;
3326         }
3327         return float_relation_equal;
3328     }
3329     if (likely(isless(ua.h, ub.h))) {
3330         return float_relation_less;
3331     }
3332     /* The only condition remaining is unordered.
3333      * Fall through to set flags.
3334      */
3335  soft:
3336     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3337 }
3338
3339 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3340 {
3341     return f64_compare(a, b, false, s);
3342 }
3343
3344 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3345 {
3346     return f64_compare(a, b, true, s);
3347 }
3348
3349 static FloatRelation QEMU_FLATTEN
3350 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3351 {
3352     FloatParts64 pa = bfloat16_unpack_canonical(a, s);
3353     FloatParts64 pb = bfloat16_unpack_canonical(b, s);
3354     return compare_floats(pa, pb, is_quiet, s);
3355 }
3356
3357 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3358 {
3359     return soft_bf16_compare(a, b, false, s);
3360 }
3361
3362 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3363 {
3364     return soft_bf16_compare(a, b, true, s);
3365 }
3366
3367 /* Multiply A by 2 raised to the power N.  */
3368 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3369 {
3370     if (unlikely(is_nan(a.cls))) {
3371         return return_nan(a, s);
3372     }
3373     if (a.cls == float_class_normal) {
3374         /* The largest float type (even though not supported by FloatParts64)
3375          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3376          * still allows rounding to infinity, without allowing overflow
3377          * within the int32_t that backs FloatParts64.exp.
3378          */
3379         n = MIN(MAX(n, -0x10000), 0x10000);
3380         a.exp += n;
3381     }
3382     return a;
3383 }
3384
3385 float16 float16_scalbn(float16 a, int n, float_status *status)
3386 {
3387     FloatParts64 pa = float16_unpack_canonical(a, status);
3388     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3389     return float16_round_pack_canonical(pr, status);
3390 }
3391
3392 float32 float32_scalbn(float32 a, int n, float_status *status)
3393 {
3394     FloatParts64 pa = float32_unpack_canonical(a, status);
3395     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3396     return float32_round_pack_canonical(pr, status);
3397 }
3398
3399 float64 float64_scalbn(float64 a, int n, float_status *status)
3400 {
3401     FloatParts64 pa = float64_unpack_canonical(a, status);
3402     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3403     return float64_round_pack_canonical(pr, status);
3404 }
3405
3406 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3407 {
3408     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3409     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3410     return bfloat16_round_pack_canonical(pr, status);
3411 }
3412
3413 /*
3414  * Square Root
3415  *
3416  * The old softfloat code did an approximation step before zeroing in
3417  * on the final result. However for simpleness we just compute the
3418  * square root by iterating down from the implicit bit to enough extra
3419  * bits to ensure we get a correctly rounded result.
3420  *
3421  * This does mean however the calculation is slower than before,
3422  * especially for 64 bit floats.
3423  */
3424
3425 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3426 {
3427     uint64_t a_frac, r_frac, s_frac;
3428     int bit, last_bit;
3429
3430     if (is_nan(a.cls)) {
3431         return return_nan(a, s);
3432     }
3433     if (a.cls == float_class_zero) {
3434         return a;  /* sqrt(+-0) = +-0 */
3435     }
3436     if (a.sign) {
3437         float_raise(float_flag_invalid, s);
3438         return parts_default_nan(s);
3439     }
3440     if (a.cls == float_class_inf) {
3441         return a;  /* sqrt(+inf) = +inf */
3442     }
3443
3444     assert(a.cls == float_class_normal);
3445
3446     /* We need two overflow bits at the top. Adding room for that is a
3447      * right shift. If the exponent is odd, we can discard the low bit
3448      * by multiplying the fraction by 2; that's a left shift. Combine
3449      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3450      */
3451     a_frac = a.frac >> (2 - (a.exp & 1));
3452     a.exp >>= 1;
3453
3454     /* Bit-by-bit computation of sqrt.  */
3455     r_frac = 0;
3456     s_frac = 0;
3457
3458     /* Iterate from implicit bit down to the 3 extra bits to compute a
3459      * properly rounded result. Remember we've inserted two more bits
3460      * at the top, so these positions are two less.
3461      */
3462     bit = DECOMPOSED_BINARY_POINT - 2;
3463     last_bit = MAX(p->frac_shift - 4, 0);
3464     do {
3465         uint64_t q = 1ULL << bit;
3466         uint64_t t_frac = s_frac + q;
3467         if (t_frac <= a_frac) {
3468             s_frac = t_frac + q;
3469             a_frac -= t_frac;
3470             r_frac += q;
3471         }
3472         a_frac <<= 1;
3473     } while (--bit >= last_bit);
3474
3475     /* Undo the right shift done above. If there is any remaining
3476      * fraction, the result is inexact. Set the sticky bit.
3477      */
3478     a.frac = (r_frac << 2) + (a_frac != 0);
3479
3480     return a;
3481 }
3482
3483 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3484 {
3485     FloatParts64 pa = float16_unpack_canonical(a, status);
3486     FloatParts64 pr = sqrt_float(pa, status, &float16_params);
3487     return float16_round_pack_canonical(pr, status);
3488 }
3489
3490 static float32 QEMU_SOFTFLOAT_ATTR
3491 soft_f32_sqrt(float32 a, float_status *status)
3492 {
3493     FloatParts64 pa = float32_unpack_canonical(a, status);
3494     FloatParts64 pr = sqrt_float(pa, status, &float32_params);
3495     return float32_round_pack_canonical(pr, status);
3496 }
3497
3498 static float64 QEMU_SOFTFLOAT_ATTR
3499 soft_f64_sqrt(float64 a, float_status *status)
3500 {
3501     FloatParts64 pa = float64_unpack_canonical(a, status);
3502     FloatParts64 pr = sqrt_float(pa, status, &float64_params);
3503     return float64_round_pack_canonical(pr, status);
3504 }
3505
3506 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3507 {
3508     union_float32 ua, ur;
3509
3510     ua.s = xa;
3511     if (unlikely(!can_use_fpu(s))) {
3512         goto soft;
3513     }
3514
3515     float32_input_flush1(&ua.s, s);
3516     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3517         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3518                        fpclassify(ua.h) == FP_ZERO) ||
3519                      signbit(ua.h))) {
3520             goto soft;
3521         }
3522     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3523                         float32_is_neg(ua.s))) {
3524         goto soft;
3525     }
3526     ur.h = sqrtf(ua.h);
3527     return ur.s;
3528
3529  soft:
3530     return soft_f32_sqrt(ua.s, s);
3531 }
3532
3533 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3534 {
3535     union_float64 ua, ur;
3536
3537     ua.s = xa;
3538     if (unlikely(!can_use_fpu(s))) {
3539         goto soft;
3540     }
3541
3542     float64_input_flush1(&ua.s, s);
3543     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3544         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3545                        fpclassify(ua.h) == FP_ZERO) ||
3546                      signbit(ua.h))) {
3547             goto soft;
3548         }
3549     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3550                         float64_is_neg(ua.s))) {
3551         goto soft;
3552     }
3553     ur.h = sqrt(ua.h);
3554     return ur.s;
3555
3556  soft:
3557     return soft_f64_sqrt(ua.s, s);
3558 }
3559
3560 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3561 {
3562     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3563     FloatParts64 pr = sqrt_float(pa, status, &bfloat16_params);
3564     return bfloat16_round_pack_canonical(pr, status);
3565 }
3566
3567 /*----------------------------------------------------------------------------
3568 | The pattern for a default generated NaN.
3569 *----------------------------------------------------------------------------*/
3570
3571 float16 float16_default_nan(float_status *status)
3572 {
3573     FloatParts64 p = parts_default_nan(status);
3574     p.frac >>= float16_params.frac_shift;
3575     return float16_pack_raw(p);
3576 }
3577
3578 float32 float32_default_nan(float_status *status)
3579 {
3580     FloatParts64 p = parts_default_nan(status);
3581     p.frac >>= float32_params.frac_shift;
3582     return float32_pack_raw(p);
3583 }
3584
3585 float64 float64_default_nan(float_status *status)
3586 {
3587     FloatParts64 p = parts_default_nan(status);
3588     p.frac >>= float64_params.frac_shift;
3589     return float64_pack_raw(p);
3590 }
3591
3592 float128 float128_default_nan(float_status *status)
3593 {
3594     FloatParts64 p = parts_default_nan(status);
3595     float128 r;
3596
3597     /* Extrapolate from the choices made by parts_default_nan to fill
3598      * in the quad-floating format.  If the low bit is set, assume we
3599      * want to set all non-snan bits.
3600      */
3601     r.low = -(p.frac & 1);
3602     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3603     r.high |= UINT64_C(0x7FFF000000000000);
3604     r.high |= (uint64_t)p.sign << 63;
3605
3606     return r;
3607 }
3608
3609 bfloat16 bfloat16_default_nan(float_status *status)
3610 {
3611     FloatParts64 p = parts_default_nan(status);
3612     p.frac >>= bfloat16_params.frac_shift;
3613     return bfloat16_pack_raw(p);
3614 }
3615
3616 /*----------------------------------------------------------------------------
3617 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3618 *----------------------------------------------------------------------------*/
3619
3620 float16 float16_silence_nan(float16 a, float_status *status)
3621 {
3622     FloatParts64 p = float16_unpack_raw(a);
3623     p.frac <<= float16_params.frac_shift;
3624     p = parts_silence_nan(p, status);
3625     p.frac >>= float16_params.frac_shift;
3626     return float16_pack_raw(p);
3627 }
3628
3629 float32 float32_silence_nan(float32 a, float_status *status)
3630 {
3631     FloatParts64 p = float32_unpack_raw(a);
3632     p.frac <<= float32_params.frac_shift;
3633     p = parts_silence_nan(p, status);
3634     p.frac >>= float32_params.frac_shift;
3635     return float32_pack_raw(p);
3636 }
3637
3638 float64 float64_silence_nan(float64 a, float_status *status)
3639 {
3640     FloatParts64 p = float64_unpack_raw(a);
3641     p.frac <<= float64_params.frac_shift;
3642     p = parts_silence_nan(p, status);
3643     p.frac >>= float64_params.frac_shift;
3644     return float64_pack_raw(p);
3645 }
3646
3647 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3648 {
3649     FloatParts64 p = bfloat16_unpack_raw(a);
3650     p.frac <<= bfloat16_params.frac_shift;
3651     p = parts_silence_nan(p, status);
3652     p.frac >>= bfloat16_params.frac_shift;
3653     return bfloat16_pack_raw(p);
3654 }
3655
3656 /*----------------------------------------------------------------------------
3657 | If `a' is denormal and we are in flush-to-zero mode then set the
3658 | input-denormal exception and return zero. Otherwise just return the value.
3659 *----------------------------------------------------------------------------*/
3660
3661 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3662 {
3663     if (p.exp == 0 && p.frac != 0) {
3664         float_raise(float_flag_input_denormal, status);
3665         return true;
3666     }
3667
3668     return false;
3669 }
3670
3671 float16 float16_squash_input_denormal(float16 a, float_status *status)
3672 {
3673     if (status->flush_inputs_to_zero) {
3674         FloatParts64 p = float16_unpack_raw(a);
3675         if (parts_squash_denormal(p, status)) {
3676             return float16_set_sign(float16_zero, p.sign);
3677         }
3678     }
3679     return a;
3680 }
3681
3682 float32 float32_squash_input_denormal(float32 a, float_status *status)
3683 {
3684     if (status->flush_inputs_to_zero) {
3685         FloatParts64 p = float32_unpack_raw(a);
3686         if (parts_squash_denormal(p, status)) {
3687             return float32_set_sign(float32_zero, p.sign);
3688         }
3689     }
3690     return a;
3691 }
3692
3693 float64 float64_squash_input_denormal(float64 a, float_status *status)
3694 {
3695     if (status->flush_inputs_to_zero) {
3696         FloatParts64 p = float64_unpack_raw(a);
3697         if (parts_squash_denormal(p, status)) {
3698             return float64_set_sign(float64_zero, p.sign);
3699         }
3700     }
3701     return a;
3702 }
3703
3704 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3705 {
3706     if (status->flush_inputs_to_zero) {
3707         FloatParts64 p = bfloat16_unpack_raw(a);
3708         if (parts_squash_denormal(p, status)) {
3709             return bfloat16_set_sign(bfloat16_zero, p.sign);
3710         }
3711     }
3712     return a;
3713 }
3714
3715 /*----------------------------------------------------------------------------
3716 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3717 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3718 | input.  If `zSign' is 1, the input is negated before being converted to an
3719 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3720 | is simply rounded to an integer, with the inexact exception raised if the
3721 | input cannot be represented exactly as an integer.  However, if the fixed-
3722 | point input is too large, the invalid exception is raised and the largest
3723 | positive or negative integer is returned.
3724 *----------------------------------------------------------------------------*/
3725
3726 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3727                                  float_status *status)
3728 {
3729     int8_t roundingMode;
3730     bool roundNearestEven;
3731     int8_t roundIncrement, roundBits;
3732     int32_t z;
3733
3734     roundingMode = status->float_rounding_mode;
3735     roundNearestEven = ( roundingMode == float_round_nearest_even );
3736     switch (roundingMode) {
3737     case float_round_nearest_even:
3738     case float_round_ties_away:
3739         roundIncrement = 0x40;
3740         break;
3741     case float_round_to_zero:
3742         roundIncrement = 0;
3743         break;
3744     case float_round_up:
3745         roundIncrement = zSign ? 0 : 0x7f;
3746         break;
3747     case float_round_down:
3748         roundIncrement = zSign ? 0x7f : 0;
3749         break;
3750     case float_round_to_odd:
3751         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3752         break;
3753     default:
3754         abort();
3755     }
3756     roundBits = absZ & 0x7F;
3757     absZ = ( absZ + roundIncrement )>>7;
3758     if (!(roundBits ^ 0x40) && roundNearestEven) {
3759         absZ &= ~1;
3760     }
3761     z = absZ;
3762     if ( zSign ) z = - z;
3763     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3764         float_raise(float_flag_invalid, status);
3765         return zSign ? INT32_MIN : INT32_MAX;
3766     }
3767     if (roundBits) {
3768         float_raise(float_flag_inexact, status);
3769     }
3770     return z;
3771
3772 }
3773
3774 /*----------------------------------------------------------------------------
3775 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3776 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3777 | and returns the properly rounded 64-bit integer corresponding to the input.
3778 | If `zSign' is 1, the input is negated before being converted to an integer.
3779 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3780 | the inexact exception raised if the input cannot be represented exactly as
3781 | an integer.  However, if the fixed-point input is too large, the invalid
3782 | exception is raised and the largest positive or negative integer is
3783 | returned.
3784 *----------------------------------------------------------------------------*/
3785
3786 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3787                                float_status *status)
3788 {
3789     int8_t roundingMode;
3790     bool roundNearestEven, increment;
3791     int64_t z;
3792
3793     roundingMode = status->float_rounding_mode;
3794     roundNearestEven = ( roundingMode == float_round_nearest_even );
3795     switch (roundingMode) {
3796     case float_round_nearest_even:
3797     case float_round_ties_away:
3798         increment = ((int64_t) absZ1 < 0);
3799         break;
3800     case float_round_to_zero:
3801         increment = 0;
3802         break;
3803     case float_round_up:
3804         increment = !zSign && absZ1;
3805         break;
3806     case float_round_down:
3807         increment = zSign && absZ1;
3808         break;
3809     case float_round_to_odd:
3810         increment = !(absZ0 & 1) && absZ1;
3811         break;
3812     default:
3813         abort();
3814     }
3815     if ( increment ) {
3816         ++absZ0;
3817         if ( absZ0 == 0 ) goto overflow;
3818         if (!(absZ1 << 1) && roundNearestEven) {
3819             absZ0 &= ~1;
3820         }
3821     }
3822     z = absZ0;
3823     if ( zSign ) z = - z;
3824     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3825  overflow:
3826         float_raise(float_flag_invalid, status);
3827         return zSign ? INT64_MIN : INT64_MAX;
3828     }
3829     if (absZ1) {
3830         float_raise(float_flag_inexact, status);
3831     }
3832     return z;
3833
3834 }
3835
3836 /*----------------------------------------------------------------------------
3837 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3838 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3839 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3840 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3841 | with the inexact exception raised if the input cannot be represented exactly
3842 | as an integer.  However, if the fixed-point input is too large, the invalid
3843 | exception is raised and the largest unsigned integer is returned.
3844 *----------------------------------------------------------------------------*/
3845
3846 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3847                                 uint64_t absZ1, float_status *status)
3848 {
3849     int8_t roundingMode;
3850     bool roundNearestEven, increment;
3851
3852     roundingMode = status->float_rounding_mode;
3853     roundNearestEven = (roundingMode == float_round_nearest_even);
3854     switch (roundingMode) {
3855     case float_round_nearest_even:
3856     case float_round_ties_away:
3857         increment = ((int64_t)absZ1 < 0);
3858         break;
3859     case float_round_to_zero:
3860         increment = 0;
3861         break;
3862     case float_round_up:
3863         increment = !zSign && absZ1;
3864         break;
3865     case float_round_down:
3866         increment = zSign && absZ1;
3867         break;
3868     case float_round_to_odd:
3869         increment = !(absZ0 & 1) && absZ1;
3870         break;
3871     default:
3872         abort();
3873     }
3874     if (increment) {
3875         ++absZ0;
3876         if (absZ0 == 0) {
3877             float_raise(float_flag_invalid, status);
3878             return UINT64_MAX;
3879         }
3880         if (!(absZ1 << 1) && roundNearestEven) {
3881             absZ0 &= ~1;
3882         }
3883     }
3884
3885     if (zSign && absZ0) {
3886         float_raise(float_flag_invalid, status);
3887         return 0;
3888     }
3889
3890     if (absZ1) {
3891         float_raise(float_flag_inexact, status);
3892     }
3893     return absZ0;
3894 }
3895
3896 /*----------------------------------------------------------------------------
3897 | Normalizes the subnormal single-precision floating-point value represented
3898 | by the denormalized significand `aSig'.  The normalized exponent and
3899 | significand are stored at the locations pointed to by `zExpPtr' and
3900 | `zSigPtr', respectively.
3901 *----------------------------------------------------------------------------*/
3902
3903 static void
3904  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3905 {
3906     int8_t shiftCount;
3907
3908     shiftCount = clz32(aSig) - 8;
3909     *zSigPtr = aSig<<shiftCount;
3910     *zExpPtr = 1 - shiftCount;
3911
3912 }
3913
3914 /*----------------------------------------------------------------------------
3915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3916 | and significand `zSig', and returns the proper single-precision floating-
3917 | point value corresponding to the abstract input.  Ordinarily, the abstract
3918 | value is simply rounded and packed into the single-precision format, with
3919 | the inexact exception raised if the abstract input cannot be represented
3920 | exactly.  However, if the abstract value is too large, the overflow and
3921 | inexact exceptions are raised and an infinity or maximal finite value is
3922 | returned.  If the abstract value is too small, the input value is rounded to
3923 | a subnormal number, and the underflow and inexact exceptions are raised if
3924 | the abstract input cannot be represented exactly as a subnormal single-
3925 | precision floating-point number.
3926 |     The input significand `zSig' has its binary point between bits 30
3927 | and 29, which is 7 bits to the left of the usual location.  This shifted
3928 | significand must be normalized or smaller.  If `zSig' is not normalized,
3929 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3930 | and it must not require rounding.  In the usual case that `zSig' is
3931 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3932 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3933 | Binary Floating-Point Arithmetic.
3934 *----------------------------------------------------------------------------*/
3935
3936 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3937                                    float_status *status)
3938 {
3939     int8_t roundingMode;
3940     bool roundNearestEven;
3941     int8_t roundIncrement, roundBits;
3942     bool isTiny;
3943
3944     roundingMode = status->float_rounding_mode;
3945     roundNearestEven = ( roundingMode == float_round_nearest_even );
3946     switch (roundingMode) {
3947     case float_round_nearest_even:
3948     case float_round_ties_away:
3949         roundIncrement = 0x40;
3950         break;
3951     case float_round_to_zero:
3952         roundIncrement = 0;
3953         break;
3954     case float_round_up:
3955         roundIncrement = zSign ? 0 : 0x7f;
3956         break;
3957     case float_round_down:
3958         roundIncrement = zSign ? 0x7f : 0;
3959         break;
3960     case float_round_to_odd:
3961         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3962         break;
3963     default:
3964         abort();
3965         break;
3966     }
3967     roundBits = zSig & 0x7F;
3968     if ( 0xFD <= (uint16_t) zExp ) {
3969         if (    ( 0xFD < zExp )
3970              || (    ( zExp == 0xFD )
3971                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3972            ) {
3973             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3974                                    roundIncrement != 0;
3975             float_raise(float_flag_overflow | float_flag_inexact, status);
3976             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3977         }
3978         if ( zExp < 0 ) {
3979             if (status->flush_to_zero) {
3980                 float_raise(float_flag_output_denormal, status);
3981                 return packFloat32(zSign, 0, 0);
3982             }
3983             isTiny = status->tininess_before_rounding
3984                   || (zExp < -1)
3985                   || (zSig + roundIncrement < 0x80000000);
3986             shift32RightJamming( zSig, - zExp, &zSig );
3987             zExp = 0;
3988             roundBits = zSig & 0x7F;
3989             if (isTiny && roundBits) {
3990                 float_raise(float_flag_underflow, status);
3991             }
3992             if (roundingMode == float_round_to_odd) {
3993                 /*
3994                  * For round-to-odd case, the roundIncrement depends on
3995                  * zSig which just changed.
3996                  */
3997                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3998             }
3999         }
4000     }
4001     if (roundBits) {
4002         float_raise(float_flag_inexact, status);
4003     }
4004     zSig = ( zSig + roundIncrement )>>7;
4005     if (!(roundBits ^ 0x40) && roundNearestEven) {
4006         zSig &= ~1;
4007     }
4008     if ( zSig == 0 ) zExp = 0;
4009     return packFloat32( zSign, zExp, zSig );
4010
4011 }
4012
4013 /*----------------------------------------------------------------------------
4014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4015 | and significand `zSig', and returns the proper single-precision floating-
4016 | point value corresponding to the abstract input.  This routine is just like
4017 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4018 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4019 | floating-point exponent.
4020 *----------------------------------------------------------------------------*/
4021
4022 static float32
4023  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4024                               float_status *status)
4025 {
4026     int8_t shiftCount;
4027
4028     shiftCount = clz32(zSig) - 1;
4029     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4030                                status);
4031
4032 }
4033
4034 /*----------------------------------------------------------------------------
4035 | Normalizes the subnormal double-precision floating-point value represented
4036 | by the denormalized significand `aSig'.  The normalized exponent and
4037 | significand are stored at the locations pointed to by `zExpPtr' and
4038 | `zSigPtr', respectively.
4039 *----------------------------------------------------------------------------*/
4040
4041 static void
4042  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4043 {
4044     int8_t shiftCount;
4045
4046     shiftCount = clz64(aSig) - 11;
4047     *zSigPtr = aSig<<shiftCount;
4048     *zExpPtr = 1 - shiftCount;
4049
4050 }
4051
4052 /*----------------------------------------------------------------------------
4053 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4054 | double-precision floating-point value, returning the result.  After being
4055 | shifted into the proper positions, the three fields are simply added
4056 | together to form the result.  This means that any integer portion of `zSig'
4057 | will be added into the exponent.  Since a properly normalized significand
4058 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4059 | than the desired result exponent whenever `zSig' is a complete, normalized
4060 | significand.
4061 *----------------------------------------------------------------------------*/
4062
4063 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4064 {
4065
4066     return make_float64(
4067         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4068
4069 }
4070
4071 /*----------------------------------------------------------------------------
4072 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4073 | and significand `zSig', and returns the proper double-precision floating-
4074 | point value corresponding to the abstract input.  Ordinarily, the abstract
4075 | value is simply rounded and packed into the double-precision format, with
4076 | the inexact exception raised if the abstract input cannot be represented
4077 | exactly.  However, if the abstract value is too large, the overflow and
4078 | inexact exceptions are raised and an infinity or maximal finite value is
4079 | returned.  If the abstract value is too small, the input value is rounded to
4080 | a subnormal number, and the underflow and inexact exceptions are raised if
4081 | the abstract input cannot be represented exactly as a subnormal double-
4082 | precision floating-point number.
4083 |     The input significand `zSig' has its binary point between bits 62
4084 | and 61, which is 10 bits to the left of the usual location.  This shifted
4085 | significand must be normalized or smaller.  If `zSig' is not normalized,
4086 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4087 | and it must not require rounding.  In the usual case that `zSig' is
4088 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4089 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4090 | Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092
4093 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4094                                    float_status *status)
4095 {
4096     int8_t roundingMode;
4097     bool roundNearestEven;
4098     int roundIncrement, roundBits;
4099     bool isTiny;
4100
4101     roundingMode = status->float_rounding_mode;
4102     roundNearestEven = ( roundingMode == float_round_nearest_even );
4103     switch (roundingMode) {
4104     case float_round_nearest_even:
4105     case float_round_ties_away:
4106         roundIncrement = 0x200;
4107         break;
4108     case float_round_to_zero:
4109         roundIncrement = 0;
4110         break;
4111     case float_round_up:
4112         roundIncrement = zSign ? 0 : 0x3ff;
4113         break;
4114     case float_round_down:
4115         roundIncrement = zSign ? 0x3ff : 0;
4116         break;
4117     case float_round_to_odd:
4118         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4119         break;
4120     default:
4121         abort();
4122     }
4123     roundBits = zSig & 0x3FF;
4124     if ( 0x7FD <= (uint16_t) zExp ) {
4125         if (    ( 0x7FD < zExp )
4126              || (    ( zExp == 0x7FD )
4127                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4128            ) {
4129             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4130                                    roundIncrement != 0;
4131             float_raise(float_flag_overflow | float_flag_inexact, status);
4132             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4133         }
4134         if ( zExp < 0 ) {
4135             if (status->flush_to_zero) {
4136                 float_raise(float_flag_output_denormal, status);
4137                 return packFloat64(zSign, 0, 0);
4138             }
4139             isTiny = status->tininess_before_rounding
4140                   || (zExp < -1)
4141                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4142             shift64RightJamming( zSig, - zExp, &zSig );
4143             zExp = 0;
4144             roundBits = zSig & 0x3FF;
4145             if (isTiny && roundBits) {
4146                 float_raise(float_flag_underflow, status);
4147             }
4148             if (roundingMode == float_round_to_odd) {
4149                 /*
4150                  * For round-to-odd case, the roundIncrement depends on
4151                  * zSig which just changed.
4152                  */
4153                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4154             }
4155         }
4156     }
4157     if (roundBits) {
4158         float_raise(float_flag_inexact, status);
4159     }
4160     zSig = ( zSig + roundIncrement )>>10;
4161     if (!(roundBits ^ 0x200) && roundNearestEven) {
4162         zSig &= ~1;
4163     }
4164     if ( zSig == 0 ) zExp = 0;
4165     return packFloat64( zSign, zExp, zSig );
4166
4167 }
4168
4169 /*----------------------------------------------------------------------------
4170 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4171 | and significand `zSig', and returns the proper double-precision floating-
4172 | point value corresponding to the abstract input.  This routine is just like
4173 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4174 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4175 | floating-point exponent.
4176 *----------------------------------------------------------------------------*/
4177
4178 static float64
4179  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4180                               float_status *status)
4181 {
4182     int8_t shiftCount;
4183
4184     shiftCount = clz64(zSig) - 1;
4185     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4186                                status);
4187
4188 }
4189
4190 /*----------------------------------------------------------------------------
4191 | Normalizes the subnormal extended double-precision floating-point value
4192 | represented by the denormalized significand `aSig'.  The normalized exponent
4193 | and significand are stored at the locations pointed to by `zExpPtr' and
4194 | `zSigPtr', respectively.
4195 *----------------------------------------------------------------------------*/
4196
4197 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4198                                 uint64_t *zSigPtr)
4199 {
4200     int8_t shiftCount;
4201
4202     shiftCount = clz64(aSig);
4203     *zSigPtr = aSig<<shiftCount;
4204     *zExpPtr = 1 - shiftCount;
4205 }
4206
4207 /*----------------------------------------------------------------------------
4208 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4209 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4210 | and returns the proper extended double-precision floating-point value
4211 | corresponding to the abstract input.  Ordinarily, the abstract value is
4212 | rounded and packed into the extended double-precision format, with the
4213 | inexact exception raised if the abstract input cannot be represented
4214 | exactly.  However, if the abstract value is too large, the overflow and
4215 | inexact exceptions are raised and an infinity or maximal finite value is
4216 | returned.  If the abstract value is too small, the input value is rounded to
4217 | a subnormal number, and the underflow and inexact exceptions are raised if
4218 | the abstract input cannot be represented exactly as a subnormal extended
4219 | double-precision floating-point number.
4220 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4221 | number of bits as single or double precision, respectively.  Otherwise, the
4222 | result is rounded to the full precision of the extended double-precision
4223 | format.
4224 |     The input significand must be normalized or smaller.  If the input
4225 | significand is not normalized, `zExp' must be 0; in that case, the result
4226 | returned is a subnormal number, and it must not require rounding.  The
4227 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4230
4231 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4232                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4233                               float_status *status)
4234 {
4235     int8_t roundingMode;
4236     bool roundNearestEven, increment, isTiny;
4237     int64_t roundIncrement, roundMask, roundBits;
4238
4239     roundingMode = status->float_rounding_mode;
4240     roundNearestEven = ( roundingMode == float_round_nearest_even );
4241     if ( roundingPrecision == 80 ) goto precision80;
4242     if ( roundingPrecision == 64 ) {
4243         roundIncrement = UINT64_C(0x0000000000000400);
4244         roundMask = UINT64_C(0x00000000000007FF);
4245     }
4246     else if ( roundingPrecision == 32 ) {
4247         roundIncrement = UINT64_C(0x0000008000000000);
4248         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4249     }
4250     else {
4251         goto precision80;
4252     }
4253     zSig0 |= ( zSig1 != 0 );
4254     switch (roundingMode) {
4255     case float_round_nearest_even:
4256     case float_round_ties_away:
4257         break;
4258     case float_round_to_zero:
4259         roundIncrement = 0;
4260         break;
4261     case float_round_up:
4262         roundIncrement = zSign ? 0 : roundMask;
4263         break;
4264     case float_round_down:
4265         roundIncrement = zSign ? roundMask : 0;
4266         break;
4267     default:
4268         abort();
4269     }
4270     roundBits = zSig0 & roundMask;
4271     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4272         if (    ( 0x7FFE < zExp )
4273              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4274            ) {
4275             goto overflow;
4276         }
4277         if ( zExp <= 0 ) {
4278             if (status->flush_to_zero) {
4279                 float_raise(float_flag_output_denormal, status);
4280                 return packFloatx80(zSign, 0, 0);
4281             }
4282             isTiny = status->tininess_before_rounding
4283                   || (zExp < 0 )
4284                   || (zSig0 <= zSig0 + roundIncrement);
4285             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4286             zExp = 0;
4287             roundBits = zSig0 & roundMask;
4288             if (isTiny && roundBits) {
4289                 float_raise(float_flag_underflow, status);
4290             }
4291             if (roundBits) {
4292                 float_raise(float_flag_inexact, status);
4293             }
4294             zSig0 += roundIncrement;
4295             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4296             roundIncrement = roundMask + 1;
4297             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4298                 roundMask |= roundIncrement;
4299             }
4300             zSig0 &= ~ roundMask;
4301             return packFloatx80( zSign, zExp, zSig0 );
4302         }
4303     }
4304     if (roundBits) {
4305         float_raise(float_flag_inexact, status);
4306     }
4307     zSig0 += roundIncrement;
4308     if ( zSig0 < roundIncrement ) {
4309         ++zExp;
4310         zSig0 = UINT64_C(0x8000000000000000);
4311     }
4312     roundIncrement = roundMask + 1;
4313     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4314         roundMask |= roundIncrement;
4315     }
4316     zSig0 &= ~ roundMask;
4317     if ( zSig0 == 0 ) zExp = 0;
4318     return packFloatx80( zSign, zExp, zSig0 );
4319  precision80:
4320     switch (roundingMode) {
4321     case float_round_nearest_even:
4322     case float_round_ties_away:
4323         increment = ((int64_t)zSig1 < 0);
4324         break;
4325     case float_round_to_zero:
4326         increment = 0;
4327         break;
4328     case float_round_up:
4329         increment = !zSign && zSig1;
4330         break;
4331     case float_round_down:
4332         increment = zSign && zSig1;
4333         break;
4334     default:
4335         abort();
4336     }
4337     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4338         if (    ( 0x7FFE < zExp )
4339              || (    ( zExp == 0x7FFE )
4340                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4341                   && increment
4342                 )
4343            ) {
4344             roundMask = 0;
4345  overflow:
4346             float_raise(float_flag_overflow | float_flag_inexact, status);
4347             if (    ( roundingMode == float_round_to_zero )
4348                  || ( zSign && ( roundingMode == float_round_up ) )
4349                  || ( ! zSign && ( roundingMode == float_round_down ) )
4350                ) {
4351                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4352             }
4353             return packFloatx80(zSign,
4354                                 floatx80_infinity_high,
4355                                 floatx80_infinity_low);
4356         }
4357         if ( zExp <= 0 ) {
4358             isTiny = status->tininess_before_rounding
4359                   || (zExp < 0)
4360                   || !increment
4361                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4362             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4363             zExp = 0;
4364             if (isTiny && zSig1) {
4365                 float_raise(float_flag_underflow, status);
4366             }
4367             if (zSig1) {
4368                 float_raise(float_flag_inexact, status);
4369             }
4370             switch (roundingMode) {
4371             case float_round_nearest_even:
4372             case float_round_ties_away:
4373                 increment = ((int64_t)zSig1 < 0);
4374                 break;
4375             case float_round_to_zero:
4376                 increment = 0;
4377                 break;
4378             case float_round_up:
4379                 increment = !zSign && zSig1;
4380                 break;
4381             case float_round_down:
4382                 increment = zSign && zSig1;
4383                 break;
4384             default:
4385                 abort();
4386             }
4387             if ( increment ) {
4388                 ++zSig0;
4389                 if (!(zSig1 << 1) && roundNearestEven) {
4390                     zSig0 &= ~1;
4391                 }
4392                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4393             }
4394             return packFloatx80( zSign, zExp, zSig0 );
4395         }
4396     }
4397     if (zSig1) {
4398         float_raise(float_flag_inexact, status);
4399     }
4400     if ( increment ) {
4401         ++zSig0;
4402         if ( zSig0 == 0 ) {
4403             ++zExp;
4404             zSig0 = UINT64_C(0x8000000000000000);
4405         }
4406         else {
4407             if (!(zSig1 << 1) && roundNearestEven) {
4408                 zSig0 &= ~1;
4409             }
4410         }
4411     }
4412     else {
4413         if ( zSig0 == 0 ) zExp = 0;
4414     }
4415     return packFloatx80( zSign, zExp, zSig0 );
4416
4417 }
4418
4419 /*----------------------------------------------------------------------------
4420 | Takes an abstract floating-point value having sign `zSign', exponent
4421 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4422 | and returns the proper extended double-precision floating-point value
4423 | corresponding to the abstract input.  This routine is just like
4424 | `roundAndPackFloatx80' except that the input significand does not have to be
4425 | normalized.
4426 *----------------------------------------------------------------------------*/
4427
4428 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4429                                        bool zSign, int32_t zExp,
4430                                        uint64_t zSig0, uint64_t zSig1,
4431                                        float_status *status)
4432 {
4433     int8_t shiftCount;
4434
4435     if ( zSig0 == 0 ) {
4436         zSig0 = zSig1;
4437         zSig1 = 0;
4438         zExp -= 64;
4439     }
4440     shiftCount = clz64(zSig0);
4441     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4442     zExp -= shiftCount;
4443     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4444                                 zSig0, zSig1, status);
4445
4446 }
4447
4448 /*----------------------------------------------------------------------------
4449 | Returns the least-significant 64 fraction bits of the quadruple-precision
4450 | floating-point value `a'.
4451 *----------------------------------------------------------------------------*/
4452
4453 static inline uint64_t extractFloat128Frac1( float128 a )
4454 {
4455
4456     return a.low;
4457
4458 }
4459
4460 /*----------------------------------------------------------------------------
4461 | Returns the most-significant 48 fraction bits of the quadruple-precision
4462 | floating-point value `a'.
4463 *----------------------------------------------------------------------------*/
4464
4465 static inline uint64_t extractFloat128Frac0( float128 a )
4466 {
4467
4468     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4469
4470 }
4471
4472 /*----------------------------------------------------------------------------
4473 | Returns the exponent bits of the quadruple-precision floating-point value
4474 | `a'.
4475 *----------------------------------------------------------------------------*/
4476
4477 static inline int32_t extractFloat128Exp( float128 a )
4478 {
4479
4480     return ( a.high>>48 ) & 0x7FFF;
4481
4482 }
4483
4484 /*----------------------------------------------------------------------------
4485 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4486 *----------------------------------------------------------------------------*/
4487
4488 static inline bool extractFloat128Sign(float128 a)
4489 {
4490     return a.high >> 63;
4491 }
4492
4493 /*----------------------------------------------------------------------------
4494 | Normalizes the subnormal quadruple-precision floating-point value
4495 | represented by the denormalized significand formed by the concatenation of
4496 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4497 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4498 | significand are stored at the location pointed to by `zSig0Ptr', and the
4499 | least significant 64 bits of the normalized significand are stored at the
4500 | location pointed to by `zSig1Ptr'.
4501 *----------------------------------------------------------------------------*/
4502
4503 static void
4504  normalizeFloat128Subnormal(
4505      uint64_t aSig0,
4506      uint64_t aSig1,
4507      int32_t *zExpPtr,
4508      uint64_t *zSig0Ptr,
4509      uint64_t *zSig1Ptr
4510  )
4511 {
4512     int8_t shiftCount;
4513
4514     if ( aSig0 == 0 ) {
4515         shiftCount = clz64(aSig1) - 15;
4516         if ( shiftCount < 0 ) {
4517             *zSig0Ptr = aSig1>>( - shiftCount );
4518             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4519         }
4520         else {
4521             *zSig0Ptr = aSig1<<shiftCount;
4522             *zSig1Ptr = 0;
4523         }
4524         *zExpPtr = - shiftCount - 63;
4525     }
4526     else {
4527         shiftCount = clz64(aSig0) - 15;
4528         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4529         *zExpPtr = 1 - shiftCount;
4530     }
4531
4532 }
4533
4534 /*----------------------------------------------------------------------------
4535 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4536 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4537 | floating-point value, returning the result.  After being shifted into the
4538 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4539 | added together to form the most significant 32 bits of the result.  This
4540 | means that any integer portion of `zSig0' will be added into the exponent.
4541 | Since a properly normalized significand will have an integer portion equal
4542 | to 1, the `zExp' input should be 1 less than the desired result exponent
4543 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4544 | significand.
4545 *----------------------------------------------------------------------------*/
4546
4547 static inline float128
4548 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4549 {
4550     float128 z;
4551
4552     z.low = zSig1;
4553     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4554     return z;
4555 }
4556
4557 /*----------------------------------------------------------------------------
4558 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4559 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4560 | and `zSig2', and returns the proper quadruple-precision floating-point value
4561 | corresponding to the abstract input.  Ordinarily, the abstract value is
4562 | simply rounded and packed into the quadruple-precision format, with the
4563 | inexact exception raised if the abstract input cannot be represented
4564 | exactly.  However, if the abstract value is too large, the overflow and
4565 | inexact exceptions are raised and an infinity or maximal finite value is
4566 | returned.  If the abstract value is too small, the input value is rounded to
4567 | a subnormal number, and the underflow and inexact exceptions are raised if
4568 | the abstract input cannot be represented exactly as a subnormal quadruple-
4569 | precision floating-point number.
4570 |     The input significand must be normalized or smaller.  If the input
4571 | significand is not normalized, `zExp' must be 0; in that case, the result
4572 | returned is a subnormal number, and it must not require rounding.  In the
4573 | usual case that the input significand is normalized, `zExp' must be 1 less
4574 | than the ``true'' floating-point exponent.  The handling of underflow and
4575 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4576 *----------------------------------------------------------------------------*/
4577
4578 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4579                                      uint64_t zSig0, uint64_t zSig1,
4580                                      uint64_t zSig2, float_status *status)
4581 {
4582     int8_t roundingMode;
4583     bool roundNearestEven, increment, isTiny;
4584
4585     roundingMode = status->float_rounding_mode;
4586     roundNearestEven = ( roundingMode == float_round_nearest_even );
4587     switch (roundingMode) {
4588     case float_round_nearest_even:
4589     case float_round_ties_away:
4590         increment = ((int64_t)zSig2 < 0);
4591         break;
4592     case float_round_to_zero:
4593         increment = 0;
4594         break;
4595     case float_round_up:
4596         increment = !zSign && zSig2;
4597         break;
4598     case float_round_down:
4599         increment = zSign && zSig2;
4600         break;
4601     case float_round_to_odd:
4602         increment = !(zSig1 & 0x1) && zSig2;
4603         break;
4604     default:
4605         abort();
4606     }
4607     if ( 0x7FFD <= (uint32_t) zExp ) {
4608         if (    ( 0x7FFD < zExp )
4609              || (    ( zExp == 0x7FFD )
4610                   && eq128(
4611                          UINT64_C(0x0001FFFFFFFFFFFF),
4612                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4613                          zSig0,
4614                          zSig1
4615                      )
4616                   && increment
4617                 )
4618            ) {
4619             float_raise(float_flag_overflow | float_flag_inexact, status);
4620             if (    ( roundingMode == float_round_to_zero )
4621                  || ( zSign && ( roundingMode == float_round_up ) )
4622                  || ( ! zSign && ( roundingMode == float_round_down ) )
4623                  || (roundingMode == float_round_to_odd)
4624                ) {
4625                 return
4626                     packFloat128(
4627                         zSign,
4628                         0x7FFE,
4629                         UINT64_C(0x0000FFFFFFFFFFFF),
4630                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4631                     );
4632             }
4633             return packFloat128( zSign, 0x7FFF, 0, 0 );
4634         }
4635         if ( zExp < 0 ) {
4636             if (status->flush_to_zero) {
4637                 float_raise(float_flag_output_denormal, status);
4638                 return packFloat128(zSign, 0, 0, 0);
4639             }
4640             isTiny = status->tininess_before_rounding
4641                   || (zExp < -1)
4642                   || !increment
4643                   || lt128(zSig0, zSig1,
4644                            UINT64_C(0x0001FFFFFFFFFFFF),
4645                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4646             shift128ExtraRightJamming(
4647                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4648             zExp = 0;
4649             if (isTiny && zSig2) {
4650                 float_raise(float_flag_underflow, status);
4651             }
4652             switch (roundingMode) {
4653             case float_round_nearest_even:
4654             case float_round_ties_away:
4655                 increment = ((int64_t)zSig2 < 0);
4656                 break;
4657             case float_round_to_zero:
4658                 increment = 0;
4659                 break;
4660             case float_round_up:
4661                 increment = !zSign && zSig2;
4662                 break;
4663             case float_round_down:
4664                 increment = zSign && zSig2;
4665                 break;
4666             case float_round_to_odd:
4667                 increment = !(zSig1 & 0x1) && zSig2;
4668                 break;
4669             default:
4670                 abort();
4671             }
4672         }
4673     }
4674     if (zSig2) {
4675         float_raise(float_flag_inexact, status);
4676     }
4677     if ( increment ) {
4678         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4679         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4680             zSig1 &= ~1;
4681         }
4682     }
4683     else {
4684         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4685     }
4686     return packFloat128( zSign, zExp, zSig0, zSig1 );
4687
4688 }
4689
4690 /*----------------------------------------------------------------------------
4691 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4692 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4693 | returns the proper quadruple-precision floating-point value corresponding
4694 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4695 | except that the input significand has fewer bits and does not have to be
4696 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4697 | point exponent.
4698 *----------------------------------------------------------------------------*/
4699
4700 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4701                                               uint64_t zSig0, uint64_t zSig1,
4702                                               float_status *status)
4703 {
4704     int8_t shiftCount;
4705     uint64_t zSig2;
4706
4707     if ( zSig0 == 0 ) {
4708         zSig0 = zSig1;
4709         zSig1 = 0;
4710         zExp -= 64;
4711     }
4712     shiftCount = clz64(zSig0) - 15;
4713     if ( 0 <= shiftCount ) {
4714         zSig2 = 0;
4715         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4716     }
4717     else {
4718         shift128ExtraRightJamming(
4719             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4720     }
4721     zExp -= shiftCount;
4722     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4723
4724 }
4725
4726
4727 /*----------------------------------------------------------------------------
4728 | Returns the result of converting the 32-bit two's complement integer `a'
4729 | to the extended double-precision floating-point format.  The conversion
4730 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4731 | Arithmetic.
4732 *----------------------------------------------------------------------------*/
4733
4734 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4735 {
4736     bool zSign;
4737     uint32_t absA;
4738     int8_t shiftCount;
4739     uint64_t zSig;
4740
4741     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4742     zSign = ( a < 0 );
4743     absA = zSign ? - a : a;
4744     shiftCount = clz32(absA) + 32;
4745     zSig = absA;
4746     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4747
4748 }
4749
4750 /*----------------------------------------------------------------------------
4751 | Returns the result of converting the 32-bit two's complement integer `a' to
4752 | the quadruple-precision floating-point format.  The conversion is performed
4753 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4755
4756 float128 int32_to_float128(int32_t a, float_status *status)
4757 {
4758     bool zSign;
4759     uint32_t absA;
4760     int8_t shiftCount;
4761     uint64_t zSig0;
4762
4763     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4764     zSign = ( a < 0 );
4765     absA = zSign ? - a : a;
4766     shiftCount = clz32(absA) + 17;
4767     zSig0 = absA;
4768     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4769
4770 }
4771
4772 /*----------------------------------------------------------------------------
4773 | Returns the result of converting the 64-bit two's complement integer `a'
4774 | to the extended double-precision floating-point format.  The conversion
4775 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4776 | Arithmetic.
4777 *----------------------------------------------------------------------------*/
4778
4779 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4780 {
4781     bool zSign;
4782     uint64_t absA;
4783     int8_t shiftCount;
4784
4785     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4786     zSign = ( a < 0 );
4787     absA = zSign ? - a : a;
4788     shiftCount = clz64(absA);
4789     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4790
4791 }
4792
4793 /*----------------------------------------------------------------------------
4794 | Returns the result of converting the 64-bit two's complement integer `a' to
4795 | the quadruple-precision floating-point format.  The conversion is performed
4796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4797 *----------------------------------------------------------------------------*/
4798
4799 float128 int64_to_float128(int64_t a, float_status *status)
4800 {
4801     bool zSign;
4802     uint64_t absA;
4803     int8_t shiftCount;
4804     int32_t zExp;
4805     uint64_t zSig0, zSig1;
4806
4807     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4808     zSign = ( a < 0 );
4809     absA = zSign ? - a : a;
4810     shiftCount = clz64(absA) + 49;
4811     zExp = 0x406E - shiftCount;
4812     if ( 64 <= shiftCount ) {
4813         zSig1 = 0;
4814         zSig0 = absA;
4815         shiftCount -= 64;
4816     }
4817     else {
4818         zSig1 = absA;
4819         zSig0 = 0;
4820     }
4821     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4822     return packFloat128( zSign, zExp, zSig0, zSig1 );
4823
4824 }
4825
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the 64-bit unsigned integer `a'
4828 | to the quadruple-precision floating-point format.  The conversion is performed
4829 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4830 *----------------------------------------------------------------------------*/
4831
4832 float128 uint64_to_float128(uint64_t a, float_status *status)
4833 {
4834     if (a == 0) {
4835         return float128_zero;
4836     }
4837     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4838 }
4839
4840 /*----------------------------------------------------------------------------
4841 | Returns the result of converting the single-precision floating-point value
4842 | `a' to the extended double-precision floating-point format.  The conversion
4843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4844 | Arithmetic.
4845 *----------------------------------------------------------------------------*/
4846
4847 floatx80 float32_to_floatx80(float32 a, float_status *status)
4848 {
4849     bool aSign;
4850     int aExp;
4851     uint32_t aSig;
4852
4853     a = float32_squash_input_denormal(a, status);
4854     aSig = extractFloat32Frac( a );
4855     aExp = extractFloat32Exp( a );
4856     aSign = extractFloat32Sign( a );
4857     if ( aExp == 0xFF ) {
4858         if (aSig) {
4859             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4860                                                status);
4861             return floatx80_silence_nan(res, status);
4862         }
4863         return packFloatx80(aSign,
4864                             floatx80_infinity_high,
4865                             floatx80_infinity_low);
4866     }
4867     if ( aExp == 0 ) {
4868         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4869         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4870     }
4871     aSig |= 0x00800000;
4872     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4873
4874 }
4875
4876 /*----------------------------------------------------------------------------
4877 | Returns the result of converting the single-precision floating-point value
4878 | `a' to the double-precision floating-point format.  The conversion is
4879 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4880 | Arithmetic.
4881 *----------------------------------------------------------------------------*/
4882
4883 float128 float32_to_float128(float32 a, float_status *status)
4884 {
4885     bool aSign;
4886     int aExp;
4887     uint32_t aSig;
4888
4889     a = float32_squash_input_denormal(a, status);
4890     aSig = extractFloat32Frac( a );
4891     aExp = extractFloat32Exp( a );
4892     aSign = extractFloat32Sign( a );
4893     if ( aExp == 0xFF ) {
4894         if (aSig) {
4895             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4896         }
4897         return packFloat128( aSign, 0x7FFF, 0, 0 );
4898     }
4899     if ( aExp == 0 ) {
4900         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4901         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4902         --aExp;
4903     }
4904     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4905
4906 }
4907
4908 /*----------------------------------------------------------------------------
4909 | Returns the remainder of the single-precision floating-point value `a'
4910 | with respect to the corresponding value `b'.  The operation is performed
4911 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912 *----------------------------------------------------------------------------*/
4913
4914 float32 float32_rem(float32 a, float32 b, float_status *status)
4915 {
4916     bool aSign, zSign;
4917     int aExp, bExp, expDiff;
4918     uint32_t aSig, bSig;
4919     uint32_t q;
4920     uint64_t aSig64, bSig64, q64;
4921     uint32_t alternateASig;
4922     int32_t sigMean;
4923     a = float32_squash_input_denormal(a, status);
4924     b = float32_squash_input_denormal(b, status);
4925
4926     aSig = extractFloat32Frac( a );
4927     aExp = extractFloat32Exp( a );
4928     aSign = extractFloat32Sign( a );
4929     bSig = extractFloat32Frac( b );
4930     bExp = extractFloat32Exp( b );
4931     if ( aExp == 0xFF ) {
4932         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4933             return propagateFloat32NaN(a, b, status);
4934         }
4935         float_raise(float_flag_invalid, status);
4936         return float32_default_nan(status);
4937     }
4938     if ( bExp == 0xFF ) {
4939         if (bSig) {
4940             return propagateFloat32NaN(a, b, status);
4941         }
4942         return a;
4943     }
4944     if ( bExp == 0 ) {
4945         if ( bSig == 0 ) {
4946             float_raise(float_flag_invalid, status);
4947             return float32_default_nan(status);
4948         }
4949         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4950     }
4951     if ( aExp == 0 ) {
4952         if ( aSig == 0 ) return a;
4953         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4954     }
4955     expDiff = aExp - bExp;
4956     aSig |= 0x00800000;
4957     bSig |= 0x00800000;
4958     if ( expDiff < 32 ) {
4959         aSig <<= 8;
4960         bSig <<= 8;
4961         if ( expDiff < 0 ) {
4962             if ( expDiff < -1 ) return a;
4963             aSig >>= 1;
4964         }
4965         q = ( bSig <= aSig );
4966         if ( q ) aSig -= bSig;
4967         if ( 0 < expDiff ) {
4968             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4969             q >>= 32 - expDiff;
4970             bSig >>= 2;
4971             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4972         }
4973         else {
4974             aSig >>= 2;
4975             bSig >>= 2;
4976         }
4977     }
4978     else {
4979         if ( bSig <= aSig ) aSig -= bSig;
4980         aSig64 = ( (uint64_t) aSig )<<40;
4981         bSig64 = ( (uint64_t) bSig )<<40;
4982         expDiff -= 64;
4983         while ( 0 < expDiff ) {
4984             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4985             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4986             aSig64 = - ( ( bSig * q64 )<<38 );
4987             expDiff -= 62;
4988         }
4989         expDiff += 64;
4990         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4991         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4992         q = q64>>( 64 - expDiff );
4993         bSig <<= 6;
4994         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4995     }
4996     do {
4997         alternateASig = aSig;
4998         ++q;
4999         aSig -= bSig;
5000     } while ( 0 <= (int32_t) aSig );
5001     sigMean = aSig + alternateASig;
5002     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5003         aSig = alternateASig;
5004     }
5005     zSign = ( (int32_t) aSig < 0 );
5006     if ( zSign ) aSig = - aSig;
5007     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5008 }
5009
5010
5011
5012 /*----------------------------------------------------------------------------
5013 | Returns the binary exponential of the single-precision floating-point value
5014 | `a'. The operation is performed according to the IEC/IEEE Standard for
5015 | Binary Floating-Point Arithmetic.
5016 |
5017 | Uses the following identities:
5018 |
5019 | 1. -------------------------------------------------------------------------
5020 |      x    x*ln(2)
5021 |     2  = e
5022 |
5023 | 2. -------------------------------------------------------------------------
5024 |                      2     3     4     5           n
5025 |      x        x     x     x     x     x           x
5026 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5027 |               1!    2!    3!    4!    5!          n!
5028 *----------------------------------------------------------------------------*/
5029
5030 static const float64 float32_exp2_coefficients[15] =
5031 {
5032     const_float64( 0x3ff0000000000000ll ), /*  1 */
5033     const_float64( 0x3fe0000000000000ll ), /*  2 */
5034     const_float64( 0x3fc5555555555555ll ), /*  3 */
5035     const_float64( 0x3fa5555555555555ll ), /*  4 */
5036     const_float64( 0x3f81111111111111ll ), /*  5 */
5037     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5038     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5039     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5040     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5041     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5042     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5043     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5044     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5045     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5046     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5047 };
5048
5049 float32 float32_exp2(float32 a, float_status *status)
5050 {
5051     bool aSign;
5052     int aExp;
5053     uint32_t aSig;
5054     float64 r, x, xn;
5055     int i;
5056     a = float32_squash_input_denormal(a, status);
5057
5058     aSig = extractFloat32Frac( a );
5059     aExp = extractFloat32Exp( a );
5060     aSign = extractFloat32Sign( a );
5061
5062     if ( aExp == 0xFF) {
5063         if (aSig) {
5064             return propagateFloat32NaN(a, float32_zero, status);
5065         }
5066         return (aSign) ? float32_zero : a;
5067     }
5068     if (aExp == 0) {
5069         if (aSig == 0) return float32_one;
5070     }
5071
5072     float_raise(float_flag_inexact, status);
5073
5074     /* ******************************* */
5075     /* using float64 for approximation */
5076     /* ******************************* */
5077     x = float32_to_float64(a, status);
5078     x = float64_mul(x, float64_ln2, status);
5079
5080     xn = x;
5081     r = float64_one;
5082     for (i = 0 ; i < 15 ; i++) {
5083         float64 f;
5084
5085         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5086         r = float64_add(r, f, status);
5087
5088         xn = float64_mul(xn, x, status);
5089     }
5090
5091     return float64_to_float32(r, status);
5092 }
5093
5094 /*----------------------------------------------------------------------------
5095 | Returns the binary log of the single-precision floating-point value `a'.
5096 | The operation is performed according to the IEC/IEEE Standard for Binary
5097 | Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099 float32 float32_log2(float32 a, float_status *status)
5100 {
5101     bool aSign, zSign;
5102     int aExp;
5103     uint32_t aSig, zSig, i;
5104
5105     a = float32_squash_input_denormal(a, status);
5106     aSig = extractFloat32Frac( a );
5107     aExp = extractFloat32Exp( a );
5108     aSign = extractFloat32Sign( a );
5109
5110     if ( aExp == 0 ) {
5111         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5112         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5113     }
5114     if ( aSign ) {
5115         float_raise(float_flag_invalid, status);
5116         return float32_default_nan(status);
5117     }
5118     if ( aExp == 0xFF ) {
5119         if (aSig) {
5120             return propagateFloat32NaN(a, float32_zero, status);
5121         }
5122         return a;
5123     }
5124
5125     aExp -= 0x7F;
5126     aSig |= 0x00800000;
5127     zSign = aExp < 0;
5128     zSig = aExp << 23;
5129
5130     for (i = 1 << 22; i > 0; i >>= 1) {
5131         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5132         if ( aSig & 0x01000000 ) {
5133             aSig >>= 1;
5134             zSig |= i;
5135         }
5136     }
5137
5138     if ( zSign )
5139         zSig = -zSig;
5140
5141     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5142 }
5143
5144 /*----------------------------------------------------------------------------
5145 | Returns the result of converting the double-precision floating-point value
5146 | `a' to the extended double-precision floating-point format.  The conversion
5147 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5148 | Arithmetic.
5149 *----------------------------------------------------------------------------*/
5150
5151 floatx80 float64_to_floatx80(float64 a, float_status *status)
5152 {
5153     bool aSign;
5154     int aExp;
5155     uint64_t aSig;
5156
5157     a = float64_squash_input_denormal(a, status);
5158     aSig = extractFloat64Frac( a );
5159     aExp = extractFloat64Exp( a );
5160     aSign = extractFloat64Sign( a );
5161     if ( aExp == 0x7FF ) {
5162         if (aSig) {
5163             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5164                                                status);
5165             return floatx80_silence_nan(res, status);
5166         }
5167         return packFloatx80(aSign,
5168                             floatx80_infinity_high,
5169                             floatx80_infinity_low);
5170     }
5171     if ( aExp == 0 ) {
5172         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5173         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5174     }
5175     return
5176         packFloatx80(
5177             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5178
5179 }
5180
5181 /*----------------------------------------------------------------------------
5182 | Returns the result of converting the double-precision floating-point value
5183 | `a' to the quadruple-precision floating-point format.  The conversion is
5184 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5185 | Arithmetic.
5186 *----------------------------------------------------------------------------*/
5187
5188 float128 float64_to_float128(float64 a, float_status *status)
5189 {
5190     bool aSign;
5191     int aExp;
5192     uint64_t aSig, zSig0, zSig1;
5193
5194     a = float64_squash_input_denormal(a, status);
5195     aSig = extractFloat64Frac( a );
5196     aExp = extractFloat64Exp( a );
5197     aSign = extractFloat64Sign( a );
5198     if ( aExp == 0x7FF ) {
5199         if (aSig) {
5200             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5201         }
5202         return packFloat128( aSign, 0x7FFF, 0, 0 );
5203     }
5204     if ( aExp == 0 ) {
5205         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5206         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5207         --aExp;
5208     }
5209     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5210     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5211
5212 }
5213
5214
5215 /*----------------------------------------------------------------------------
5216 | Returns the remainder of the double-precision floating-point value `a'
5217 | with respect to the corresponding value `b'.  The operation is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220
5221 float64 float64_rem(float64 a, float64 b, float_status *status)
5222 {
5223     bool aSign, zSign;
5224     int aExp, bExp, expDiff;
5225     uint64_t aSig, bSig;
5226     uint64_t q, alternateASig;
5227     int64_t sigMean;
5228
5229     a = float64_squash_input_denormal(a, status);
5230     b = float64_squash_input_denormal(b, status);
5231     aSig = extractFloat64Frac( a );
5232     aExp = extractFloat64Exp( a );
5233     aSign = extractFloat64Sign( a );
5234     bSig = extractFloat64Frac( b );
5235     bExp = extractFloat64Exp( b );
5236     if ( aExp == 0x7FF ) {
5237         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5238             return propagateFloat64NaN(a, b, status);
5239         }
5240         float_raise(float_flag_invalid, status);
5241         return float64_default_nan(status);
5242     }
5243     if ( bExp == 0x7FF ) {
5244         if (bSig) {
5245             return propagateFloat64NaN(a, b, status);
5246         }
5247         return a;
5248     }
5249     if ( bExp == 0 ) {
5250         if ( bSig == 0 ) {
5251             float_raise(float_flag_invalid, status);
5252             return float64_default_nan(status);
5253         }
5254         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5255     }
5256     if ( aExp == 0 ) {
5257         if ( aSig == 0 ) return a;
5258         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5259     }
5260     expDiff = aExp - bExp;
5261     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5262     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5263     if ( expDiff < 0 ) {
5264         if ( expDiff < -1 ) return a;
5265         aSig >>= 1;
5266     }
5267     q = ( bSig <= aSig );
5268     if ( q ) aSig -= bSig;
5269     expDiff -= 64;
5270     while ( 0 < expDiff ) {
5271         q = estimateDiv128To64( aSig, 0, bSig );
5272         q = ( 2 < q ) ? q - 2 : 0;
5273         aSig = - ( ( bSig>>2 ) * q );
5274         expDiff -= 62;
5275     }
5276     expDiff += 64;
5277     if ( 0 < expDiff ) {
5278         q = estimateDiv128To64( aSig, 0, bSig );
5279         q = ( 2 < q ) ? q - 2 : 0;
5280         q >>= 64 - expDiff;
5281         bSig >>= 2;
5282         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5283     }
5284     else {
5285         aSig >>= 2;
5286         bSig >>= 2;
5287     }
5288     do {
5289         alternateASig = aSig;
5290         ++q;
5291         aSig -= bSig;
5292     } while ( 0 <= (int64_t) aSig );
5293     sigMean = aSig + alternateASig;
5294     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5295         aSig = alternateASig;
5296     }
5297     zSign = ( (int64_t) aSig < 0 );
5298     if ( zSign ) aSig = - aSig;
5299     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5300
5301 }
5302
5303 /*----------------------------------------------------------------------------
5304 | Returns the binary log of the double-precision floating-point value `a'.
5305 | The operation is performed according to the IEC/IEEE Standard for Binary
5306 | Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5308 float64 float64_log2(float64 a, float_status *status)
5309 {
5310     bool aSign, zSign;
5311     int aExp;
5312     uint64_t aSig, aSig0, aSig1, zSig, i;
5313     a = float64_squash_input_denormal(a, status);
5314
5315     aSig = extractFloat64Frac( a );
5316     aExp = extractFloat64Exp( a );
5317     aSign = extractFloat64Sign( a );
5318
5319     if ( aExp == 0 ) {
5320         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5321         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5322     }
5323     if ( aSign ) {
5324         float_raise(float_flag_invalid, status);
5325         return float64_default_nan(status);
5326     }
5327     if ( aExp == 0x7FF ) {
5328         if (aSig) {
5329             return propagateFloat64NaN(a, float64_zero, status);
5330         }
5331         return a;
5332     }
5333
5334     aExp -= 0x3FF;
5335     aSig |= UINT64_C(0x0010000000000000);
5336     zSign = aExp < 0;
5337     zSig = (uint64_t)aExp << 52;
5338     for (i = 1LL << 51; i > 0; i >>= 1) {
5339         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5340         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5341         if ( aSig & UINT64_C(0x0020000000000000) ) {
5342             aSig >>= 1;
5343             zSig |= i;
5344         }
5345     }
5346
5347     if ( zSign )
5348         zSig = -zSig;
5349     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5350 }
5351
5352 /*----------------------------------------------------------------------------
5353 | Returns the result of converting the extended double-precision floating-
5354 | point value `a' to the 32-bit two's complement integer format.  The
5355 | conversion is performed according to the IEC/IEEE Standard for Binary
5356 | Floating-Point Arithmetic---which means in particular that the conversion
5357 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5358 | largest positive integer is returned.  Otherwise, if the conversion
5359 | overflows, the largest integer with the same sign as `a' is returned.
5360 *----------------------------------------------------------------------------*/
5361
5362 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5363 {
5364     bool aSign;
5365     int32_t aExp, shiftCount;
5366     uint64_t aSig;
5367
5368     if (floatx80_invalid_encoding(a)) {
5369         float_raise(float_flag_invalid, status);
5370         return 1 << 31;
5371     }
5372     aSig = extractFloatx80Frac( a );
5373     aExp = extractFloatx80Exp( a );
5374     aSign = extractFloatx80Sign( a );
5375     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5376     shiftCount = 0x4037 - aExp;
5377     if ( shiftCount <= 0 ) shiftCount = 1;
5378     shift64RightJamming( aSig, shiftCount, &aSig );
5379     return roundAndPackInt32(aSign, aSig, status);
5380
5381 }
5382
5383 /*----------------------------------------------------------------------------
5384 | Returns the result of converting the extended double-precision floating-
5385 | point value `a' to the 32-bit two's complement integer format.  The
5386 | conversion is performed according to the IEC/IEEE Standard for Binary
5387 | Floating-Point Arithmetic, except that the conversion is always rounded
5388 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5389 | Otherwise, if the conversion overflows, the largest integer with the same
5390 | sign as `a' is returned.
5391 *----------------------------------------------------------------------------*/
5392
5393 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5394 {
5395     bool aSign;
5396     int32_t aExp, shiftCount;
5397     uint64_t aSig, savedASig;
5398     int32_t z;
5399
5400     if (floatx80_invalid_encoding(a)) {
5401         float_raise(float_flag_invalid, status);
5402         return 1 << 31;
5403     }
5404     aSig = extractFloatx80Frac( a );
5405     aExp = extractFloatx80Exp( a );
5406     aSign = extractFloatx80Sign( a );
5407     if ( 0x401E < aExp ) {
5408         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5409         goto invalid;
5410     }
5411     else if ( aExp < 0x3FFF ) {
5412         if (aExp || aSig) {
5413             float_raise(float_flag_inexact, status);
5414         }
5415         return 0;
5416     }
5417     shiftCount = 0x403E - aExp;
5418     savedASig = aSig;
5419     aSig >>= shiftCount;
5420     z = aSig;
5421     if ( aSign ) z = - z;
5422     if ( ( z < 0 ) ^ aSign ) {
5423  invalid:
5424         float_raise(float_flag_invalid, status);
5425         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5426     }
5427     if ( ( aSig<<shiftCount ) != savedASig ) {
5428         float_raise(float_flag_inexact, status);
5429     }
5430     return z;
5431
5432 }
5433
5434 /*----------------------------------------------------------------------------
5435 | Returns the result of converting the extended double-precision floating-
5436 | point value `a' to the 64-bit two's complement integer format.  The
5437 | conversion is performed according to the IEC/IEEE Standard for Binary
5438 | Floating-Point Arithmetic---which means in particular that the conversion
5439 | is rounded according to the current rounding mode.  If `a' is a NaN,
5440 | the largest positive integer is returned.  Otherwise, if the conversion
5441 | overflows, the largest integer with the same sign as `a' is returned.
5442 *----------------------------------------------------------------------------*/
5443
5444 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5445 {
5446     bool aSign;
5447     int32_t aExp, shiftCount;
5448     uint64_t aSig, aSigExtra;
5449
5450     if (floatx80_invalid_encoding(a)) {
5451         float_raise(float_flag_invalid, status);
5452         return 1ULL << 63;
5453     }
5454     aSig = extractFloatx80Frac( a );
5455     aExp = extractFloatx80Exp( a );
5456     aSign = extractFloatx80Sign( a );
5457     shiftCount = 0x403E - aExp;
5458     if ( shiftCount <= 0 ) {
5459         if ( shiftCount ) {
5460             float_raise(float_flag_invalid, status);
5461             if (!aSign || floatx80_is_any_nan(a)) {
5462                 return INT64_MAX;
5463             }
5464             return INT64_MIN;
5465         }
5466         aSigExtra = 0;
5467     }
5468     else {
5469         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5470     }
5471     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5472
5473 }
5474
5475 /*----------------------------------------------------------------------------
5476 | Returns the result of converting the extended double-precision floating-
5477 | point value `a' to the 64-bit two's complement integer format.  The
5478 | conversion is performed according to the IEC/IEEE Standard for Binary
5479 | Floating-Point Arithmetic, except that the conversion is always rounded
5480 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5481 | Otherwise, if the conversion overflows, the largest integer with the same
5482 | sign as `a' is returned.
5483 *----------------------------------------------------------------------------*/
5484
5485 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5486 {
5487     bool aSign;
5488     int32_t aExp, shiftCount;
5489     uint64_t aSig;
5490     int64_t z;
5491
5492     if (floatx80_invalid_encoding(a)) {
5493         float_raise(float_flag_invalid, status);
5494         return 1ULL << 63;
5495     }
5496     aSig = extractFloatx80Frac( a );
5497     aExp = extractFloatx80Exp( a );
5498     aSign = extractFloatx80Sign( a );
5499     shiftCount = aExp - 0x403E;
5500     if ( 0 <= shiftCount ) {
5501         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5502         if ( ( a.high != 0xC03E ) || aSig ) {
5503             float_raise(float_flag_invalid, status);
5504             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5505                 return INT64_MAX;
5506             }
5507         }
5508         return INT64_MIN;
5509     }
5510     else if ( aExp < 0x3FFF ) {
5511         if (aExp | aSig) {
5512             float_raise(float_flag_inexact, status);
5513         }
5514         return 0;
5515     }
5516     z = aSig>>( - shiftCount );
5517     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5518         float_raise(float_flag_inexact, status);
5519     }
5520     if ( aSign ) z = - z;
5521     return z;
5522
5523 }
5524
5525 /*----------------------------------------------------------------------------
5526 | Returns the result of converting the extended double-precision floating-
5527 | point value `a' to the single-precision floating-point format.  The
5528 | conversion is performed according to the IEC/IEEE Standard for Binary
5529 | Floating-Point Arithmetic.
5530 *----------------------------------------------------------------------------*/
5531
5532 float32 floatx80_to_float32(floatx80 a, float_status *status)
5533 {
5534     bool aSign;
5535     int32_t aExp;
5536     uint64_t aSig;
5537
5538     if (floatx80_invalid_encoding(a)) {
5539         float_raise(float_flag_invalid, status);
5540         return float32_default_nan(status);
5541     }
5542     aSig = extractFloatx80Frac( a );
5543     aExp = extractFloatx80Exp( a );
5544     aSign = extractFloatx80Sign( a );
5545     if ( aExp == 0x7FFF ) {
5546         if ( (uint64_t) ( aSig<<1 ) ) {
5547             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5548                                              status);
5549             return float32_silence_nan(res, status);
5550         }
5551         return packFloat32( aSign, 0xFF, 0 );
5552     }
5553     shift64RightJamming( aSig, 33, &aSig );
5554     if ( aExp || aSig ) aExp -= 0x3F81;
5555     return roundAndPackFloat32(aSign, aExp, aSig, status);
5556
5557 }
5558
5559 /*----------------------------------------------------------------------------
5560 | Returns the result of converting the extended double-precision floating-
5561 | point value `a' to the double-precision floating-point format.  The
5562 | conversion is performed according to the IEC/IEEE Standard for Binary
5563 | Floating-Point Arithmetic.
5564 *----------------------------------------------------------------------------*/
5565
5566 float64 floatx80_to_float64(floatx80 a, float_status *status)
5567 {
5568     bool aSign;
5569     int32_t aExp;
5570     uint64_t aSig, zSig;
5571
5572     if (floatx80_invalid_encoding(a)) {
5573         float_raise(float_flag_invalid, status);
5574         return float64_default_nan(status);
5575     }
5576     aSig = extractFloatx80Frac( a );
5577     aExp = extractFloatx80Exp( a );
5578     aSign = extractFloatx80Sign( a );
5579     if ( aExp == 0x7FFF ) {
5580         if ( (uint64_t) ( aSig<<1 ) ) {
5581             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5582                                              status);
5583             return float64_silence_nan(res, status);
5584         }
5585         return packFloat64( aSign, 0x7FF, 0 );
5586     }
5587     shift64RightJamming( aSig, 1, &zSig );
5588     if ( aExp || aSig ) aExp -= 0x3C01;
5589     return roundAndPackFloat64(aSign, aExp, zSig, status);
5590
5591 }
5592
5593 /*----------------------------------------------------------------------------
5594 | Returns the result of converting the extended double-precision floating-
5595 | point value `a' to the quadruple-precision floating-point format.  The
5596 | conversion is performed according to the IEC/IEEE Standard for Binary
5597 | Floating-Point Arithmetic.
5598 *----------------------------------------------------------------------------*/
5599
5600 float128 floatx80_to_float128(floatx80 a, float_status *status)
5601 {
5602     bool aSign;
5603     int aExp;
5604     uint64_t aSig, zSig0, zSig1;
5605
5606     if (floatx80_invalid_encoding(a)) {
5607         float_raise(float_flag_invalid, status);
5608         return float128_default_nan(status);
5609     }
5610     aSig = extractFloatx80Frac( a );
5611     aExp = extractFloatx80Exp( a );
5612     aSign = extractFloatx80Sign( a );
5613     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5614         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5615                                            status);
5616         return float128_silence_nan(res, status);
5617     }
5618     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5619     return packFloat128( aSign, aExp, zSig0, zSig1 );
5620
5621 }
5622
5623 /*----------------------------------------------------------------------------
5624 | Rounds the extended double-precision floating-point value `a'
5625 | to the precision provided by floatx80_rounding_precision and returns the
5626 | result as an extended double-precision floating-point value.
5627 | The operation is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic.
5629 *----------------------------------------------------------------------------*/
5630
5631 floatx80 floatx80_round(floatx80 a, float_status *status)
5632 {
5633     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5634                                 extractFloatx80Sign(a),
5635                                 extractFloatx80Exp(a),
5636                                 extractFloatx80Frac(a), 0, status);
5637 }
5638
5639 /*----------------------------------------------------------------------------
5640 | Rounds the extended double-precision floating-point value `a' to an integer,
5641 | and returns the result as an extended quadruple-precision floating-point
5642 | value.  The operation is performed according to the IEC/IEEE Standard for
5643 | Binary Floating-Point Arithmetic.
5644 *----------------------------------------------------------------------------*/
5645
5646 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5647 {
5648     bool aSign;
5649     int32_t aExp;
5650     uint64_t lastBitMask, roundBitsMask;
5651     floatx80 z;
5652
5653     if (floatx80_invalid_encoding(a)) {
5654         float_raise(float_flag_invalid, status);
5655         return floatx80_default_nan(status);
5656     }
5657     aExp = extractFloatx80Exp( a );
5658     if ( 0x403E <= aExp ) {
5659         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5660             return propagateFloatx80NaN(a, a, status);
5661         }
5662         return a;
5663     }
5664     if ( aExp < 0x3FFF ) {
5665         if (    ( aExp == 0 )
5666              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5667             return a;
5668         }
5669         float_raise(float_flag_inexact, status);
5670         aSign = extractFloatx80Sign( a );
5671         switch (status->float_rounding_mode) {
5672          case float_round_nearest_even:
5673             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5674                ) {
5675                 return
5676                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5677             }
5678             break;
5679         case float_round_ties_away:
5680             if (aExp == 0x3FFE) {
5681                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5682             }
5683             break;
5684          case float_round_down:
5685             return
5686                   aSign ?
5687                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5688                 : packFloatx80( 0, 0, 0 );
5689          case float_round_up:
5690             return
5691                   aSign ? packFloatx80( 1, 0, 0 )
5692                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5693
5694         case float_round_to_zero:
5695             break;
5696         default:
5697             g_assert_not_reached();
5698         }
5699         return packFloatx80( aSign, 0, 0 );
5700     }
5701     lastBitMask = 1;
5702     lastBitMask <<= 0x403E - aExp;
5703     roundBitsMask = lastBitMask - 1;
5704     z = a;
5705     switch (status->float_rounding_mode) {
5706     case float_round_nearest_even:
5707         z.low += lastBitMask>>1;
5708         if ((z.low & roundBitsMask) == 0) {
5709             z.low &= ~lastBitMask;
5710         }
5711         break;
5712     case float_round_ties_away:
5713         z.low += lastBitMask >> 1;
5714         break;
5715     case float_round_to_zero:
5716         break;
5717     case float_round_up:
5718         if (!extractFloatx80Sign(z)) {
5719             z.low += roundBitsMask;
5720         }
5721         break;
5722     case float_round_down:
5723         if (extractFloatx80Sign(z)) {
5724             z.low += roundBitsMask;
5725         }
5726         break;
5727     default:
5728         abort();
5729     }
5730     z.low &= ~ roundBitsMask;
5731     if ( z.low == 0 ) {
5732         ++z.high;
5733         z.low = UINT64_C(0x8000000000000000);
5734     }
5735     if (z.low != a.low) {
5736         float_raise(float_flag_inexact, status);
5737     }
5738     return z;
5739
5740 }
5741
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of adding the absolute values of the extended double-
5744 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5745 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5746 | The addition is performed according to the IEC/IEEE Standard for Binary
5747 | Floating-Point Arithmetic.
5748 *----------------------------------------------------------------------------*/
5749
5750 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5751                                 float_status *status)
5752 {
5753     int32_t aExp, bExp, zExp;
5754     uint64_t aSig, bSig, zSig0, zSig1;
5755     int32_t expDiff;
5756
5757     aSig = extractFloatx80Frac( a );
5758     aExp = extractFloatx80Exp( a );
5759     bSig = extractFloatx80Frac( b );
5760     bExp = extractFloatx80Exp( b );
5761     expDiff = aExp - bExp;
5762     if ( 0 < expDiff ) {
5763         if ( aExp == 0x7FFF ) {
5764             if ((uint64_t)(aSig << 1)) {
5765                 return propagateFloatx80NaN(a, b, status);
5766             }
5767             return a;
5768         }
5769         if ( bExp == 0 ) --expDiff;
5770         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5771         zExp = aExp;
5772     }
5773     else if ( expDiff < 0 ) {
5774         if ( bExp == 0x7FFF ) {
5775             if ((uint64_t)(bSig << 1)) {
5776                 return propagateFloatx80NaN(a, b, status);
5777             }
5778             return packFloatx80(zSign,
5779                                 floatx80_infinity_high,
5780                                 floatx80_infinity_low);
5781         }
5782         if ( aExp == 0 ) ++expDiff;
5783         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5784         zExp = bExp;
5785     }
5786     else {
5787         if ( aExp == 0x7FFF ) {
5788             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5789                 return propagateFloatx80NaN(a, b, status);
5790             }
5791             return a;
5792         }
5793         zSig1 = 0;
5794         zSig0 = aSig + bSig;
5795         if ( aExp == 0 ) {
5796             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5797                 /* At least one of the values is a pseudo-denormal,
5798                  * and there is a carry out of the result.  */
5799                 zExp = 1;
5800                 goto shiftRight1;
5801             }
5802             if (zSig0 == 0) {
5803                 return packFloatx80(zSign, 0, 0);
5804             }
5805             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5806             goto roundAndPack;
5807         }
5808         zExp = aExp;
5809         goto shiftRight1;
5810     }
5811     zSig0 = aSig + bSig;
5812     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5813  shiftRight1:
5814     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5815     zSig0 |= UINT64_C(0x8000000000000000);
5816     ++zExp;
5817  roundAndPack:
5818     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5819                                 zSign, zExp, zSig0, zSig1, status);
5820 }
5821
5822 /*----------------------------------------------------------------------------
5823 | Returns the result of subtracting the absolute values of the extended
5824 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5825 | difference is negated before being returned.  `zSign' is ignored if the
5826 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5827 | Standard for Binary Floating-Point Arithmetic.
5828 *----------------------------------------------------------------------------*/
5829
5830 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5831                                 float_status *status)
5832 {
5833     int32_t aExp, bExp, zExp;
5834     uint64_t aSig, bSig, zSig0, zSig1;
5835     int32_t expDiff;
5836
5837     aSig = extractFloatx80Frac( a );
5838     aExp = extractFloatx80Exp( a );
5839     bSig = extractFloatx80Frac( b );
5840     bExp = extractFloatx80Exp( b );
5841     expDiff = aExp - bExp;
5842     if ( 0 < expDiff ) goto aExpBigger;
5843     if ( expDiff < 0 ) goto bExpBigger;
5844     if ( aExp == 0x7FFF ) {
5845         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5846             return propagateFloatx80NaN(a, b, status);
5847         }
5848         float_raise(float_flag_invalid, status);
5849         return floatx80_default_nan(status);
5850     }
5851     if ( aExp == 0 ) {
5852         aExp = 1;
5853         bExp = 1;
5854     }
5855     zSig1 = 0;
5856     if ( bSig < aSig ) goto aBigger;
5857     if ( aSig < bSig ) goto bBigger;
5858     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5859  bExpBigger:
5860     if ( bExp == 0x7FFF ) {
5861         if ((uint64_t)(bSig << 1)) {
5862             return propagateFloatx80NaN(a, b, status);
5863         }
5864         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5865                             floatx80_infinity_low);
5866     }
5867     if ( aExp == 0 ) ++expDiff;
5868     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5869  bBigger:
5870     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5871     zExp = bExp;
5872     zSign ^= 1;
5873     goto normalizeRoundAndPack;
5874  aExpBigger:
5875     if ( aExp == 0x7FFF ) {
5876         if ((uint64_t)(aSig << 1)) {
5877             return propagateFloatx80NaN(a, b, status);
5878         }
5879         return a;
5880     }
5881     if ( bExp == 0 ) --expDiff;
5882     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5883  aBigger:
5884     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5885     zExp = aExp;
5886  normalizeRoundAndPack:
5887     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5888                                          zSign, zExp, zSig0, zSig1, status);
5889 }
5890
5891 /*----------------------------------------------------------------------------
5892 | Returns the result of adding the extended double-precision floating-point
5893 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5894 | Standard for Binary Floating-Point Arithmetic.
5895 *----------------------------------------------------------------------------*/
5896
5897 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5898 {
5899     bool aSign, bSign;
5900
5901     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5902         float_raise(float_flag_invalid, status);
5903         return floatx80_default_nan(status);
5904     }
5905     aSign = extractFloatx80Sign( a );
5906     bSign = extractFloatx80Sign( b );
5907     if ( aSign == bSign ) {
5908         return addFloatx80Sigs(a, b, aSign, status);
5909     }
5910     else {
5911         return subFloatx80Sigs(a, b, aSign, status);
5912     }
5913
5914 }
5915
5916 /*----------------------------------------------------------------------------
5917 | Returns the result of subtracting the extended double-precision floating-
5918 | point values `a' and `b'.  The operation is performed according to the
5919 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5920 *----------------------------------------------------------------------------*/
5921
5922 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5923 {
5924     bool aSign, bSign;
5925
5926     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5927         float_raise(float_flag_invalid, status);
5928         return floatx80_default_nan(status);
5929     }
5930     aSign = extractFloatx80Sign( a );
5931     bSign = extractFloatx80Sign( b );
5932     if ( aSign == bSign ) {
5933         return subFloatx80Sigs(a, b, aSign, status);
5934     }
5935     else {
5936         return addFloatx80Sigs(a, b, aSign, status);
5937     }
5938
5939 }
5940
5941 /*----------------------------------------------------------------------------
5942 | Returns the result of multiplying the extended double-precision floating-
5943 | point values `a' and `b'.  The operation is performed according to the
5944 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5945 *----------------------------------------------------------------------------*/
5946
5947 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5948 {
5949     bool aSign, bSign, zSign;
5950     int32_t aExp, bExp, zExp;
5951     uint64_t aSig, bSig, zSig0, zSig1;
5952
5953     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5954         float_raise(float_flag_invalid, status);
5955         return floatx80_default_nan(status);
5956     }
5957     aSig = extractFloatx80Frac( a );
5958     aExp = extractFloatx80Exp( a );
5959     aSign = extractFloatx80Sign( a );
5960     bSig = extractFloatx80Frac( b );
5961     bExp = extractFloatx80Exp( b );
5962     bSign = extractFloatx80Sign( b );
5963     zSign = aSign ^ bSign;
5964     if ( aExp == 0x7FFF ) {
5965         if (    (uint64_t) ( aSig<<1 )
5966              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5967             return propagateFloatx80NaN(a, b, status);
5968         }
5969         if ( ( bExp | bSig ) == 0 ) goto invalid;
5970         return packFloatx80(zSign, floatx80_infinity_high,
5971                                    floatx80_infinity_low);
5972     }
5973     if ( bExp == 0x7FFF ) {
5974         if ((uint64_t)(bSig << 1)) {
5975             return propagateFloatx80NaN(a, b, status);
5976         }
5977         if ( ( aExp | aSig ) == 0 ) {
5978  invalid:
5979             float_raise(float_flag_invalid, status);
5980             return floatx80_default_nan(status);
5981         }
5982         return packFloatx80(zSign, floatx80_infinity_high,
5983                                    floatx80_infinity_low);
5984     }
5985     if ( aExp == 0 ) {
5986         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5987         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5988     }
5989     if ( bExp == 0 ) {
5990         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5991         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5992     }
5993     zExp = aExp + bExp - 0x3FFE;
5994     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5995     if ( 0 < (int64_t) zSig0 ) {
5996         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5997         --zExp;
5998     }
5999     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6000                                 zSign, zExp, zSig0, zSig1, status);
6001 }
6002
6003 /*----------------------------------------------------------------------------
6004 | Returns the result of dividing the extended double-precision floating-point
6005 | value `a' by the corresponding value `b'.  The operation is performed
6006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6007 *----------------------------------------------------------------------------*/
6008
6009 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6010 {
6011     bool aSign, bSign, zSign;
6012     int32_t aExp, bExp, zExp;
6013     uint64_t aSig, bSig, zSig0, zSig1;
6014     uint64_t rem0, rem1, rem2, term0, term1, term2;
6015
6016     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6017         float_raise(float_flag_invalid, status);
6018         return floatx80_default_nan(status);
6019     }
6020     aSig = extractFloatx80Frac( a );
6021     aExp = extractFloatx80Exp( a );
6022     aSign = extractFloatx80Sign( a );
6023     bSig = extractFloatx80Frac( b );
6024     bExp = extractFloatx80Exp( b );
6025     bSign = extractFloatx80Sign( b );
6026     zSign = aSign ^ bSign;
6027     if ( aExp == 0x7FFF ) {
6028         if ((uint64_t)(aSig << 1)) {
6029             return propagateFloatx80NaN(a, b, status);
6030         }
6031         if ( bExp == 0x7FFF ) {
6032             if ((uint64_t)(bSig << 1)) {
6033                 return propagateFloatx80NaN(a, b, status);
6034             }
6035             goto invalid;
6036         }
6037         return packFloatx80(zSign, floatx80_infinity_high,
6038                                    floatx80_infinity_low);
6039     }
6040     if ( bExp == 0x7FFF ) {
6041         if ((uint64_t)(bSig << 1)) {
6042             return propagateFloatx80NaN(a, b, status);
6043         }
6044         return packFloatx80( zSign, 0, 0 );
6045     }
6046     if ( bExp == 0 ) {
6047         if ( bSig == 0 ) {
6048             if ( ( aExp | aSig ) == 0 ) {
6049  invalid:
6050                 float_raise(float_flag_invalid, status);
6051                 return floatx80_default_nan(status);
6052             }
6053             float_raise(float_flag_divbyzero, status);
6054             return packFloatx80(zSign, floatx80_infinity_high,
6055                                        floatx80_infinity_low);
6056         }
6057         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6058     }
6059     if ( aExp == 0 ) {
6060         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6061         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6062     }
6063     zExp = aExp - bExp + 0x3FFE;
6064     rem1 = 0;
6065     if ( bSig <= aSig ) {
6066         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6067         ++zExp;
6068     }
6069     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6070     mul64To128( bSig, zSig0, &term0, &term1 );
6071     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6072     while ( (int64_t) rem0 < 0 ) {
6073         --zSig0;
6074         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6075     }
6076     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6077     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6078         mul64To128( bSig, zSig1, &term1, &term2 );
6079         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6080         while ( (int64_t) rem1 < 0 ) {
6081             --zSig1;
6082             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6083         }
6084         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6085     }
6086     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6087                                 zSign, zExp, zSig0, zSig1, status);
6088 }
6089
6090 /*----------------------------------------------------------------------------
6091 | Returns the remainder of the extended double-precision floating-point value
6092 | `a' with respect to the corresponding value `b'.  The operation is performed
6093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6094 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6095 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6096 | the absolute value of the integer quotient.
6097 *----------------------------------------------------------------------------*/
6098
6099 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6100                          float_status *status)
6101 {
6102     bool aSign, zSign;
6103     int32_t aExp, bExp, expDiff, aExpOrig;
6104     uint64_t aSig0, aSig1, bSig;
6105     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6106
6107     *quotient = 0;
6108     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6109         float_raise(float_flag_invalid, status);
6110         return floatx80_default_nan(status);
6111     }
6112     aSig0 = extractFloatx80Frac( a );
6113     aExpOrig = aExp = extractFloatx80Exp( a );
6114     aSign = extractFloatx80Sign( a );
6115     bSig = extractFloatx80Frac( b );
6116     bExp = extractFloatx80Exp( b );
6117     if ( aExp == 0x7FFF ) {
6118         if (    (uint64_t) ( aSig0<<1 )
6119              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6120             return propagateFloatx80NaN(a, b, status);
6121         }
6122         goto invalid;
6123     }
6124     if ( bExp == 0x7FFF ) {
6125         if ((uint64_t)(bSig << 1)) {
6126             return propagateFloatx80NaN(a, b, status);
6127         }
6128         if (aExp == 0 && aSig0 >> 63) {
6129             /*
6130              * Pseudo-denormal argument must be returned in normalized
6131              * form.
6132              */
6133             return packFloatx80(aSign, 1, aSig0);
6134         }
6135         return a;
6136     }
6137     if ( bExp == 0 ) {
6138         if ( bSig == 0 ) {
6139  invalid:
6140             float_raise(float_flag_invalid, status);
6141             return floatx80_default_nan(status);
6142         }
6143         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6144     }
6145     if ( aExp == 0 ) {
6146         if ( aSig0 == 0 ) return a;
6147         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6148     }
6149     zSign = aSign;
6150     expDiff = aExp - bExp;
6151     aSig1 = 0;
6152     if ( expDiff < 0 ) {
6153         if ( mod || expDiff < -1 ) {
6154             if (aExp == 1 && aExpOrig == 0) {
6155                 /*
6156                  * Pseudo-denormal argument must be returned in
6157                  * normalized form.
6158                  */
6159                 return packFloatx80(aSign, aExp, aSig0);
6160             }
6161             return a;
6162         }
6163         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6164         expDiff = 0;
6165     }
6166     *quotient = q = ( bSig <= aSig0 );
6167     if ( q ) aSig0 -= bSig;
6168     expDiff -= 64;
6169     while ( 0 < expDiff ) {
6170         q = estimateDiv128To64( aSig0, aSig1, bSig );
6171         q = ( 2 < q ) ? q - 2 : 0;
6172         mul64To128( bSig, q, &term0, &term1 );
6173         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6174         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6175         expDiff -= 62;
6176         *quotient <<= 62;
6177         *quotient += q;
6178     }
6179     expDiff += 64;
6180     if ( 0 < expDiff ) {
6181         q = estimateDiv128To64( aSig0, aSig1, bSig );
6182         q = ( 2 < q ) ? q - 2 : 0;
6183         q >>= 64 - expDiff;
6184         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6185         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6186         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6187         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6188             ++q;
6189             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6190         }
6191         if (expDiff < 64) {
6192             *quotient <<= expDiff;
6193         } else {
6194             *quotient = 0;
6195         }
6196         *quotient += q;
6197     }
6198     else {
6199         term1 = 0;
6200         term0 = bSig;
6201     }
6202     if (!mod) {
6203         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6204         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6205                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6206                         && ( q & 1 ) )
6207             ) {
6208             aSig0 = alternateASig0;
6209             aSig1 = alternateASig1;
6210             zSign = ! zSign;
6211             ++*quotient;
6212         }
6213     }
6214     return
6215         normalizeRoundAndPackFloatx80(
6216             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6217
6218 }
6219
6220 /*----------------------------------------------------------------------------
6221 | Returns the remainder of the extended double-precision floating-point value
6222 | `a' with respect to the corresponding value `b'.  The operation is performed
6223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6224 *----------------------------------------------------------------------------*/
6225
6226 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6227 {
6228     uint64_t quotient;
6229     return floatx80_modrem(a, b, false, &quotient, status);
6230 }
6231
6232 /*----------------------------------------------------------------------------
6233 | Returns the remainder of the extended double-precision floating-point value
6234 | `a' with respect to the corresponding value `b', with the quotient truncated
6235 | toward zero.
6236 *----------------------------------------------------------------------------*/
6237
6238 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6239 {
6240     uint64_t quotient;
6241     return floatx80_modrem(a, b, true, &quotient, status);
6242 }
6243
6244 /*----------------------------------------------------------------------------
6245 | Returns the square root of the extended double-precision floating-point
6246 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6247 | for Binary Floating-Point Arithmetic.
6248 *----------------------------------------------------------------------------*/
6249
6250 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6251 {
6252     bool aSign;
6253     int32_t aExp, zExp;
6254     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6255     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6256
6257     if (floatx80_invalid_encoding(a)) {
6258         float_raise(float_flag_invalid, status);
6259         return floatx80_default_nan(status);
6260     }
6261     aSig0 = extractFloatx80Frac( a );
6262     aExp = extractFloatx80Exp( a );
6263     aSign = extractFloatx80Sign( a );
6264     if ( aExp == 0x7FFF ) {
6265         if ((uint64_t)(aSig0 << 1)) {
6266             return propagateFloatx80NaN(a, a, status);
6267         }
6268         if ( ! aSign ) return a;
6269         goto invalid;
6270     }
6271     if ( aSign ) {
6272         if ( ( aExp | aSig0 ) == 0 ) return a;
6273  invalid:
6274         float_raise(float_flag_invalid, status);
6275         return floatx80_default_nan(status);
6276     }
6277     if ( aExp == 0 ) {
6278         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6279         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6280     }
6281     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6282     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6283     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6284     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6285     doubleZSig0 = zSig0<<1;
6286     mul64To128( zSig0, zSig0, &term0, &term1 );
6287     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6288     while ( (int64_t) rem0 < 0 ) {
6289         --zSig0;
6290         doubleZSig0 -= 2;
6291         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6292     }
6293     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6294     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6295         if ( zSig1 == 0 ) zSig1 = 1;
6296         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6297         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6298         mul64To128( zSig1, zSig1, &term2, &term3 );
6299         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6300         while ( (int64_t) rem1 < 0 ) {
6301             --zSig1;
6302             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6303             term3 |= 1;
6304             term2 |= doubleZSig0;
6305             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6306         }
6307         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6308     }
6309     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6310     zSig0 |= doubleZSig0;
6311     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6312                                 0, zExp, zSig0, zSig1, status);
6313 }
6314
6315 /*----------------------------------------------------------------------------
6316 | Returns the result of converting the quadruple-precision floating-point
6317 | value `a' to the 32-bit two's complement integer format.  The conversion
6318 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6319 | Arithmetic---which means in particular that the conversion is rounded
6320 | according to the current rounding mode.  If `a' is a NaN, the largest
6321 | positive integer is returned.  Otherwise, if the conversion overflows, the
6322 | largest integer with the same sign as `a' is returned.
6323 *----------------------------------------------------------------------------*/
6324
6325 int32_t float128_to_int32(float128 a, float_status *status)
6326 {
6327     bool aSign;
6328     int32_t aExp, shiftCount;
6329     uint64_t aSig0, aSig1;
6330
6331     aSig1 = extractFloat128Frac1( a );
6332     aSig0 = extractFloat128Frac0( a );
6333     aExp = extractFloat128Exp( a );
6334     aSign = extractFloat128Sign( a );
6335     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6336     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6337     aSig0 |= ( aSig1 != 0 );
6338     shiftCount = 0x4028 - aExp;
6339     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6340     return roundAndPackInt32(aSign, aSig0, status);
6341
6342 }
6343
6344 /*----------------------------------------------------------------------------
6345 | Returns the result of converting the quadruple-precision floating-point
6346 | value `a' to the 32-bit two's complement integer format.  The conversion
6347 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6348 | Arithmetic, except that the conversion is always rounded toward zero.  If
6349 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6350 | conversion overflows, the largest integer with the same sign as `a' is
6351 | returned.
6352 *----------------------------------------------------------------------------*/
6353
6354 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6355 {
6356     bool aSign;
6357     int32_t aExp, shiftCount;
6358     uint64_t aSig0, aSig1, savedASig;
6359     int32_t z;
6360
6361     aSig1 = extractFloat128Frac1( a );
6362     aSig0 = extractFloat128Frac0( a );
6363     aExp = extractFloat128Exp( a );
6364     aSign = extractFloat128Sign( a );
6365     aSig0 |= ( aSig1 != 0 );
6366     if ( 0x401E < aExp ) {
6367         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6368         goto invalid;
6369     }
6370     else if ( aExp < 0x3FFF ) {
6371         if (aExp || aSig0) {
6372             float_raise(float_flag_inexact, status);
6373         }
6374         return 0;
6375     }
6376     aSig0 |= UINT64_C(0x0001000000000000);
6377     shiftCount = 0x402F - aExp;
6378     savedASig = aSig0;
6379     aSig0 >>= shiftCount;
6380     z = aSig0;
6381     if ( aSign ) z = - z;
6382     if ( ( z < 0 ) ^ aSign ) {
6383  invalid:
6384         float_raise(float_flag_invalid, status);
6385         return aSign ? INT32_MIN : INT32_MAX;
6386     }
6387     if ( ( aSig0<<shiftCount ) != savedASig ) {
6388         float_raise(float_flag_inexact, status);
6389     }
6390     return z;
6391
6392 }
6393
6394 /*----------------------------------------------------------------------------
6395 | Returns the result of converting the quadruple-precision floating-point
6396 | value `a' to the 64-bit two's complement integer format.  The conversion
6397 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6398 | Arithmetic---which means in particular that the conversion is rounded
6399 | according to the current rounding mode.  If `a' is a NaN, the largest
6400 | positive integer is returned.  Otherwise, if the conversion overflows, the
6401 | largest integer with the same sign as `a' is returned.
6402 *----------------------------------------------------------------------------*/
6403
6404 int64_t float128_to_int64(float128 a, float_status *status)
6405 {
6406     bool aSign;
6407     int32_t aExp, shiftCount;
6408     uint64_t aSig0, aSig1;
6409
6410     aSig1 = extractFloat128Frac1( a );
6411     aSig0 = extractFloat128Frac0( a );
6412     aExp = extractFloat128Exp( a );
6413     aSign = extractFloat128Sign( a );
6414     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6415     shiftCount = 0x402F - aExp;
6416     if ( shiftCount <= 0 ) {
6417         if ( 0x403E < aExp ) {
6418             float_raise(float_flag_invalid, status);
6419             if (    ! aSign
6420                  || (    ( aExp == 0x7FFF )
6421                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6422                     )
6423                ) {
6424                 return INT64_MAX;
6425             }
6426             return INT64_MIN;
6427         }
6428         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6429     }
6430     else {
6431         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6432     }
6433     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6434
6435 }
6436
6437 /*----------------------------------------------------------------------------
6438 | Returns the result of converting the quadruple-precision floating-point
6439 | value `a' to the 64-bit two's complement integer format.  The conversion
6440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6441 | Arithmetic, except that the conversion is always rounded toward zero.
6442 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6443 | the conversion overflows, the largest integer with the same sign as `a' is
6444 | returned.
6445 *----------------------------------------------------------------------------*/
6446
6447 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6448 {
6449     bool aSign;
6450     int32_t aExp, shiftCount;
6451     uint64_t aSig0, aSig1;
6452     int64_t z;
6453
6454     aSig1 = extractFloat128Frac1( a );
6455     aSig0 = extractFloat128Frac0( a );
6456     aExp = extractFloat128Exp( a );
6457     aSign = extractFloat128Sign( a );
6458     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6459     shiftCount = aExp - 0x402F;
6460     if ( 0 < shiftCount ) {
6461         if ( 0x403E <= aExp ) {
6462             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6463             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6464                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6465                 if (aSig1) {
6466                     float_raise(float_flag_inexact, status);
6467                 }
6468             }
6469             else {
6470                 float_raise(float_flag_invalid, status);
6471                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6472                     return INT64_MAX;
6473                 }
6474             }
6475             return INT64_MIN;
6476         }
6477         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6478         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6479             float_raise(float_flag_inexact, status);
6480         }
6481     }
6482     else {
6483         if ( aExp < 0x3FFF ) {
6484             if ( aExp | aSig0 | aSig1 ) {
6485                 float_raise(float_flag_inexact, status);
6486             }
6487             return 0;
6488         }
6489         z = aSig0>>( - shiftCount );
6490         if (    aSig1
6491              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6492             float_raise(float_flag_inexact, status);
6493         }
6494     }
6495     if ( aSign ) z = - z;
6496     return z;
6497
6498 }
6499
6500 /*----------------------------------------------------------------------------
6501 | Returns the result of converting the quadruple-precision floating-point value
6502 | `a' to the 64-bit unsigned integer format.  The conversion is
6503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6504 | Arithmetic---which means in particular that the conversion is rounded
6505 | according to the current rounding mode.  If `a' is a NaN, the largest
6506 | positive integer is returned.  If the conversion overflows, the
6507 | largest unsigned integer is returned.  If 'a' is negative, the value is
6508 | rounded and zero is returned; negative values that do not round to zero
6509 | will raise the inexact exception.
6510 *----------------------------------------------------------------------------*/
6511
6512 uint64_t float128_to_uint64(float128 a, float_status *status)
6513 {
6514     bool aSign;
6515     int aExp;
6516     int shiftCount;
6517     uint64_t aSig0, aSig1;
6518
6519     aSig0 = extractFloat128Frac0(a);
6520     aSig1 = extractFloat128Frac1(a);
6521     aExp = extractFloat128Exp(a);
6522     aSign = extractFloat128Sign(a);
6523     if (aSign && (aExp > 0x3FFE)) {
6524         float_raise(float_flag_invalid, status);
6525         if (float128_is_any_nan(a)) {
6526             return UINT64_MAX;
6527         } else {
6528             return 0;
6529         }
6530     }
6531     if (aExp) {
6532         aSig0 |= UINT64_C(0x0001000000000000);
6533     }
6534     shiftCount = 0x402F - aExp;
6535     if (shiftCount <= 0) {
6536         if (0x403E < aExp) {
6537             float_raise(float_flag_invalid, status);
6538             return UINT64_MAX;
6539         }
6540         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6541     } else {
6542         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6543     }
6544     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6545 }
6546
6547 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6548 {
6549     uint64_t v;
6550     signed char current_rounding_mode = status->float_rounding_mode;
6551
6552     set_float_rounding_mode(float_round_to_zero, status);
6553     v = float128_to_uint64(a, status);
6554     set_float_rounding_mode(current_rounding_mode, status);
6555
6556     return v;
6557 }
6558
6559 /*----------------------------------------------------------------------------
6560 | Returns the result of converting the quadruple-precision floating-point
6561 | value `a' to the 32-bit unsigned integer format.  The conversion
6562 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6563 | Arithmetic except that the conversion is always rounded toward zero.
6564 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6565 | if the conversion overflows, the largest unsigned integer is returned.
6566 | If 'a' is negative, the value is rounded and zero is returned; negative
6567 | values that do not round to zero will raise the inexact exception.
6568 *----------------------------------------------------------------------------*/
6569
6570 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6571 {
6572     uint64_t v;
6573     uint32_t res;
6574     int old_exc_flags = get_float_exception_flags(status);
6575
6576     v = float128_to_uint64_round_to_zero(a, status);
6577     if (v > 0xffffffff) {
6578         res = 0xffffffff;
6579     } else {
6580         return v;
6581     }
6582     set_float_exception_flags(old_exc_flags, status);
6583     float_raise(float_flag_invalid, status);
6584     return res;
6585 }
6586
6587 /*----------------------------------------------------------------------------
6588 | Returns the result of converting the quadruple-precision floating-point value
6589 | `a' to the 32-bit unsigned integer format.  The conversion is
6590 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6591 | Arithmetic---which means in particular that the conversion is rounded
6592 | according to the current rounding mode.  If `a' is a NaN, the largest
6593 | positive integer is returned.  If the conversion overflows, the
6594 | largest unsigned integer is returned.  If 'a' is negative, the value is
6595 | rounded and zero is returned; negative values that do not round to zero
6596 | will raise the inexact exception.
6597 *----------------------------------------------------------------------------*/
6598
6599 uint32_t float128_to_uint32(float128 a, float_status *status)
6600 {
6601     uint64_t v;
6602     uint32_t res;
6603     int old_exc_flags = get_float_exception_flags(status);
6604
6605     v = float128_to_uint64(a, status);
6606     if (v > 0xffffffff) {
6607         res = 0xffffffff;
6608     } else {
6609         return v;
6610     }
6611     set_float_exception_flags(old_exc_flags, status);
6612     float_raise(float_flag_invalid, status);
6613     return res;
6614 }
6615
6616 /*----------------------------------------------------------------------------
6617 | Returns the result of converting the quadruple-precision floating-point
6618 | value `a' to the single-precision floating-point format.  The conversion
6619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6620 | Arithmetic.
6621 *----------------------------------------------------------------------------*/
6622
6623 float32 float128_to_float32(float128 a, float_status *status)
6624 {
6625     bool aSign;
6626     int32_t aExp;
6627     uint64_t aSig0, aSig1;
6628     uint32_t zSig;
6629
6630     aSig1 = extractFloat128Frac1( a );
6631     aSig0 = extractFloat128Frac0( a );
6632     aExp = extractFloat128Exp( a );
6633     aSign = extractFloat128Sign( a );
6634     if ( aExp == 0x7FFF ) {
6635         if ( aSig0 | aSig1 ) {
6636             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6637         }
6638         return packFloat32( aSign, 0xFF, 0 );
6639     }
6640     aSig0 |= ( aSig1 != 0 );
6641     shift64RightJamming( aSig0, 18, &aSig0 );
6642     zSig = aSig0;
6643     if ( aExp || zSig ) {
6644         zSig |= 0x40000000;
6645         aExp -= 0x3F81;
6646     }
6647     return roundAndPackFloat32(aSign, aExp, zSig, status);
6648
6649 }
6650
6651 /*----------------------------------------------------------------------------
6652 | Returns the result of converting the quadruple-precision floating-point
6653 | value `a' to the double-precision floating-point format.  The conversion
6654 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6655 | Arithmetic.
6656 *----------------------------------------------------------------------------*/
6657
6658 float64 float128_to_float64(float128 a, float_status *status)
6659 {
6660     bool aSign;
6661     int32_t aExp;
6662     uint64_t aSig0, aSig1;
6663
6664     aSig1 = extractFloat128Frac1( a );
6665     aSig0 = extractFloat128Frac0( a );
6666     aExp = extractFloat128Exp( a );
6667     aSign = extractFloat128Sign( a );
6668     if ( aExp == 0x7FFF ) {
6669         if ( aSig0 | aSig1 ) {
6670             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6671         }
6672         return packFloat64( aSign, 0x7FF, 0 );
6673     }
6674     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6675     aSig0 |= ( aSig1 != 0 );
6676     if ( aExp || aSig0 ) {
6677         aSig0 |= UINT64_C(0x4000000000000000);
6678         aExp -= 0x3C01;
6679     }
6680     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6681
6682 }
6683
6684 /*----------------------------------------------------------------------------
6685 | Returns the result of converting the quadruple-precision floating-point
6686 | value `a' to the extended double-precision floating-point format.  The
6687 | conversion is performed according to the IEC/IEEE Standard for Binary
6688 | Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6690
6691 floatx80 float128_to_floatx80(float128 a, float_status *status)
6692 {
6693     bool aSign;
6694     int32_t aExp;
6695     uint64_t aSig0, aSig1;
6696
6697     aSig1 = extractFloat128Frac1( a );
6698     aSig0 = extractFloat128Frac0( a );
6699     aExp = extractFloat128Exp( a );
6700     aSign = extractFloat128Sign( a );
6701     if ( aExp == 0x7FFF ) {
6702         if ( aSig0 | aSig1 ) {
6703             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6704                                                status);
6705             return floatx80_silence_nan(res, status);
6706         }
6707         return packFloatx80(aSign, floatx80_infinity_high,
6708                                    floatx80_infinity_low);
6709     }
6710     if ( aExp == 0 ) {
6711         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6712         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6713     }
6714     else {
6715         aSig0 |= UINT64_C(0x0001000000000000);
6716     }
6717     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6718     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6719
6720 }
6721
6722 /*----------------------------------------------------------------------------
6723 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6724 | returns the result as a quadruple-precision floating-point value.  The
6725 | operation is performed according to the IEC/IEEE Standard for Binary
6726 | Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6728
6729 float128 float128_round_to_int(float128 a, float_status *status)
6730 {
6731     bool aSign;
6732     int32_t aExp;
6733     uint64_t lastBitMask, roundBitsMask;
6734     float128 z;
6735
6736     aExp = extractFloat128Exp( a );
6737     if ( 0x402F <= aExp ) {
6738         if ( 0x406F <= aExp ) {
6739             if (    ( aExp == 0x7FFF )
6740                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6741                ) {
6742                 return propagateFloat128NaN(a, a, status);
6743             }
6744             return a;
6745         }
6746         lastBitMask = 1;
6747         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6748         roundBitsMask = lastBitMask - 1;
6749         z = a;
6750         switch (status->float_rounding_mode) {
6751         case float_round_nearest_even:
6752             if ( lastBitMask ) {
6753                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6754                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6755             }
6756             else {
6757                 if ( (int64_t) z.low < 0 ) {
6758                     ++z.high;
6759                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6760                 }
6761             }
6762             break;
6763         case float_round_ties_away:
6764             if (lastBitMask) {
6765                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6766             } else {
6767                 if ((int64_t) z.low < 0) {
6768                     ++z.high;
6769                 }
6770             }
6771             break;
6772         case float_round_to_zero:
6773             break;
6774         case float_round_up:
6775             if (!extractFloat128Sign(z)) {
6776                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6777             }
6778             break;
6779         case float_round_down:
6780             if (extractFloat128Sign(z)) {
6781                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6782             }
6783             break;
6784         case float_round_to_odd:
6785             /*
6786              * Note that if lastBitMask == 0, the last bit is the lsb
6787              * of high, and roundBitsMask == -1.
6788              */
6789             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6790                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6791             }
6792             break;
6793         default:
6794             abort();
6795         }
6796         z.low &= ~ roundBitsMask;
6797     }
6798     else {
6799         if ( aExp < 0x3FFF ) {
6800             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6801             float_raise(float_flag_inexact, status);
6802             aSign = extractFloat128Sign( a );
6803             switch (status->float_rounding_mode) {
6804             case float_round_nearest_even:
6805                 if (    ( aExp == 0x3FFE )
6806                      && (   extractFloat128Frac0( a )
6807                           | extractFloat128Frac1( a ) )
6808                    ) {
6809                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6810                 }
6811                 break;
6812             case float_round_ties_away:
6813                 if (aExp == 0x3FFE) {
6814                     return packFloat128(aSign, 0x3FFF, 0, 0);
6815                 }
6816                 break;
6817             case float_round_down:
6818                 return
6819                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6820                     : packFloat128( 0, 0, 0, 0 );
6821             case float_round_up:
6822                 return
6823                       aSign ? packFloat128( 1, 0, 0, 0 )
6824                     : packFloat128( 0, 0x3FFF, 0, 0 );
6825
6826             case float_round_to_odd:
6827                 return packFloat128(aSign, 0x3FFF, 0, 0);
6828
6829             case float_round_to_zero:
6830                 break;
6831             }
6832             return packFloat128( aSign, 0, 0, 0 );
6833         }
6834         lastBitMask = 1;
6835         lastBitMask <<= 0x402F - aExp;
6836         roundBitsMask = lastBitMask - 1;
6837         z.low = 0;
6838         z.high = a.high;
6839         switch (status->float_rounding_mode) {
6840         case float_round_nearest_even:
6841             z.high += lastBitMask>>1;
6842             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6843                 z.high &= ~ lastBitMask;
6844             }
6845             break;
6846         case float_round_ties_away:
6847             z.high += lastBitMask>>1;
6848             break;
6849         case float_round_to_zero:
6850             break;
6851         case float_round_up:
6852             if (!extractFloat128Sign(z)) {
6853                 z.high |= ( a.low != 0 );
6854                 z.high += roundBitsMask;
6855             }
6856             break;
6857         case float_round_down:
6858             if (extractFloat128Sign(z)) {
6859                 z.high |= (a.low != 0);
6860                 z.high += roundBitsMask;
6861             }
6862             break;
6863         case float_round_to_odd:
6864             if ((z.high & lastBitMask) == 0) {
6865                 z.high |= (a.low != 0);
6866                 z.high += roundBitsMask;
6867             }
6868             break;
6869         default:
6870             abort();
6871         }
6872         z.high &= ~ roundBitsMask;
6873     }
6874     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6875         float_raise(float_flag_inexact, status);
6876     }
6877     return z;
6878
6879 }
6880
6881 /*----------------------------------------------------------------------------
6882 | Returns the result of adding the absolute values of the quadruple-precision
6883 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6884 | before being returned.  `zSign' is ignored if the result is a NaN.
6885 | The addition is performed according to the IEC/IEEE Standard for Binary
6886 | Floating-Point Arithmetic.
6887 *----------------------------------------------------------------------------*/
6888
6889 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6890                                 float_status *status)
6891 {
6892     int32_t aExp, bExp, zExp;
6893     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6894     int32_t expDiff;
6895
6896     aSig1 = extractFloat128Frac1( a );
6897     aSig0 = extractFloat128Frac0( a );
6898     aExp = extractFloat128Exp( a );
6899     bSig1 = extractFloat128Frac1( b );
6900     bSig0 = extractFloat128Frac0( b );
6901     bExp = extractFloat128Exp( b );
6902     expDiff = aExp - bExp;
6903     if ( 0 < expDiff ) {
6904         if ( aExp == 0x7FFF ) {
6905             if (aSig0 | aSig1) {
6906                 return propagateFloat128NaN(a, b, status);
6907             }
6908             return a;
6909         }
6910         if ( bExp == 0 ) {
6911             --expDiff;
6912         }
6913         else {
6914             bSig0 |= UINT64_C(0x0001000000000000);
6915         }
6916         shift128ExtraRightJamming(
6917             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6918         zExp = aExp;
6919     }
6920     else if ( expDiff < 0 ) {
6921         if ( bExp == 0x7FFF ) {
6922             if (bSig0 | bSig1) {
6923                 return propagateFloat128NaN(a, b, status);
6924             }
6925             return packFloat128( zSign, 0x7FFF, 0, 0 );
6926         }
6927         if ( aExp == 0 ) {
6928             ++expDiff;
6929         }
6930         else {
6931             aSig0 |= UINT64_C(0x0001000000000000);
6932         }
6933         shift128ExtraRightJamming(
6934             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6935         zExp = bExp;
6936     }
6937     else {
6938         if ( aExp == 0x7FFF ) {
6939             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6940                 return propagateFloat128NaN(a, b, status);
6941             }
6942             return a;
6943         }
6944         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6945         if ( aExp == 0 ) {
6946             if (status->flush_to_zero) {
6947                 if (zSig0 | zSig1) {
6948                     float_raise(float_flag_output_denormal, status);
6949                 }
6950                 return packFloat128(zSign, 0, 0, 0);
6951             }
6952             return packFloat128( zSign, 0, zSig0, zSig1 );
6953         }
6954         zSig2 = 0;
6955         zSig0 |= UINT64_C(0x0002000000000000);
6956         zExp = aExp;
6957         goto shiftRight1;
6958     }
6959     aSig0 |= UINT64_C(0x0001000000000000);
6960     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6961     --zExp;
6962     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6963     ++zExp;
6964  shiftRight1:
6965     shift128ExtraRightJamming(
6966         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6967  roundAndPack:
6968     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6969
6970 }
6971
6972 /*----------------------------------------------------------------------------
6973 | Returns the result of subtracting the absolute values of the quadruple-
6974 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6975 | difference is negated before being returned.  `zSign' is ignored if the
6976 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6977 | Standard for Binary Floating-Point Arithmetic.
6978 *----------------------------------------------------------------------------*/
6979
6980 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6981                                 float_status *status)
6982 {
6983     int32_t aExp, bExp, zExp;
6984     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6985     int32_t expDiff;
6986
6987     aSig1 = extractFloat128Frac1( a );
6988     aSig0 = extractFloat128Frac0( a );
6989     aExp = extractFloat128Exp( a );
6990     bSig1 = extractFloat128Frac1( b );
6991     bSig0 = extractFloat128Frac0( b );
6992     bExp = extractFloat128Exp( b );
6993     expDiff = aExp - bExp;
6994     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6995     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6996     if ( 0 < expDiff ) goto aExpBigger;
6997     if ( expDiff < 0 ) goto bExpBigger;
6998     if ( aExp == 0x7FFF ) {
6999         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7000             return propagateFloat128NaN(a, b, status);
7001         }
7002         float_raise(float_flag_invalid, status);
7003         return float128_default_nan(status);
7004     }
7005     if ( aExp == 0 ) {
7006         aExp = 1;
7007         bExp = 1;
7008     }
7009     if ( bSig0 < aSig0 ) goto aBigger;
7010     if ( aSig0 < bSig0 ) goto bBigger;
7011     if ( bSig1 < aSig1 ) goto aBigger;
7012     if ( aSig1 < bSig1 ) goto bBigger;
7013     return packFloat128(status->float_rounding_mode == float_round_down,
7014                         0, 0, 0);
7015  bExpBigger:
7016     if ( bExp == 0x7FFF ) {
7017         if (bSig0 | bSig1) {
7018             return propagateFloat128NaN(a, b, status);
7019         }
7020         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7021     }
7022     if ( aExp == 0 ) {
7023         ++expDiff;
7024     }
7025     else {
7026         aSig0 |= UINT64_C(0x4000000000000000);
7027     }
7028     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7029     bSig0 |= UINT64_C(0x4000000000000000);
7030  bBigger:
7031     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7032     zExp = bExp;
7033     zSign ^= 1;
7034     goto normalizeRoundAndPack;
7035  aExpBigger:
7036     if ( aExp == 0x7FFF ) {
7037         if (aSig0 | aSig1) {
7038             return propagateFloat128NaN(a, b, status);
7039         }
7040         return a;
7041     }
7042     if ( bExp == 0 ) {
7043         --expDiff;
7044     }
7045     else {
7046         bSig0 |= UINT64_C(0x4000000000000000);
7047     }
7048     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7049     aSig0 |= UINT64_C(0x4000000000000000);
7050  aBigger:
7051     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7052     zExp = aExp;
7053  normalizeRoundAndPack:
7054     --zExp;
7055     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7056                                          status);
7057
7058 }
7059
7060 /*----------------------------------------------------------------------------
7061 | Returns the result of adding the quadruple-precision floating-point values
7062 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7063 | for Binary Floating-Point Arithmetic.
7064 *----------------------------------------------------------------------------*/
7065
7066 float128 float128_add(float128 a, float128 b, float_status *status)
7067 {
7068     bool aSign, bSign;
7069
7070     aSign = extractFloat128Sign( a );
7071     bSign = extractFloat128Sign( b );
7072     if ( aSign == bSign ) {
7073         return addFloat128Sigs(a, b, aSign, status);
7074     }
7075     else {
7076         return subFloat128Sigs(a, b, aSign, status);
7077     }
7078
7079 }
7080
7081 /*----------------------------------------------------------------------------
7082 | Returns the result of subtracting the quadruple-precision floating-point
7083 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7084 | Standard for Binary Floating-Point Arithmetic.
7085 *----------------------------------------------------------------------------*/
7086
7087 float128 float128_sub(float128 a, float128 b, float_status *status)
7088 {
7089     bool aSign, bSign;
7090
7091     aSign = extractFloat128Sign( a );
7092     bSign = extractFloat128Sign( b );
7093     if ( aSign == bSign ) {
7094         return subFloat128Sigs(a, b, aSign, status);
7095     }
7096     else {
7097         return addFloat128Sigs(a, b, aSign, status);
7098     }
7099
7100 }
7101
7102 /*----------------------------------------------------------------------------
7103 | Returns the result of multiplying the quadruple-precision floating-point
7104 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7105 | Standard for Binary Floating-Point Arithmetic.
7106 *----------------------------------------------------------------------------*/
7107
7108 float128 float128_mul(float128 a, float128 b, float_status *status)
7109 {
7110     bool aSign, bSign, zSign;
7111     int32_t aExp, bExp, zExp;
7112     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7113
7114     aSig1 = extractFloat128Frac1( a );
7115     aSig0 = extractFloat128Frac0( a );
7116     aExp = extractFloat128Exp( a );
7117     aSign = extractFloat128Sign( a );
7118     bSig1 = extractFloat128Frac1( b );
7119     bSig0 = extractFloat128Frac0( b );
7120     bExp = extractFloat128Exp( b );
7121     bSign = extractFloat128Sign( b );
7122     zSign = aSign ^ bSign;
7123     if ( aExp == 0x7FFF ) {
7124         if (    ( aSig0 | aSig1 )
7125              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7126             return propagateFloat128NaN(a, b, status);
7127         }
7128         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7129         return packFloat128( zSign, 0x7FFF, 0, 0 );
7130     }
7131     if ( bExp == 0x7FFF ) {
7132         if (bSig0 | bSig1) {
7133             return propagateFloat128NaN(a, b, status);
7134         }
7135         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7136  invalid:
7137             float_raise(float_flag_invalid, status);
7138             return float128_default_nan(status);
7139         }
7140         return packFloat128( zSign, 0x7FFF, 0, 0 );
7141     }
7142     if ( aExp == 0 ) {
7143         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7144         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7145     }
7146     if ( bExp == 0 ) {
7147         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7148         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7149     }
7150     zExp = aExp + bExp - 0x4000;
7151     aSig0 |= UINT64_C(0x0001000000000000);
7152     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7153     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7154     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7155     zSig2 |= ( zSig3 != 0 );
7156     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7157         shift128ExtraRightJamming(
7158             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7159         ++zExp;
7160     }
7161     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7162
7163 }
7164
7165 /*----------------------------------------------------------------------------
7166 | Returns the result of dividing the quadruple-precision floating-point value
7167 | `a' by the corresponding value `b'.  The operation is performed according to
7168 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7169 *----------------------------------------------------------------------------*/
7170
7171 float128 float128_div(float128 a, float128 b, float_status *status)
7172 {
7173     bool aSign, bSign, zSign;
7174     int32_t aExp, bExp, zExp;
7175     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7176     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7177
7178     aSig1 = extractFloat128Frac1( a );
7179     aSig0 = extractFloat128Frac0( a );
7180     aExp = extractFloat128Exp( a );
7181     aSign = extractFloat128Sign( a );
7182     bSig1 = extractFloat128Frac1( b );
7183     bSig0 = extractFloat128Frac0( b );
7184     bExp = extractFloat128Exp( b );
7185     bSign = extractFloat128Sign( b );
7186     zSign = aSign ^ bSign;
7187     if ( aExp == 0x7FFF ) {
7188         if (aSig0 | aSig1) {
7189             return propagateFloat128NaN(a, b, status);
7190         }
7191         if ( bExp == 0x7FFF ) {
7192             if (bSig0 | bSig1) {
7193                 return propagateFloat128NaN(a, b, status);
7194             }
7195             goto invalid;
7196         }
7197         return packFloat128( zSign, 0x7FFF, 0, 0 );
7198     }
7199     if ( bExp == 0x7FFF ) {
7200         if (bSig0 | bSig1) {
7201             return propagateFloat128NaN(a, b, status);
7202         }
7203         return packFloat128( zSign, 0, 0, 0 );
7204     }
7205     if ( bExp == 0 ) {
7206         if ( ( bSig0 | bSig1 ) == 0 ) {
7207             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7208  invalid:
7209                 float_raise(float_flag_invalid, status);
7210                 return float128_default_nan(status);
7211             }
7212             float_raise(float_flag_divbyzero, status);
7213             return packFloat128( zSign, 0x7FFF, 0, 0 );
7214         }
7215         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7216     }
7217     if ( aExp == 0 ) {
7218         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7219         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7220     }
7221     zExp = aExp - bExp + 0x3FFD;
7222     shortShift128Left(
7223         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7224     shortShift128Left(
7225         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7226     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7227         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7228         ++zExp;
7229     }
7230     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7231     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7232     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7233     while ( (int64_t) rem0 < 0 ) {
7234         --zSig0;
7235         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7236     }
7237     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7238     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7239         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7240         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7241         while ( (int64_t) rem1 < 0 ) {
7242             --zSig1;
7243             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7244         }
7245         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7246     }
7247     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7248     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7249
7250 }
7251
7252 /*----------------------------------------------------------------------------
7253 | Returns the remainder of the quadruple-precision floating-point value `a'
7254 | with respect to the corresponding value `b'.  The operation is performed
7255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7256 *----------------------------------------------------------------------------*/
7257
7258 float128 float128_rem(float128 a, float128 b, float_status *status)
7259 {
7260     bool aSign, zSign;
7261     int32_t aExp, bExp, expDiff;
7262     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7263     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7264     int64_t sigMean0;
7265
7266     aSig1 = extractFloat128Frac1( a );
7267     aSig0 = extractFloat128Frac0( a );
7268     aExp = extractFloat128Exp( a );
7269     aSign = extractFloat128Sign( a );
7270     bSig1 = extractFloat128Frac1( b );
7271     bSig0 = extractFloat128Frac0( b );
7272     bExp = extractFloat128Exp( b );
7273     if ( aExp == 0x7FFF ) {
7274         if (    ( aSig0 | aSig1 )
7275              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7276             return propagateFloat128NaN(a, b, status);
7277         }
7278         goto invalid;
7279     }
7280     if ( bExp == 0x7FFF ) {
7281         if (bSig0 | bSig1) {
7282             return propagateFloat128NaN(a, b, status);
7283         }
7284         return a;
7285     }
7286     if ( bExp == 0 ) {
7287         if ( ( bSig0 | bSig1 ) == 0 ) {
7288  invalid:
7289             float_raise(float_flag_invalid, status);
7290             return float128_default_nan(status);
7291         }
7292         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7293     }
7294     if ( aExp == 0 ) {
7295         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7296         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7297     }
7298     expDiff = aExp - bExp;
7299     if ( expDiff < -1 ) return a;
7300     shortShift128Left(
7301         aSig0 | UINT64_C(0x0001000000000000),
7302         aSig1,
7303         15 - ( expDiff < 0 ),
7304         &aSig0,
7305         &aSig1
7306     );
7307     shortShift128Left(
7308         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7309     q = le128( bSig0, bSig1, aSig0, aSig1 );
7310     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7311     expDiff -= 64;
7312     while ( 0 < expDiff ) {
7313         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7314         q = ( 4 < q ) ? q - 4 : 0;
7315         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7316         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7317         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7318         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7319         expDiff -= 61;
7320     }
7321     if ( -64 < expDiff ) {
7322         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7323         q = ( 4 < q ) ? q - 4 : 0;
7324         q >>= - expDiff;
7325         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7326         expDiff += 52;
7327         if ( expDiff < 0 ) {
7328             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7329         }
7330         else {
7331             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7332         }
7333         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7334         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7335     }
7336     else {
7337         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7338         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7339     }
7340     do {
7341         alternateASig0 = aSig0;
7342         alternateASig1 = aSig1;
7343         ++q;
7344         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7345     } while ( 0 <= (int64_t) aSig0 );
7346     add128(
7347         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7348     if (    ( sigMean0 < 0 )
7349          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7350         aSig0 = alternateASig0;
7351         aSig1 = alternateASig1;
7352     }
7353     zSign = ( (int64_t) aSig0 < 0 );
7354     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7355     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7356                                          status);
7357 }
7358
7359 /*----------------------------------------------------------------------------
7360 | Returns the square root of the quadruple-precision floating-point value `a'.
7361 | The operation is performed according to the IEC/IEEE Standard for Binary
7362 | Floating-Point Arithmetic.
7363 *----------------------------------------------------------------------------*/
7364
7365 float128 float128_sqrt(float128 a, float_status *status)
7366 {
7367     bool aSign;
7368     int32_t aExp, zExp;
7369     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7370     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7371
7372     aSig1 = extractFloat128Frac1( a );
7373     aSig0 = extractFloat128Frac0( a );
7374     aExp = extractFloat128Exp( a );
7375     aSign = extractFloat128Sign( a );
7376     if ( aExp == 0x7FFF ) {
7377         if (aSig0 | aSig1) {
7378             return propagateFloat128NaN(a, a, status);
7379         }
7380         if ( ! aSign ) return a;
7381         goto invalid;
7382     }
7383     if ( aSign ) {
7384         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7385  invalid:
7386         float_raise(float_flag_invalid, status);
7387         return float128_default_nan(status);
7388     }
7389     if ( aExp == 0 ) {
7390         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7391         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7392     }
7393     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7394     aSig0 |= UINT64_C(0x0001000000000000);
7395     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7396     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7397     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7398     doubleZSig0 = zSig0<<1;
7399     mul64To128( zSig0, zSig0, &term0, &term1 );
7400     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7401     while ( (int64_t) rem0 < 0 ) {
7402         --zSig0;
7403         doubleZSig0 -= 2;
7404         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7405     }
7406     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7407     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7408         if ( zSig1 == 0 ) zSig1 = 1;
7409         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7410         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7411         mul64To128( zSig1, zSig1, &term2, &term3 );
7412         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7413         while ( (int64_t) rem1 < 0 ) {
7414             --zSig1;
7415             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7416             term3 |= 1;
7417             term2 |= doubleZSig0;
7418             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7419         }
7420         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7421     }
7422     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7423     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7424
7425 }
7426
7427 static inline FloatRelation
7428 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7429                           float_status *status)
7430 {
7431     bool aSign, bSign;
7432
7433     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7434         float_raise(float_flag_invalid, status);
7435         return float_relation_unordered;
7436     }
7437     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7438           ( extractFloatx80Frac( a )<<1 ) ) ||
7439         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7440           ( extractFloatx80Frac( b )<<1 ) )) {
7441         if (!is_quiet ||
7442             floatx80_is_signaling_nan(a, status) ||
7443             floatx80_is_signaling_nan(b, status)) {
7444             float_raise(float_flag_invalid, status);
7445         }
7446         return float_relation_unordered;
7447     }
7448     aSign = extractFloatx80Sign( a );
7449     bSign = extractFloatx80Sign( b );
7450     if ( aSign != bSign ) {
7451
7452         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7453              ( ( a.low | b.low ) == 0 ) ) {
7454             /* zero case */
7455             return float_relation_equal;
7456         } else {
7457             return 1 - (2 * aSign);
7458         }
7459     } else {
7460         /* Normalize pseudo-denormals before comparison.  */
7461         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7462             ++a.high;
7463         }
7464         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7465             ++b.high;
7466         }
7467         if (a.low == b.low && a.high == b.high) {
7468             return float_relation_equal;
7469         } else {
7470             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7471         }
7472     }
7473 }
7474
7475 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7476 {
7477     return floatx80_compare_internal(a, b, 0, status);
7478 }
7479
7480 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7481                                      float_status *status)
7482 {
7483     return floatx80_compare_internal(a, b, 1, status);
7484 }
7485
7486 static inline FloatRelation
7487 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7488                           float_status *status)
7489 {
7490     bool aSign, bSign;
7491
7492     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7493           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7494         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7495           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7496         if (!is_quiet ||
7497             float128_is_signaling_nan(a, status) ||
7498             float128_is_signaling_nan(b, status)) {
7499             float_raise(float_flag_invalid, status);
7500         }
7501         return float_relation_unordered;
7502     }
7503     aSign = extractFloat128Sign( a );
7504     bSign = extractFloat128Sign( b );
7505     if ( aSign != bSign ) {
7506         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7507             /* zero case */
7508             return float_relation_equal;
7509         } else {
7510             return 1 - (2 * aSign);
7511         }
7512     } else {
7513         if (a.low == b.low && a.high == b.high) {
7514             return float_relation_equal;
7515         } else {
7516             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7517         }
7518     }
7519 }
7520
7521 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7522 {
7523     return float128_compare_internal(a, b, 0, status);
7524 }
7525
7526 FloatRelation float128_compare_quiet(float128 a, float128 b,
7527                                      float_status *status)
7528 {
7529     return float128_compare_internal(a, b, 1, status);
7530 }
7531
7532 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7533 {
7534     bool aSign;
7535     int32_t aExp;
7536     uint64_t aSig;
7537
7538     if (floatx80_invalid_encoding(a)) {
7539         float_raise(float_flag_invalid, status);
7540         return floatx80_default_nan(status);
7541     }
7542     aSig = extractFloatx80Frac( a );
7543     aExp = extractFloatx80Exp( a );
7544     aSign = extractFloatx80Sign( a );
7545
7546     if ( aExp == 0x7FFF ) {
7547         if ( aSig<<1 ) {
7548             return propagateFloatx80NaN(a, a, status);
7549         }
7550         return a;
7551     }
7552
7553     if (aExp == 0) {
7554         if (aSig == 0) {
7555             return a;
7556         }
7557         aExp++;
7558     }
7559
7560     if (n > 0x10000) {
7561         n = 0x10000;
7562     } else if (n < -0x10000) {
7563         n = -0x10000;
7564     }
7565
7566     aExp += n;
7567     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7568                                          aSign, aExp, aSig, 0, status);
7569 }
7570
7571 float128 float128_scalbn(float128 a, int n, float_status *status)
7572 {
7573     bool aSign;
7574     int32_t aExp;
7575     uint64_t aSig0, aSig1;
7576
7577     aSig1 = extractFloat128Frac1( a );
7578     aSig0 = extractFloat128Frac0( a );
7579     aExp = extractFloat128Exp( a );
7580     aSign = extractFloat128Sign( a );
7581     if ( aExp == 0x7FFF ) {
7582         if ( aSig0 | aSig1 ) {
7583             return propagateFloat128NaN(a, a, status);
7584         }
7585         return a;
7586     }
7587     if (aExp != 0) {
7588         aSig0 |= UINT64_C(0x0001000000000000);
7589     } else if (aSig0 == 0 && aSig1 == 0) {
7590         return a;
7591     } else {
7592         aExp++;
7593     }
7594
7595     if (n > 0x10000) {
7596         n = 0x10000;
7597     } else if (n < -0x10000) {
7598         n = -0x10000;
7599     }
7600
7601     aExp += n - 1;
7602     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7603                                          , status);
7604
7605 }
7606
7607 static void __attribute__((constructor)) softfloat_init(void)
7608 {
7609     union_float64 ua, ub, uc, ur;
7610
7611     if (QEMU_NO_HARDFLOAT) {
7612         return;
7613     }
7614     /*
7615      * Test that the host's FMA is not obviously broken. For example,
7616      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7617      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7618      */
7619     ua.s = 0x0020000000000001ULL;
7620     ub.s = 0x3ca0000000000000ULL;
7621     uc.s = 0x0020000000000000ULL;
7622     ur.h = fma(ua.h, ub.h, uc.h);
7623     if (ur.s != 0x0020000000000001ULL) {
7624         force_soft_fma = true;
7625     }
7626 }