fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float. The
 504  * exponent is unbiased and the fraction is normalized. All
 505  * calculations are done with a 64 bit fraction and then rounded as
 506  * appropriate for the final format.
 507  *
 508  * Thanks to the packed FloatClass a decent compiler should be able to
 509  * fit the whole structure into registers and avoid using the stack
 510  * for parameter passing.
 511  */
 512
 513 typedef struct {
 514     uint64_t frac;
 515     int32_t  exp;
 516     FloatClass cls;
 517     bool sign;
 518 } FloatParts64;
 519
 520 #define DECOMPOSED_BINARY_POINT    63
 521 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 522
 523 /* Structure holding all of the relevant parameters for a format.
 524  *   exp_size: the size of the exponent field
 525  *   exp_bias: the offset applied to the exponent field
 526  *   exp_max: the maximum normalised exponent
 527  *   frac_size: the size of the fraction field
 528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 529  * The following are computed based the size of fraction
 530  *   frac_lsb: least significant bit of fraction
 531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 532  *   round_mask/roundeven_mask: masks used for rounding
 533  * The following optional modifiers are available:
 534  *   arm_althp: handle ARM Alternative Half Precision
 535  */
 536 typedef struct {
 537     int exp_size;
 538     int exp_bias;
 539     int exp_max;
 540     int frac_size;
 541     int frac_shift;
 542     uint64_t frac_lsb;
 543     uint64_t frac_lsbm1;
 544     uint64_t round_mask;
 545     uint64_t roundeven_mask;
 546     bool arm_althp;
 547 } FloatFmt;
 548
 549 /* Expand fields based on the size of exponent and fraction */
 550 #define FLOAT_PARAMS(E, F)                                           \
 551     .exp_size       = E,                                             \
 552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 553     .exp_max        = (1 << E) - 1,                                  \
 554     .frac_size      = F,                                             \
 555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 560
 561 static const FloatFmt float16_params = {
 562     FLOAT_PARAMS(5, 10)
 563 };
 564
 565 static const FloatFmt float16_params_ahp = {
 566     FLOAT_PARAMS(5, 10),
 567     .arm_althp = true
 568 };
 569
 570 static const FloatFmt bfloat16_params = {
 571     FLOAT_PARAMS(8, 7)
 572 };
 573
 574 static const FloatFmt float32_params = {
 575     FLOAT_PARAMS(8, 23)
 576 };
 577
 578 static const FloatFmt float64_params = {
 579     FLOAT_PARAMS(11, 52)
 580 };
 581
 582 /* Unpack a float to parts, but do not canonicalize.  */
 583 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 584 {
 585     const int f_size = fmt->frac_size;
 586     const int e_size = fmt->exp_size;
 587
 588     *r = (FloatParts64) {
 589         .cls = float_class_unclassified,
 590         .sign = extract64(raw, f_size + e_size, 1),
 591         .exp = extract64(raw, f_size, e_size),
 592         .frac = extract64(raw, 0, f_size)
 593     };
 594 }
 595
 596 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
 597 {
 598     unpack_raw64(p, &float16_params, f);
 599 }
 600
 601 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 602 {
 603     unpack_raw64(p, &bfloat16_params, f);
 604 }
 605
 606 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
 607 {
 608     unpack_raw64(p, &float32_params, f);
 609 }
 610
 611 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
 612 {
 613     unpack_raw64(p, &float64_params, f);
 614 }
 615
 616 /* Pack a float from parts, but do not canonicalize.  */
 617 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 618 {
 619     const int f_size = fmt->frac_size;
 620     const int e_size = fmt->exp_size;
 621     uint64_t ret;
 622
 623     ret = (uint64_t)p->sign << (f_size + e_size);
 624     ret = deposit64(ret, f_size, e_size, p->exp);
 625     ret = deposit64(ret, 0, f_size, p->frac);
 626     return ret;
 627 }
 628
 629 static inline float16 float16_pack_raw(FloatParts64 p)
 630 {
 631     return make_float16(pack_raw64(&p, &float16_params));
 632 }
 633
 634 static inline bfloat16 bfloat16_pack_raw(FloatParts64 p)
 635 {
 636     return pack_raw64(&p, &bfloat16_params);
 637 }
 638
 639 static inline float32 float32_pack_raw(FloatParts64 p)
 640 {
 641     return make_float32(pack_raw64(&p, &float32_params));
 642 }
 643
 644 static inline float64 float64_pack_raw(FloatParts64 p)
 645 {
 646     return make_float64(pack_raw64(&p, &float64_params));
 647 }
 648
 649 /*----------------------------------------------------------------------------
 650 | Functions and definitions to determine:  (1) whether tininess for underflow
 651 | is detected before or after rounding by default, (2) what (if anything)
 652 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 653 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 654 | are propagated from function inputs to output.  These details are target-
 655 | specific.
 656 *----------------------------------------------------------------------------*/
 657 #include "softfloat-specialize.c.inc"
 658
 659 #define parts_default_nan  parts64_default_nan
 660
 661 /* Canonicalize EXP and FRAC, setting CLS.  */
 662 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
 663                                   float_status *status)
 664 {
 665     if (part.exp == parm->exp_max && !parm->arm_althp) {
 666         if (part.frac == 0) {
 667             part.cls = float_class_inf;
 668         } else {
 669             part.frac <<= parm->frac_shift;
 670             part.cls = (parts_is_snan_frac(part.frac, status)
 671                         ? float_class_snan : float_class_qnan);
 672         }
 673     } else if (part.exp == 0) {
 674         if (likely(part.frac == 0)) {
 675             part.cls = float_class_zero;
 676         } else if (status->flush_inputs_to_zero) {
 677             float_raise(float_flag_input_denormal, status);
 678             part.cls = float_class_zero;
 679             part.frac = 0;
 680         } else {
 681             int shift = clz64(part.frac);
 682             part.cls = float_class_normal;
 683             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 684             part.frac <<= shift;
 685         }
 686     } else {
 687         part.cls = float_class_normal;
 688         part.exp -= parm->exp_bias;
 689         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 690     }
 691     return part;
 692 }
 693
 694 /* Round and uncanonicalize a floating-point number by parts. There
 695  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 696  * fraction; these bits will be removed. The exponent will be biased
 697  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 698  */
 699
 700 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
 701                                   const FloatFmt *parm)
 702 {
 703     const uint64_t frac_lsb = parm->frac_lsb;
 704     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 705     const uint64_t round_mask = parm->round_mask;
 706     const uint64_t roundeven_mask = parm->roundeven_mask;
 707     const int exp_max = parm->exp_max;
 708     const int frac_shift = parm->frac_shift;
 709     uint64_t frac, inc;
 710     int exp, flags = 0;
 711     bool overflow_norm;
 712
 713     frac = p.frac;
 714     exp = p.exp;
 715
 716     switch (p.cls) {
 717     case float_class_normal:
 718         switch (s->float_rounding_mode) {
 719         case float_round_nearest_even:
 720             overflow_norm = false;
 721             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 722             break;
 723         case float_round_ties_away:
 724             overflow_norm = false;
 725             inc = frac_lsbm1;
 726             break;
 727         case float_round_to_zero:
 728             overflow_norm = true;
 729             inc = 0;
 730             break;
 731         case float_round_up:
 732             inc = p.sign ? 0 : round_mask;
 733             overflow_norm = p.sign;
 734             break;
 735         case float_round_down:
 736             inc = p.sign ? round_mask : 0;
 737             overflow_norm = !p.sign;
 738             break;
 739         case float_round_to_odd:
 740             overflow_norm = true;
 741             inc = frac & frac_lsb ? 0 : round_mask;
 742             break;
 743         default:
 744             g_assert_not_reached();
 745         }
 746
 747         exp += parm->exp_bias;
 748         if (likely(exp > 0)) {
 749             if (frac & round_mask) {
 750                 flags |= float_flag_inexact;
 751                 if (uadd64_overflow(frac, inc, &frac)) {
 752                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
 753                     exp++;
 754                 }
 755             }
 756             frac >>= frac_shift;
 757
 758             if (parm->arm_althp) {
 759                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 760                 if (unlikely(exp > exp_max)) {
 761                     /* Overflow.  Return the maximum normal.  */
 762                     flags = float_flag_invalid;
 763                     exp = exp_max;
 764                     frac = -1;
 765                 }
 766             } else if (unlikely(exp >= exp_max)) {
 767                 flags |= float_flag_overflow | float_flag_inexact;
 768                 if (overflow_norm) {
 769                     exp = exp_max - 1;
 770                     frac = -1;
 771                 } else {
 772                     p.cls = float_class_inf;
 773                     goto do_inf;
 774                 }
 775             }
 776         } else if (s->flush_to_zero) {
 777             flags |= float_flag_output_denormal;
 778             p.cls = float_class_zero;
 779             goto do_zero;
 780         } else {
 781             bool is_tiny = s->tininess_before_rounding || (exp < 0);
 782
 783             if (!is_tiny) {
 784                 uint64_t discard;
 785                 is_tiny = !uadd64_overflow(frac, inc, &discard);
 786             }
 787
 788             shift64RightJamming(frac, 1 - exp, &frac);
 789             if (frac & round_mask) {
 790                 /* Need to recompute round-to-even.  */
 791                 switch (s->float_rounding_mode) {
 792                 case float_round_nearest_even:
 793                     inc = ((frac & roundeven_mask) != frac_lsbm1
 794                            ? frac_lsbm1 : 0);
 795                     break;
 796                 case float_round_to_odd:
 797                     inc = frac & frac_lsb ? 0 : round_mask;
 798                     break;
 799                 default:
 800                     break;
 801                 }
 802                 flags |= float_flag_inexact;
 803                 frac += inc;
 804             }
 805
 806             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 807             frac >>= frac_shift;
 808
 809             if (is_tiny && (flags & float_flag_inexact)) {
 810                 flags |= float_flag_underflow;
 811             }
 812             if (exp == 0 && frac == 0) {
 813                 p.cls = float_class_zero;
 814             }
 815         }
 816         break;
 817
 818     case float_class_zero:
 819     do_zero:
 820         exp = 0;
 821         frac = 0;
 822         break;
 823
 824     case float_class_inf:
 825     do_inf:
 826         assert(!parm->arm_althp);
 827         exp = exp_max;
 828         frac = 0;
 829         break;
 830
 831     case float_class_qnan:
 832     case float_class_snan:
 833         assert(!parm->arm_althp);
 834         exp = exp_max;
 835         frac >>= parm->frac_shift;
 836         break;
 837
 838     default:
 839         g_assert_not_reached();
 840     }
 841
 842     float_raise(flags, s);
 843     p.exp = exp;
 844     p.frac = frac;
 845     return p;
 846 }
 847
 848 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
 849 {
 850     g_assert(is_nan(a.cls));
 851     if (is_snan(a.cls)) {
 852         float_raise(float_flag_invalid, s);
 853         if (!s->default_nan_mode) {
 854             return parts_silence_nan(a, s);
 855         }
 856     } else if (!s->default_nan_mode) {
 857         return a;
 858     }
 859     parts_default_nan(&a, s);
 860     return a;
 861 }
 862
 863 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
 864 {
 865     if (is_snan(a.cls) || is_snan(b.cls)) {
 866         float_raise(float_flag_invalid, s);
 867     }
 868
 869     if (s->default_nan_mode) {
 870         parts_default_nan(&a, s);
 871     } else {
 872         if (pickNaN(a.cls, b.cls,
 873                     a.frac > b.frac ||
 874                     (a.frac == b.frac && a.sign < b.sign), s)) {
 875             a = b;
 876         }
 877         if (is_snan(a.cls)) {
 878             return parts_silence_nan(a, s);
 879         }
 880     }
 881     return a;
 882 }
 883
 884 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
 885                                   bool inf_zero, float_status *s)
 886 {
 887     int which;
 888
 889     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 890         float_raise(float_flag_invalid, s);
 891     }
 892
 893     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 894
 895     if (s->default_nan_mode) {
 896         /* Note that this check is after pickNaNMulAdd so that function
 897          * has an opportunity to set the Invalid flag.
 898          */
 899         which = 3;
 900     }
 901
 902     switch (which) {
 903     case 0:
 904         break;
 905     case 1:
 906         a = b;
 907         break;
 908     case 2:
 909         a = c;
 910         break;
 911     case 3:
 912         parts_default_nan(&a, s);
 913         break;
 914     default:
 915         g_assert_not_reached();
 916     }
 917
 918     if (is_snan(a.cls)) {
 919         return parts_silence_nan(a, s);
 920     }
 921     return a;
 922 }
 923
 924 /*
 925  * Pack/unpack routines with a specific FloatFmt.
 926  */
 927
 928 static FloatParts64 float16a_unpack_canonical(float16 f, float_status *s,
 929                                             const FloatFmt *params)
 930 {
 931     FloatParts64 p;
 932
 933     float16_unpack_raw(&p, f);
 934     return sf_canonicalize(p, params, s);
 935 }
 936
 937 static FloatParts64 float16_unpack_canonical(float16 f, float_status *s)
 938 {
 939     return float16a_unpack_canonical(f, s, &float16_params);
 940 }
 941
 942 static FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s)
 943 {
 944     FloatParts64 p;
 945
 946     bfloat16_unpack_raw(&p, f);
 947     return sf_canonicalize(p, &bfloat16_params, s);
 948 }
 949
 950 static float16 float16a_round_pack_canonical(FloatParts64 p, float_status *s,
 951                                              const FloatFmt *params)
 952 {
 953     return float16_pack_raw(round_canonical(p, s, params));
 954 }
 955
 956 static float16 float16_round_pack_canonical(FloatParts64 p, float_status *s)
 957 {
 958     return float16a_round_pack_canonical(p, s, &float16_params);
 959 }
 960
 961 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 p, float_status *s)
 962 {
 963     return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
 964 }
 965
 966 static FloatParts64 float32_unpack_canonical(float32 f, float_status *s)
 967 {
 968     FloatParts64 p;
 969
 970     float32_unpack_raw(&p, f);
 971     return sf_canonicalize(p, &float32_params, s);
 972 }
 973
 974 static float32 float32_round_pack_canonical(FloatParts64 p, float_status *s)
 975 {
 976     return float32_pack_raw(round_canonical(p, s, &float32_params));
 977 }
 978
 979 static FloatParts64 float64_unpack_canonical(float64 f, float_status *s)
 980 {
 981     FloatParts64 p;
 982
 983     float64_unpack_raw(&p, f);
 984     return sf_canonicalize(p, &float64_params, s);
 985 }
 986
 987 static float64 float64_round_pack_canonical(FloatParts64 p, float_status *s)
 988 {
 989     return float64_pack_raw(round_canonical(p, s, &float64_params));
 990 }
 991
 992 /*
 993  * Returns the result of adding or subtracting the values of the
 994  * floating-point values `a' and `b'. The operation is performed
 995  * according to the IEC/IEEE Standard for Binary Floating-Point
 996  * Arithmetic.
 997  */
 998
 999 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1000                                 float_status *s)
1001 {
1002     bool a_sign = a.sign;
1003     bool b_sign = b.sign ^ subtract;
1004
1005     if (a_sign != b_sign) {
1006         /* Subtraction */
1007
1008         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1009             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1010                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1011                 a.frac = a.frac - b.frac;
1012             } else {
1013                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1014                 a.frac = b.frac - a.frac;
1015                 a.exp = b.exp;
1016                 a_sign ^= 1;
1017             }
1018
1019             if (a.frac == 0) {
1020                 a.cls = float_class_zero;
1021                 a.sign = s->float_rounding_mode == float_round_down;
1022             } else {
1023                 int shift = clz64(a.frac);
1024                 a.frac = a.frac << shift;
1025                 a.exp = a.exp - shift;
1026                 a.sign = a_sign;
1027             }
1028             return a;
1029         }
1030         if (is_nan(a.cls) || is_nan(b.cls)) {
1031             return pick_nan(a, b, s);
1032         }
1033         if (a.cls == float_class_inf) {
1034             if (b.cls == float_class_inf) {
1035                 float_raise(float_flag_invalid, s);
1036                 parts_default_nan(&a, s);
1037             }
1038             return a;
1039         }
1040         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1041             a.sign = s->float_rounding_mode == float_round_down;
1042             return a;
1043         }
1044         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1045             b.sign = a_sign ^ 1;
1046             return b;
1047         }
1048         if (b.cls == float_class_zero) {
1049             return a;
1050         }
1051     } else {
1052         /* Addition */
1053         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1054             if (a.exp > b.exp) {
1055                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1056             } else if (a.exp < b.exp) {
1057                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1058                 a.exp = b.exp;
1059             }
1060
1061             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1062                 shift64RightJamming(a.frac, 1, &a.frac);
1063                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1064                 a.exp += 1;
1065             }
1066             return a;
1067         }
1068         if (is_nan(a.cls) || is_nan(b.cls)) {
1069             return pick_nan(a, b, s);
1070         }
1071         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1072             return a;
1073         }
1074         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1075             b.sign = b_sign;
1076             return b;
1077         }
1078     }
1079     g_assert_not_reached();
1080 }
1081
1082 /*
1083  * Returns the result of adding or subtracting the floating-point
1084  * values `a' and `b'. The operation is performed according to the
1085  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1086  */
1087
1088 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1089 {
1090     FloatParts64 pa = float16_unpack_canonical(a, status);
1091     FloatParts64 pb = float16_unpack_canonical(b, status);
1092     FloatParts64 pr = addsub_floats(pa, pb, false, status);
1093
1094     return float16_round_pack_canonical(pr, status);
1095 }
1096
1097 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1098 {
1099     FloatParts64 pa = float16_unpack_canonical(a, status);
1100     FloatParts64 pb = float16_unpack_canonical(b, status);
1101     FloatParts64 pr = addsub_floats(pa, pb, true, status);
1102
1103     return float16_round_pack_canonical(pr, status);
1104 }
1105
1106 static float32 QEMU_SOFTFLOAT_ATTR
1107 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1108 {
1109     FloatParts64 pa = float32_unpack_canonical(a, status);
1110     FloatParts64 pb = float32_unpack_canonical(b, status);
1111     FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1112
1113     return float32_round_pack_canonical(pr, status);
1114 }
1115
1116 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1117 {
1118     return soft_f32_addsub(a, b, false, status);
1119 }
1120
1121 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1122 {
1123     return soft_f32_addsub(a, b, true, status);
1124 }
1125
1126 static float64 QEMU_SOFTFLOAT_ATTR
1127 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1128 {
1129     FloatParts64 pa = float64_unpack_canonical(a, status);
1130     FloatParts64 pb = float64_unpack_canonical(b, status);
1131     FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1132
1133     return float64_round_pack_canonical(pr, status);
1134 }
1135
1136 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1137 {
1138     return soft_f64_addsub(a, b, false, status);
1139 }
1140
1141 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1142 {
1143     return soft_f64_addsub(a, b, true, status);
1144 }
1145
1146 static float hard_f32_add(float a, float b)
1147 {
1148     return a + b;
1149 }
1150
1151 static float hard_f32_sub(float a, float b)
1152 {
1153     return a - b;
1154 }
1155
1156 static double hard_f64_add(double a, double b)
1157 {
1158     return a + b;
1159 }
1160
1161 static double hard_f64_sub(double a, double b)
1162 {
1163     return a - b;
1164 }
1165
1166 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1167 {
1168     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1169         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1170     }
1171     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1172 }
1173
1174 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1175 {
1176     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1177         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1178     } else {
1179         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1180     }
1181 }
1182
1183 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1184                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1185 {
1186     return float32_gen2(a, b, s, hard, soft,
1187                         f32_is_zon2, f32_addsubmul_post);
1188 }
1189
1190 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1191                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1192 {
1193     return float64_gen2(a, b, s, hard, soft,
1194                         f64_is_zon2, f64_addsubmul_post);
1195 }
1196
1197 float32 QEMU_FLATTEN
1198 float32_add(float32 a, float32 b, float_status *s)
1199 {
1200     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1201 }
1202
1203 float32 QEMU_FLATTEN
1204 float32_sub(float32 a, float32 b, float_status *s)
1205 {
1206     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1207 }
1208
1209 float64 QEMU_FLATTEN
1210 float64_add(float64 a, float64 b, float_status *s)
1211 {
1212     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1213 }
1214
1215 float64 QEMU_FLATTEN
1216 float64_sub(float64 a, float64 b, float_status *s)
1217 {
1218     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1219 }
1220
1221 /*
1222  * Returns the result of adding or subtracting the bfloat16
1223  * values `a' and `b'.
1224  */
1225 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1226 {
1227     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1228     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1229     FloatParts64 pr = addsub_floats(pa, pb, false, status);
1230
1231     return bfloat16_round_pack_canonical(pr, status);
1232 }
1233
1234 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1235 {
1236     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1237     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1238     FloatParts64 pr = addsub_floats(pa, pb, true, status);
1239
1240     return bfloat16_round_pack_canonical(pr, status);
1241 }
1242
1243 /*
1244  * Returns the result of multiplying the floating-point values `a' and
1245  * `b'. The operation is performed according to the IEC/IEEE Standard
1246  * for Binary Floating-Point Arithmetic.
1247  */
1248
1249 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1250 {
1251     bool sign = a.sign ^ b.sign;
1252
1253     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1254         uint64_t hi, lo;
1255         int exp = a.exp + b.exp;
1256
1257         mul64To128(a.frac, b.frac, &hi, &lo);
1258         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1259             exp += 1;
1260         } else {
1261             hi <<= 1;
1262         }
1263         hi |= (lo != 0);
1264
1265         /* Re-use a */
1266         a.exp = exp;
1267         a.sign = sign;
1268         a.frac = hi;
1269         return a;
1270     }
1271     /* handle all the NaN cases */
1272     if (is_nan(a.cls) || is_nan(b.cls)) {
1273         return pick_nan(a, b, s);
1274     }
1275     /* Inf * Zero == NaN */
1276     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1277         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1278         float_raise(float_flag_invalid, s);
1279         parts_default_nan(&a, s);
1280         return a;
1281     }
1282     /* Multiply by 0 or Inf */
1283     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1284         a.sign = sign;
1285         return a;
1286     }
1287     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1288         b.sign = sign;
1289         return b;
1290     }
1291     g_assert_not_reached();
1292 }
1293
1294 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1295 {
1296     FloatParts64 pa = float16_unpack_canonical(a, status);
1297     FloatParts64 pb = float16_unpack_canonical(b, status);
1298     FloatParts64 pr = mul_floats(pa, pb, status);
1299
1300     return float16_round_pack_canonical(pr, status);
1301 }
1302
1303 static float32 QEMU_SOFTFLOAT_ATTR
1304 soft_f32_mul(float32 a, float32 b, float_status *status)
1305 {
1306     FloatParts64 pa = float32_unpack_canonical(a, status);
1307     FloatParts64 pb = float32_unpack_canonical(b, status);
1308     FloatParts64 pr = mul_floats(pa, pb, status);
1309
1310     return float32_round_pack_canonical(pr, status);
1311 }
1312
1313 static float64 QEMU_SOFTFLOAT_ATTR
1314 soft_f64_mul(float64 a, float64 b, float_status *status)
1315 {
1316     FloatParts64 pa = float64_unpack_canonical(a, status);
1317     FloatParts64 pb = float64_unpack_canonical(b, status);
1318     FloatParts64 pr = mul_floats(pa, pb, status);
1319
1320     return float64_round_pack_canonical(pr, status);
1321 }
1322
1323 static float hard_f32_mul(float a, float b)
1324 {
1325     return a * b;
1326 }
1327
1328 static double hard_f64_mul(double a, double b)
1329 {
1330     return a * b;
1331 }
1332
1333 float32 QEMU_FLATTEN
1334 float32_mul(float32 a, float32 b, float_status *s)
1335 {
1336     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1337                         f32_is_zon2, f32_addsubmul_post);
1338 }
1339
1340 float64 QEMU_FLATTEN
1341 float64_mul(float64 a, float64 b, float_status *s)
1342 {
1343     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1344                         f64_is_zon2, f64_addsubmul_post);
1345 }
1346
1347 /*
1348  * Returns the result of multiplying the bfloat16
1349  * values `a' and `b'.
1350  */
1351
1352 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1353 {
1354     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1355     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1356     FloatParts64 pr = mul_floats(pa, pb, status);
1357
1358     return bfloat16_round_pack_canonical(pr, status);
1359 }
1360
1361 /*
1362  * Returns the result of multiplying the floating-point values `a' and
1363  * `b' then adding 'c', with no intermediate rounding step after the
1364  * multiplication. The operation is performed according to the
1365  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1366  * The flags argument allows the caller to select negation of the
1367  * addend, the intermediate product, or the final result. (The
1368  * difference between this and having the caller do a separate
1369  * negation is that negating externally will flip the sign bit on
1370  * NaNs.)
1371  */
1372
1373 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1374                                 int flags, float_status *s)
1375 {
1376     bool inf_zero, p_sign;
1377     bool sign_flip = flags & float_muladd_negate_result;
1378     FloatClass p_class;
1379     uint64_t hi, lo;
1380     int p_exp;
1381     int ab_mask, abc_mask;
1382
1383     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1384     abc_mask = float_cmask(c.cls) | ab_mask;
1385     inf_zero = ab_mask == float_cmask_infzero;
1386
1387     /* It is implementation-defined whether the cases of (0,inf,qnan)
1388      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1389      * they return if they do), so we have to hand this information
1390      * off to the target-specific pick-a-NaN routine.
1391      */
1392     if (unlikely(abc_mask & float_cmask_anynan)) {
1393         return pick_nan_muladd(a, b, c, inf_zero, s);
1394     }
1395
1396     if (inf_zero) {
1397         float_raise(float_flag_invalid, s);
1398         parts_default_nan(&a, s);
1399         return a;
1400     }
1401
1402     if (flags & float_muladd_negate_c) {
1403         c.sign ^= 1;
1404     }
1405
1406     p_sign = a.sign ^ b.sign;
1407
1408     if (flags & float_muladd_negate_product) {
1409         p_sign ^= 1;
1410     }
1411
1412     if (ab_mask & float_cmask_inf) {
1413         p_class = float_class_inf;
1414     } else if (ab_mask & float_cmask_zero) {
1415         p_class = float_class_zero;
1416     } else {
1417         p_class = float_class_normal;
1418     }
1419
1420     if (c.cls == float_class_inf) {
1421         if (p_class == float_class_inf && p_sign != c.sign) {
1422             float_raise(float_flag_invalid, s);
1423             parts_default_nan(&c, s);
1424         } else {
1425             c.sign ^= sign_flip;
1426         }
1427         return c;
1428     }
1429
1430     if (p_class == float_class_inf) {
1431         a.cls = float_class_inf;
1432         a.sign = p_sign ^ sign_flip;
1433         return a;
1434     }
1435
1436     if (p_class == float_class_zero) {
1437         if (c.cls == float_class_zero) {
1438             if (p_sign != c.sign) {
1439                 p_sign = s->float_rounding_mode == float_round_down;
1440             }
1441             c.sign = p_sign;
1442         } else if (flags & float_muladd_halve_result) {
1443             c.exp -= 1;
1444         }
1445         c.sign ^= sign_flip;
1446         return c;
1447     }
1448
1449     /* a & b should be normals now... */
1450     assert(a.cls == float_class_normal &&
1451            b.cls == float_class_normal);
1452
1453     p_exp = a.exp + b.exp;
1454
1455     mul64To128(a.frac, b.frac, &hi, &lo);
1456
1457     /* Renormalize to the msb. */
1458     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1459         p_exp += 1;
1460     } else {
1461         shortShift128Left(hi, lo, 1, &hi, &lo);
1462     }
1463
1464     /* + add/sub */
1465     if (c.cls != float_class_zero) {
1466         int exp_diff = p_exp - c.exp;
1467         if (p_sign == c.sign) {
1468             /* Addition */
1469             if (exp_diff <= 0) {
1470                 shift64RightJamming(hi, -exp_diff, &hi);
1471                 p_exp = c.exp;
1472                 if (uadd64_overflow(hi, c.frac, &hi)) {
1473                     shift64RightJamming(hi, 1, &hi);
1474                     hi |= DECOMPOSED_IMPLICIT_BIT;
1475                     p_exp += 1;
1476                 }
1477             } else {
1478                 uint64_t c_hi, c_lo, over;
1479                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1480                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1481                 if (over) {
1482                     shift64RightJamming(hi, 1, &hi);
1483                     hi |= DECOMPOSED_IMPLICIT_BIT;
1484                     p_exp += 1;
1485                 }
1486             }
1487         } else {
1488             /* Subtraction */
1489             uint64_t c_hi = c.frac, c_lo = 0;
1490
1491             if (exp_diff <= 0) {
1492                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1493                 if (exp_diff == 0
1494                     &&
1495                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1496                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1497                 } else {
1498                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1499                     p_sign ^= 1;
1500                     p_exp = c.exp;
1501                 }
1502             } else {
1503                 shift128RightJamming(c_hi, c_lo,
1504                                      exp_diff,
1505                                      &c_hi, &c_lo);
1506                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1507             }
1508
1509             if (hi == 0 && lo == 0) {
1510                 a.cls = float_class_zero;
1511                 a.sign = s->float_rounding_mode == float_round_down;
1512                 a.sign ^= sign_flip;
1513                 return a;
1514             } else {
1515                 int shift;
1516                 if (hi != 0) {
1517                     shift = clz64(hi);
1518                 } else {
1519                     shift = clz64(lo) + 64;
1520                 }
1521                 /* Normalizing to a binary point of 124 is the
1522                    correct adjust for the exponent.  However since we're
1523                    shifting, we might as well put the binary point back
1524                    at 63 where we really want it.  Therefore shift as
1525                    if we're leaving 1 bit at the top of the word, but
1526                    adjust the exponent as if we're leaving 3 bits.  */
1527                 shift128Left(hi, lo, shift, &hi, &lo);
1528                 p_exp -= shift;
1529             }
1530         }
1531     }
1532     hi |= (lo != 0);
1533
1534     if (flags & float_muladd_halve_result) {
1535         p_exp -= 1;
1536     }
1537
1538     /* finally prepare our result */
1539     a.cls = float_class_normal;
1540     a.sign = p_sign ^ sign_flip;
1541     a.exp = p_exp;
1542     a.frac = hi;
1543
1544     return a;
1545 }
1546
1547 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1548                                                 int flags, float_status *status)
1549 {
1550     FloatParts64 pa = float16_unpack_canonical(a, status);
1551     FloatParts64 pb = float16_unpack_canonical(b, status);
1552     FloatParts64 pc = float16_unpack_canonical(c, status);
1553     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1554
1555     return float16_round_pack_canonical(pr, status);
1556 }
1557
1558 static float32 QEMU_SOFTFLOAT_ATTR
1559 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1560                 float_status *status)
1561 {
1562     FloatParts64 pa = float32_unpack_canonical(a, status);
1563     FloatParts64 pb = float32_unpack_canonical(b, status);
1564     FloatParts64 pc = float32_unpack_canonical(c, status);
1565     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1566
1567     return float32_round_pack_canonical(pr, status);
1568 }
1569
1570 static float64 QEMU_SOFTFLOAT_ATTR
1571 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1572                 float_status *status)
1573 {
1574     FloatParts64 pa = float64_unpack_canonical(a, status);
1575     FloatParts64 pb = float64_unpack_canonical(b, status);
1576     FloatParts64 pc = float64_unpack_canonical(c, status);
1577     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1578
1579     return float64_round_pack_canonical(pr, status);
1580 }
1581
1582 static bool force_soft_fma;
1583
1584 float32 QEMU_FLATTEN
1585 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1586 {
1587     union_float32 ua, ub, uc, ur;
1588
1589     ua.s = xa;
1590     ub.s = xb;
1591     uc.s = xc;
1592
1593     if (unlikely(!can_use_fpu(s))) {
1594         goto soft;
1595     }
1596     if (unlikely(flags & float_muladd_halve_result)) {
1597         goto soft;
1598     }
1599
1600     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1601     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1602         goto soft;
1603     }
1604
1605     if (unlikely(force_soft_fma)) {
1606         goto soft;
1607     }
1608
1609     /*
1610      * When (a || b) == 0, there's no need to check for under/over flow,
1611      * since we know the addend is (normal || 0) and the product is 0.
1612      */
1613     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1614         union_float32 up;
1615         bool prod_sign;
1616
1617         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1618         prod_sign ^= !!(flags & float_muladd_negate_product);
1619         up.s = float32_set_sign(float32_zero, prod_sign);
1620
1621         if (flags & float_muladd_negate_c) {
1622             uc.h = -uc.h;
1623         }
1624         ur.h = up.h + uc.h;
1625     } else {
1626         union_float32 ua_orig = ua;
1627         union_float32 uc_orig = uc;
1628
1629         if (flags & float_muladd_negate_product) {
1630             ua.h = -ua.h;
1631         }
1632         if (flags & float_muladd_negate_c) {
1633             uc.h = -uc.h;
1634         }
1635
1636         ur.h = fmaf(ua.h, ub.h, uc.h);
1637
1638         if (unlikely(f32_is_inf(ur))) {
1639             float_raise(float_flag_overflow, s);
1640         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1641             ua = ua_orig;
1642             uc = uc_orig;
1643             goto soft;
1644         }
1645     }
1646     if (flags & float_muladd_negate_result) {
1647         return float32_chs(ur.s);
1648     }
1649     return ur.s;
1650
1651  soft:
1652     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1653 }
1654
1655 float64 QEMU_FLATTEN
1656 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1657 {
1658     union_float64 ua, ub, uc, ur;
1659
1660     ua.s = xa;
1661     ub.s = xb;
1662     uc.s = xc;
1663
1664     if (unlikely(!can_use_fpu(s))) {
1665         goto soft;
1666     }
1667     if (unlikely(flags & float_muladd_halve_result)) {
1668         goto soft;
1669     }
1670
1671     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1672     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1673         goto soft;
1674     }
1675
1676     if (unlikely(force_soft_fma)) {
1677         goto soft;
1678     }
1679
1680     /*
1681      * When (a || b) == 0, there's no need to check for under/over flow,
1682      * since we know the addend is (normal || 0) and the product is 0.
1683      */
1684     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1685         union_float64 up;
1686         bool prod_sign;
1687
1688         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1689         prod_sign ^= !!(flags & float_muladd_negate_product);
1690         up.s = float64_set_sign(float64_zero, prod_sign);
1691
1692         if (flags & float_muladd_negate_c) {
1693             uc.h = -uc.h;
1694         }
1695         ur.h = up.h + uc.h;
1696     } else {
1697         union_float64 ua_orig = ua;
1698         union_float64 uc_orig = uc;
1699
1700         if (flags & float_muladd_negate_product) {
1701             ua.h = -ua.h;
1702         }
1703         if (flags & float_muladd_negate_c) {
1704             uc.h = -uc.h;
1705         }
1706
1707         ur.h = fma(ua.h, ub.h, uc.h);
1708
1709         if (unlikely(f64_is_inf(ur))) {
1710             float_raise(float_flag_overflow, s);
1711         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1712             ua = ua_orig;
1713             uc = uc_orig;
1714             goto soft;
1715         }
1716     }
1717     if (flags & float_muladd_negate_result) {
1718         return float64_chs(ur.s);
1719     }
1720     return ur.s;
1721
1722  soft:
1723     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1724 }
1725
1726 /*
1727  * Returns the result of multiplying the bfloat16 values `a'
1728  * and `b' then adding 'c', with no intermediate rounding step after the
1729  * multiplication.
1730  */
1731
1732 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1733                                       int flags, float_status *status)
1734 {
1735     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1736     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1737     FloatParts64 pc = bfloat16_unpack_canonical(c, status);
1738     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1739
1740     return bfloat16_round_pack_canonical(pr, status);
1741 }
1742
1743 /*
1744  * Returns the result of dividing the floating-point value `a' by the
1745  * corresponding value `b'. The operation is performed according to
1746  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1747  */
1748
1749 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1750 {
1751     bool sign = a.sign ^ b.sign;
1752
1753     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1754         uint64_t n0, n1, q, r;
1755         int exp = a.exp - b.exp;
1756
1757         /*
1758          * We want a 2*N / N-bit division to produce exactly an N-bit
1759          * result, so that we do not lose any precision and so that we
1760          * do not have to renormalize afterward.  If A.frac < B.frac,
1761          * then division would produce an (N-1)-bit result; shift A left
1762          * by one to produce the an N-bit result, and decrement the
1763          * exponent to match.
1764          *
1765          * The udiv_qrnnd algorithm that we're using requires normalization,
1766          * i.e. the msb of the denominator must be set, which is already true.
1767          */
1768         if (a.frac < b.frac) {
1769             exp -= 1;
1770             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1771         } else {
1772             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1773         }
1774         q = udiv_qrnnd(&r, n1, n0, b.frac);
1775
1776         /* Set lsb if there is a remainder, to set inexact. */
1777         a.frac = q | (r != 0);
1778         a.sign = sign;
1779         a.exp = exp;
1780         return a;
1781     }
1782     /* handle all the NaN cases */
1783     if (is_nan(a.cls) || is_nan(b.cls)) {
1784         return pick_nan(a, b, s);
1785     }
1786     /* 0/0 or Inf/Inf */
1787     if (a.cls == b.cls
1788         &&
1789         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1790         float_raise(float_flag_invalid, s);
1791         parts_default_nan(&a, s);
1792         return a;
1793     }
1794     /* Inf / x or 0 / x */
1795     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1796         a.sign = sign;
1797         return a;
1798     }
1799     /* Div 0 => Inf */
1800     if (b.cls == float_class_zero) {
1801         float_raise(float_flag_divbyzero, s);
1802         a.cls = float_class_inf;
1803         a.sign = sign;
1804         return a;
1805     }
1806     /* Div by Inf */
1807     if (b.cls == float_class_inf) {
1808         a.cls = float_class_zero;
1809         a.sign = sign;
1810         return a;
1811     }
1812     g_assert_not_reached();
1813 }
1814
1815 float16 float16_div(float16 a, float16 b, float_status *status)
1816 {
1817     FloatParts64 pa = float16_unpack_canonical(a, status);
1818     FloatParts64 pb = float16_unpack_canonical(b, status);
1819     FloatParts64 pr = div_floats(pa, pb, status);
1820
1821     return float16_round_pack_canonical(pr, status);
1822 }
1823
1824 static float32 QEMU_SOFTFLOAT_ATTR
1825 soft_f32_div(float32 a, float32 b, float_status *status)
1826 {
1827     FloatParts64 pa = float32_unpack_canonical(a, status);
1828     FloatParts64 pb = float32_unpack_canonical(b, status);
1829     FloatParts64 pr = div_floats(pa, pb, status);
1830
1831     return float32_round_pack_canonical(pr, status);
1832 }
1833
1834 static float64 QEMU_SOFTFLOAT_ATTR
1835 soft_f64_div(float64 a, float64 b, float_status *status)
1836 {
1837     FloatParts64 pa = float64_unpack_canonical(a, status);
1838     FloatParts64 pb = float64_unpack_canonical(b, status);
1839     FloatParts64 pr = div_floats(pa, pb, status);
1840
1841     return float64_round_pack_canonical(pr, status);
1842 }
1843
1844 static float hard_f32_div(float a, float b)
1845 {
1846     return a / b;
1847 }
1848
1849 static double hard_f64_div(double a, double b)
1850 {
1851     return a / b;
1852 }
1853
1854 static bool f32_div_pre(union_float32 a, union_float32 b)
1855 {
1856     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1857         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1858                fpclassify(b.h) == FP_NORMAL;
1859     }
1860     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1861 }
1862
1863 static bool f64_div_pre(union_float64 a, union_float64 b)
1864 {
1865     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1866         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1867                fpclassify(b.h) == FP_NORMAL;
1868     }
1869     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1870 }
1871
1872 static bool f32_div_post(union_float32 a, union_float32 b)
1873 {
1874     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1875         return fpclassify(a.h) != FP_ZERO;
1876     }
1877     return !float32_is_zero(a.s);
1878 }
1879
1880 static bool f64_div_post(union_float64 a, union_float64 b)
1881 {
1882     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1883         return fpclassify(a.h) != FP_ZERO;
1884     }
1885     return !float64_is_zero(a.s);
1886 }
1887
1888 float32 QEMU_FLATTEN
1889 float32_div(float32 a, float32 b, float_status *s)
1890 {
1891     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1892                         f32_div_pre, f32_div_post);
1893 }
1894
1895 float64 QEMU_FLATTEN
1896 float64_div(float64 a, float64 b, float_status *s)
1897 {
1898     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1899                         f64_div_pre, f64_div_post);
1900 }
1901
1902 /*
1903  * Returns the result of dividing the bfloat16
1904  * value `a' by the corresponding value `b'.
1905  */
1906
1907 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1908 {
1909     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1910     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1911     FloatParts64 pr = div_floats(pa, pb, status);
1912
1913     return bfloat16_round_pack_canonical(pr, status);
1914 }
1915
1916 /*
1917  * Float to Float conversions
1918  *
1919  * Returns the result of converting one float format to another. The
1920  * conversion is performed according to the IEC/IEEE Standard for
1921  * Binary Floating-Point Arithmetic.
1922  *
1923  * The float_to_float helper only needs to take care of raising
1924  * invalid exceptions and handling the conversion on NaNs.
1925  */
1926
1927 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1928                                  float_status *s)
1929 {
1930     if (dstf->arm_althp) {
1931         switch (a.cls) {
1932         case float_class_qnan:
1933         case float_class_snan:
1934             /* There is no NaN in the destination format.  Raise Invalid
1935              * and return a zero with the sign of the input NaN.
1936              */
1937             float_raise(float_flag_invalid, s);
1938             a.cls = float_class_zero;
1939             a.frac = 0;
1940             a.exp = 0;
1941             break;
1942
1943         case float_class_inf:
1944             /* There is no Inf in the destination format.  Raise Invalid
1945              * and return the maximum normal with the correct sign.
1946              */
1947             float_raise(float_flag_invalid, s);
1948             a.cls = float_class_normal;
1949             a.exp = dstf->exp_max;
1950             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1951             break;
1952
1953         default:
1954             break;
1955         }
1956     } else if (is_nan(a.cls)) {
1957         return return_nan(a, s);
1958     }
1959     return a;
1960 }
1961
1962 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1963 {
1964     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1965     FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1966     FloatParts64 pr = float_to_float(p, &float32_params, s);
1967     return float32_round_pack_canonical(pr, s);
1968 }
1969
1970 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1971 {
1972     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1973     FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1974     FloatParts64 pr = float_to_float(p, &float64_params, s);
1975     return float64_round_pack_canonical(pr, s);
1976 }
1977
1978 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1979 {
1980     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1981     FloatParts64 p = float32_unpack_canonical(a, s);
1982     FloatParts64 pr = float_to_float(p, fmt16, s);
1983     return float16a_round_pack_canonical(pr, s, fmt16);
1984 }
1985
1986 static float64 QEMU_SOFTFLOAT_ATTR
1987 soft_float32_to_float64(float32 a, float_status *s)
1988 {
1989     FloatParts64 p = float32_unpack_canonical(a, s);
1990     FloatParts64 pr = float_to_float(p, &float64_params, s);
1991     return float64_round_pack_canonical(pr, s);
1992 }
1993
1994 float64 float32_to_float64(float32 a, float_status *s)
1995 {
1996     if (likely(float32_is_normal(a))) {
1997         /* Widening conversion can never produce inexact results.  */
1998         union_float32 uf;
1999         union_float64 ud;
2000         uf.s = a;
2001         ud.h = uf.h;
2002         return ud.s;
2003     } else if (float32_is_zero(a)) {
2004         return float64_set_sign(float64_zero, float32_is_neg(a));
2005     } else {
2006         return soft_float32_to_float64(a, s);
2007     }
2008 }
2009
2010 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2011 {
2012     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2013     FloatParts64 p = float64_unpack_canonical(a, s);
2014     FloatParts64 pr = float_to_float(p, fmt16, s);
2015     return float16a_round_pack_canonical(pr, s, fmt16);
2016 }
2017
2018 float32 float64_to_float32(float64 a, float_status *s)
2019 {
2020     FloatParts64 p = float64_unpack_canonical(a, s);
2021     FloatParts64 pr = float_to_float(p, &float32_params, s);
2022     return float32_round_pack_canonical(pr, s);
2023 }
2024
2025 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2026 {
2027     FloatParts64 p = bfloat16_unpack_canonical(a, s);
2028     FloatParts64 pr = float_to_float(p, &float32_params, s);
2029     return float32_round_pack_canonical(pr, s);
2030 }
2031
2032 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2033 {
2034     FloatParts64 p = bfloat16_unpack_canonical(a, s);
2035     FloatParts64 pr = float_to_float(p, &float64_params, s);
2036     return float64_round_pack_canonical(pr, s);
2037 }
2038
2039 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2040 {
2041     FloatParts64 p = float32_unpack_canonical(a, s);
2042     FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2043     return bfloat16_round_pack_canonical(pr, s);
2044 }
2045
2046 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2047 {
2048     FloatParts64 p = float64_unpack_canonical(a, s);
2049     FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2050     return bfloat16_round_pack_canonical(pr, s);
2051 }
2052
2053 /*
2054  * Rounds the floating-point value `a' to an integer, and returns the
2055  * result as a floating-point value. The operation is performed
2056  * according to the IEC/IEEE Standard for Binary Floating-Point
2057  * Arithmetic.
2058  */
2059
2060 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2061                                int scale, float_status *s)
2062 {
2063     switch (a.cls) {
2064     case float_class_qnan:
2065     case float_class_snan:
2066         return return_nan(a, s);
2067
2068     case float_class_zero:
2069     case float_class_inf:
2070         /* already "integral" */
2071         break;
2072
2073     case float_class_normal:
2074         scale = MIN(MAX(scale, -0x10000), 0x10000);
2075         a.exp += scale;
2076
2077         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2078             /* already integral */
2079             break;
2080         }
2081         if (a.exp < 0) {
2082             bool one;
2083             /* all fractional */
2084             float_raise(float_flag_inexact, s);
2085             switch (rmode) {
2086             case float_round_nearest_even:
2087                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2088                 break;
2089             case float_round_ties_away:
2090                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2091                 break;
2092             case float_round_to_zero:
2093                 one = false;
2094                 break;
2095             case float_round_up:
2096                 one = !a.sign;
2097                 break;
2098             case float_round_down:
2099                 one = a.sign;
2100                 break;
2101             case float_round_to_odd:
2102                 one = true;
2103                 break;
2104             default:
2105                 g_assert_not_reached();
2106             }
2107
2108             if (one) {
2109                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2110                 a.exp = 0;
2111             } else {
2112                 a.cls = float_class_zero;
2113             }
2114         } else {
2115             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2116             uint64_t frac_lsbm1 = frac_lsb >> 1;
2117             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2118             uint64_t rnd_mask = rnd_even_mask >> 1;
2119             uint64_t inc;
2120
2121             switch (rmode) {
2122             case float_round_nearest_even:
2123                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2124                 break;
2125             case float_round_ties_away:
2126                 inc = frac_lsbm1;
2127                 break;
2128             case float_round_to_zero:
2129                 inc = 0;
2130                 break;
2131             case float_round_up:
2132                 inc = a.sign ? 0 : rnd_mask;
2133                 break;
2134             case float_round_down:
2135                 inc = a.sign ? rnd_mask : 0;
2136                 break;
2137             case float_round_to_odd:
2138                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2139                 break;
2140             default:
2141                 g_assert_not_reached();
2142             }
2143
2144             if (a.frac & rnd_mask) {
2145                 float_raise(float_flag_inexact, s);
2146                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2147                     a.frac >>= 1;
2148                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2149                     a.exp++;
2150                 }
2151                 a.frac &= ~rnd_mask;
2152             }
2153         }
2154         break;
2155     default:
2156         g_assert_not_reached();
2157     }
2158     return a;
2159 }
2160
2161 float16 float16_round_to_int(float16 a, float_status *s)
2162 {
2163     FloatParts64 pa = float16_unpack_canonical(a, s);
2164     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2165     return float16_round_pack_canonical(pr, s);
2166 }
2167
2168 float32 float32_round_to_int(float32 a, float_status *s)
2169 {
2170     FloatParts64 pa = float32_unpack_canonical(a, s);
2171     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2172     return float32_round_pack_canonical(pr, s);
2173 }
2174
2175 float64 float64_round_to_int(float64 a, float_status *s)
2176 {
2177     FloatParts64 pa = float64_unpack_canonical(a, s);
2178     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2179     return float64_round_pack_canonical(pr, s);
2180 }
2181
2182 /*
2183  * Rounds the bfloat16 value `a' to an integer, and returns the
2184  * result as a bfloat16 value.
2185  */
2186
2187 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2188 {
2189     FloatParts64 pa = bfloat16_unpack_canonical(a, s);
2190     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2191     return bfloat16_round_pack_canonical(pr, s);
2192 }
2193
2194 /*
2195  * Returns the result of converting the floating-point value `a' to
2196  * the two's complement integer format. The conversion is performed
2197  * according to the IEC/IEEE Standard for Binary Floating-Point
2198  * Arithmetic---which means in particular that the conversion is
2199  * rounded according to the current rounding mode. If `a' is a NaN,
2200  * the largest positive integer is returned. Otherwise, if the
2201  * conversion overflows, the largest integer with the same sign as `a'
2202  * is returned.
2203 */
2204
2205 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2206                                      int scale, int64_t min, int64_t max,
2207                                      float_status *s)
2208 {
2209     uint64_t r;
2210     int orig_flags = get_float_exception_flags(s);
2211     FloatParts64 p = round_to_int(in, rmode, scale, s);
2212
2213     switch (p.cls) {
2214     case float_class_snan:
2215     case float_class_qnan:
2216         s->float_exception_flags = orig_flags | float_flag_invalid;
2217         return max;
2218     case float_class_inf:
2219         s->float_exception_flags = orig_flags | float_flag_invalid;
2220         return p.sign ? min : max;
2221     case float_class_zero:
2222         return 0;
2223     case float_class_normal:
2224         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2225             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2226         } else {
2227             r = UINT64_MAX;
2228         }
2229         if (p.sign) {
2230             if (r <= -(uint64_t) min) {
2231                 return -r;
2232             } else {
2233                 s->float_exception_flags = orig_flags | float_flag_invalid;
2234                 return min;
2235             }
2236         } else {
2237             if (r <= max) {
2238                 return r;
2239             } else {
2240                 s->float_exception_flags = orig_flags | float_flag_invalid;
2241                 return max;
2242             }
2243         }
2244     default:
2245         g_assert_not_reached();
2246     }
2247 }
2248
2249 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2250                               float_status *s)
2251 {
2252     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2253                                  rmode, scale, INT8_MIN, INT8_MAX, s);
2254 }
2255
2256 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2257                                 float_status *s)
2258 {
2259     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2260                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2261 }
2262
2263 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2264                                 float_status *s)
2265 {
2266     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2267                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2268 }
2269
2270 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2271                                 float_status *s)
2272 {
2273     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2274                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2275 }
2276
2277 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2278                                 float_status *s)
2279 {
2280     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2281                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2282 }
2283
2284 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2285                                 float_status *s)
2286 {
2287     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2288                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2289 }
2290
2291 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2292                                 float_status *s)
2293 {
2294     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2295                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2296 }
2297
2298 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2299                                 float_status *s)
2300 {
2301     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2302                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2303 }
2304
2305 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2306                                 float_status *s)
2307 {
2308     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2309                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2310 }
2311
2312 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2313                                 float_status *s)
2314 {
2315     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2316                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2317 }
2318
2319 int8_t float16_to_int8(float16 a, float_status *s)
2320 {
2321     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2322 }
2323
2324 int16_t float16_to_int16(float16 a, float_status *s)
2325 {
2326     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2327 }
2328
2329 int32_t float16_to_int32(float16 a, float_status *s)
2330 {
2331     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2332 }
2333
2334 int64_t float16_to_int64(float16 a, float_status *s)
2335 {
2336     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2337 }
2338
2339 int16_t float32_to_int16(float32 a, float_status *s)
2340 {
2341     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2342 }
2343
2344 int32_t float32_to_int32(float32 a, float_status *s)
2345 {
2346     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2347 }
2348
2349 int64_t float32_to_int64(float32 a, float_status *s)
2350 {
2351     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2352 }
2353
2354 int16_t float64_to_int16(float64 a, float_status *s)
2355 {
2356     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2357 }
2358
2359 int32_t float64_to_int32(float64 a, float_status *s)
2360 {
2361     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2362 }
2363
2364 int64_t float64_to_int64(float64 a, float_status *s)
2365 {
2366     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2367 }
2368
2369 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2370 {
2371     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2372 }
2373
2374 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2375 {
2376     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2377 }
2378
2379 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2380 {
2381     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2382 }
2383
2384 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2385 {
2386     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2387 }
2388
2389 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2390 {
2391     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2392 }
2393
2394 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2395 {
2396     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2397 }
2398
2399 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2400 {
2401     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2402 }
2403
2404 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2405 {
2406     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2407 }
2408
2409 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2410 {
2411     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2412 }
2413
2414 /*
2415  * Returns the result of converting the floating-point value `a' to
2416  * the two's complement integer format.
2417  */
2418
2419 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2420                                  float_status *s)
2421 {
2422     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2423                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2424 }
2425
2426 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2427                                  float_status *s)
2428 {
2429     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2430                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2431 }
2432
2433 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2434                                  float_status *s)
2435 {
2436     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2437                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2438 }
2439
2440 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2441 {
2442     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2443 }
2444
2445 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2446 {
2447     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2448 }
2449
2450 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2451 {
2452     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2453 }
2454
2455 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2456 {
2457     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2458 }
2459
2460 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2461 {
2462     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2463 }
2464
2465 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2466 {
2467     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2468 }
2469
2470 /*
2471  *  Returns the result of converting the floating-point value `a' to
2472  *  the unsigned integer format. The conversion is performed according
2473  *  to the IEC/IEEE Standard for Binary Floating-Point
2474  *  Arithmetic---which means in particular that the conversion is
2475  *  rounded according to the current rounding mode. If `a' is a NaN,
2476  *  the largest unsigned integer is returned. Otherwise, if the
2477  *  conversion overflows, the largest unsigned integer is returned. If
2478  *  the 'a' is negative, the result is rounded and zero is returned;
2479  *  values that do not round to zero will raise the inexact exception
2480  *  flag.
2481  */
2482
2483 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2484                                        int scale, uint64_t max,
2485                                        float_status *s)
2486 {
2487     int orig_flags = get_float_exception_flags(s);
2488     FloatParts64 p = round_to_int(in, rmode, scale, s);
2489     uint64_t r;
2490
2491     switch (p.cls) {
2492     case float_class_snan:
2493     case float_class_qnan:
2494         s->float_exception_flags = orig_flags | float_flag_invalid;
2495         return max;
2496     case float_class_inf:
2497         s->float_exception_flags = orig_flags | float_flag_invalid;
2498         return p.sign ? 0 : max;
2499     case float_class_zero:
2500         return 0;
2501     case float_class_normal:
2502         if (p.sign) {
2503             s->float_exception_flags = orig_flags | float_flag_invalid;
2504             return 0;
2505         }
2506
2507         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2508             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2509         } else {
2510             s->float_exception_flags = orig_flags | float_flag_invalid;
2511             return max;
2512         }
2513
2514         /* For uint64 this will never trip, but if p.exp is too large
2515          * to shift a decomposed fraction we shall have exited via the
2516          * 3rd leg above.
2517          */
2518         if (r > max) {
2519             s->float_exception_flags = orig_flags | float_flag_invalid;
2520             return max;
2521         }
2522         return r;
2523     default:
2524         g_assert_not_reached();
2525     }
2526 }
2527
2528 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2529                                 float_status *s)
2530 {
2531     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2532                                   rmode, scale, UINT8_MAX, s);
2533 }
2534
2535 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2536                                   float_status *s)
2537 {
2538     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2539                                   rmode, scale, UINT16_MAX, s);
2540 }
2541
2542 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2543                                   float_status *s)
2544 {
2545     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2546                                   rmode, scale, UINT32_MAX, s);
2547 }
2548
2549 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2550                                   float_status *s)
2551 {
2552     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2553                                   rmode, scale, UINT64_MAX, s);
2554 }
2555
2556 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2557                                   float_status *s)
2558 {
2559     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2560                                   rmode, scale, UINT16_MAX, s);
2561 }
2562
2563 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2564                                   float_status *s)
2565 {
2566     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2567                                   rmode, scale, UINT32_MAX, s);
2568 }
2569
2570 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2571                                   float_status *s)
2572 {
2573     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2574                                   rmode, scale, UINT64_MAX, s);
2575 }
2576
2577 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2578                                   float_status *s)
2579 {
2580     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2581                                   rmode, scale, UINT16_MAX, s);
2582 }
2583
2584 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2585                                   float_status *s)
2586 {
2587     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2588                                   rmode, scale, UINT32_MAX, s);
2589 }
2590
2591 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2592                                   float_status *s)
2593 {
2594     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2595                                   rmode, scale, UINT64_MAX, s);
2596 }
2597
2598 uint8_t float16_to_uint8(float16 a, float_status *s)
2599 {
2600     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2601 }
2602
2603 uint16_t float16_to_uint16(float16 a, float_status *s)
2604 {
2605     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2606 }
2607
2608 uint32_t float16_to_uint32(float16 a, float_status *s)
2609 {
2610     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2611 }
2612
2613 uint64_t float16_to_uint64(float16 a, float_status *s)
2614 {
2615     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2616 }
2617
2618 uint16_t float32_to_uint16(float32 a, float_status *s)
2619 {
2620     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2621 }
2622
2623 uint32_t float32_to_uint32(float32 a, float_status *s)
2624 {
2625     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2626 }
2627
2628 uint64_t float32_to_uint64(float32 a, float_status *s)
2629 {
2630     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2631 }
2632
2633 uint16_t float64_to_uint16(float64 a, float_status *s)
2634 {
2635     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2636 }
2637
2638 uint32_t float64_to_uint32(float64 a, float_status *s)
2639 {
2640     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2641 }
2642
2643 uint64_t float64_to_uint64(float64 a, float_status *s)
2644 {
2645     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2646 }
2647
2648 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2649 {
2650     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2651 }
2652
2653 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2654 {
2655     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2656 }
2657
2658 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2659 {
2660     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2661 }
2662
2663 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2664 {
2665     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2666 }
2667
2668 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2669 {
2670     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2671 }
2672
2673 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2674 {
2675     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2676 }
2677
2678 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2679 {
2680     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2681 }
2682
2683 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2684 {
2685     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2686 }
2687
2688 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2689 {
2690     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2691 }
2692
2693 /*
2694  *  Returns the result of converting the bfloat16 value `a' to
2695  *  the unsigned integer format.
2696  */
2697
2698 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2699                                    int scale, float_status *s)
2700 {
2701     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2702                                   rmode, scale, UINT16_MAX, s);
2703 }
2704
2705 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2706                                    int scale, float_status *s)
2707 {
2708     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2709                                   rmode, scale, UINT32_MAX, s);
2710 }
2711
2712 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2713                                    int scale, float_status *s)
2714 {
2715     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2716                                   rmode, scale, UINT64_MAX, s);
2717 }
2718
2719 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2720 {
2721     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2722 }
2723
2724 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2725 {
2726     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2727 }
2728
2729 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2730 {
2731     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2732 }
2733
2734 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2735 {
2736     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2737 }
2738
2739 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2740 {
2741     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2742 }
2743
2744 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2745 {
2746     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2747 }
2748
2749 /*
2750  * Integer to float conversions
2751  *
2752  * Returns the result of converting the two's complement integer `a'
2753  * to the floating-point format. The conversion is performed according
2754  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2755  */
2756
2757 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2758 {
2759     FloatParts64 r = { .sign = false };
2760
2761     if (a == 0) {
2762         r.cls = float_class_zero;
2763     } else {
2764         uint64_t f = a;
2765         int shift;
2766
2767         r.cls = float_class_normal;
2768         if (a < 0) {
2769             f = -f;
2770             r.sign = true;
2771         }
2772         shift = clz64(f);
2773         scale = MIN(MAX(scale, -0x10000), 0x10000);
2774
2775         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2776         r.frac = f << shift;
2777     }
2778
2779     return r;
2780 }
2781
2782 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2783 {
2784     FloatParts64 pa = int_to_float(a, scale, status);
2785     return float16_round_pack_canonical(pa, status);
2786 }
2787
2788 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2789 {
2790     return int64_to_float16_scalbn(a, scale, status);
2791 }
2792
2793 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2794 {
2795     return int64_to_float16_scalbn(a, scale, status);
2796 }
2797
2798 float16 int64_to_float16(int64_t a, float_status *status)
2799 {
2800     return int64_to_float16_scalbn(a, 0, status);
2801 }
2802
2803 float16 int32_to_float16(int32_t a, float_status *status)
2804 {
2805     return int64_to_float16_scalbn(a, 0, status);
2806 }
2807
2808 float16 int16_to_float16(int16_t a, float_status *status)
2809 {
2810     return int64_to_float16_scalbn(a, 0, status);
2811 }
2812
2813 float16 int8_to_float16(int8_t a, float_status *status)
2814 {
2815     return int64_to_float16_scalbn(a, 0, status);
2816 }
2817
2818 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2819 {
2820     FloatParts64 pa = int_to_float(a, scale, status);
2821     return float32_round_pack_canonical(pa, status);
2822 }
2823
2824 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2825 {
2826     return int64_to_float32_scalbn(a, scale, status);
2827 }
2828
2829 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2830 {
2831     return int64_to_float32_scalbn(a, scale, status);
2832 }
2833
2834 float32 int64_to_float32(int64_t a, float_status *status)
2835 {
2836     return int64_to_float32_scalbn(a, 0, status);
2837 }
2838
2839 float32 int32_to_float32(int32_t a, float_status *status)
2840 {
2841     return int64_to_float32_scalbn(a, 0, status);
2842 }
2843
2844 float32 int16_to_float32(int16_t a, float_status *status)
2845 {
2846     return int64_to_float32_scalbn(a, 0, status);
2847 }
2848
2849 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2850 {
2851     FloatParts64 pa = int_to_float(a, scale, status);
2852     return float64_round_pack_canonical(pa, status);
2853 }
2854
2855 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2856 {
2857     return int64_to_float64_scalbn(a, scale, status);
2858 }
2859
2860 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2861 {
2862     return int64_to_float64_scalbn(a, scale, status);
2863 }
2864
2865 float64 int64_to_float64(int64_t a, float_status *status)
2866 {
2867     return int64_to_float64_scalbn(a, 0, status);
2868 }
2869
2870 float64 int32_to_float64(int32_t a, float_status *status)
2871 {
2872     return int64_to_float64_scalbn(a, 0, status);
2873 }
2874
2875 float64 int16_to_float64(int16_t a, float_status *status)
2876 {
2877     return int64_to_float64_scalbn(a, 0, status);
2878 }
2879
2880 /*
2881  * Returns the result of converting the two's complement integer `a'
2882  * to the bfloat16 format.
2883  */
2884
2885 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2886 {
2887     FloatParts64 pa = int_to_float(a, scale, status);
2888     return bfloat16_round_pack_canonical(pa, status);
2889 }
2890
2891 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2892 {
2893     return int64_to_bfloat16_scalbn(a, scale, status);
2894 }
2895
2896 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2897 {
2898     return int64_to_bfloat16_scalbn(a, scale, status);
2899 }
2900
2901 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2902 {
2903     return int64_to_bfloat16_scalbn(a, 0, status);
2904 }
2905
2906 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2907 {
2908     return int64_to_bfloat16_scalbn(a, 0, status);
2909 }
2910
2911 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2912 {
2913     return int64_to_bfloat16_scalbn(a, 0, status);
2914 }
2915
2916 /*
2917  * Unsigned Integer to float conversions
2918  *
2919  * Returns the result of converting the unsigned integer `a' to the
2920  * floating-point format. The conversion is performed according to the
2921  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2922  */
2923
2924 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
2925 {
2926     FloatParts64 r = { .sign = false };
2927     int shift;
2928
2929     if (a == 0) {
2930         r.cls = float_class_zero;
2931     } else {
2932         scale = MIN(MAX(scale, -0x10000), 0x10000);
2933         shift = clz64(a);
2934         r.cls = float_class_normal;
2935         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2936         r.frac = a << shift;
2937     }
2938
2939     return r;
2940 }
2941
2942 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2943 {
2944     FloatParts64 pa = uint_to_float(a, scale, status);
2945     return float16_round_pack_canonical(pa, status);
2946 }
2947
2948 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2949 {
2950     return uint64_to_float16_scalbn(a, scale, status);
2951 }
2952
2953 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2954 {
2955     return uint64_to_float16_scalbn(a, scale, status);
2956 }
2957
2958 float16 uint64_to_float16(uint64_t a, float_status *status)
2959 {
2960     return uint64_to_float16_scalbn(a, 0, status);
2961 }
2962
2963 float16 uint32_to_float16(uint32_t a, float_status *status)
2964 {
2965     return uint64_to_float16_scalbn(a, 0, status);
2966 }
2967
2968 float16 uint16_to_float16(uint16_t a, float_status *status)
2969 {
2970     return uint64_to_float16_scalbn(a, 0, status);
2971 }
2972
2973 float16 uint8_to_float16(uint8_t a, float_status *status)
2974 {
2975     return uint64_to_float16_scalbn(a, 0, status);
2976 }
2977
2978 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2979 {
2980     FloatParts64 pa = uint_to_float(a, scale, status);
2981     return float32_round_pack_canonical(pa, status);
2982 }
2983
2984 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2985 {
2986     return uint64_to_float32_scalbn(a, scale, status);
2987 }
2988
2989 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2990 {
2991     return uint64_to_float32_scalbn(a, scale, status);
2992 }
2993
2994 float32 uint64_to_float32(uint64_t a, float_status *status)
2995 {
2996     return uint64_to_float32_scalbn(a, 0, status);
2997 }
2998
2999 float32 uint32_to_float32(uint32_t a, float_status *status)
3000 {
3001     return uint64_to_float32_scalbn(a, 0, status);
3002 }
3003
3004 float32 uint16_to_float32(uint16_t a, float_status *status)
3005 {
3006     return uint64_to_float32_scalbn(a, 0, status);
3007 }
3008
3009 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3010 {
3011     FloatParts64 pa = uint_to_float(a, scale, status);
3012     return float64_round_pack_canonical(pa, status);
3013 }
3014
3015 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3016 {
3017     return uint64_to_float64_scalbn(a, scale, status);
3018 }
3019
3020 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3021 {
3022     return uint64_to_float64_scalbn(a, scale, status);
3023 }
3024
3025 float64 uint64_to_float64(uint64_t a, float_status *status)
3026 {
3027     return uint64_to_float64_scalbn(a, 0, status);
3028 }
3029
3030 float64 uint32_to_float64(uint32_t a, float_status *status)
3031 {
3032     return uint64_to_float64_scalbn(a, 0, status);
3033 }
3034
3035 float64 uint16_to_float64(uint16_t a, float_status *status)
3036 {
3037     return uint64_to_float64_scalbn(a, 0, status);
3038 }
3039
3040 /*
3041  * Returns the result of converting the unsigned integer `a' to the
3042  * bfloat16 format.
3043  */
3044
3045 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3046 {
3047     FloatParts64 pa = uint_to_float(a, scale, status);
3048     return bfloat16_round_pack_canonical(pa, status);
3049 }
3050
3051 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3052 {
3053     return uint64_to_bfloat16_scalbn(a, scale, status);
3054 }
3055
3056 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3057 {
3058     return uint64_to_bfloat16_scalbn(a, scale, status);
3059 }
3060
3061 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3062 {
3063     return uint64_to_bfloat16_scalbn(a, 0, status);
3064 }
3065
3066 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3067 {
3068     return uint64_to_bfloat16_scalbn(a, 0, status);
3069 }
3070
3071 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3072 {
3073     return uint64_to_bfloat16_scalbn(a, 0, status);
3074 }
3075
3076 /* Float Min/Max */
3077 /* min() and max() functions. These can't be implemented as
3078  * 'compare and pick one input' because that would mishandle
3079  * NaNs and +0 vs -0.
3080  *
3081  * minnum() and maxnum() functions. These are similar to the min()
3082  * and max() functions but if one of the arguments is a QNaN and
3083  * the other is numerical then the numerical argument is returned.
3084  * SNaNs will get quietened before being returned.
3085  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3086  * and maxNum() operations. min() and max() are the typical min/max
3087  * semantics provided by many CPUs which predate that specification.
3088  *
3089  * minnummag() and maxnummag() functions correspond to minNumMag()
3090  * and minNumMag() from the IEEE-754 2008.
3091  */
3092 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3093                                 bool ieee, bool ismag, float_status *s)
3094 {
3095     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3096         if (ieee) {
3097             /* Takes two floating-point values `a' and `b', one of
3098              * which is a NaN, and returns the appropriate NaN
3099              * result. If either `a' or `b' is a signaling NaN,
3100              * the invalid exception is raised.
3101              */
3102             if (is_snan(a.cls) || is_snan(b.cls)) {
3103                 return pick_nan(a, b, s);
3104             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3105                 return b;
3106             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3107                 return a;
3108             }
3109         }
3110         return pick_nan(a, b, s);
3111     } else {
3112         int a_exp, b_exp;
3113
3114         switch (a.cls) {
3115         case float_class_normal:
3116             a_exp = a.exp;
3117             break;
3118         case float_class_inf:
3119             a_exp = INT_MAX;
3120             break;
3121         case float_class_zero:
3122             a_exp = INT_MIN;
3123             break;
3124         default:
3125             g_assert_not_reached();
3126             break;
3127         }
3128         switch (b.cls) {
3129         case float_class_normal:
3130             b_exp = b.exp;
3131             break;
3132         case float_class_inf:
3133             b_exp = INT_MAX;
3134             break;
3135         case float_class_zero:
3136             b_exp = INT_MIN;
3137             break;
3138         default:
3139             g_assert_not_reached();
3140             break;
3141         }
3142
3143         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3144             bool a_less = a_exp < b_exp;
3145             if (a_exp == b_exp) {
3146                 a_less = a.frac < b.frac;
3147             }
3148             return a_less ^ ismin ? b : a;
3149         }
3150
3151         if (a.sign == b.sign) {
3152             bool a_less = a_exp < b_exp;
3153             if (a_exp == b_exp) {
3154                 a_less = a.frac < b.frac;
3155             }
3156             return a.sign ^ a_less ^ ismin ? b : a;
3157         } else {
3158             return a.sign ^ ismin ? b : a;
3159         }
3160     }
3161 }
3162
3163 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3164 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3165                                      float_status *s)                   \
3166 {                                                                       \
3167     FloatParts64 pa = float ## sz ## _unpack_canonical(a, s);             \
3168     FloatParts64 pb = float ## sz ## _unpack_canonical(b, s);             \
3169     FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3170                                                                         \
3171     return float ## sz ## _round_pack_canonical(pr, s);                 \
3172 }
3173
3174 MINMAX(16, min, true, false, false)
3175 MINMAX(16, minnum, true, true, false)
3176 MINMAX(16, minnummag, true, true, true)
3177 MINMAX(16, max, false, false, false)
3178 MINMAX(16, maxnum, false, true, false)
3179 MINMAX(16, maxnummag, false, true, true)
3180
3181 MINMAX(32, min, true, false, false)
3182 MINMAX(32, minnum, true, true, false)
3183 MINMAX(32, minnummag, true, true, true)
3184 MINMAX(32, max, false, false, false)
3185 MINMAX(32, maxnum, false, true, false)
3186 MINMAX(32, maxnummag, false, true, true)
3187
3188 MINMAX(64, min, true, false, false)
3189 MINMAX(64, minnum, true, true, false)
3190 MINMAX(64, minnummag, true, true, true)
3191 MINMAX(64, max, false, false, false)
3192 MINMAX(64, maxnum, false, true, false)
3193 MINMAX(64, maxnummag, false, true, true)
3194
3195 #undef MINMAX
3196
3197 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3198 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3199 {                                                                       \
3200     FloatParts64 pa = bfloat16_unpack_canonical(a, s);                    \
3201     FloatParts64 pb = bfloat16_unpack_canonical(b, s);                    \
3202     FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3203                                                                         \
3204     return bfloat16_round_pack_canonical(pr, s);                        \
3205 }
3206
3207 BF16_MINMAX(min, true, false, false)
3208 BF16_MINMAX(minnum, true, true, false)
3209 BF16_MINMAX(minnummag, true, true, true)
3210 BF16_MINMAX(max, false, false, false)
3211 BF16_MINMAX(maxnum, false, true, false)
3212 BF16_MINMAX(maxnummag, false, true, true)
3213
3214 #undef BF16_MINMAX
3215
3216 /* Floating point compare */
3217 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3218                                     float_status *s)
3219 {
3220     if (is_nan(a.cls) || is_nan(b.cls)) {
3221         if (!is_quiet ||
3222             a.cls == float_class_snan ||
3223             b.cls == float_class_snan) {
3224             float_raise(float_flag_invalid, s);
3225         }
3226         return float_relation_unordered;
3227     }
3228
3229     if (a.cls == float_class_zero) {
3230         if (b.cls == float_class_zero) {
3231             return float_relation_equal;
3232         }
3233         return b.sign ? float_relation_greater : float_relation_less;
3234     } else if (b.cls == float_class_zero) {
3235         return a.sign ? float_relation_less : float_relation_greater;
3236     }
3237
3238     /* The only really important thing about infinity is its sign. If
3239      * both are infinities the sign marks the smallest of the two.
3240      */
3241     if (a.cls == float_class_inf) {
3242         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3243             return float_relation_equal;
3244         }
3245         return a.sign ? float_relation_less : float_relation_greater;
3246     } else if (b.cls == float_class_inf) {
3247         return b.sign ? float_relation_greater : float_relation_less;
3248     }
3249
3250     if (a.sign != b.sign) {
3251         return a.sign ? float_relation_less : float_relation_greater;
3252     }
3253
3254     if (a.exp == b.exp) {
3255         if (a.frac == b.frac) {
3256             return float_relation_equal;
3257         }
3258         if (a.sign) {
3259             return a.frac > b.frac ?
3260                 float_relation_less : float_relation_greater;
3261         } else {
3262             return a.frac > b.frac ?
3263                 float_relation_greater : float_relation_less;
3264         }
3265     } else {
3266         if (a.sign) {
3267             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3268         } else {
3269             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3270         }
3271     }
3272 }
3273
3274 #define COMPARE(name, attr, sz)                                         \
3275 static int attr                                                         \
3276 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3277 {                                                                       \
3278     FloatParts64 pa = float ## sz ## _unpack_canonical(a, s);             \
3279     FloatParts64 pb = float ## sz ## _unpack_canonical(b, s);             \
3280     return compare_floats(pa, pb, is_quiet, s);                         \
3281 }
3282
3283 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3284 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3285 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3286
3287 #undef COMPARE
3288
3289 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3290 {
3291     return soft_f16_compare(a, b, false, s);
3292 }
3293
3294 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3295 {
3296     return soft_f16_compare(a, b, true, s);
3297 }
3298
3299 static FloatRelation QEMU_FLATTEN
3300 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3301 {
3302     union_float32 ua, ub;
3303
3304     ua.s = xa;
3305     ub.s = xb;
3306
3307     if (QEMU_NO_HARDFLOAT) {
3308         goto soft;
3309     }
3310
3311     float32_input_flush2(&ua.s, &ub.s, s);
3312     if (isgreaterequal(ua.h, ub.h)) {
3313         if (isgreater(ua.h, ub.h)) {
3314             return float_relation_greater;
3315         }
3316         return float_relation_equal;
3317     }
3318     if (likely(isless(ua.h, ub.h))) {
3319         return float_relation_less;
3320     }
3321     /* The only condition remaining is unordered.
3322      * Fall through to set flags.
3323      */
3324  soft:
3325     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3326 }
3327
3328 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3329 {
3330     return f32_compare(a, b, false, s);
3331 }
3332
3333 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3334 {
3335     return f32_compare(a, b, true, s);
3336 }
3337
3338 static FloatRelation QEMU_FLATTEN
3339 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3340 {
3341     union_float64 ua, ub;
3342
3343     ua.s = xa;
3344     ub.s = xb;
3345
3346     if (QEMU_NO_HARDFLOAT) {
3347         goto soft;
3348     }
3349
3350     float64_input_flush2(&ua.s, &ub.s, s);
3351     if (isgreaterequal(ua.h, ub.h)) {
3352         if (isgreater(ua.h, ub.h)) {
3353             return float_relation_greater;
3354         }
3355         return float_relation_equal;
3356     }
3357     if (likely(isless(ua.h, ub.h))) {
3358         return float_relation_less;
3359     }
3360     /* The only condition remaining is unordered.
3361      * Fall through to set flags.
3362      */
3363  soft:
3364     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3365 }
3366
3367 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3368 {
3369     return f64_compare(a, b, false, s);
3370 }
3371
3372 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3373 {
3374     return f64_compare(a, b, true, s);
3375 }
3376
3377 static FloatRelation QEMU_FLATTEN
3378 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3379 {
3380     FloatParts64 pa = bfloat16_unpack_canonical(a, s);
3381     FloatParts64 pb = bfloat16_unpack_canonical(b, s);
3382     return compare_floats(pa, pb, is_quiet, s);
3383 }
3384
3385 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3386 {
3387     return soft_bf16_compare(a, b, false, s);
3388 }
3389
3390 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3391 {
3392     return soft_bf16_compare(a, b, true, s);
3393 }
3394
3395 /* Multiply A by 2 raised to the power N.  */
3396 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3397 {
3398     if (unlikely(is_nan(a.cls))) {
3399         return return_nan(a, s);
3400     }
3401     if (a.cls == float_class_normal) {
3402         /* The largest float type (even though not supported by FloatParts64)
3403          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3404          * still allows rounding to infinity, without allowing overflow
3405          * within the int32_t that backs FloatParts64.exp.
3406          */
3407         n = MIN(MAX(n, -0x10000), 0x10000);
3408         a.exp += n;
3409     }
3410     return a;
3411 }
3412
3413 float16 float16_scalbn(float16 a, int n, float_status *status)
3414 {
3415     FloatParts64 pa = float16_unpack_canonical(a, status);
3416     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3417     return float16_round_pack_canonical(pr, status);
3418 }
3419
3420 float32 float32_scalbn(float32 a, int n, float_status *status)
3421 {
3422     FloatParts64 pa = float32_unpack_canonical(a, status);
3423     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3424     return float32_round_pack_canonical(pr, status);
3425 }
3426
3427 float64 float64_scalbn(float64 a, int n, float_status *status)
3428 {
3429     FloatParts64 pa = float64_unpack_canonical(a, status);
3430     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3431     return float64_round_pack_canonical(pr, status);
3432 }
3433
3434 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3435 {
3436     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3437     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3438     return bfloat16_round_pack_canonical(pr, status);
3439 }
3440
3441 /*
3442  * Square Root
3443  *
3444  * The old softfloat code did an approximation step before zeroing in
3445  * on the final result. However for simpleness we just compute the
3446  * square root by iterating down from the implicit bit to enough extra
3447  * bits to ensure we get a correctly rounded result.
3448  *
3449  * This does mean however the calculation is slower than before,
3450  * especially for 64 bit floats.
3451  */
3452
3453 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3454 {
3455     uint64_t a_frac, r_frac, s_frac;
3456     int bit, last_bit;
3457
3458     if (is_nan(a.cls)) {
3459         return return_nan(a, s);
3460     }
3461     if (a.cls == float_class_zero) {
3462         return a;  /* sqrt(+-0) = +-0 */
3463     }
3464     if (a.sign) {
3465         float_raise(float_flag_invalid, s);
3466         parts_default_nan(&a, s);
3467         return a;
3468     }
3469     if (a.cls == float_class_inf) {
3470         return a;  /* sqrt(+inf) = +inf */
3471     }
3472
3473     assert(a.cls == float_class_normal);
3474
3475     /* We need two overflow bits at the top. Adding room for that is a
3476      * right shift. If the exponent is odd, we can discard the low bit
3477      * by multiplying the fraction by 2; that's a left shift. Combine
3478      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3479      */
3480     a_frac = a.frac >> (2 - (a.exp & 1));
3481     a.exp >>= 1;
3482
3483     /* Bit-by-bit computation of sqrt.  */
3484     r_frac = 0;
3485     s_frac = 0;
3486
3487     /* Iterate from implicit bit down to the 3 extra bits to compute a
3488      * properly rounded result. Remember we've inserted two more bits
3489      * at the top, so these positions are two less.
3490      */
3491     bit = DECOMPOSED_BINARY_POINT - 2;
3492     last_bit = MAX(p->frac_shift - 4, 0);
3493     do {
3494         uint64_t q = 1ULL << bit;
3495         uint64_t t_frac = s_frac + q;
3496         if (t_frac <= a_frac) {
3497             s_frac = t_frac + q;
3498             a_frac -= t_frac;
3499             r_frac += q;
3500         }
3501         a_frac <<= 1;
3502     } while (--bit >= last_bit);
3503
3504     /* Undo the right shift done above. If there is any remaining
3505      * fraction, the result is inexact. Set the sticky bit.
3506      */
3507     a.frac = (r_frac << 2) + (a_frac != 0);
3508
3509     return a;
3510 }
3511
3512 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3513 {
3514     FloatParts64 pa = float16_unpack_canonical(a, status);
3515     FloatParts64 pr = sqrt_float(pa, status, &float16_params);
3516     return float16_round_pack_canonical(pr, status);
3517 }
3518
3519 static float32 QEMU_SOFTFLOAT_ATTR
3520 soft_f32_sqrt(float32 a, float_status *status)
3521 {
3522     FloatParts64 pa = float32_unpack_canonical(a, status);
3523     FloatParts64 pr = sqrt_float(pa, status, &float32_params);
3524     return float32_round_pack_canonical(pr, status);
3525 }
3526
3527 static float64 QEMU_SOFTFLOAT_ATTR
3528 soft_f64_sqrt(float64 a, float_status *status)
3529 {
3530     FloatParts64 pa = float64_unpack_canonical(a, status);
3531     FloatParts64 pr = sqrt_float(pa, status, &float64_params);
3532     return float64_round_pack_canonical(pr, status);
3533 }
3534
3535 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3536 {
3537     union_float32 ua, ur;
3538
3539     ua.s = xa;
3540     if (unlikely(!can_use_fpu(s))) {
3541         goto soft;
3542     }
3543
3544     float32_input_flush1(&ua.s, s);
3545     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3546         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3547                        fpclassify(ua.h) == FP_ZERO) ||
3548                      signbit(ua.h))) {
3549             goto soft;
3550         }
3551     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3552                         float32_is_neg(ua.s))) {
3553         goto soft;
3554     }
3555     ur.h = sqrtf(ua.h);
3556     return ur.s;
3557
3558  soft:
3559     return soft_f32_sqrt(ua.s, s);
3560 }
3561
3562 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3563 {
3564     union_float64 ua, ur;
3565
3566     ua.s = xa;
3567     if (unlikely(!can_use_fpu(s))) {
3568         goto soft;
3569     }
3570
3571     float64_input_flush1(&ua.s, s);
3572     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3573         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3574                        fpclassify(ua.h) == FP_ZERO) ||
3575                      signbit(ua.h))) {
3576             goto soft;
3577         }
3578     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3579                         float64_is_neg(ua.s))) {
3580         goto soft;
3581     }
3582     ur.h = sqrt(ua.h);
3583     return ur.s;
3584
3585  soft:
3586     return soft_f64_sqrt(ua.s, s);
3587 }
3588
3589 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3590 {
3591     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3592     FloatParts64 pr = sqrt_float(pa, status, &bfloat16_params);
3593     return bfloat16_round_pack_canonical(pr, status);
3594 }
3595
3596 /*----------------------------------------------------------------------------
3597 | The pattern for a default generated NaN.
3598 *----------------------------------------------------------------------------*/
3599
3600 float16 float16_default_nan(float_status *status)
3601 {
3602     FloatParts64 p;
3603
3604     parts_default_nan(&p, status);
3605     p.frac >>= float16_params.frac_shift;
3606     return float16_pack_raw(p);
3607 }
3608
3609 float32 float32_default_nan(float_status *status)
3610 {
3611     FloatParts64 p;
3612
3613     parts_default_nan(&p, status);
3614     p.frac >>= float32_params.frac_shift;
3615     return float32_pack_raw(p);
3616 }
3617
3618 float64 float64_default_nan(float_status *status)
3619 {
3620     FloatParts64 p;
3621
3622     parts_default_nan(&p, status);
3623     p.frac >>= float64_params.frac_shift;
3624     return float64_pack_raw(p);
3625 }
3626
3627 float128 float128_default_nan(float_status *status)
3628 {
3629     FloatParts64 p;
3630     float128 r;
3631
3632     parts_default_nan(&p, status);
3633     /* Extrapolate from the choices made by parts_default_nan to fill
3634      * in the quad-floating format.  If the low bit is set, assume we
3635      * want to set all non-snan bits.
3636      */
3637     r.low = -(p.frac & 1);
3638     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3639     r.high |= UINT64_C(0x7FFF000000000000);
3640     r.high |= (uint64_t)p.sign << 63;
3641
3642     return r;
3643 }
3644
3645 bfloat16 bfloat16_default_nan(float_status *status)
3646 {
3647     FloatParts64 p;
3648
3649     parts_default_nan(&p, status);
3650     p.frac >>= bfloat16_params.frac_shift;
3651     return bfloat16_pack_raw(p);
3652 }
3653
3654 /*----------------------------------------------------------------------------
3655 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3656 *----------------------------------------------------------------------------*/
3657
3658 float16 float16_silence_nan(float16 a, float_status *status)
3659 {
3660     FloatParts64 p;
3661
3662     float16_unpack_raw(&p, a);
3663     p.frac <<= float16_params.frac_shift;
3664     p = parts_silence_nan(p, status);
3665     p.frac >>= float16_params.frac_shift;
3666     return float16_pack_raw(p);
3667 }
3668
3669 float32 float32_silence_nan(float32 a, float_status *status)
3670 {
3671     FloatParts64 p;
3672
3673     float32_unpack_raw(&p, a);
3674     p.frac <<= float32_params.frac_shift;
3675     p = parts_silence_nan(p, status);
3676     p.frac >>= float32_params.frac_shift;
3677     return float32_pack_raw(p);
3678 }
3679
3680 float64 float64_silence_nan(float64 a, float_status *status)
3681 {
3682     FloatParts64 p;
3683
3684     float64_unpack_raw(&p, a);
3685     p.frac <<= float64_params.frac_shift;
3686     p = parts_silence_nan(p, status);
3687     p.frac >>= float64_params.frac_shift;
3688     return float64_pack_raw(p);
3689 }
3690
3691 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3692 {
3693     FloatParts64 p;
3694
3695     bfloat16_unpack_raw(&p, a);
3696     p.frac <<= bfloat16_params.frac_shift;
3697     p = parts_silence_nan(p, status);
3698     p.frac >>= bfloat16_params.frac_shift;
3699     return bfloat16_pack_raw(p);
3700 }
3701
3702 /*----------------------------------------------------------------------------
3703 | If `a' is denormal and we are in flush-to-zero mode then set the
3704 | input-denormal exception and return zero. Otherwise just return the value.
3705 *----------------------------------------------------------------------------*/
3706
3707 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3708 {
3709     if (p.exp == 0 && p.frac != 0) {
3710         float_raise(float_flag_input_denormal, status);
3711         return true;
3712     }
3713
3714     return false;
3715 }
3716
3717 float16 float16_squash_input_denormal(float16 a, float_status *status)
3718 {
3719     if (status->flush_inputs_to_zero) {
3720         FloatParts64 p;
3721
3722         float16_unpack_raw(&p, a);
3723         if (parts_squash_denormal(p, status)) {
3724             return float16_set_sign(float16_zero, p.sign);
3725         }
3726     }
3727     return a;
3728 }
3729
3730 float32 float32_squash_input_denormal(float32 a, float_status *status)
3731 {
3732     if (status->flush_inputs_to_zero) {
3733         FloatParts64 p;
3734
3735         float32_unpack_raw(&p, a);
3736         if (parts_squash_denormal(p, status)) {
3737             return float32_set_sign(float32_zero, p.sign);
3738         }
3739     }
3740     return a;
3741 }
3742
3743 float64 float64_squash_input_denormal(float64 a, float_status *status)
3744 {
3745     if (status->flush_inputs_to_zero) {
3746         FloatParts64 p;
3747
3748         float64_unpack_raw(&p, a);
3749         if (parts_squash_denormal(p, status)) {
3750             return float64_set_sign(float64_zero, p.sign);
3751         }
3752     }
3753     return a;
3754 }
3755
3756 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3757 {
3758     if (status->flush_inputs_to_zero) {
3759         FloatParts64 p;
3760
3761         bfloat16_unpack_raw(&p, a);
3762         if (parts_squash_denormal(p, status)) {
3763             return bfloat16_set_sign(bfloat16_zero, p.sign);
3764         }
3765     }
3766     return a;
3767 }
3768
3769 /*----------------------------------------------------------------------------
3770 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3771 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3772 | input.  If `zSign' is 1, the input is negated before being converted to an
3773 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3774 | is simply rounded to an integer, with the inexact exception raised if the
3775 | input cannot be represented exactly as an integer.  However, if the fixed-
3776 | point input is too large, the invalid exception is raised and the largest
3777 | positive or negative integer is returned.
3778 *----------------------------------------------------------------------------*/
3779
3780 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3781                                  float_status *status)
3782 {
3783     int8_t roundingMode;
3784     bool roundNearestEven;
3785     int8_t roundIncrement, roundBits;
3786     int32_t z;
3787
3788     roundingMode = status->float_rounding_mode;
3789     roundNearestEven = ( roundingMode == float_round_nearest_even );
3790     switch (roundingMode) {
3791     case float_round_nearest_even:
3792     case float_round_ties_away:
3793         roundIncrement = 0x40;
3794         break;
3795     case float_round_to_zero:
3796         roundIncrement = 0;
3797         break;
3798     case float_round_up:
3799         roundIncrement = zSign ? 0 : 0x7f;
3800         break;
3801     case float_round_down:
3802         roundIncrement = zSign ? 0x7f : 0;
3803         break;
3804     case float_round_to_odd:
3805         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3806         break;
3807     default:
3808         abort();
3809     }
3810     roundBits = absZ & 0x7F;
3811     absZ = ( absZ + roundIncrement )>>7;
3812     if (!(roundBits ^ 0x40) && roundNearestEven) {
3813         absZ &= ~1;
3814     }
3815     z = absZ;
3816     if ( zSign ) z = - z;
3817     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3818         float_raise(float_flag_invalid, status);
3819         return zSign ? INT32_MIN : INT32_MAX;
3820     }
3821     if (roundBits) {
3822         float_raise(float_flag_inexact, status);
3823     }
3824     return z;
3825
3826 }
3827
3828 /*----------------------------------------------------------------------------
3829 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3830 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3831 | and returns the properly rounded 64-bit integer corresponding to the input.
3832 | If `zSign' is 1, the input is negated before being converted to an integer.
3833 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3834 | the inexact exception raised if the input cannot be represented exactly as
3835 | an integer.  However, if the fixed-point input is too large, the invalid
3836 | exception is raised and the largest positive or negative integer is
3837 | returned.
3838 *----------------------------------------------------------------------------*/
3839
3840 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3841                                float_status *status)
3842 {
3843     int8_t roundingMode;
3844     bool roundNearestEven, increment;
3845     int64_t z;
3846
3847     roundingMode = status->float_rounding_mode;
3848     roundNearestEven = ( roundingMode == float_round_nearest_even );
3849     switch (roundingMode) {
3850     case float_round_nearest_even:
3851     case float_round_ties_away:
3852         increment = ((int64_t) absZ1 < 0);
3853         break;
3854     case float_round_to_zero:
3855         increment = 0;
3856         break;
3857     case float_round_up:
3858         increment = !zSign && absZ1;
3859         break;
3860     case float_round_down:
3861         increment = zSign && absZ1;
3862         break;
3863     case float_round_to_odd:
3864         increment = !(absZ0 & 1) && absZ1;
3865         break;
3866     default:
3867         abort();
3868     }
3869     if ( increment ) {
3870         ++absZ0;
3871         if ( absZ0 == 0 ) goto overflow;
3872         if (!(absZ1 << 1) && roundNearestEven) {
3873             absZ0 &= ~1;
3874         }
3875     }
3876     z = absZ0;
3877     if ( zSign ) z = - z;
3878     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3879  overflow:
3880         float_raise(float_flag_invalid, status);
3881         return zSign ? INT64_MIN : INT64_MAX;
3882     }
3883     if (absZ1) {
3884         float_raise(float_flag_inexact, status);
3885     }
3886     return z;
3887
3888 }
3889
3890 /*----------------------------------------------------------------------------
3891 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3892 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3893 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3894 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3895 | with the inexact exception raised if the input cannot be represented exactly
3896 | as an integer.  However, if the fixed-point input is too large, the invalid
3897 | exception is raised and the largest unsigned integer is returned.
3898 *----------------------------------------------------------------------------*/
3899
3900 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3901                                 uint64_t absZ1, float_status *status)
3902 {
3903     int8_t roundingMode;
3904     bool roundNearestEven, increment;
3905
3906     roundingMode = status->float_rounding_mode;
3907     roundNearestEven = (roundingMode == float_round_nearest_even);
3908     switch (roundingMode) {
3909     case float_round_nearest_even:
3910     case float_round_ties_away:
3911         increment = ((int64_t)absZ1 < 0);
3912         break;
3913     case float_round_to_zero:
3914         increment = 0;
3915         break;
3916     case float_round_up:
3917         increment = !zSign && absZ1;
3918         break;
3919     case float_round_down:
3920         increment = zSign && absZ1;
3921         break;
3922     case float_round_to_odd:
3923         increment = !(absZ0 & 1) && absZ1;
3924         break;
3925     default:
3926         abort();
3927     }
3928     if (increment) {
3929         ++absZ0;
3930         if (absZ0 == 0) {
3931             float_raise(float_flag_invalid, status);
3932             return UINT64_MAX;
3933         }
3934         if (!(absZ1 << 1) && roundNearestEven) {
3935             absZ0 &= ~1;
3936         }
3937     }
3938
3939     if (zSign && absZ0) {
3940         float_raise(float_flag_invalid, status);
3941         return 0;
3942     }
3943
3944     if (absZ1) {
3945         float_raise(float_flag_inexact, status);
3946     }
3947     return absZ0;
3948 }
3949
3950 /*----------------------------------------------------------------------------
3951 | Normalizes the subnormal single-precision floating-point value represented
3952 | by the denormalized significand `aSig'.  The normalized exponent and
3953 | significand are stored at the locations pointed to by `zExpPtr' and
3954 | `zSigPtr', respectively.
3955 *----------------------------------------------------------------------------*/
3956
3957 static void
3958  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3959 {
3960     int8_t shiftCount;
3961
3962     shiftCount = clz32(aSig) - 8;
3963     *zSigPtr = aSig<<shiftCount;
3964     *zExpPtr = 1 - shiftCount;
3965
3966 }
3967
3968 /*----------------------------------------------------------------------------
3969 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3970 | and significand `zSig', and returns the proper single-precision floating-
3971 | point value corresponding to the abstract input.  Ordinarily, the abstract
3972 | value is simply rounded and packed into the single-precision format, with
3973 | the inexact exception raised if the abstract input cannot be represented
3974 | exactly.  However, if the abstract value is too large, the overflow and
3975 | inexact exceptions are raised and an infinity or maximal finite value is
3976 | returned.  If the abstract value is too small, the input value is rounded to
3977 | a subnormal number, and the underflow and inexact exceptions are raised if
3978 | the abstract input cannot be represented exactly as a subnormal single-
3979 | precision floating-point number.
3980 |     The input significand `zSig' has its binary point between bits 30
3981 | and 29, which is 7 bits to the left of the usual location.  This shifted
3982 | significand must be normalized or smaller.  If `zSig' is not normalized,
3983 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3984 | and it must not require rounding.  In the usual case that `zSig' is
3985 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3986 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3987 | Binary Floating-Point Arithmetic.
3988 *----------------------------------------------------------------------------*/
3989
3990 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3991                                    float_status *status)
3992 {
3993     int8_t roundingMode;
3994     bool roundNearestEven;
3995     int8_t roundIncrement, roundBits;
3996     bool isTiny;
3997
3998     roundingMode = status->float_rounding_mode;
3999     roundNearestEven = ( roundingMode == float_round_nearest_even );
4000     switch (roundingMode) {
4001     case float_round_nearest_even:
4002     case float_round_ties_away:
4003         roundIncrement = 0x40;
4004         break;
4005     case float_round_to_zero:
4006         roundIncrement = 0;
4007         break;
4008     case float_round_up:
4009         roundIncrement = zSign ? 0 : 0x7f;
4010         break;
4011     case float_round_down:
4012         roundIncrement = zSign ? 0x7f : 0;
4013         break;
4014     case float_round_to_odd:
4015         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4016         break;
4017     default:
4018         abort();
4019         break;
4020     }
4021     roundBits = zSig & 0x7F;
4022     if ( 0xFD <= (uint16_t) zExp ) {
4023         if (    ( 0xFD < zExp )
4024              || (    ( zExp == 0xFD )
4025                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4026            ) {
4027             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4028                                    roundIncrement != 0;
4029             float_raise(float_flag_overflow | float_flag_inexact, status);
4030             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4031         }
4032         if ( zExp < 0 ) {
4033             if (status->flush_to_zero) {
4034                 float_raise(float_flag_output_denormal, status);
4035                 return packFloat32(zSign, 0, 0);
4036             }
4037             isTiny = status->tininess_before_rounding
4038                   || (zExp < -1)
4039                   || (zSig + roundIncrement < 0x80000000);
4040             shift32RightJamming( zSig, - zExp, &zSig );
4041             zExp = 0;
4042             roundBits = zSig & 0x7F;
4043             if (isTiny && roundBits) {
4044                 float_raise(float_flag_underflow, status);
4045             }
4046             if (roundingMode == float_round_to_odd) {
4047                 /*
4048                  * For round-to-odd case, the roundIncrement depends on
4049                  * zSig which just changed.
4050                  */
4051                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4052             }
4053         }
4054     }
4055     if (roundBits) {
4056         float_raise(float_flag_inexact, status);
4057     }
4058     zSig = ( zSig + roundIncrement )>>7;
4059     if (!(roundBits ^ 0x40) && roundNearestEven) {
4060         zSig &= ~1;
4061     }
4062     if ( zSig == 0 ) zExp = 0;
4063     return packFloat32( zSign, zExp, zSig );
4064
4065 }
4066
4067 /*----------------------------------------------------------------------------
4068 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4069 | and significand `zSig', and returns the proper single-precision floating-
4070 | point value corresponding to the abstract input.  This routine is just like
4071 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4072 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4073 | floating-point exponent.
4074 *----------------------------------------------------------------------------*/
4075
4076 static float32
4077  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4078                               float_status *status)
4079 {
4080     int8_t shiftCount;
4081
4082     shiftCount = clz32(zSig) - 1;
4083     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4084                                status);
4085
4086 }
4087
4088 /*----------------------------------------------------------------------------
4089 | Normalizes the subnormal double-precision floating-point value represented
4090 | by the denormalized significand `aSig'.  The normalized exponent and
4091 | significand are stored at the locations pointed to by `zExpPtr' and
4092 | `zSigPtr', respectively.
4093 *----------------------------------------------------------------------------*/
4094
4095 static void
4096  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4097 {
4098     int8_t shiftCount;
4099
4100     shiftCount = clz64(aSig) - 11;
4101     *zSigPtr = aSig<<shiftCount;
4102     *zExpPtr = 1 - shiftCount;
4103
4104 }
4105
4106 /*----------------------------------------------------------------------------
4107 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4108 | double-precision floating-point value, returning the result.  After being
4109 | shifted into the proper positions, the three fields are simply added
4110 | together to form the result.  This means that any integer portion of `zSig'
4111 | will be added into the exponent.  Since a properly normalized significand
4112 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4113 | than the desired result exponent whenever `zSig' is a complete, normalized
4114 | significand.
4115 *----------------------------------------------------------------------------*/
4116
4117 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4118 {
4119
4120     return make_float64(
4121         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4122
4123 }
4124
4125 /*----------------------------------------------------------------------------
4126 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4127 | and significand `zSig', and returns the proper double-precision floating-
4128 | point value corresponding to the abstract input.  Ordinarily, the abstract
4129 | value is simply rounded and packed into the double-precision format, with
4130 | the inexact exception raised if the abstract input cannot be represented
4131 | exactly.  However, if the abstract value is too large, the overflow and
4132 | inexact exceptions are raised and an infinity or maximal finite value is
4133 | returned.  If the abstract value is too small, the input value is rounded to
4134 | a subnormal number, and the underflow and inexact exceptions are raised if
4135 | the abstract input cannot be represented exactly as a subnormal double-
4136 | precision floating-point number.
4137 |     The input significand `zSig' has its binary point between bits 62
4138 | and 61, which is 10 bits to the left of the usual location.  This shifted
4139 | significand must be normalized or smaller.  If `zSig' is not normalized,
4140 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4141 | and it must not require rounding.  In the usual case that `zSig' is
4142 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4143 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4144 | Binary Floating-Point Arithmetic.
4145 *----------------------------------------------------------------------------*/
4146
4147 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4148                                    float_status *status)
4149 {
4150     int8_t roundingMode;
4151     bool roundNearestEven;
4152     int roundIncrement, roundBits;
4153     bool isTiny;
4154
4155     roundingMode = status->float_rounding_mode;
4156     roundNearestEven = ( roundingMode == float_round_nearest_even );
4157     switch (roundingMode) {
4158     case float_round_nearest_even:
4159     case float_round_ties_away:
4160         roundIncrement = 0x200;
4161         break;
4162     case float_round_to_zero:
4163         roundIncrement = 0;
4164         break;
4165     case float_round_up:
4166         roundIncrement = zSign ? 0 : 0x3ff;
4167         break;
4168     case float_round_down:
4169         roundIncrement = zSign ? 0x3ff : 0;
4170         break;
4171     case float_round_to_odd:
4172         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4173         break;
4174     default:
4175         abort();
4176     }
4177     roundBits = zSig & 0x3FF;
4178     if ( 0x7FD <= (uint16_t) zExp ) {
4179         if (    ( 0x7FD < zExp )
4180              || (    ( zExp == 0x7FD )
4181                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4182            ) {
4183             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4184                                    roundIncrement != 0;
4185             float_raise(float_flag_overflow | float_flag_inexact, status);
4186             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4187         }
4188         if ( zExp < 0 ) {
4189             if (status->flush_to_zero) {
4190                 float_raise(float_flag_output_denormal, status);
4191                 return packFloat64(zSign, 0, 0);
4192             }
4193             isTiny = status->tininess_before_rounding
4194                   || (zExp < -1)
4195                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4196             shift64RightJamming( zSig, - zExp, &zSig );
4197             zExp = 0;
4198             roundBits = zSig & 0x3FF;
4199             if (isTiny && roundBits) {
4200                 float_raise(float_flag_underflow, status);
4201             }
4202             if (roundingMode == float_round_to_odd) {
4203                 /*
4204                  * For round-to-odd case, the roundIncrement depends on
4205                  * zSig which just changed.
4206                  */
4207                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4208             }
4209         }
4210     }
4211     if (roundBits) {
4212         float_raise(float_flag_inexact, status);
4213     }
4214     zSig = ( zSig + roundIncrement )>>10;
4215     if (!(roundBits ^ 0x200) && roundNearestEven) {
4216         zSig &= ~1;
4217     }
4218     if ( zSig == 0 ) zExp = 0;
4219     return packFloat64( zSign, zExp, zSig );
4220
4221 }
4222
4223 /*----------------------------------------------------------------------------
4224 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4225 | and significand `zSig', and returns the proper double-precision floating-
4226 | point value corresponding to the abstract input.  This routine is just like
4227 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4228 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4229 | floating-point exponent.
4230 *----------------------------------------------------------------------------*/
4231
4232 static float64
4233  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4234                               float_status *status)
4235 {
4236     int8_t shiftCount;
4237
4238     shiftCount = clz64(zSig) - 1;
4239     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4240                                status);
4241
4242 }
4243
4244 /*----------------------------------------------------------------------------
4245 | Normalizes the subnormal extended double-precision floating-point value
4246 | represented by the denormalized significand `aSig'.  The normalized exponent
4247 | and significand are stored at the locations pointed to by `zExpPtr' and
4248 | `zSigPtr', respectively.
4249 *----------------------------------------------------------------------------*/
4250
4251 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4252                                 uint64_t *zSigPtr)
4253 {
4254     int8_t shiftCount;
4255
4256     shiftCount = clz64(aSig);
4257     *zSigPtr = aSig<<shiftCount;
4258     *zExpPtr = 1 - shiftCount;
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4263 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4264 | and returns the proper extended double-precision floating-point value
4265 | corresponding to the abstract input.  Ordinarily, the abstract value is
4266 | rounded and packed into the extended double-precision format, with the
4267 | inexact exception raised if the abstract input cannot be represented
4268 | exactly.  However, if the abstract value is too large, the overflow and
4269 | inexact exceptions are raised and an infinity or maximal finite value is
4270 | returned.  If the abstract value is too small, the input value is rounded to
4271 | a subnormal number, and the underflow and inexact exceptions are raised if
4272 | the abstract input cannot be represented exactly as a subnormal extended
4273 | double-precision floating-point number.
4274 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4275 | number of bits as single or double precision, respectively.  Otherwise, the
4276 | result is rounded to the full precision of the extended double-precision
4277 | format.
4278 |     The input significand must be normalized or smaller.  If the input
4279 | significand is not normalized, `zExp' must be 0; in that case, the result
4280 | returned is a subnormal number, and it must not require rounding.  The
4281 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4282 | Floating-Point Arithmetic.
4283 *----------------------------------------------------------------------------*/
4284
4285 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4286                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4287                               float_status *status)
4288 {
4289     int8_t roundingMode;
4290     bool roundNearestEven, increment, isTiny;
4291     int64_t roundIncrement, roundMask, roundBits;
4292
4293     roundingMode = status->float_rounding_mode;
4294     roundNearestEven = ( roundingMode == float_round_nearest_even );
4295     if ( roundingPrecision == 80 ) goto precision80;
4296     if ( roundingPrecision == 64 ) {
4297         roundIncrement = UINT64_C(0x0000000000000400);
4298         roundMask = UINT64_C(0x00000000000007FF);
4299     }
4300     else if ( roundingPrecision == 32 ) {
4301         roundIncrement = UINT64_C(0x0000008000000000);
4302         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4303     }
4304     else {
4305         goto precision80;
4306     }
4307     zSig0 |= ( zSig1 != 0 );
4308     switch (roundingMode) {
4309     case float_round_nearest_even:
4310     case float_round_ties_away:
4311         break;
4312     case float_round_to_zero:
4313         roundIncrement = 0;
4314         break;
4315     case float_round_up:
4316         roundIncrement = zSign ? 0 : roundMask;
4317         break;
4318     case float_round_down:
4319         roundIncrement = zSign ? roundMask : 0;
4320         break;
4321     default:
4322         abort();
4323     }
4324     roundBits = zSig0 & roundMask;
4325     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4326         if (    ( 0x7FFE < zExp )
4327              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4328            ) {
4329             goto overflow;
4330         }
4331         if ( zExp <= 0 ) {
4332             if (status->flush_to_zero) {
4333                 float_raise(float_flag_output_denormal, status);
4334                 return packFloatx80(zSign, 0, 0);
4335             }
4336             isTiny = status->tininess_before_rounding
4337                   || (zExp < 0 )
4338                   || (zSig0 <= zSig0 + roundIncrement);
4339             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4340             zExp = 0;
4341             roundBits = zSig0 & roundMask;
4342             if (isTiny && roundBits) {
4343                 float_raise(float_flag_underflow, status);
4344             }
4345             if (roundBits) {
4346                 float_raise(float_flag_inexact, status);
4347             }
4348             zSig0 += roundIncrement;
4349             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4350             roundIncrement = roundMask + 1;
4351             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4352                 roundMask |= roundIncrement;
4353             }
4354             zSig0 &= ~ roundMask;
4355             return packFloatx80( zSign, zExp, zSig0 );
4356         }
4357     }
4358     if (roundBits) {
4359         float_raise(float_flag_inexact, status);
4360     }
4361     zSig0 += roundIncrement;
4362     if ( zSig0 < roundIncrement ) {
4363         ++zExp;
4364         zSig0 = UINT64_C(0x8000000000000000);
4365     }
4366     roundIncrement = roundMask + 1;
4367     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4368         roundMask |= roundIncrement;
4369     }
4370     zSig0 &= ~ roundMask;
4371     if ( zSig0 == 0 ) zExp = 0;
4372     return packFloatx80( zSign, zExp, zSig0 );
4373  precision80:
4374     switch (roundingMode) {
4375     case float_round_nearest_even:
4376     case float_round_ties_away:
4377         increment = ((int64_t)zSig1 < 0);
4378         break;
4379     case float_round_to_zero:
4380         increment = 0;
4381         break;
4382     case float_round_up:
4383         increment = !zSign && zSig1;
4384         break;
4385     case float_round_down:
4386         increment = zSign && zSig1;
4387         break;
4388     default:
4389         abort();
4390     }
4391     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4392         if (    ( 0x7FFE < zExp )
4393              || (    ( zExp == 0x7FFE )
4394                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4395                   && increment
4396                 )
4397            ) {
4398             roundMask = 0;
4399  overflow:
4400             float_raise(float_flag_overflow | float_flag_inexact, status);
4401             if (    ( roundingMode == float_round_to_zero )
4402                  || ( zSign && ( roundingMode == float_round_up ) )
4403                  || ( ! zSign && ( roundingMode == float_round_down ) )
4404                ) {
4405                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4406             }
4407             return packFloatx80(zSign,
4408                                 floatx80_infinity_high,
4409                                 floatx80_infinity_low);
4410         }
4411         if ( zExp <= 0 ) {
4412             isTiny = status->tininess_before_rounding
4413                   || (zExp < 0)
4414                   || !increment
4415                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4416             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4417             zExp = 0;
4418             if (isTiny && zSig1) {
4419                 float_raise(float_flag_underflow, status);
4420             }
4421             if (zSig1) {
4422                 float_raise(float_flag_inexact, status);
4423             }
4424             switch (roundingMode) {
4425             case float_round_nearest_even:
4426             case float_round_ties_away:
4427                 increment = ((int64_t)zSig1 < 0);
4428                 break;
4429             case float_round_to_zero:
4430                 increment = 0;
4431                 break;
4432             case float_round_up:
4433                 increment = !zSign && zSig1;
4434                 break;
4435             case float_round_down:
4436                 increment = zSign && zSig1;
4437                 break;
4438             default:
4439                 abort();
4440             }
4441             if ( increment ) {
4442                 ++zSig0;
4443                 if (!(zSig1 << 1) && roundNearestEven) {
4444                     zSig0 &= ~1;
4445                 }
4446                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4447             }
4448             return packFloatx80( zSign, zExp, zSig0 );
4449         }
4450     }
4451     if (zSig1) {
4452         float_raise(float_flag_inexact, status);
4453     }
4454     if ( increment ) {
4455         ++zSig0;
4456         if ( zSig0 == 0 ) {
4457             ++zExp;
4458             zSig0 = UINT64_C(0x8000000000000000);
4459         }
4460         else {
4461             if (!(zSig1 << 1) && roundNearestEven) {
4462                 zSig0 &= ~1;
4463             }
4464         }
4465     }
4466     else {
4467         if ( zSig0 == 0 ) zExp = 0;
4468     }
4469     return packFloatx80( zSign, zExp, zSig0 );
4470
4471 }
4472
4473 /*----------------------------------------------------------------------------
4474 | Takes an abstract floating-point value having sign `zSign', exponent
4475 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4476 | and returns the proper extended double-precision floating-point value
4477 | corresponding to the abstract input.  This routine is just like
4478 | `roundAndPackFloatx80' except that the input significand does not have to be
4479 | normalized.
4480 *----------------------------------------------------------------------------*/
4481
4482 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4483                                        bool zSign, int32_t zExp,
4484                                        uint64_t zSig0, uint64_t zSig1,
4485                                        float_status *status)
4486 {
4487     int8_t shiftCount;
4488
4489     if ( zSig0 == 0 ) {
4490         zSig0 = zSig1;
4491         zSig1 = 0;
4492         zExp -= 64;
4493     }
4494     shiftCount = clz64(zSig0);
4495     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4496     zExp -= shiftCount;
4497     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4498                                 zSig0, zSig1, status);
4499
4500 }
4501
4502 /*----------------------------------------------------------------------------
4503 | Returns the least-significant 64 fraction bits of the quadruple-precision
4504 | floating-point value `a'.
4505 *----------------------------------------------------------------------------*/
4506
4507 static inline uint64_t extractFloat128Frac1( float128 a )
4508 {
4509
4510     return a.low;
4511
4512 }
4513
4514 /*----------------------------------------------------------------------------
4515 | Returns the most-significant 48 fraction bits of the quadruple-precision
4516 | floating-point value `a'.
4517 *----------------------------------------------------------------------------*/
4518
4519 static inline uint64_t extractFloat128Frac0( float128 a )
4520 {
4521
4522     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4523
4524 }
4525
4526 /*----------------------------------------------------------------------------
4527 | Returns the exponent bits of the quadruple-precision floating-point value
4528 | `a'.
4529 *----------------------------------------------------------------------------*/
4530
4531 static inline int32_t extractFloat128Exp( float128 a )
4532 {
4533
4534     return ( a.high>>48 ) & 0x7FFF;
4535
4536 }
4537
4538 /*----------------------------------------------------------------------------
4539 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4540 *----------------------------------------------------------------------------*/
4541
4542 static inline bool extractFloat128Sign(float128 a)
4543 {
4544     return a.high >> 63;
4545 }
4546
4547 /*----------------------------------------------------------------------------
4548 | Normalizes the subnormal quadruple-precision floating-point value
4549 | represented by the denormalized significand formed by the concatenation of
4550 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4551 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4552 | significand are stored at the location pointed to by `zSig0Ptr', and the
4553 | least significant 64 bits of the normalized significand are stored at the
4554 | location pointed to by `zSig1Ptr'.
4555 *----------------------------------------------------------------------------*/
4556
4557 static void
4558  normalizeFloat128Subnormal(
4559      uint64_t aSig0,
4560      uint64_t aSig1,
4561      int32_t *zExpPtr,
4562      uint64_t *zSig0Ptr,
4563      uint64_t *zSig1Ptr
4564  )
4565 {
4566     int8_t shiftCount;
4567
4568     if ( aSig0 == 0 ) {
4569         shiftCount = clz64(aSig1) - 15;
4570         if ( shiftCount < 0 ) {
4571             *zSig0Ptr = aSig1>>( - shiftCount );
4572             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4573         }
4574         else {
4575             *zSig0Ptr = aSig1<<shiftCount;
4576             *zSig1Ptr = 0;
4577         }
4578         *zExpPtr = - shiftCount - 63;
4579     }
4580     else {
4581         shiftCount = clz64(aSig0) - 15;
4582         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4583         *zExpPtr = 1 - shiftCount;
4584     }
4585
4586 }
4587
4588 /*----------------------------------------------------------------------------
4589 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4590 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4591 | floating-point value, returning the result.  After being shifted into the
4592 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4593 | added together to form the most significant 32 bits of the result.  This
4594 | means that any integer portion of `zSig0' will be added into the exponent.
4595 | Since a properly normalized significand will have an integer portion equal
4596 | to 1, the `zExp' input should be 1 less than the desired result exponent
4597 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4598 | significand.
4599 *----------------------------------------------------------------------------*/
4600
4601 static inline float128
4602 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4603 {
4604     float128 z;
4605
4606     z.low = zSig1;
4607     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4608     return z;
4609 }
4610
4611 /*----------------------------------------------------------------------------
4612 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4613 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4614 | and `zSig2', and returns the proper quadruple-precision floating-point value
4615 | corresponding to the abstract input.  Ordinarily, the abstract value is
4616 | simply rounded and packed into the quadruple-precision format, with the
4617 | inexact exception raised if the abstract input cannot be represented
4618 | exactly.  However, if the abstract value is too large, the overflow and
4619 | inexact exceptions are raised and an infinity or maximal finite value is
4620 | returned.  If the abstract value is too small, the input value is rounded to
4621 | a subnormal number, and the underflow and inexact exceptions are raised if
4622 | the abstract input cannot be represented exactly as a subnormal quadruple-
4623 | precision floating-point number.
4624 |     The input significand must be normalized or smaller.  If the input
4625 | significand is not normalized, `zExp' must be 0; in that case, the result
4626 | returned is a subnormal number, and it must not require rounding.  In the
4627 | usual case that the input significand is normalized, `zExp' must be 1 less
4628 | than the ``true'' floating-point exponent.  The handling of underflow and
4629 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631
4632 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4633                                      uint64_t zSig0, uint64_t zSig1,
4634                                      uint64_t zSig2, float_status *status)
4635 {
4636     int8_t roundingMode;
4637     bool roundNearestEven, increment, isTiny;
4638
4639     roundingMode = status->float_rounding_mode;
4640     roundNearestEven = ( roundingMode == float_round_nearest_even );
4641     switch (roundingMode) {
4642     case float_round_nearest_even:
4643     case float_round_ties_away:
4644         increment = ((int64_t)zSig2 < 0);
4645         break;
4646     case float_round_to_zero:
4647         increment = 0;
4648         break;
4649     case float_round_up:
4650         increment = !zSign && zSig2;
4651         break;
4652     case float_round_down:
4653         increment = zSign && zSig2;
4654         break;
4655     case float_round_to_odd:
4656         increment = !(zSig1 & 0x1) && zSig2;
4657         break;
4658     default:
4659         abort();
4660     }
4661     if ( 0x7FFD <= (uint32_t) zExp ) {
4662         if (    ( 0x7FFD < zExp )
4663              || (    ( zExp == 0x7FFD )
4664                   && eq128(
4665                          UINT64_C(0x0001FFFFFFFFFFFF),
4666                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4667                          zSig0,
4668                          zSig1
4669                      )
4670                   && increment
4671                 )
4672            ) {
4673             float_raise(float_flag_overflow | float_flag_inexact, status);
4674             if (    ( roundingMode == float_round_to_zero )
4675                  || ( zSign && ( roundingMode == float_round_up ) )
4676                  || ( ! zSign && ( roundingMode == float_round_down ) )
4677                  || (roundingMode == float_round_to_odd)
4678                ) {
4679                 return
4680                     packFloat128(
4681                         zSign,
4682                         0x7FFE,
4683                         UINT64_C(0x0000FFFFFFFFFFFF),
4684                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4685                     );
4686             }
4687             return packFloat128( zSign, 0x7FFF, 0, 0 );
4688         }
4689         if ( zExp < 0 ) {
4690             if (status->flush_to_zero) {
4691                 float_raise(float_flag_output_denormal, status);
4692                 return packFloat128(zSign, 0, 0, 0);
4693             }
4694             isTiny = status->tininess_before_rounding
4695                   || (zExp < -1)
4696                   || !increment
4697                   || lt128(zSig0, zSig1,
4698                            UINT64_C(0x0001FFFFFFFFFFFF),
4699                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4700             shift128ExtraRightJamming(
4701                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4702             zExp = 0;
4703             if (isTiny && zSig2) {
4704                 float_raise(float_flag_underflow, status);
4705             }
4706             switch (roundingMode) {
4707             case float_round_nearest_even:
4708             case float_round_ties_away:
4709                 increment = ((int64_t)zSig2 < 0);
4710                 break;
4711             case float_round_to_zero:
4712                 increment = 0;
4713                 break;
4714             case float_round_up:
4715                 increment = !zSign && zSig2;
4716                 break;
4717             case float_round_down:
4718                 increment = zSign && zSig2;
4719                 break;
4720             case float_round_to_odd:
4721                 increment = !(zSig1 & 0x1) && zSig2;
4722                 break;
4723             default:
4724                 abort();
4725             }
4726         }
4727     }
4728     if (zSig2) {
4729         float_raise(float_flag_inexact, status);
4730     }
4731     if ( increment ) {
4732         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4733         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4734             zSig1 &= ~1;
4735         }
4736     }
4737     else {
4738         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4739     }
4740     return packFloat128( zSign, zExp, zSig0, zSig1 );
4741
4742 }
4743
4744 /*----------------------------------------------------------------------------
4745 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4746 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4747 | returns the proper quadruple-precision floating-point value corresponding
4748 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4749 | except that the input significand has fewer bits and does not have to be
4750 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4751 | point exponent.
4752 *----------------------------------------------------------------------------*/
4753
4754 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4755                                               uint64_t zSig0, uint64_t zSig1,
4756                                               float_status *status)
4757 {
4758     int8_t shiftCount;
4759     uint64_t zSig2;
4760
4761     if ( zSig0 == 0 ) {
4762         zSig0 = zSig1;
4763         zSig1 = 0;
4764         zExp -= 64;
4765     }
4766     shiftCount = clz64(zSig0) - 15;
4767     if ( 0 <= shiftCount ) {
4768         zSig2 = 0;
4769         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4770     }
4771     else {
4772         shift128ExtraRightJamming(
4773             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4774     }
4775     zExp -= shiftCount;
4776     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4777
4778 }
4779
4780
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of converting the 32-bit two's complement integer `a'
4783 | to the extended double-precision floating-point format.  The conversion
4784 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4785 | Arithmetic.
4786 *----------------------------------------------------------------------------*/
4787
4788 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4789 {
4790     bool zSign;
4791     uint32_t absA;
4792     int8_t shiftCount;
4793     uint64_t zSig;
4794
4795     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4796     zSign = ( a < 0 );
4797     absA = zSign ? - a : a;
4798     shiftCount = clz32(absA) + 32;
4799     zSig = absA;
4800     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4801
4802 }
4803
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the 32-bit two's complement integer `a' to
4806 | the quadruple-precision floating-point format.  The conversion is performed
4807 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4808 *----------------------------------------------------------------------------*/
4809
4810 float128 int32_to_float128(int32_t a, float_status *status)
4811 {
4812     bool zSign;
4813     uint32_t absA;
4814     int8_t shiftCount;
4815     uint64_t zSig0;
4816
4817     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4818     zSign = ( a < 0 );
4819     absA = zSign ? - a : a;
4820     shiftCount = clz32(absA) + 17;
4821     zSig0 = absA;
4822     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4823
4824 }
4825
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the 64-bit two's complement integer `a'
4828 | to the extended double-precision floating-point format.  The conversion
4829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4830 | Arithmetic.
4831 *----------------------------------------------------------------------------*/
4832
4833 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4834 {
4835     bool zSign;
4836     uint64_t absA;
4837     int8_t shiftCount;
4838
4839     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4840     zSign = ( a < 0 );
4841     absA = zSign ? - a : a;
4842     shiftCount = clz64(absA);
4843     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4844
4845 }
4846
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of converting the 64-bit two's complement integer `a' to
4849 | the quadruple-precision floating-point format.  The conversion is performed
4850 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4851 *----------------------------------------------------------------------------*/
4852
4853 float128 int64_to_float128(int64_t a, float_status *status)
4854 {
4855     bool zSign;
4856     uint64_t absA;
4857     int8_t shiftCount;
4858     int32_t zExp;
4859     uint64_t zSig0, zSig1;
4860
4861     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4862     zSign = ( a < 0 );
4863     absA = zSign ? - a : a;
4864     shiftCount = clz64(absA) + 49;
4865     zExp = 0x406E - shiftCount;
4866     if ( 64 <= shiftCount ) {
4867         zSig1 = 0;
4868         zSig0 = absA;
4869         shiftCount -= 64;
4870     }
4871     else {
4872         zSig1 = absA;
4873         zSig0 = 0;
4874     }
4875     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4876     return packFloat128( zSign, zExp, zSig0, zSig1 );
4877
4878 }
4879
4880 /*----------------------------------------------------------------------------
4881 | Returns the result of converting the 64-bit unsigned integer `a'
4882 | to the quadruple-precision floating-point format.  The conversion is performed
4883 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4884 *----------------------------------------------------------------------------*/
4885
4886 float128 uint64_to_float128(uint64_t a, float_status *status)
4887 {
4888     if (a == 0) {
4889         return float128_zero;
4890     }
4891     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4892 }
4893
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of converting the single-precision floating-point value
4896 | `a' to the extended double-precision floating-point format.  The conversion
4897 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4898 | Arithmetic.
4899 *----------------------------------------------------------------------------*/
4900
4901 floatx80 float32_to_floatx80(float32 a, float_status *status)
4902 {
4903     bool aSign;
4904     int aExp;
4905     uint32_t aSig;
4906
4907     a = float32_squash_input_denormal(a, status);
4908     aSig = extractFloat32Frac( a );
4909     aExp = extractFloat32Exp( a );
4910     aSign = extractFloat32Sign( a );
4911     if ( aExp == 0xFF ) {
4912         if (aSig) {
4913             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4914                                                status);
4915             return floatx80_silence_nan(res, status);
4916         }
4917         return packFloatx80(aSign,
4918                             floatx80_infinity_high,
4919                             floatx80_infinity_low);
4920     }
4921     if ( aExp == 0 ) {
4922         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4923         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4924     }
4925     aSig |= 0x00800000;
4926     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4927
4928 }
4929
4930 /*----------------------------------------------------------------------------
4931 | Returns the result of converting the single-precision floating-point value
4932 | `a' to the double-precision floating-point format.  The conversion is
4933 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4934 | Arithmetic.
4935 *----------------------------------------------------------------------------*/
4936
4937 float128 float32_to_float128(float32 a, float_status *status)
4938 {
4939     bool aSign;
4940     int aExp;
4941     uint32_t aSig;
4942
4943     a = float32_squash_input_denormal(a, status);
4944     aSig = extractFloat32Frac( a );
4945     aExp = extractFloat32Exp( a );
4946     aSign = extractFloat32Sign( a );
4947     if ( aExp == 0xFF ) {
4948         if (aSig) {
4949             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4950         }
4951         return packFloat128( aSign, 0x7FFF, 0, 0 );
4952     }
4953     if ( aExp == 0 ) {
4954         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4955         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4956         --aExp;
4957     }
4958     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4959
4960 }
4961
4962 /*----------------------------------------------------------------------------
4963 | Returns the remainder of the single-precision floating-point value `a'
4964 | with respect to the corresponding value `b'.  The operation is performed
4965 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967
4968 float32 float32_rem(float32 a, float32 b, float_status *status)
4969 {
4970     bool aSign, zSign;
4971     int aExp, bExp, expDiff;
4972     uint32_t aSig, bSig;
4973     uint32_t q;
4974     uint64_t aSig64, bSig64, q64;
4975     uint32_t alternateASig;
4976     int32_t sigMean;
4977     a = float32_squash_input_denormal(a, status);
4978     b = float32_squash_input_denormal(b, status);
4979
4980     aSig = extractFloat32Frac( a );
4981     aExp = extractFloat32Exp( a );
4982     aSign = extractFloat32Sign( a );
4983     bSig = extractFloat32Frac( b );
4984     bExp = extractFloat32Exp( b );
4985     if ( aExp == 0xFF ) {
4986         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4987             return propagateFloat32NaN(a, b, status);
4988         }
4989         float_raise(float_flag_invalid, status);
4990         return float32_default_nan(status);
4991     }
4992     if ( bExp == 0xFF ) {
4993         if (bSig) {
4994             return propagateFloat32NaN(a, b, status);
4995         }
4996         return a;
4997     }
4998     if ( bExp == 0 ) {
4999         if ( bSig == 0 ) {
5000             float_raise(float_flag_invalid, status);
5001             return float32_default_nan(status);
5002         }
5003         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5004     }
5005     if ( aExp == 0 ) {
5006         if ( aSig == 0 ) return a;
5007         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5008     }
5009     expDiff = aExp - bExp;
5010     aSig |= 0x00800000;
5011     bSig |= 0x00800000;
5012     if ( expDiff < 32 ) {
5013         aSig <<= 8;
5014         bSig <<= 8;
5015         if ( expDiff < 0 ) {
5016             if ( expDiff < -1 ) return a;
5017             aSig >>= 1;
5018         }
5019         q = ( bSig <= aSig );
5020         if ( q ) aSig -= bSig;
5021         if ( 0 < expDiff ) {
5022             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5023             q >>= 32 - expDiff;
5024             bSig >>= 2;
5025             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5026         }
5027         else {
5028             aSig >>= 2;
5029             bSig >>= 2;
5030         }
5031     }
5032     else {
5033         if ( bSig <= aSig ) aSig -= bSig;
5034         aSig64 = ( (uint64_t) aSig )<<40;
5035         bSig64 = ( (uint64_t) bSig )<<40;
5036         expDiff -= 64;
5037         while ( 0 < expDiff ) {
5038             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5039             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5040             aSig64 = - ( ( bSig * q64 )<<38 );
5041             expDiff -= 62;
5042         }
5043         expDiff += 64;
5044         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5045         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5046         q = q64>>( 64 - expDiff );
5047         bSig <<= 6;
5048         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5049     }
5050     do {
5051         alternateASig = aSig;
5052         ++q;
5053         aSig -= bSig;
5054     } while ( 0 <= (int32_t) aSig );
5055     sigMean = aSig + alternateASig;
5056     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5057         aSig = alternateASig;
5058     }
5059     zSign = ( (int32_t) aSig < 0 );
5060     if ( zSign ) aSig = - aSig;
5061     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5062 }
5063
5064
5065
5066 /*----------------------------------------------------------------------------
5067 | Returns the binary exponential of the single-precision floating-point value
5068 | `a'. The operation is performed according to the IEC/IEEE Standard for
5069 | Binary Floating-Point Arithmetic.
5070 |
5071 | Uses the following identities:
5072 |
5073 | 1. -------------------------------------------------------------------------
5074 |      x    x*ln(2)
5075 |     2  = e
5076 |
5077 | 2. -------------------------------------------------------------------------
5078 |                      2     3     4     5           n
5079 |      x        x     x     x     x     x           x
5080 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5081 |               1!    2!    3!    4!    5!          n!
5082 *----------------------------------------------------------------------------*/
5083
5084 static const float64 float32_exp2_coefficients[15] =
5085 {
5086     const_float64( 0x3ff0000000000000ll ), /*  1 */
5087     const_float64( 0x3fe0000000000000ll ), /*  2 */
5088     const_float64( 0x3fc5555555555555ll ), /*  3 */
5089     const_float64( 0x3fa5555555555555ll ), /*  4 */
5090     const_float64( 0x3f81111111111111ll ), /*  5 */
5091     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5092     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5093     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5094     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5095     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5096     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5097     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5098     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5099     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5100     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5101 };
5102
5103 float32 float32_exp2(float32 a, float_status *status)
5104 {
5105     bool aSign;
5106     int aExp;
5107     uint32_t aSig;
5108     float64 r, x, xn;
5109     int i;
5110     a = float32_squash_input_denormal(a, status);
5111
5112     aSig = extractFloat32Frac( a );
5113     aExp = extractFloat32Exp( a );
5114     aSign = extractFloat32Sign( a );
5115
5116     if ( aExp == 0xFF) {
5117         if (aSig) {
5118             return propagateFloat32NaN(a, float32_zero, status);
5119         }
5120         return (aSign) ? float32_zero : a;
5121     }
5122     if (aExp == 0) {
5123         if (aSig == 0) return float32_one;
5124     }
5125
5126     float_raise(float_flag_inexact, status);
5127
5128     /* ******************************* */
5129     /* using float64 for approximation */
5130     /* ******************************* */
5131     x = float32_to_float64(a, status);
5132     x = float64_mul(x, float64_ln2, status);
5133
5134     xn = x;
5135     r = float64_one;
5136     for (i = 0 ; i < 15 ; i++) {
5137         float64 f;
5138
5139         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5140         r = float64_add(r, f, status);
5141
5142         xn = float64_mul(xn, x, status);
5143     }
5144
5145     return float64_to_float32(r, status);
5146 }
5147
5148 /*----------------------------------------------------------------------------
5149 | Returns the binary log of the single-precision floating-point value `a'.
5150 | The operation is performed according to the IEC/IEEE Standard for Binary
5151 | Floating-Point Arithmetic.
5152 *----------------------------------------------------------------------------*/
5153 float32 float32_log2(float32 a, float_status *status)
5154 {
5155     bool aSign, zSign;
5156     int aExp;
5157     uint32_t aSig, zSig, i;
5158
5159     a = float32_squash_input_denormal(a, status);
5160     aSig = extractFloat32Frac( a );
5161     aExp = extractFloat32Exp( a );
5162     aSign = extractFloat32Sign( a );
5163
5164     if ( aExp == 0 ) {
5165         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5166         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5167     }
5168     if ( aSign ) {
5169         float_raise(float_flag_invalid, status);
5170         return float32_default_nan(status);
5171     }
5172     if ( aExp == 0xFF ) {
5173         if (aSig) {
5174             return propagateFloat32NaN(a, float32_zero, status);
5175         }
5176         return a;
5177     }
5178
5179     aExp -= 0x7F;
5180     aSig |= 0x00800000;
5181     zSign = aExp < 0;
5182     zSig = aExp << 23;
5183
5184     for (i = 1 << 22; i > 0; i >>= 1) {
5185         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5186         if ( aSig & 0x01000000 ) {
5187             aSig >>= 1;
5188             zSig |= i;
5189         }
5190     }
5191
5192     if ( zSign )
5193         zSig = -zSig;
5194
5195     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5196 }
5197
5198 /*----------------------------------------------------------------------------
5199 | Returns the result of converting the double-precision floating-point value
5200 | `a' to the extended double-precision floating-point format.  The conversion
5201 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5202 | Arithmetic.
5203 *----------------------------------------------------------------------------*/
5204
5205 floatx80 float64_to_floatx80(float64 a, float_status *status)
5206 {
5207     bool aSign;
5208     int aExp;
5209     uint64_t aSig;
5210
5211     a = float64_squash_input_denormal(a, status);
5212     aSig = extractFloat64Frac( a );
5213     aExp = extractFloat64Exp( a );
5214     aSign = extractFloat64Sign( a );
5215     if ( aExp == 0x7FF ) {
5216         if (aSig) {
5217             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5218                                                status);
5219             return floatx80_silence_nan(res, status);
5220         }
5221         return packFloatx80(aSign,
5222                             floatx80_infinity_high,
5223                             floatx80_infinity_low);
5224     }
5225     if ( aExp == 0 ) {
5226         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5227         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5228     }
5229     return
5230         packFloatx80(
5231             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5232
5233 }
5234
5235 /*----------------------------------------------------------------------------
5236 | Returns the result of converting the double-precision floating-point value
5237 | `a' to the quadruple-precision floating-point format.  The conversion is
5238 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5239 | Arithmetic.
5240 *----------------------------------------------------------------------------*/
5241
5242 float128 float64_to_float128(float64 a, float_status *status)
5243 {
5244     bool aSign;
5245     int aExp;
5246     uint64_t aSig, zSig0, zSig1;
5247
5248     a = float64_squash_input_denormal(a, status);
5249     aSig = extractFloat64Frac( a );
5250     aExp = extractFloat64Exp( a );
5251     aSign = extractFloat64Sign( a );
5252     if ( aExp == 0x7FF ) {
5253         if (aSig) {
5254             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5255         }
5256         return packFloat128( aSign, 0x7FFF, 0, 0 );
5257     }
5258     if ( aExp == 0 ) {
5259         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5260         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5261         --aExp;
5262     }
5263     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5264     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5265
5266 }
5267
5268
5269 /*----------------------------------------------------------------------------
5270 | Returns the remainder of the double-precision floating-point value `a'
5271 | with respect to the corresponding value `b'.  The operation is performed
5272 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5274
5275 float64 float64_rem(float64 a, float64 b, float_status *status)
5276 {
5277     bool aSign, zSign;
5278     int aExp, bExp, expDiff;
5279     uint64_t aSig, bSig;
5280     uint64_t q, alternateASig;
5281     int64_t sigMean;
5282
5283     a = float64_squash_input_denormal(a, status);
5284     b = float64_squash_input_denormal(b, status);
5285     aSig = extractFloat64Frac( a );
5286     aExp = extractFloat64Exp( a );
5287     aSign = extractFloat64Sign( a );
5288     bSig = extractFloat64Frac( b );
5289     bExp = extractFloat64Exp( b );
5290     if ( aExp == 0x7FF ) {
5291         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5292             return propagateFloat64NaN(a, b, status);
5293         }
5294         float_raise(float_flag_invalid, status);
5295         return float64_default_nan(status);
5296     }
5297     if ( bExp == 0x7FF ) {
5298         if (bSig) {
5299             return propagateFloat64NaN(a, b, status);
5300         }
5301         return a;
5302     }
5303     if ( bExp == 0 ) {
5304         if ( bSig == 0 ) {
5305             float_raise(float_flag_invalid, status);
5306             return float64_default_nan(status);
5307         }
5308         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5309     }
5310     if ( aExp == 0 ) {
5311         if ( aSig == 0 ) return a;
5312         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5313     }
5314     expDiff = aExp - bExp;
5315     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5316     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5317     if ( expDiff < 0 ) {
5318         if ( expDiff < -1 ) return a;
5319         aSig >>= 1;
5320     }
5321     q = ( bSig <= aSig );
5322     if ( q ) aSig -= bSig;
5323     expDiff -= 64;
5324     while ( 0 < expDiff ) {
5325         q = estimateDiv128To64( aSig, 0, bSig );
5326         q = ( 2 < q ) ? q - 2 : 0;
5327         aSig = - ( ( bSig>>2 ) * q );
5328         expDiff -= 62;
5329     }
5330     expDiff += 64;
5331     if ( 0 < expDiff ) {
5332         q = estimateDiv128To64( aSig, 0, bSig );
5333         q = ( 2 < q ) ? q - 2 : 0;
5334         q >>= 64 - expDiff;
5335         bSig >>= 2;
5336         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5337     }
5338     else {
5339         aSig >>= 2;
5340         bSig >>= 2;
5341     }
5342     do {
5343         alternateASig = aSig;
5344         ++q;
5345         aSig -= bSig;
5346     } while ( 0 <= (int64_t) aSig );
5347     sigMean = aSig + alternateASig;
5348     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5349         aSig = alternateASig;
5350     }
5351     zSign = ( (int64_t) aSig < 0 );
5352     if ( zSign ) aSig = - aSig;
5353     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5354
5355 }
5356
5357 /*----------------------------------------------------------------------------
5358 | Returns the binary log of the double-precision floating-point value `a'.
5359 | The operation is performed according to the IEC/IEEE Standard for Binary
5360 | Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362 float64 float64_log2(float64 a, float_status *status)
5363 {
5364     bool aSign, zSign;
5365     int aExp;
5366     uint64_t aSig, aSig0, aSig1, zSig, i;
5367     a = float64_squash_input_denormal(a, status);
5368
5369     aSig = extractFloat64Frac( a );
5370     aExp = extractFloat64Exp( a );
5371     aSign = extractFloat64Sign( a );
5372
5373     if ( aExp == 0 ) {
5374         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5375         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5376     }
5377     if ( aSign ) {
5378         float_raise(float_flag_invalid, status);
5379         return float64_default_nan(status);
5380     }
5381     if ( aExp == 0x7FF ) {
5382         if (aSig) {
5383             return propagateFloat64NaN(a, float64_zero, status);
5384         }
5385         return a;
5386     }
5387
5388     aExp -= 0x3FF;
5389     aSig |= UINT64_C(0x0010000000000000);
5390     zSign = aExp < 0;
5391     zSig = (uint64_t)aExp << 52;
5392     for (i = 1LL << 51; i > 0; i >>= 1) {
5393         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5394         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5395         if ( aSig & UINT64_C(0x0020000000000000) ) {
5396             aSig >>= 1;
5397             zSig |= i;
5398         }
5399     }
5400
5401     if ( zSign )
5402         zSig = -zSig;
5403     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5404 }
5405
5406 /*----------------------------------------------------------------------------
5407 | Returns the result of converting the extended double-precision floating-
5408 | point value `a' to the 32-bit two's complement integer format.  The
5409 | conversion is performed according to the IEC/IEEE Standard for Binary
5410 | Floating-Point Arithmetic---which means in particular that the conversion
5411 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5412 | largest positive integer is returned.  Otherwise, if the conversion
5413 | overflows, the largest integer with the same sign as `a' is returned.
5414 *----------------------------------------------------------------------------*/
5415
5416 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5417 {
5418     bool aSign;
5419     int32_t aExp, shiftCount;
5420     uint64_t aSig;
5421
5422     if (floatx80_invalid_encoding(a)) {
5423         float_raise(float_flag_invalid, status);
5424         return 1 << 31;
5425     }
5426     aSig = extractFloatx80Frac( a );
5427     aExp = extractFloatx80Exp( a );
5428     aSign = extractFloatx80Sign( a );
5429     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5430     shiftCount = 0x4037 - aExp;
5431     if ( shiftCount <= 0 ) shiftCount = 1;
5432     shift64RightJamming( aSig, shiftCount, &aSig );
5433     return roundAndPackInt32(aSign, aSig, status);
5434
5435 }
5436
5437 /*----------------------------------------------------------------------------
5438 | Returns the result of converting the extended double-precision floating-
5439 | point value `a' to the 32-bit two's complement integer format.  The
5440 | conversion is performed according to the IEC/IEEE Standard for Binary
5441 | Floating-Point Arithmetic, except that the conversion is always rounded
5442 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5443 | Otherwise, if the conversion overflows, the largest integer with the same
5444 | sign as `a' is returned.
5445 *----------------------------------------------------------------------------*/
5446
5447 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5448 {
5449     bool aSign;
5450     int32_t aExp, shiftCount;
5451     uint64_t aSig, savedASig;
5452     int32_t z;
5453
5454     if (floatx80_invalid_encoding(a)) {
5455         float_raise(float_flag_invalid, status);
5456         return 1 << 31;
5457     }
5458     aSig = extractFloatx80Frac( a );
5459     aExp = extractFloatx80Exp( a );
5460     aSign = extractFloatx80Sign( a );
5461     if ( 0x401E < aExp ) {
5462         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5463         goto invalid;
5464     }
5465     else if ( aExp < 0x3FFF ) {
5466         if (aExp || aSig) {
5467             float_raise(float_flag_inexact, status);
5468         }
5469         return 0;
5470     }
5471     shiftCount = 0x403E - aExp;
5472     savedASig = aSig;
5473     aSig >>= shiftCount;
5474     z = aSig;
5475     if ( aSign ) z = - z;
5476     if ( ( z < 0 ) ^ aSign ) {
5477  invalid:
5478         float_raise(float_flag_invalid, status);
5479         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5480     }
5481     if ( ( aSig<<shiftCount ) != savedASig ) {
5482         float_raise(float_flag_inexact, status);
5483     }
5484     return z;
5485
5486 }
5487
5488 /*----------------------------------------------------------------------------
5489 | Returns the result of converting the extended double-precision floating-
5490 | point value `a' to the 64-bit two's complement integer format.  The
5491 | conversion is performed according to the IEC/IEEE Standard for Binary
5492 | Floating-Point Arithmetic---which means in particular that the conversion
5493 | is rounded according to the current rounding mode.  If `a' is a NaN,
5494 | the largest positive integer is returned.  Otherwise, if the conversion
5495 | overflows, the largest integer with the same sign as `a' is returned.
5496 *----------------------------------------------------------------------------*/
5497
5498 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5499 {
5500     bool aSign;
5501     int32_t aExp, shiftCount;
5502     uint64_t aSig, aSigExtra;
5503
5504     if (floatx80_invalid_encoding(a)) {
5505         float_raise(float_flag_invalid, status);
5506         return 1ULL << 63;
5507     }
5508     aSig = extractFloatx80Frac( a );
5509     aExp = extractFloatx80Exp( a );
5510     aSign = extractFloatx80Sign( a );
5511     shiftCount = 0x403E - aExp;
5512     if ( shiftCount <= 0 ) {
5513         if ( shiftCount ) {
5514             float_raise(float_flag_invalid, status);
5515             if (!aSign || floatx80_is_any_nan(a)) {
5516                 return INT64_MAX;
5517             }
5518             return INT64_MIN;
5519         }
5520         aSigExtra = 0;
5521     }
5522     else {
5523         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5524     }
5525     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5526
5527 }
5528
5529 /*----------------------------------------------------------------------------
5530 | Returns the result of converting the extended double-precision floating-
5531 | point value `a' to the 64-bit two's complement integer format.  The
5532 | conversion is performed according to the IEC/IEEE Standard for Binary
5533 | Floating-Point Arithmetic, except that the conversion is always rounded
5534 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5535 | Otherwise, if the conversion overflows, the largest integer with the same
5536 | sign as `a' is returned.
5537 *----------------------------------------------------------------------------*/
5538
5539 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5540 {
5541     bool aSign;
5542     int32_t aExp, shiftCount;
5543     uint64_t aSig;
5544     int64_t z;
5545
5546     if (floatx80_invalid_encoding(a)) {
5547         float_raise(float_flag_invalid, status);
5548         return 1ULL << 63;
5549     }
5550     aSig = extractFloatx80Frac( a );
5551     aExp = extractFloatx80Exp( a );
5552     aSign = extractFloatx80Sign( a );
5553     shiftCount = aExp - 0x403E;
5554     if ( 0 <= shiftCount ) {
5555         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5556         if ( ( a.high != 0xC03E ) || aSig ) {
5557             float_raise(float_flag_invalid, status);
5558             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5559                 return INT64_MAX;
5560             }
5561         }
5562         return INT64_MIN;
5563     }
5564     else if ( aExp < 0x3FFF ) {
5565         if (aExp | aSig) {
5566             float_raise(float_flag_inexact, status);
5567         }
5568         return 0;
5569     }
5570     z = aSig>>( - shiftCount );
5571     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5572         float_raise(float_flag_inexact, status);
5573     }
5574     if ( aSign ) z = - z;
5575     return z;
5576
5577 }
5578
5579 /*----------------------------------------------------------------------------
5580 | Returns the result of converting the extended double-precision floating-
5581 | point value `a' to the single-precision floating-point format.  The
5582 | conversion is performed according to the IEC/IEEE Standard for Binary
5583 | Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585
5586 float32 floatx80_to_float32(floatx80 a, float_status *status)
5587 {
5588     bool aSign;
5589     int32_t aExp;
5590     uint64_t aSig;
5591
5592     if (floatx80_invalid_encoding(a)) {
5593         float_raise(float_flag_invalid, status);
5594         return float32_default_nan(status);
5595     }
5596     aSig = extractFloatx80Frac( a );
5597     aExp = extractFloatx80Exp( a );
5598     aSign = extractFloatx80Sign( a );
5599     if ( aExp == 0x7FFF ) {
5600         if ( (uint64_t) ( aSig<<1 ) ) {
5601             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5602                                              status);
5603             return float32_silence_nan(res, status);
5604         }
5605         return packFloat32( aSign, 0xFF, 0 );
5606     }
5607     shift64RightJamming( aSig, 33, &aSig );
5608     if ( aExp || aSig ) aExp -= 0x3F81;
5609     return roundAndPackFloat32(aSign, aExp, aSig, status);
5610
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the extended double-precision floating-
5615 | point value `a' to the double-precision floating-point format.  The
5616 | conversion is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619
5620 float64 floatx80_to_float64(floatx80 a, float_status *status)
5621 {
5622     bool aSign;
5623     int32_t aExp;
5624     uint64_t aSig, zSig;
5625
5626     if (floatx80_invalid_encoding(a)) {
5627         float_raise(float_flag_invalid, status);
5628         return float64_default_nan(status);
5629     }
5630     aSig = extractFloatx80Frac( a );
5631     aExp = extractFloatx80Exp( a );
5632     aSign = extractFloatx80Sign( a );
5633     if ( aExp == 0x7FFF ) {
5634         if ( (uint64_t) ( aSig<<1 ) ) {
5635             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5636                                              status);
5637             return float64_silence_nan(res, status);
5638         }
5639         return packFloat64( aSign, 0x7FF, 0 );
5640     }
5641     shift64RightJamming( aSig, 1, &zSig );
5642     if ( aExp || aSig ) aExp -= 0x3C01;
5643     return roundAndPackFloat64(aSign, aExp, zSig, status);
5644
5645 }
5646
5647 /*----------------------------------------------------------------------------
5648 | Returns the result of converting the extended double-precision floating-
5649 | point value `a' to the quadruple-precision floating-point format.  The
5650 | conversion is performed according to the IEC/IEEE Standard for Binary
5651 | Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5653
5654 float128 floatx80_to_float128(floatx80 a, float_status *status)
5655 {
5656     bool aSign;
5657     int aExp;
5658     uint64_t aSig, zSig0, zSig1;
5659
5660     if (floatx80_invalid_encoding(a)) {
5661         float_raise(float_flag_invalid, status);
5662         return float128_default_nan(status);
5663     }
5664     aSig = extractFloatx80Frac( a );
5665     aExp = extractFloatx80Exp( a );
5666     aSign = extractFloatx80Sign( a );
5667     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5668         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5669                                            status);
5670         return float128_silence_nan(res, status);
5671     }
5672     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5673     return packFloat128( aSign, aExp, zSig0, zSig1 );
5674
5675 }
5676
5677 /*----------------------------------------------------------------------------
5678 | Rounds the extended double-precision floating-point value `a'
5679 | to the precision provided by floatx80_rounding_precision and returns the
5680 | result as an extended double-precision floating-point value.
5681 | The operation is performed according to the IEC/IEEE Standard for Binary
5682 | Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5684
5685 floatx80 floatx80_round(floatx80 a, float_status *status)
5686 {
5687     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5688                                 extractFloatx80Sign(a),
5689                                 extractFloatx80Exp(a),
5690                                 extractFloatx80Frac(a), 0, status);
5691 }
5692
5693 /*----------------------------------------------------------------------------
5694 | Rounds the extended double-precision floating-point value `a' to an integer,
5695 | and returns the result as an extended quadruple-precision floating-point
5696 | value.  The operation is performed according to the IEC/IEEE Standard for
5697 | Binary Floating-Point Arithmetic.
5698 *----------------------------------------------------------------------------*/
5699
5700 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5701 {
5702     bool aSign;
5703     int32_t aExp;
5704     uint64_t lastBitMask, roundBitsMask;
5705     floatx80 z;
5706
5707     if (floatx80_invalid_encoding(a)) {
5708         float_raise(float_flag_invalid, status);
5709         return floatx80_default_nan(status);
5710     }
5711     aExp = extractFloatx80Exp( a );
5712     if ( 0x403E <= aExp ) {
5713         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5714             return propagateFloatx80NaN(a, a, status);
5715         }
5716         return a;
5717     }
5718     if ( aExp < 0x3FFF ) {
5719         if (    ( aExp == 0 )
5720              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5721             return a;
5722         }
5723         float_raise(float_flag_inexact, status);
5724         aSign = extractFloatx80Sign( a );
5725         switch (status->float_rounding_mode) {
5726          case float_round_nearest_even:
5727             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5728                ) {
5729                 return
5730                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5731             }
5732             break;
5733         case float_round_ties_away:
5734             if (aExp == 0x3FFE) {
5735                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5736             }
5737             break;
5738          case float_round_down:
5739             return
5740                   aSign ?
5741                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5742                 : packFloatx80( 0, 0, 0 );
5743          case float_round_up:
5744             return
5745                   aSign ? packFloatx80( 1, 0, 0 )
5746                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5747
5748         case float_round_to_zero:
5749             break;
5750         default:
5751             g_assert_not_reached();
5752         }
5753         return packFloatx80( aSign, 0, 0 );
5754     }
5755     lastBitMask = 1;
5756     lastBitMask <<= 0x403E - aExp;
5757     roundBitsMask = lastBitMask - 1;
5758     z = a;
5759     switch (status->float_rounding_mode) {
5760     case float_round_nearest_even:
5761         z.low += lastBitMask>>1;
5762         if ((z.low & roundBitsMask) == 0) {
5763             z.low &= ~lastBitMask;
5764         }
5765         break;
5766     case float_round_ties_away:
5767         z.low += lastBitMask >> 1;
5768         break;
5769     case float_round_to_zero:
5770         break;
5771     case float_round_up:
5772         if (!extractFloatx80Sign(z)) {
5773             z.low += roundBitsMask;
5774         }
5775         break;
5776     case float_round_down:
5777         if (extractFloatx80Sign(z)) {
5778             z.low += roundBitsMask;
5779         }
5780         break;
5781     default:
5782         abort();
5783     }
5784     z.low &= ~ roundBitsMask;
5785     if ( z.low == 0 ) {
5786         ++z.high;
5787         z.low = UINT64_C(0x8000000000000000);
5788     }
5789     if (z.low != a.low) {
5790         float_raise(float_flag_inexact, status);
5791     }
5792     return z;
5793
5794 }
5795
5796 /*----------------------------------------------------------------------------
5797 | Returns the result of adding the absolute values of the extended double-
5798 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5799 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5800 | The addition is performed according to the IEC/IEEE Standard for Binary
5801 | Floating-Point Arithmetic.
5802 *----------------------------------------------------------------------------*/
5803
5804 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5805                                 float_status *status)
5806 {
5807     int32_t aExp, bExp, zExp;
5808     uint64_t aSig, bSig, zSig0, zSig1;
5809     int32_t expDiff;
5810
5811     aSig = extractFloatx80Frac( a );
5812     aExp = extractFloatx80Exp( a );
5813     bSig = extractFloatx80Frac( b );
5814     bExp = extractFloatx80Exp( b );
5815     expDiff = aExp - bExp;
5816     if ( 0 < expDiff ) {
5817         if ( aExp == 0x7FFF ) {
5818             if ((uint64_t)(aSig << 1)) {
5819                 return propagateFloatx80NaN(a, b, status);
5820             }
5821             return a;
5822         }
5823         if ( bExp == 0 ) --expDiff;
5824         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5825         zExp = aExp;
5826     }
5827     else if ( expDiff < 0 ) {
5828         if ( bExp == 0x7FFF ) {
5829             if ((uint64_t)(bSig << 1)) {
5830                 return propagateFloatx80NaN(a, b, status);
5831             }
5832             return packFloatx80(zSign,
5833                                 floatx80_infinity_high,
5834                                 floatx80_infinity_low);
5835         }
5836         if ( aExp == 0 ) ++expDiff;
5837         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5838         zExp = bExp;
5839     }
5840     else {
5841         if ( aExp == 0x7FFF ) {
5842             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5843                 return propagateFloatx80NaN(a, b, status);
5844             }
5845             return a;
5846         }
5847         zSig1 = 0;
5848         zSig0 = aSig + bSig;
5849         if ( aExp == 0 ) {
5850             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5851                 /* At least one of the values is a pseudo-denormal,
5852                  * and there is a carry out of the result.  */
5853                 zExp = 1;
5854                 goto shiftRight1;
5855             }
5856             if (zSig0 == 0) {
5857                 return packFloatx80(zSign, 0, 0);
5858             }
5859             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5860             goto roundAndPack;
5861         }
5862         zExp = aExp;
5863         goto shiftRight1;
5864     }
5865     zSig0 = aSig + bSig;
5866     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5867  shiftRight1:
5868     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5869     zSig0 |= UINT64_C(0x8000000000000000);
5870     ++zExp;
5871  roundAndPack:
5872     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5873                                 zSign, zExp, zSig0, zSig1, status);
5874 }
5875
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of subtracting the absolute values of the extended
5878 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5879 | difference is negated before being returned.  `zSign' is ignored if the
5880 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5881 | Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5883
5884 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5885                                 float_status *status)
5886 {
5887     int32_t aExp, bExp, zExp;
5888     uint64_t aSig, bSig, zSig0, zSig1;
5889     int32_t expDiff;
5890
5891     aSig = extractFloatx80Frac( a );
5892     aExp = extractFloatx80Exp( a );
5893     bSig = extractFloatx80Frac( b );
5894     bExp = extractFloatx80Exp( b );
5895     expDiff = aExp - bExp;
5896     if ( 0 < expDiff ) goto aExpBigger;
5897     if ( expDiff < 0 ) goto bExpBigger;
5898     if ( aExp == 0x7FFF ) {
5899         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5900             return propagateFloatx80NaN(a, b, status);
5901         }
5902         float_raise(float_flag_invalid, status);
5903         return floatx80_default_nan(status);
5904     }
5905     if ( aExp == 0 ) {
5906         aExp = 1;
5907         bExp = 1;
5908     }
5909     zSig1 = 0;
5910     if ( bSig < aSig ) goto aBigger;
5911     if ( aSig < bSig ) goto bBigger;
5912     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5913  bExpBigger:
5914     if ( bExp == 0x7FFF ) {
5915         if ((uint64_t)(bSig << 1)) {
5916             return propagateFloatx80NaN(a, b, status);
5917         }
5918         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5919                             floatx80_infinity_low);
5920     }
5921     if ( aExp == 0 ) ++expDiff;
5922     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5923  bBigger:
5924     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5925     zExp = bExp;
5926     zSign ^= 1;
5927     goto normalizeRoundAndPack;
5928  aExpBigger:
5929     if ( aExp == 0x7FFF ) {
5930         if ((uint64_t)(aSig << 1)) {
5931             return propagateFloatx80NaN(a, b, status);
5932         }
5933         return a;
5934     }
5935     if ( bExp == 0 ) --expDiff;
5936     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5937  aBigger:
5938     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5939     zExp = aExp;
5940  normalizeRoundAndPack:
5941     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5942                                          zSign, zExp, zSig0, zSig1, status);
5943 }
5944
5945 /*----------------------------------------------------------------------------
5946 | Returns the result of adding the extended double-precision floating-point
5947 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5948 | Standard for Binary Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5950
5951 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5952 {
5953     bool aSign, bSign;
5954
5955     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5956         float_raise(float_flag_invalid, status);
5957         return floatx80_default_nan(status);
5958     }
5959     aSign = extractFloatx80Sign( a );
5960     bSign = extractFloatx80Sign( b );
5961     if ( aSign == bSign ) {
5962         return addFloatx80Sigs(a, b, aSign, status);
5963     }
5964     else {
5965         return subFloatx80Sigs(a, b, aSign, status);
5966     }
5967
5968 }
5969
5970 /*----------------------------------------------------------------------------
5971 | Returns the result of subtracting the extended double-precision floating-
5972 | point values `a' and `b'.  The operation is performed according to the
5973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5974 *----------------------------------------------------------------------------*/
5975
5976 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5977 {
5978     bool aSign, bSign;
5979
5980     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5981         float_raise(float_flag_invalid, status);
5982         return floatx80_default_nan(status);
5983     }
5984     aSign = extractFloatx80Sign( a );
5985     bSign = extractFloatx80Sign( b );
5986     if ( aSign == bSign ) {
5987         return subFloatx80Sigs(a, b, aSign, status);
5988     }
5989     else {
5990         return addFloatx80Sigs(a, b, aSign, status);
5991     }
5992
5993 }
5994
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of multiplying the extended double-precision floating-
5997 | point values `a' and `b'.  The operation is performed according to the
5998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5999 *----------------------------------------------------------------------------*/
6000
6001 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6002 {
6003     bool aSign, bSign, zSign;
6004     int32_t aExp, bExp, zExp;
6005     uint64_t aSig, bSig, zSig0, zSig1;
6006
6007     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6008         float_raise(float_flag_invalid, status);
6009         return floatx80_default_nan(status);
6010     }
6011     aSig = extractFloatx80Frac( a );
6012     aExp = extractFloatx80Exp( a );
6013     aSign = extractFloatx80Sign( a );
6014     bSig = extractFloatx80Frac( b );
6015     bExp = extractFloatx80Exp( b );
6016     bSign = extractFloatx80Sign( b );
6017     zSign = aSign ^ bSign;
6018     if ( aExp == 0x7FFF ) {
6019         if (    (uint64_t) ( aSig<<1 )
6020              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6021             return propagateFloatx80NaN(a, b, status);
6022         }
6023         if ( ( bExp | bSig ) == 0 ) goto invalid;
6024         return packFloatx80(zSign, floatx80_infinity_high,
6025                                    floatx80_infinity_low);
6026     }
6027     if ( bExp == 0x7FFF ) {
6028         if ((uint64_t)(bSig << 1)) {
6029             return propagateFloatx80NaN(a, b, status);
6030         }
6031         if ( ( aExp | aSig ) == 0 ) {
6032  invalid:
6033             float_raise(float_flag_invalid, status);
6034             return floatx80_default_nan(status);
6035         }
6036         return packFloatx80(zSign, floatx80_infinity_high,
6037                                    floatx80_infinity_low);
6038     }
6039     if ( aExp == 0 ) {
6040         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6041         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6042     }
6043     if ( bExp == 0 ) {
6044         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6045         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6046     }
6047     zExp = aExp + bExp - 0x3FFE;
6048     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6049     if ( 0 < (int64_t) zSig0 ) {
6050         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6051         --zExp;
6052     }
6053     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6054                                 zSign, zExp, zSig0, zSig1, status);
6055 }
6056
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of dividing the extended double-precision floating-point
6059 | value `a' by the corresponding value `b'.  The operation is performed
6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6061 *----------------------------------------------------------------------------*/
6062
6063 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6064 {
6065     bool aSign, bSign, zSign;
6066     int32_t aExp, bExp, zExp;
6067     uint64_t aSig, bSig, zSig0, zSig1;
6068     uint64_t rem0, rem1, rem2, term0, term1, term2;
6069
6070     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6071         float_raise(float_flag_invalid, status);
6072         return floatx80_default_nan(status);
6073     }
6074     aSig = extractFloatx80Frac( a );
6075     aExp = extractFloatx80Exp( a );
6076     aSign = extractFloatx80Sign( a );
6077     bSig = extractFloatx80Frac( b );
6078     bExp = extractFloatx80Exp( b );
6079     bSign = extractFloatx80Sign( b );
6080     zSign = aSign ^ bSign;
6081     if ( aExp == 0x7FFF ) {
6082         if ((uint64_t)(aSig << 1)) {
6083             return propagateFloatx80NaN(a, b, status);
6084         }
6085         if ( bExp == 0x7FFF ) {
6086             if ((uint64_t)(bSig << 1)) {
6087                 return propagateFloatx80NaN(a, b, status);
6088             }
6089             goto invalid;
6090         }
6091         return packFloatx80(zSign, floatx80_infinity_high,
6092                                    floatx80_infinity_low);
6093     }
6094     if ( bExp == 0x7FFF ) {
6095         if ((uint64_t)(bSig << 1)) {
6096             return propagateFloatx80NaN(a, b, status);
6097         }
6098         return packFloatx80( zSign, 0, 0 );
6099     }
6100     if ( bExp == 0 ) {
6101         if ( bSig == 0 ) {
6102             if ( ( aExp | aSig ) == 0 ) {
6103  invalid:
6104                 float_raise(float_flag_invalid, status);
6105                 return floatx80_default_nan(status);
6106             }
6107             float_raise(float_flag_divbyzero, status);
6108             return packFloatx80(zSign, floatx80_infinity_high,
6109                                        floatx80_infinity_low);
6110         }
6111         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6112     }
6113     if ( aExp == 0 ) {
6114         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6115         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6116     }
6117     zExp = aExp - bExp + 0x3FFE;
6118     rem1 = 0;
6119     if ( bSig <= aSig ) {
6120         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6121         ++zExp;
6122     }
6123     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6124     mul64To128( bSig, zSig0, &term0, &term1 );
6125     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6126     while ( (int64_t) rem0 < 0 ) {
6127         --zSig0;
6128         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6129     }
6130     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6131     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6132         mul64To128( bSig, zSig1, &term1, &term2 );
6133         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6134         while ( (int64_t) rem1 < 0 ) {
6135             --zSig1;
6136             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6137         }
6138         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6139     }
6140     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6141                                 zSign, zExp, zSig0, zSig1, status);
6142 }
6143
6144 /*----------------------------------------------------------------------------
6145 | Returns the remainder of the extended double-precision floating-point value
6146 | `a' with respect to the corresponding value `b'.  The operation is performed
6147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6148 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6149 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6150 | the absolute value of the integer quotient.
6151 *----------------------------------------------------------------------------*/
6152
6153 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6154                          float_status *status)
6155 {
6156     bool aSign, zSign;
6157     int32_t aExp, bExp, expDiff, aExpOrig;
6158     uint64_t aSig0, aSig1, bSig;
6159     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6160
6161     *quotient = 0;
6162     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6163         float_raise(float_flag_invalid, status);
6164         return floatx80_default_nan(status);
6165     }
6166     aSig0 = extractFloatx80Frac( a );
6167     aExpOrig = aExp = extractFloatx80Exp( a );
6168     aSign = extractFloatx80Sign( a );
6169     bSig = extractFloatx80Frac( b );
6170     bExp = extractFloatx80Exp( b );
6171     if ( aExp == 0x7FFF ) {
6172         if (    (uint64_t) ( aSig0<<1 )
6173              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6174             return propagateFloatx80NaN(a, b, status);
6175         }
6176         goto invalid;
6177     }
6178     if ( bExp == 0x7FFF ) {
6179         if ((uint64_t)(bSig << 1)) {
6180             return propagateFloatx80NaN(a, b, status);
6181         }
6182         if (aExp == 0 && aSig0 >> 63) {
6183             /*
6184              * Pseudo-denormal argument must be returned in normalized
6185              * form.
6186              */
6187             return packFloatx80(aSign, 1, aSig0);
6188         }
6189         return a;
6190     }
6191     if ( bExp == 0 ) {
6192         if ( bSig == 0 ) {
6193  invalid:
6194             float_raise(float_flag_invalid, status);
6195             return floatx80_default_nan(status);
6196         }
6197         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6198     }
6199     if ( aExp == 0 ) {
6200         if ( aSig0 == 0 ) return a;
6201         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6202     }
6203     zSign = aSign;
6204     expDiff = aExp - bExp;
6205     aSig1 = 0;
6206     if ( expDiff < 0 ) {
6207         if ( mod || expDiff < -1 ) {
6208             if (aExp == 1 && aExpOrig == 0) {
6209                 /*
6210                  * Pseudo-denormal argument must be returned in
6211                  * normalized form.
6212                  */
6213                 return packFloatx80(aSign, aExp, aSig0);
6214             }
6215             return a;
6216         }
6217         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6218         expDiff = 0;
6219     }
6220     *quotient = q = ( bSig <= aSig0 );
6221     if ( q ) aSig0 -= bSig;
6222     expDiff -= 64;
6223     while ( 0 < expDiff ) {
6224         q = estimateDiv128To64( aSig0, aSig1, bSig );
6225         q = ( 2 < q ) ? q - 2 : 0;
6226         mul64To128( bSig, q, &term0, &term1 );
6227         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6228         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6229         expDiff -= 62;
6230         *quotient <<= 62;
6231         *quotient += q;
6232     }
6233     expDiff += 64;
6234     if ( 0 < expDiff ) {
6235         q = estimateDiv128To64( aSig0, aSig1, bSig );
6236         q = ( 2 < q ) ? q - 2 : 0;
6237         q >>= 64 - expDiff;
6238         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6239         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6240         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6241         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6242             ++q;
6243             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6244         }
6245         if (expDiff < 64) {
6246             *quotient <<= expDiff;
6247         } else {
6248             *quotient = 0;
6249         }
6250         *quotient += q;
6251     }
6252     else {
6253         term1 = 0;
6254         term0 = bSig;
6255     }
6256     if (!mod) {
6257         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6258         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6259                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6260                         && ( q & 1 ) )
6261             ) {
6262             aSig0 = alternateASig0;
6263             aSig1 = alternateASig1;
6264             zSign = ! zSign;
6265             ++*quotient;
6266         }
6267     }
6268     return
6269         normalizeRoundAndPackFloatx80(
6270             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6271
6272 }
6273
6274 /*----------------------------------------------------------------------------
6275 | Returns the remainder of the extended double-precision floating-point value
6276 | `a' with respect to the corresponding value `b'.  The operation is performed
6277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6278 *----------------------------------------------------------------------------*/
6279
6280 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6281 {
6282     uint64_t quotient;
6283     return floatx80_modrem(a, b, false, &quotient, status);
6284 }
6285
6286 /*----------------------------------------------------------------------------
6287 | Returns the remainder of the extended double-precision floating-point value
6288 | `a' with respect to the corresponding value `b', with the quotient truncated
6289 | toward zero.
6290 *----------------------------------------------------------------------------*/
6291
6292 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6293 {
6294     uint64_t quotient;
6295     return floatx80_modrem(a, b, true, &quotient, status);
6296 }
6297
6298 /*----------------------------------------------------------------------------
6299 | Returns the square root of the extended double-precision floating-point
6300 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6301 | for Binary Floating-Point Arithmetic.
6302 *----------------------------------------------------------------------------*/
6303
6304 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6305 {
6306     bool aSign;
6307     int32_t aExp, zExp;
6308     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6309     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6310
6311     if (floatx80_invalid_encoding(a)) {
6312         float_raise(float_flag_invalid, status);
6313         return floatx80_default_nan(status);
6314     }
6315     aSig0 = extractFloatx80Frac( a );
6316     aExp = extractFloatx80Exp( a );
6317     aSign = extractFloatx80Sign( a );
6318     if ( aExp == 0x7FFF ) {
6319         if ((uint64_t)(aSig0 << 1)) {
6320             return propagateFloatx80NaN(a, a, status);
6321         }
6322         if ( ! aSign ) return a;
6323         goto invalid;
6324     }
6325     if ( aSign ) {
6326         if ( ( aExp | aSig0 ) == 0 ) return a;
6327  invalid:
6328         float_raise(float_flag_invalid, status);
6329         return floatx80_default_nan(status);
6330     }
6331     if ( aExp == 0 ) {
6332         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6333         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6334     }
6335     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6336     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6337     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6338     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6339     doubleZSig0 = zSig0<<1;
6340     mul64To128( zSig0, zSig0, &term0, &term1 );
6341     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6342     while ( (int64_t) rem0 < 0 ) {
6343         --zSig0;
6344         doubleZSig0 -= 2;
6345         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6346     }
6347     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6348     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6349         if ( zSig1 == 0 ) zSig1 = 1;
6350         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6351         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6352         mul64To128( zSig1, zSig1, &term2, &term3 );
6353         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6354         while ( (int64_t) rem1 < 0 ) {
6355             --zSig1;
6356             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6357             term3 |= 1;
6358             term2 |= doubleZSig0;
6359             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6360         }
6361         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6362     }
6363     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6364     zSig0 |= doubleZSig0;
6365     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6366                                 0, zExp, zSig0, zSig1, status);
6367 }
6368
6369 /*----------------------------------------------------------------------------
6370 | Returns the result of converting the quadruple-precision floating-point
6371 | value `a' to the 32-bit two's complement integer format.  The conversion
6372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6373 | Arithmetic---which means in particular that the conversion is rounded
6374 | according to the current rounding mode.  If `a' is a NaN, the largest
6375 | positive integer is returned.  Otherwise, if the conversion overflows, the
6376 | largest integer with the same sign as `a' is returned.
6377 *----------------------------------------------------------------------------*/
6378
6379 int32_t float128_to_int32(float128 a, float_status *status)
6380 {
6381     bool aSign;
6382     int32_t aExp, shiftCount;
6383     uint64_t aSig0, aSig1;
6384
6385     aSig1 = extractFloat128Frac1( a );
6386     aSig0 = extractFloat128Frac0( a );
6387     aExp = extractFloat128Exp( a );
6388     aSign = extractFloat128Sign( a );
6389     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6390     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6391     aSig0 |= ( aSig1 != 0 );
6392     shiftCount = 0x4028 - aExp;
6393     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6394     return roundAndPackInt32(aSign, aSig0, status);
6395
6396 }
6397
6398 /*----------------------------------------------------------------------------
6399 | Returns the result of converting the quadruple-precision floating-point
6400 | value `a' to the 32-bit two's complement integer format.  The conversion
6401 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6402 | Arithmetic, except that the conversion is always rounded toward zero.  If
6403 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6404 | conversion overflows, the largest integer with the same sign as `a' is
6405 | returned.
6406 *----------------------------------------------------------------------------*/
6407
6408 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6409 {
6410     bool aSign;
6411     int32_t aExp, shiftCount;
6412     uint64_t aSig0, aSig1, savedASig;
6413     int32_t z;
6414
6415     aSig1 = extractFloat128Frac1( a );
6416     aSig0 = extractFloat128Frac0( a );
6417     aExp = extractFloat128Exp( a );
6418     aSign = extractFloat128Sign( a );
6419     aSig0 |= ( aSig1 != 0 );
6420     if ( 0x401E < aExp ) {
6421         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6422         goto invalid;
6423     }
6424     else if ( aExp < 0x3FFF ) {
6425         if (aExp || aSig0) {
6426             float_raise(float_flag_inexact, status);
6427         }
6428         return 0;
6429     }
6430     aSig0 |= UINT64_C(0x0001000000000000);
6431     shiftCount = 0x402F - aExp;
6432     savedASig = aSig0;
6433     aSig0 >>= shiftCount;
6434     z = aSig0;
6435     if ( aSign ) z = - z;
6436     if ( ( z < 0 ) ^ aSign ) {
6437  invalid:
6438         float_raise(float_flag_invalid, status);
6439         return aSign ? INT32_MIN : INT32_MAX;
6440     }
6441     if ( ( aSig0<<shiftCount ) != savedASig ) {
6442         float_raise(float_flag_inexact, status);
6443     }
6444     return z;
6445
6446 }
6447
6448 /*----------------------------------------------------------------------------
6449 | Returns the result of converting the quadruple-precision floating-point
6450 | value `a' to the 64-bit two's complement integer format.  The conversion
6451 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6452 | Arithmetic---which means in particular that the conversion is rounded
6453 | according to the current rounding mode.  If `a' is a NaN, the largest
6454 | positive integer is returned.  Otherwise, if the conversion overflows, the
6455 | largest integer with the same sign as `a' is returned.
6456 *----------------------------------------------------------------------------*/
6457
6458 int64_t float128_to_int64(float128 a, float_status *status)
6459 {
6460     bool aSign;
6461     int32_t aExp, shiftCount;
6462     uint64_t aSig0, aSig1;
6463
6464     aSig1 = extractFloat128Frac1( a );
6465     aSig0 = extractFloat128Frac0( a );
6466     aExp = extractFloat128Exp( a );
6467     aSign = extractFloat128Sign( a );
6468     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6469     shiftCount = 0x402F - aExp;
6470     if ( shiftCount <= 0 ) {
6471         if ( 0x403E < aExp ) {
6472             float_raise(float_flag_invalid, status);
6473             if (    ! aSign
6474                  || (    ( aExp == 0x7FFF )
6475                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6476                     )
6477                ) {
6478                 return INT64_MAX;
6479             }
6480             return INT64_MIN;
6481         }
6482         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6483     }
6484     else {
6485         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6486     }
6487     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6488
6489 }
6490
6491 /*----------------------------------------------------------------------------
6492 | Returns the result of converting the quadruple-precision floating-point
6493 | value `a' to the 64-bit two's complement integer format.  The conversion
6494 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6495 | Arithmetic, except that the conversion is always rounded toward zero.
6496 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6497 | the conversion overflows, the largest integer with the same sign as `a' is
6498 | returned.
6499 *----------------------------------------------------------------------------*/
6500
6501 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6502 {
6503     bool aSign;
6504     int32_t aExp, shiftCount;
6505     uint64_t aSig0, aSig1;
6506     int64_t z;
6507
6508     aSig1 = extractFloat128Frac1( a );
6509     aSig0 = extractFloat128Frac0( a );
6510     aExp = extractFloat128Exp( a );
6511     aSign = extractFloat128Sign( a );
6512     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6513     shiftCount = aExp - 0x402F;
6514     if ( 0 < shiftCount ) {
6515         if ( 0x403E <= aExp ) {
6516             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6517             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6518                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6519                 if (aSig1) {
6520                     float_raise(float_flag_inexact, status);
6521                 }
6522             }
6523             else {
6524                 float_raise(float_flag_invalid, status);
6525                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6526                     return INT64_MAX;
6527                 }
6528             }
6529             return INT64_MIN;
6530         }
6531         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6532         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6533             float_raise(float_flag_inexact, status);
6534         }
6535     }
6536     else {
6537         if ( aExp < 0x3FFF ) {
6538             if ( aExp | aSig0 | aSig1 ) {
6539                 float_raise(float_flag_inexact, status);
6540             }
6541             return 0;
6542         }
6543         z = aSig0>>( - shiftCount );
6544         if (    aSig1
6545              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6546             float_raise(float_flag_inexact, status);
6547         }
6548     }
6549     if ( aSign ) z = - z;
6550     return z;
6551
6552 }
6553
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point value
6556 | `a' to the 64-bit unsigned integer format.  The conversion is
6557 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic---which means in particular that the conversion is rounded
6559 | according to the current rounding mode.  If `a' is a NaN, the largest
6560 | positive integer is returned.  If the conversion overflows, the
6561 | largest unsigned integer is returned.  If 'a' is negative, the value is
6562 | rounded and zero is returned; negative values that do not round to zero
6563 | will raise the inexact exception.
6564 *----------------------------------------------------------------------------*/
6565
6566 uint64_t float128_to_uint64(float128 a, float_status *status)
6567 {
6568     bool aSign;
6569     int aExp;
6570     int shiftCount;
6571     uint64_t aSig0, aSig1;
6572
6573     aSig0 = extractFloat128Frac0(a);
6574     aSig1 = extractFloat128Frac1(a);
6575     aExp = extractFloat128Exp(a);
6576     aSign = extractFloat128Sign(a);
6577     if (aSign && (aExp > 0x3FFE)) {
6578         float_raise(float_flag_invalid, status);
6579         if (float128_is_any_nan(a)) {
6580             return UINT64_MAX;
6581         } else {
6582             return 0;
6583         }
6584     }
6585     if (aExp) {
6586         aSig0 |= UINT64_C(0x0001000000000000);
6587     }
6588     shiftCount = 0x402F - aExp;
6589     if (shiftCount <= 0) {
6590         if (0x403E < aExp) {
6591             float_raise(float_flag_invalid, status);
6592             return UINT64_MAX;
6593         }
6594         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6595     } else {
6596         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6597     }
6598     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6599 }
6600
6601 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6602 {
6603     uint64_t v;
6604     signed char current_rounding_mode = status->float_rounding_mode;
6605
6606     set_float_rounding_mode(float_round_to_zero, status);
6607     v = float128_to_uint64(a, status);
6608     set_float_rounding_mode(current_rounding_mode, status);
6609
6610     return v;
6611 }
6612
6613 /*----------------------------------------------------------------------------
6614 | Returns the result of converting the quadruple-precision floating-point
6615 | value `a' to the 32-bit unsigned integer format.  The conversion
6616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6617 | Arithmetic except that the conversion is always rounded toward zero.
6618 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6619 | if the conversion overflows, the largest unsigned integer is returned.
6620 | If 'a' is negative, the value is rounded and zero is returned; negative
6621 | values that do not round to zero will raise the inexact exception.
6622 *----------------------------------------------------------------------------*/
6623
6624 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6625 {
6626     uint64_t v;
6627     uint32_t res;
6628     int old_exc_flags = get_float_exception_flags(status);
6629
6630     v = float128_to_uint64_round_to_zero(a, status);
6631     if (v > 0xffffffff) {
6632         res = 0xffffffff;
6633     } else {
6634         return v;
6635     }
6636     set_float_exception_flags(old_exc_flags, status);
6637     float_raise(float_flag_invalid, status);
6638     return res;
6639 }
6640
6641 /*----------------------------------------------------------------------------
6642 | Returns the result of converting the quadruple-precision floating-point value
6643 | `a' to the 32-bit unsigned integer format.  The conversion is
6644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6645 | Arithmetic---which means in particular that the conversion is rounded
6646 | according to the current rounding mode.  If `a' is a NaN, the largest
6647 | positive integer is returned.  If the conversion overflows, the
6648 | largest unsigned integer is returned.  If 'a' is negative, the value is
6649 | rounded and zero is returned; negative values that do not round to zero
6650 | will raise the inexact exception.
6651 *----------------------------------------------------------------------------*/
6652
6653 uint32_t float128_to_uint32(float128 a, float_status *status)
6654 {
6655     uint64_t v;
6656     uint32_t res;
6657     int old_exc_flags = get_float_exception_flags(status);
6658
6659     v = float128_to_uint64(a, status);
6660     if (v > 0xffffffff) {
6661         res = 0xffffffff;
6662     } else {
6663         return v;
6664     }
6665     set_float_exception_flags(old_exc_flags, status);
6666     float_raise(float_flag_invalid, status);
6667     return res;
6668 }
6669
6670 /*----------------------------------------------------------------------------
6671 | Returns the result of converting the quadruple-precision floating-point
6672 | value `a' to the single-precision floating-point format.  The conversion
6673 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6674 | Arithmetic.
6675 *----------------------------------------------------------------------------*/
6676
6677 float32 float128_to_float32(float128 a, float_status *status)
6678 {
6679     bool aSign;
6680     int32_t aExp;
6681     uint64_t aSig0, aSig1;
6682     uint32_t zSig;
6683
6684     aSig1 = extractFloat128Frac1( a );
6685     aSig0 = extractFloat128Frac0( a );
6686     aExp = extractFloat128Exp( a );
6687     aSign = extractFloat128Sign( a );
6688     if ( aExp == 0x7FFF ) {
6689         if ( aSig0 | aSig1 ) {
6690             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6691         }
6692         return packFloat32( aSign, 0xFF, 0 );
6693     }
6694     aSig0 |= ( aSig1 != 0 );
6695     shift64RightJamming( aSig0, 18, &aSig0 );
6696     zSig = aSig0;
6697     if ( aExp || zSig ) {
6698         zSig |= 0x40000000;
6699         aExp -= 0x3F81;
6700     }
6701     return roundAndPackFloat32(aSign, aExp, zSig, status);
6702
6703 }
6704
6705 /*----------------------------------------------------------------------------
6706 | Returns the result of converting the quadruple-precision floating-point
6707 | value `a' to the double-precision floating-point format.  The conversion
6708 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6709 | Arithmetic.
6710 *----------------------------------------------------------------------------*/
6711
6712 float64 float128_to_float64(float128 a, float_status *status)
6713 {
6714     bool aSign;
6715     int32_t aExp;
6716     uint64_t aSig0, aSig1;
6717
6718     aSig1 = extractFloat128Frac1( a );
6719     aSig0 = extractFloat128Frac0( a );
6720     aExp = extractFloat128Exp( a );
6721     aSign = extractFloat128Sign( a );
6722     if ( aExp == 0x7FFF ) {
6723         if ( aSig0 | aSig1 ) {
6724             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6725         }
6726         return packFloat64( aSign, 0x7FF, 0 );
6727     }
6728     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6729     aSig0 |= ( aSig1 != 0 );
6730     if ( aExp || aSig0 ) {
6731         aSig0 |= UINT64_C(0x4000000000000000);
6732         aExp -= 0x3C01;
6733     }
6734     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6735
6736 }
6737
6738 /*----------------------------------------------------------------------------
6739 | Returns the result of converting the quadruple-precision floating-point
6740 | value `a' to the extended double-precision floating-point format.  The
6741 | conversion is performed according to the IEC/IEEE Standard for Binary
6742 | Floating-Point Arithmetic.
6743 *----------------------------------------------------------------------------*/
6744
6745 floatx80 float128_to_floatx80(float128 a, float_status *status)
6746 {
6747     bool aSign;
6748     int32_t aExp;
6749     uint64_t aSig0, aSig1;
6750
6751     aSig1 = extractFloat128Frac1( a );
6752     aSig0 = extractFloat128Frac0( a );
6753     aExp = extractFloat128Exp( a );
6754     aSign = extractFloat128Sign( a );
6755     if ( aExp == 0x7FFF ) {
6756         if ( aSig0 | aSig1 ) {
6757             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6758                                                status);
6759             return floatx80_silence_nan(res, status);
6760         }
6761         return packFloatx80(aSign, floatx80_infinity_high,
6762                                    floatx80_infinity_low);
6763     }
6764     if ( aExp == 0 ) {
6765         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6766         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6767     }
6768     else {
6769         aSig0 |= UINT64_C(0x0001000000000000);
6770     }
6771     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6772     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6773
6774 }
6775
6776 /*----------------------------------------------------------------------------
6777 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6778 | returns the result as a quadruple-precision floating-point value.  The
6779 | operation is performed according to the IEC/IEEE Standard for Binary
6780 | Floating-Point Arithmetic.
6781 *----------------------------------------------------------------------------*/
6782
6783 float128 float128_round_to_int(float128 a, float_status *status)
6784 {
6785     bool aSign;
6786     int32_t aExp;
6787     uint64_t lastBitMask, roundBitsMask;
6788     float128 z;
6789
6790     aExp = extractFloat128Exp( a );
6791     if ( 0x402F <= aExp ) {
6792         if ( 0x406F <= aExp ) {
6793             if (    ( aExp == 0x7FFF )
6794                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6795                ) {
6796                 return propagateFloat128NaN(a, a, status);
6797             }
6798             return a;
6799         }
6800         lastBitMask = 1;
6801         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6802         roundBitsMask = lastBitMask - 1;
6803         z = a;
6804         switch (status->float_rounding_mode) {
6805         case float_round_nearest_even:
6806             if ( lastBitMask ) {
6807                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6808                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6809             }
6810             else {
6811                 if ( (int64_t) z.low < 0 ) {
6812                     ++z.high;
6813                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6814                 }
6815             }
6816             break;
6817         case float_round_ties_away:
6818             if (lastBitMask) {
6819                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6820             } else {
6821                 if ((int64_t) z.low < 0) {
6822                     ++z.high;
6823                 }
6824             }
6825             break;
6826         case float_round_to_zero:
6827             break;
6828         case float_round_up:
6829             if (!extractFloat128Sign(z)) {
6830                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6831             }
6832             break;
6833         case float_round_down:
6834             if (extractFloat128Sign(z)) {
6835                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6836             }
6837             break;
6838         case float_round_to_odd:
6839             /*
6840              * Note that if lastBitMask == 0, the last bit is the lsb
6841              * of high, and roundBitsMask == -1.
6842              */
6843             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6844                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6845             }
6846             break;
6847         default:
6848             abort();
6849         }
6850         z.low &= ~ roundBitsMask;
6851     }
6852     else {
6853         if ( aExp < 0x3FFF ) {
6854             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6855             float_raise(float_flag_inexact, status);
6856             aSign = extractFloat128Sign( a );
6857             switch (status->float_rounding_mode) {
6858             case float_round_nearest_even:
6859                 if (    ( aExp == 0x3FFE )
6860                      && (   extractFloat128Frac0( a )
6861                           | extractFloat128Frac1( a ) )
6862                    ) {
6863                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6864                 }
6865                 break;
6866             case float_round_ties_away:
6867                 if (aExp == 0x3FFE) {
6868                     return packFloat128(aSign, 0x3FFF, 0, 0);
6869                 }
6870                 break;
6871             case float_round_down:
6872                 return
6873                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6874                     : packFloat128( 0, 0, 0, 0 );
6875             case float_round_up:
6876                 return
6877                       aSign ? packFloat128( 1, 0, 0, 0 )
6878                     : packFloat128( 0, 0x3FFF, 0, 0 );
6879
6880             case float_round_to_odd:
6881                 return packFloat128(aSign, 0x3FFF, 0, 0);
6882
6883             case float_round_to_zero:
6884                 break;
6885             }
6886             return packFloat128( aSign, 0, 0, 0 );
6887         }
6888         lastBitMask = 1;
6889         lastBitMask <<= 0x402F - aExp;
6890         roundBitsMask = lastBitMask - 1;
6891         z.low = 0;
6892         z.high = a.high;
6893         switch (status->float_rounding_mode) {
6894         case float_round_nearest_even:
6895             z.high += lastBitMask>>1;
6896             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6897                 z.high &= ~ lastBitMask;
6898             }
6899             break;
6900         case float_round_ties_away:
6901             z.high += lastBitMask>>1;
6902             break;
6903         case float_round_to_zero:
6904             break;
6905         case float_round_up:
6906             if (!extractFloat128Sign(z)) {
6907                 z.high |= ( a.low != 0 );
6908                 z.high += roundBitsMask;
6909             }
6910             break;
6911         case float_round_down:
6912             if (extractFloat128Sign(z)) {
6913                 z.high |= (a.low != 0);
6914                 z.high += roundBitsMask;
6915             }
6916             break;
6917         case float_round_to_odd:
6918             if ((z.high & lastBitMask) == 0) {
6919                 z.high |= (a.low != 0);
6920                 z.high += roundBitsMask;
6921             }
6922             break;
6923         default:
6924             abort();
6925         }
6926         z.high &= ~ roundBitsMask;
6927     }
6928     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6929         float_raise(float_flag_inexact, status);
6930     }
6931     return z;
6932
6933 }
6934
6935 /*----------------------------------------------------------------------------
6936 | Returns the result of adding the absolute values of the quadruple-precision
6937 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6938 | before being returned.  `zSign' is ignored if the result is a NaN.
6939 | The addition is performed according to the IEC/IEEE Standard for Binary
6940 | Floating-Point Arithmetic.
6941 *----------------------------------------------------------------------------*/
6942
6943 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6944                                 float_status *status)
6945 {
6946     int32_t aExp, bExp, zExp;
6947     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6948     int32_t expDiff;
6949
6950     aSig1 = extractFloat128Frac1( a );
6951     aSig0 = extractFloat128Frac0( a );
6952     aExp = extractFloat128Exp( a );
6953     bSig1 = extractFloat128Frac1( b );
6954     bSig0 = extractFloat128Frac0( b );
6955     bExp = extractFloat128Exp( b );
6956     expDiff = aExp - bExp;
6957     if ( 0 < expDiff ) {
6958         if ( aExp == 0x7FFF ) {
6959             if (aSig0 | aSig1) {
6960                 return propagateFloat128NaN(a, b, status);
6961             }
6962             return a;
6963         }
6964         if ( bExp == 0 ) {
6965             --expDiff;
6966         }
6967         else {
6968             bSig0 |= UINT64_C(0x0001000000000000);
6969         }
6970         shift128ExtraRightJamming(
6971             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6972         zExp = aExp;
6973     }
6974     else if ( expDiff < 0 ) {
6975         if ( bExp == 0x7FFF ) {
6976             if (bSig0 | bSig1) {
6977                 return propagateFloat128NaN(a, b, status);
6978             }
6979             return packFloat128( zSign, 0x7FFF, 0, 0 );
6980         }
6981         if ( aExp == 0 ) {
6982             ++expDiff;
6983         }
6984         else {
6985             aSig0 |= UINT64_C(0x0001000000000000);
6986         }
6987         shift128ExtraRightJamming(
6988             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6989         zExp = bExp;
6990     }
6991     else {
6992         if ( aExp == 0x7FFF ) {
6993             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6994                 return propagateFloat128NaN(a, b, status);
6995             }
6996             return a;
6997         }
6998         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6999         if ( aExp == 0 ) {
7000             if (status->flush_to_zero) {
7001                 if (zSig0 | zSig1) {
7002                     float_raise(float_flag_output_denormal, status);
7003                 }
7004                 return packFloat128(zSign, 0, 0, 0);
7005             }
7006             return packFloat128( zSign, 0, zSig0, zSig1 );
7007         }
7008         zSig2 = 0;
7009         zSig0 |= UINT64_C(0x0002000000000000);
7010         zExp = aExp;
7011         goto shiftRight1;
7012     }
7013     aSig0 |= UINT64_C(0x0001000000000000);
7014     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7015     --zExp;
7016     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7017     ++zExp;
7018  shiftRight1:
7019     shift128ExtraRightJamming(
7020         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7021  roundAndPack:
7022     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7023
7024 }
7025
7026 /*----------------------------------------------------------------------------
7027 | Returns the result of subtracting the absolute values of the quadruple-
7028 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7029 | difference is negated before being returned.  `zSign' is ignored if the
7030 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7031 | Standard for Binary Floating-Point Arithmetic.
7032 *----------------------------------------------------------------------------*/
7033
7034 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7035                                 float_status *status)
7036 {
7037     int32_t aExp, bExp, zExp;
7038     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7039     int32_t expDiff;
7040
7041     aSig1 = extractFloat128Frac1( a );
7042     aSig0 = extractFloat128Frac0( a );
7043     aExp = extractFloat128Exp( a );
7044     bSig1 = extractFloat128Frac1( b );
7045     bSig0 = extractFloat128Frac0( b );
7046     bExp = extractFloat128Exp( b );
7047     expDiff = aExp - bExp;
7048     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7049     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7050     if ( 0 < expDiff ) goto aExpBigger;
7051     if ( expDiff < 0 ) goto bExpBigger;
7052     if ( aExp == 0x7FFF ) {
7053         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7054             return propagateFloat128NaN(a, b, status);
7055         }
7056         float_raise(float_flag_invalid, status);
7057         return float128_default_nan(status);
7058     }
7059     if ( aExp == 0 ) {
7060         aExp = 1;
7061         bExp = 1;
7062     }
7063     if ( bSig0 < aSig0 ) goto aBigger;
7064     if ( aSig0 < bSig0 ) goto bBigger;
7065     if ( bSig1 < aSig1 ) goto aBigger;
7066     if ( aSig1 < bSig1 ) goto bBigger;
7067     return packFloat128(status->float_rounding_mode == float_round_down,
7068                         0, 0, 0);
7069  bExpBigger:
7070     if ( bExp == 0x7FFF ) {
7071         if (bSig0 | bSig1) {
7072             return propagateFloat128NaN(a, b, status);
7073         }
7074         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7075     }
7076     if ( aExp == 0 ) {
7077         ++expDiff;
7078     }
7079     else {
7080         aSig0 |= UINT64_C(0x4000000000000000);
7081     }
7082     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7083     bSig0 |= UINT64_C(0x4000000000000000);
7084  bBigger:
7085     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7086     zExp = bExp;
7087     zSign ^= 1;
7088     goto normalizeRoundAndPack;
7089  aExpBigger:
7090     if ( aExp == 0x7FFF ) {
7091         if (aSig0 | aSig1) {
7092             return propagateFloat128NaN(a, b, status);
7093         }
7094         return a;
7095     }
7096     if ( bExp == 0 ) {
7097         --expDiff;
7098     }
7099     else {
7100         bSig0 |= UINT64_C(0x4000000000000000);
7101     }
7102     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7103     aSig0 |= UINT64_C(0x4000000000000000);
7104  aBigger:
7105     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7106     zExp = aExp;
7107  normalizeRoundAndPack:
7108     --zExp;
7109     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7110                                          status);
7111
7112 }
7113
7114 /*----------------------------------------------------------------------------
7115 | Returns the result of adding the quadruple-precision floating-point values
7116 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7117 | for Binary Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7119
7120 float128 float128_add(float128 a, float128 b, float_status *status)
7121 {
7122     bool aSign, bSign;
7123
7124     aSign = extractFloat128Sign( a );
7125     bSign = extractFloat128Sign( b );
7126     if ( aSign == bSign ) {
7127         return addFloat128Sigs(a, b, aSign, status);
7128     }
7129     else {
7130         return subFloat128Sigs(a, b, aSign, status);
7131     }
7132
7133 }
7134
7135 /*----------------------------------------------------------------------------
7136 | Returns the result of subtracting the quadruple-precision floating-point
7137 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7138 | Standard for Binary Floating-Point Arithmetic.
7139 *----------------------------------------------------------------------------*/
7140
7141 float128 float128_sub(float128 a, float128 b, float_status *status)
7142 {
7143     bool aSign, bSign;
7144
7145     aSign = extractFloat128Sign( a );
7146     bSign = extractFloat128Sign( b );
7147     if ( aSign == bSign ) {
7148         return subFloat128Sigs(a, b, aSign, status);
7149     }
7150     else {
7151         return addFloat128Sigs(a, b, aSign, status);
7152     }
7153
7154 }
7155
7156 /*----------------------------------------------------------------------------
7157 | Returns the result of multiplying the quadruple-precision floating-point
7158 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7159 | Standard for Binary Floating-Point Arithmetic.
7160 *----------------------------------------------------------------------------*/
7161
7162 float128 float128_mul(float128 a, float128 b, float_status *status)
7163 {
7164     bool aSign, bSign, zSign;
7165     int32_t aExp, bExp, zExp;
7166     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7167
7168     aSig1 = extractFloat128Frac1( a );
7169     aSig0 = extractFloat128Frac0( a );
7170     aExp = extractFloat128Exp( a );
7171     aSign = extractFloat128Sign( a );
7172     bSig1 = extractFloat128Frac1( b );
7173     bSig0 = extractFloat128Frac0( b );
7174     bExp = extractFloat128Exp( b );
7175     bSign = extractFloat128Sign( b );
7176     zSign = aSign ^ bSign;
7177     if ( aExp == 0x7FFF ) {
7178         if (    ( aSig0 | aSig1 )
7179              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7180             return propagateFloat128NaN(a, b, status);
7181         }
7182         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7183         return packFloat128( zSign, 0x7FFF, 0, 0 );
7184     }
7185     if ( bExp == 0x7FFF ) {
7186         if (bSig0 | bSig1) {
7187             return propagateFloat128NaN(a, b, status);
7188         }
7189         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7190  invalid:
7191             float_raise(float_flag_invalid, status);
7192             return float128_default_nan(status);
7193         }
7194         return packFloat128( zSign, 0x7FFF, 0, 0 );
7195     }
7196     if ( aExp == 0 ) {
7197         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7198         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7199     }
7200     if ( bExp == 0 ) {
7201         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7202         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7203     }
7204     zExp = aExp + bExp - 0x4000;
7205     aSig0 |= UINT64_C(0x0001000000000000);
7206     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7207     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7208     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7209     zSig2 |= ( zSig3 != 0 );
7210     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7211         shift128ExtraRightJamming(
7212             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7213         ++zExp;
7214     }
7215     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7216
7217 }
7218
7219 /*----------------------------------------------------------------------------
7220 | Returns the result of dividing the quadruple-precision floating-point value
7221 | `a' by the corresponding value `b'.  The operation is performed according to
7222 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7223 *----------------------------------------------------------------------------*/
7224
7225 float128 float128_div(float128 a, float128 b, float_status *status)
7226 {
7227     bool aSign, bSign, zSign;
7228     int32_t aExp, bExp, zExp;
7229     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7230     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7231
7232     aSig1 = extractFloat128Frac1( a );
7233     aSig0 = extractFloat128Frac0( a );
7234     aExp = extractFloat128Exp( a );
7235     aSign = extractFloat128Sign( a );
7236     bSig1 = extractFloat128Frac1( b );
7237     bSig0 = extractFloat128Frac0( b );
7238     bExp = extractFloat128Exp( b );
7239     bSign = extractFloat128Sign( b );
7240     zSign = aSign ^ bSign;
7241     if ( aExp == 0x7FFF ) {
7242         if (aSig0 | aSig1) {
7243             return propagateFloat128NaN(a, b, status);
7244         }
7245         if ( bExp == 0x7FFF ) {
7246             if (bSig0 | bSig1) {
7247                 return propagateFloat128NaN(a, b, status);
7248             }
7249             goto invalid;
7250         }
7251         return packFloat128( zSign, 0x7FFF, 0, 0 );
7252     }
7253     if ( bExp == 0x7FFF ) {
7254         if (bSig0 | bSig1) {
7255             return propagateFloat128NaN(a, b, status);
7256         }
7257         return packFloat128( zSign, 0, 0, 0 );
7258     }
7259     if ( bExp == 0 ) {
7260         if ( ( bSig0 | bSig1 ) == 0 ) {
7261             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7262  invalid:
7263                 float_raise(float_flag_invalid, status);
7264                 return float128_default_nan(status);
7265             }
7266             float_raise(float_flag_divbyzero, status);
7267             return packFloat128( zSign, 0x7FFF, 0, 0 );
7268         }
7269         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7270     }
7271     if ( aExp == 0 ) {
7272         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7273         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7274     }
7275     zExp = aExp - bExp + 0x3FFD;
7276     shortShift128Left(
7277         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7278     shortShift128Left(
7279         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7280     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7281         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7282         ++zExp;
7283     }
7284     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7285     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7286     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7287     while ( (int64_t) rem0 < 0 ) {
7288         --zSig0;
7289         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7290     }
7291     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7292     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7293         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7294         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7295         while ( (int64_t) rem1 < 0 ) {
7296             --zSig1;
7297             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7298         }
7299         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7300     }
7301     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7302     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7303
7304 }
7305
7306 /*----------------------------------------------------------------------------
7307 | Returns the remainder of the quadruple-precision floating-point value `a'
7308 | with respect to the corresponding value `b'.  The operation is performed
7309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7310 *----------------------------------------------------------------------------*/
7311
7312 float128 float128_rem(float128 a, float128 b, float_status *status)
7313 {
7314     bool aSign, zSign;
7315     int32_t aExp, bExp, expDiff;
7316     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7317     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7318     int64_t sigMean0;
7319
7320     aSig1 = extractFloat128Frac1( a );
7321     aSig0 = extractFloat128Frac0( a );
7322     aExp = extractFloat128Exp( a );
7323     aSign = extractFloat128Sign( a );
7324     bSig1 = extractFloat128Frac1( b );
7325     bSig0 = extractFloat128Frac0( b );
7326     bExp = extractFloat128Exp( b );
7327     if ( aExp == 0x7FFF ) {
7328         if (    ( aSig0 | aSig1 )
7329              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7330             return propagateFloat128NaN(a, b, status);
7331         }
7332         goto invalid;
7333     }
7334     if ( bExp == 0x7FFF ) {
7335         if (bSig0 | bSig1) {
7336             return propagateFloat128NaN(a, b, status);
7337         }
7338         return a;
7339     }
7340     if ( bExp == 0 ) {
7341         if ( ( bSig0 | bSig1 ) == 0 ) {
7342  invalid:
7343             float_raise(float_flag_invalid, status);
7344             return float128_default_nan(status);
7345         }
7346         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7347     }
7348     if ( aExp == 0 ) {
7349         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7350         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7351     }
7352     expDiff = aExp - bExp;
7353     if ( expDiff < -1 ) return a;
7354     shortShift128Left(
7355         aSig0 | UINT64_C(0x0001000000000000),
7356         aSig1,
7357         15 - ( expDiff < 0 ),
7358         &aSig0,
7359         &aSig1
7360     );
7361     shortShift128Left(
7362         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7363     q = le128( bSig0, bSig1, aSig0, aSig1 );
7364     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7365     expDiff -= 64;
7366     while ( 0 < expDiff ) {
7367         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7368         q = ( 4 < q ) ? q - 4 : 0;
7369         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7370         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7371         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7372         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7373         expDiff -= 61;
7374     }
7375     if ( -64 < expDiff ) {
7376         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7377         q = ( 4 < q ) ? q - 4 : 0;
7378         q >>= - expDiff;
7379         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7380         expDiff += 52;
7381         if ( expDiff < 0 ) {
7382             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7383         }
7384         else {
7385             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7386         }
7387         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7388         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7389     }
7390     else {
7391         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7392         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7393     }
7394     do {
7395         alternateASig0 = aSig0;
7396         alternateASig1 = aSig1;
7397         ++q;
7398         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7399     } while ( 0 <= (int64_t) aSig0 );
7400     add128(
7401         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7402     if (    ( sigMean0 < 0 )
7403          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7404         aSig0 = alternateASig0;
7405         aSig1 = alternateASig1;
7406     }
7407     zSign = ( (int64_t) aSig0 < 0 );
7408     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7409     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7410                                          status);
7411 }
7412
7413 /*----------------------------------------------------------------------------
7414 | Returns the square root of the quadruple-precision floating-point value `a'.
7415 | The operation is performed according to the IEC/IEEE Standard for Binary
7416 | Floating-Point Arithmetic.
7417 *----------------------------------------------------------------------------*/
7418
7419 float128 float128_sqrt(float128 a, float_status *status)
7420 {
7421     bool aSign;
7422     int32_t aExp, zExp;
7423     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7424     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7425
7426     aSig1 = extractFloat128Frac1( a );
7427     aSig0 = extractFloat128Frac0( a );
7428     aExp = extractFloat128Exp( a );
7429     aSign = extractFloat128Sign( a );
7430     if ( aExp == 0x7FFF ) {
7431         if (aSig0 | aSig1) {
7432             return propagateFloat128NaN(a, a, status);
7433         }
7434         if ( ! aSign ) return a;
7435         goto invalid;
7436     }
7437     if ( aSign ) {
7438         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7439  invalid:
7440         float_raise(float_flag_invalid, status);
7441         return float128_default_nan(status);
7442     }
7443     if ( aExp == 0 ) {
7444         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7445         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7446     }
7447     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7448     aSig0 |= UINT64_C(0x0001000000000000);
7449     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7450     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7451     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7452     doubleZSig0 = zSig0<<1;
7453     mul64To128( zSig0, zSig0, &term0, &term1 );
7454     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7455     while ( (int64_t) rem0 < 0 ) {
7456         --zSig0;
7457         doubleZSig0 -= 2;
7458         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7459     }
7460     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7461     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7462         if ( zSig1 == 0 ) zSig1 = 1;
7463         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7464         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7465         mul64To128( zSig1, zSig1, &term2, &term3 );
7466         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7467         while ( (int64_t) rem1 < 0 ) {
7468             --zSig1;
7469             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7470             term3 |= 1;
7471             term2 |= doubleZSig0;
7472             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7473         }
7474         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7475     }
7476     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7477     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7478
7479 }
7480
7481 static inline FloatRelation
7482 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7483                           float_status *status)
7484 {
7485     bool aSign, bSign;
7486
7487     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7488         float_raise(float_flag_invalid, status);
7489         return float_relation_unordered;
7490     }
7491     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7492           ( extractFloatx80Frac( a )<<1 ) ) ||
7493         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7494           ( extractFloatx80Frac( b )<<1 ) )) {
7495         if (!is_quiet ||
7496             floatx80_is_signaling_nan(a, status) ||
7497             floatx80_is_signaling_nan(b, status)) {
7498             float_raise(float_flag_invalid, status);
7499         }
7500         return float_relation_unordered;
7501     }
7502     aSign = extractFloatx80Sign( a );
7503     bSign = extractFloatx80Sign( b );
7504     if ( aSign != bSign ) {
7505
7506         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7507              ( ( a.low | b.low ) == 0 ) ) {
7508             /* zero case */
7509             return float_relation_equal;
7510         } else {
7511             return 1 - (2 * aSign);
7512         }
7513     } else {
7514         /* Normalize pseudo-denormals before comparison.  */
7515         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7516             ++a.high;
7517         }
7518         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7519             ++b.high;
7520         }
7521         if (a.low == b.low && a.high == b.high) {
7522             return float_relation_equal;
7523         } else {
7524             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7525         }
7526     }
7527 }
7528
7529 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7530 {
7531     return floatx80_compare_internal(a, b, 0, status);
7532 }
7533
7534 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7535                                      float_status *status)
7536 {
7537     return floatx80_compare_internal(a, b, 1, status);
7538 }
7539
7540 static inline FloatRelation
7541 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7542                           float_status *status)
7543 {
7544     bool aSign, bSign;
7545
7546     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7547           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7548         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7549           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7550         if (!is_quiet ||
7551             float128_is_signaling_nan(a, status) ||
7552             float128_is_signaling_nan(b, status)) {
7553             float_raise(float_flag_invalid, status);
7554         }
7555         return float_relation_unordered;
7556     }
7557     aSign = extractFloat128Sign( a );
7558     bSign = extractFloat128Sign( b );
7559     if ( aSign != bSign ) {
7560         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7561             /* zero case */
7562             return float_relation_equal;
7563         } else {
7564             return 1 - (2 * aSign);
7565         }
7566     } else {
7567         if (a.low == b.low && a.high == b.high) {
7568             return float_relation_equal;
7569         } else {
7570             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7571         }
7572     }
7573 }
7574
7575 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7576 {
7577     return float128_compare_internal(a, b, 0, status);
7578 }
7579
7580 FloatRelation float128_compare_quiet(float128 a, float128 b,
7581                                      float_status *status)
7582 {
7583     return float128_compare_internal(a, b, 1, status);
7584 }
7585
7586 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7587 {
7588     bool aSign;
7589     int32_t aExp;
7590     uint64_t aSig;
7591
7592     if (floatx80_invalid_encoding(a)) {
7593         float_raise(float_flag_invalid, status);
7594         return floatx80_default_nan(status);
7595     }
7596     aSig = extractFloatx80Frac( a );
7597     aExp = extractFloatx80Exp( a );
7598     aSign = extractFloatx80Sign( a );
7599
7600     if ( aExp == 0x7FFF ) {
7601         if ( aSig<<1 ) {
7602             return propagateFloatx80NaN(a, a, status);
7603         }
7604         return a;
7605     }
7606
7607     if (aExp == 0) {
7608         if (aSig == 0) {
7609             return a;
7610         }
7611         aExp++;
7612     }
7613
7614     if (n > 0x10000) {
7615         n = 0x10000;
7616     } else if (n < -0x10000) {
7617         n = -0x10000;
7618     }
7619
7620     aExp += n;
7621     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7622                                          aSign, aExp, aSig, 0, status);
7623 }
7624
7625 float128 float128_scalbn(float128 a, int n, float_status *status)
7626 {
7627     bool aSign;
7628     int32_t aExp;
7629     uint64_t aSig0, aSig1;
7630
7631     aSig1 = extractFloat128Frac1( a );
7632     aSig0 = extractFloat128Frac0( a );
7633     aExp = extractFloat128Exp( a );
7634     aSign = extractFloat128Sign( a );
7635     if ( aExp == 0x7FFF ) {
7636         if ( aSig0 | aSig1 ) {
7637             return propagateFloat128NaN(a, a, status);
7638         }
7639         return a;
7640     }
7641     if (aExp != 0) {
7642         aSig0 |= UINT64_C(0x0001000000000000);
7643     } else if (aSig0 == 0 && aSig1 == 0) {
7644         return a;
7645     } else {
7646         aExp++;
7647     }
7648
7649     if (n > 0x10000) {
7650         n = 0x10000;
7651     } else if (n < -0x10000) {
7652         n = -0x10000;
7653     }
7654
7655     aExp += n - 1;
7656     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7657                                          , status);
7658
7659 }
7660
7661 static void __attribute__((constructor)) softfloat_init(void)
7662 {
7663     union_float64 ua, ub, uc, ur;
7664
7665     if (QEMU_NO_HARDFLOAT) {
7666         return;
7667     }
7668     /*
7669      * Test that the host's FMA is not obviously broken. For example,
7670      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7671      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7672      */
7673     ua.s = 0x0020000000000001ULL;
7674     ub.s = 0x3ca0000000000000ULL;
7675     uc.s = 0x0020000000000000ULL;
7676     ur.h = fma(ua.h, ub.h, uc.h);
7677     if (ur.s != 0x0020000000000001ULL) {
7678         force_soft_fma = true;
7679     }
7680 }