fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float. The
 504  * exponent is unbiased and the fraction is normalized. All
 505  * calculations are done with a 64 bit fraction and then rounded as
 506  * appropriate for the final format.
 507  *
 508  * Thanks to the packed FloatClass a decent compiler should be able to
 509  * fit the whole structure into registers and avoid using the stack
 510  * for parameter passing.
 511  */
 512
 513 typedef struct {
 514     uint64_t frac;
 515     int32_t  exp;
 516     FloatClass cls;
 517     bool sign;
 518 } FloatParts64;
 519
 520 #define DECOMPOSED_BINARY_POINT    63
 521 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 522
 523 /* Structure holding all of the relevant parameters for a format.
 524  *   exp_size: the size of the exponent field
 525  *   exp_bias: the offset applied to the exponent field
 526  *   exp_max: the maximum normalised exponent
 527  *   frac_size: the size of the fraction field
 528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 529  * The following are computed based the size of fraction
 530  *   frac_lsb: least significant bit of fraction
 531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 532  *   round_mask/roundeven_mask: masks used for rounding
 533  * The following optional modifiers are available:
 534  *   arm_althp: handle ARM Alternative Half Precision
 535  */
 536 typedef struct {
 537     int exp_size;
 538     int exp_bias;
 539     int exp_max;
 540     int frac_size;
 541     int frac_shift;
 542     uint64_t frac_lsb;
 543     uint64_t frac_lsbm1;
 544     uint64_t round_mask;
 545     uint64_t roundeven_mask;
 546     bool arm_althp;
 547 } FloatFmt;
 548
 549 /* Expand fields based on the size of exponent and fraction */
 550 #define FLOAT_PARAMS(E, F)                                           \
 551     .exp_size       = E,                                             \
 552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 553     .exp_max        = (1 << E) - 1,                                  \
 554     .frac_size      = F,                                             \
 555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 560
 561 static const FloatFmt float16_params = {
 562     FLOAT_PARAMS(5, 10)
 563 };
 564
 565 static const FloatFmt float16_params_ahp = {
 566     FLOAT_PARAMS(5, 10),
 567     .arm_althp = true
 568 };
 569
 570 static const FloatFmt bfloat16_params = {
 571     FLOAT_PARAMS(8, 7)
 572 };
 573
 574 static const FloatFmt float32_params = {
 575     FLOAT_PARAMS(8, 23)
 576 };
 577
 578 static const FloatFmt float64_params = {
 579     FLOAT_PARAMS(11, 52)
 580 };
 581
 582 /* Unpack a float to parts, but do not canonicalize.  */
 583 static inline FloatParts64 unpack_raw(FloatFmt fmt, uint64_t raw)
 584 {
 585     const int sign_pos = fmt.frac_size + fmt.exp_size;
 586
 587     return (FloatParts64) {
 588         .cls = float_class_unclassified,
 589         .sign = extract64(raw, sign_pos, 1),
 590         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 591         .frac = extract64(raw, 0, fmt.frac_size),
 592     };
 593 }
 594
 595 static inline FloatParts64 float16_unpack_raw(float16 f)
 596 {
 597     return unpack_raw(float16_params, f);
 598 }
 599
 600 static inline FloatParts64 bfloat16_unpack_raw(bfloat16 f)
 601 {
 602     return unpack_raw(bfloat16_params, f);
 603 }
 604
 605 static inline FloatParts64 float32_unpack_raw(float32 f)
 606 {
 607     return unpack_raw(float32_params, f);
 608 }
 609
 610 static inline FloatParts64 float64_unpack_raw(float64 f)
 611 {
 612     return unpack_raw(float64_params, f);
 613 }
 614
 615 /* Pack a float from parts, but do not canonicalize.  */
 616 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts64 p)
 617 {
 618     const int sign_pos = fmt.frac_size + fmt.exp_size;
 619     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 620     return deposit64(ret, sign_pos, 1, p.sign);
 621 }
 622
 623 static inline float16 float16_pack_raw(FloatParts64 p)
 624 {
 625     return make_float16(pack_raw(float16_params, p));
 626 }
 627
 628 static inline bfloat16 bfloat16_pack_raw(FloatParts64 p)
 629 {
 630     return pack_raw(bfloat16_params, p);
 631 }
 632
 633 static inline float32 float32_pack_raw(FloatParts64 p)
 634 {
 635     return make_float32(pack_raw(float32_params, p));
 636 }
 637
 638 static inline float64 float64_pack_raw(FloatParts64 p)
 639 {
 640     return make_float64(pack_raw(float64_params, p));
 641 }
 642
 643 /*----------------------------------------------------------------------------
 644 | Functions and definitions to determine:  (1) whether tininess for underflow
 645 | is detected before or after rounding by default, (2) what (if anything)
 646 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 647 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 648 | are propagated from function inputs to output.  These details are target-
 649 | specific.
 650 *----------------------------------------------------------------------------*/
 651 #include "softfloat-specialize.c.inc"
 652
 653 #define parts_default_nan  parts64_default_nan
 654
 655 /* Canonicalize EXP and FRAC, setting CLS.  */
 656 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
 657                                   float_status *status)
 658 {
 659     if (part.exp == parm->exp_max && !parm->arm_althp) {
 660         if (part.frac == 0) {
 661             part.cls = float_class_inf;
 662         } else {
 663             part.frac <<= parm->frac_shift;
 664             part.cls = (parts_is_snan_frac(part.frac, status)
 665                         ? float_class_snan : float_class_qnan);
 666         }
 667     } else if (part.exp == 0) {
 668         if (likely(part.frac == 0)) {
 669             part.cls = float_class_zero;
 670         } else if (status->flush_inputs_to_zero) {
 671             float_raise(float_flag_input_denormal, status);
 672             part.cls = float_class_zero;
 673             part.frac = 0;
 674         } else {
 675             int shift = clz64(part.frac);
 676             part.cls = float_class_normal;
 677             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 678             part.frac <<= shift;
 679         }
 680     } else {
 681         part.cls = float_class_normal;
 682         part.exp -= parm->exp_bias;
 683         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 684     }
 685     return part;
 686 }
 687
 688 /* Round and uncanonicalize a floating-point number by parts. There
 689  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 690  * fraction; these bits will be removed. The exponent will be biased
 691  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 692  */
 693
 694 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
 695                                   const FloatFmt *parm)
 696 {
 697     const uint64_t frac_lsb = parm->frac_lsb;
 698     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 699     const uint64_t round_mask = parm->round_mask;
 700     const uint64_t roundeven_mask = parm->roundeven_mask;
 701     const int exp_max = parm->exp_max;
 702     const int frac_shift = parm->frac_shift;
 703     uint64_t frac, inc;
 704     int exp, flags = 0;
 705     bool overflow_norm;
 706
 707     frac = p.frac;
 708     exp = p.exp;
 709
 710     switch (p.cls) {
 711     case float_class_normal:
 712         switch (s->float_rounding_mode) {
 713         case float_round_nearest_even:
 714             overflow_norm = false;
 715             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 716             break;
 717         case float_round_ties_away:
 718             overflow_norm = false;
 719             inc = frac_lsbm1;
 720             break;
 721         case float_round_to_zero:
 722             overflow_norm = true;
 723             inc = 0;
 724             break;
 725         case float_round_up:
 726             inc = p.sign ? 0 : round_mask;
 727             overflow_norm = p.sign;
 728             break;
 729         case float_round_down:
 730             inc = p.sign ? round_mask : 0;
 731             overflow_norm = !p.sign;
 732             break;
 733         case float_round_to_odd:
 734             overflow_norm = true;
 735             inc = frac & frac_lsb ? 0 : round_mask;
 736             break;
 737         default:
 738             g_assert_not_reached();
 739         }
 740
 741         exp += parm->exp_bias;
 742         if (likely(exp > 0)) {
 743             if (frac & round_mask) {
 744                 flags |= float_flag_inexact;
 745                 if (uadd64_overflow(frac, inc, &frac)) {
 746                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
 747                     exp++;
 748                 }
 749             }
 750             frac >>= frac_shift;
 751
 752             if (parm->arm_althp) {
 753                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 754                 if (unlikely(exp > exp_max)) {
 755                     /* Overflow.  Return the maximum normal.  */
 756                     flags = float_flag_invalid;
 757                     exp = exp_max;
 758                     frac = -1;
 759                 }
 760             } else if (unlikely(exp >= exp_max)) {
 761                 flags |= float_flag_overflow | float_flag_inexact;
 762                 if (overflow_norm) {
 763                     exp = exp_max - 1;
 764                     frac = -1;
 765                 } else {
 766                     p.cls = float_class_inf;
 767                     goto do_inf;
 768                 }
 769             }
 770         } else if (s->flush_to_zero) {
 771             flags |= float_flag_output_denormal;
 772             p.cls = float_class_zero;
 773             goto do_zero;
 774         } else {
 775             bool is_tiny = s->tininess_before_rounding || (exp < 0);
 776
 777             if (!is_tiny) {
 778                 uint64_t discard;
 779                 is_tiny = !uadd64_overflow(frac, inc, &discard);
 780             }
 781
 782             shift64RightJamming(frac, 1 - exp, &frac);
 783             if (frac & round_mask) {
 784                 /* Need to recompute round-to-even.  */
 785                 switch (s->float_rounding_mode) {
 786                 case float_round_nearest_even:
 787                     inc = ((frac & roundeven_mask) != frac_lsbm1
 788                            ? frac_lsbm1 : 0);
 789                     break;
 790                 case float_round_to_odd:
 791                     inc = frac & frac_lsb ? 0 : round_mask;
 792                     break;
 793                 default:
 794                     break;
 795                 }
 796                 flags |= float_flag_inexact;
 797                 frac += inc;
 798             }
 799
 800             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 801             frac >>= frac_shift;
 802
 803             if (is_tiny && (flags & float_flag_inexact)) {
 804                 flags |= float_flag_underflow;
 805             }
 806             if (exp == 0 && frac == 0) {
 807                 p.cls = float_class_zero;
 808             }
 809         }
 810         break;
 811
 812     case float_class_zero:
 813     do_zero:
 814         exp = 0;
 815         frac = 0;
 816         break;
 817
 818     case float_class_inf:
 819     do_inf:
 820         assert(!parm->arm_althp);
 821         exp = exp_max;
 822         frac = 0;
 823         break;
 824
 825     case float_class_qnan:
 826     case float_class_snan:
 827         assert(!parm->arm_althp);
 828         exp = exp_max;
 829         frac >>= parm->frac_shift;
 830         break;
 831
 832     default:
 833         g_assert_not_reached();
 834     }
 835
 836     float_raise(flags, s);
 837     p.exp = exp;
 838     p.frac = frac;
 839     return p;
 840 }
 841
 842 static FloatParts64 return_nan(FloatParts64 a, float_status *s)
 843 {
 844     g_assert(is_nan(a.cls));
 845     if (is_snan(a.cls)) {
 846         float_raise(float_flag_invalid, s);
 847         if (!s->default_nan_mode) {
 848             return parts_silence_nan(a, s);
 849         }
 850     } else if (!s->default_nan_mode) {
 851         return a;
 852     }
 853     parts_default_nan(&a, s);
 854     return a;
 855 }
 856
 857 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s)
 858 {
 859     if (is_snan(a.cls) || is_snan(b.cls)) {
 860         float_raise(float_flag_invalid, s);
 861     }
 862
 863     if (s->default_nan_mode) {
 864         parts_default_nan(&a, s);
 865     } else {
 866         if (pickNaN(a.cls, b.cls,
 867                     a.frac > b.frac ||
 868                     (a.frac == b.frac && a.sign < b.sign), s)) {
 869             a = b;
 870         }
 871         if (is_snan(a.cls)) {
 872             return parts_silence_nan(a, s);
 873         }
 874     }
 875     return a;
 876 }
 877
 878 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c,
 879                                   bool inf_zero, float_status *s)
 880 {
 881     int which;
 882
 883     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 884         float_raise(float_flag_invalid, s);
 885     }
 886
 887     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 888
 889     if (s->default_nan_mode) {
 890         /* Note that this check is after pickNaNMulAdd so that function
 891          * has an opportunity to set the Invalid flag.
 892          */
 893         which = 3;
 894     }
 895
 896     switch (which) {
 897     case 0:
 898         break;
 899     case 1:
 900         a = b;
 901         break;
 902     case 2:
 903         a = c;
 904         break;
 905     case 3:
 906         parts_default_nan(&a, s);
 907         break;
 908     default:
 909         g_assert_not_reached();
 910     }
 911
 912     if (is_snan(a.cls)) {
 913         return parts_silence_nan(a, s);
 914     }
 915     return a;
 916 }
 917
 918 /*
 919  * Pack/unpack routines with a specific FloatFmt.
 920  */
 921
 922 static FloatParts64 float16a_unpack_canonical(float16 f, float_status *s,
 923                                             const FloatFmt *params)
 924 {
 925     return sf_canonicalize(float16_unpack_raw(f), params, s);
 926 }
 927
 928 static FloatParts64 float16_unpack_canonical(float16 f, float_status *s)
 929 {
 930     return float16a_unpack_canonical(f, s, &float16_params);
 931 }
 932
 933 static FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s)
 934 {
 935     return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
 936 }
 937
 938 static float16 float16a_round_pack_canonical(FloatParts64 p, float_status *s,
 939                                              const FloatFmt *params)
 940 {
 941     return float16_pack_raw(round_canonical(p, s, params));
 942 }
 943
 944 static float16 float16_round_pack_canonical(FloatParts64 p, float_status *s)
 945 {
 946     return float16a_round_pack_canonical(p, s, &float16_params);
 947 }
 948
 949 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 p, float_status *s)
 950 {
 951     return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
 952 }
 953
 954 static FloatParts64 float32_unpack_canonical(float32 f, float_status *s)
 955 {
 956     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 957 }
 958
 959 static float32 float32_round_pack_canonical(FloatParts64 p, float_status *s)
 960 {
 961     return float32_pack_raw(round_canonical(p, s, &float32_params));
 962 }
 963
 964 static FloatParts64 float64_unpack_canonical(float64 f, float_status *s)
 965 {
 966     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 967 }
 968
 969 static float64 float64_round_pack_canonical(FloatParts64 p, float_status *s)
 970 {
 971     return float64_pack_raw(round_canonical(p, s, &float64_params));
 972 }
 973
 974 /*
 975  * Returns the result of adding or subtracting the values of the
 976  * floating-point values `a' and `b'. The operation is performed
 977  * according to the IEC/IEEE Standard for Binary Floating-Point
 978  * Arithmetic.
 979  */
 980
 981 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
 982                                 float_status *s)
 983 {
 984     bool a_sign = a.sign;
 985     bool b_sign = b.sign ^ subtract;
 986
 987     if (a_sign != b_sign) {
 988         /* Subtraction */
 989
 990         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 991             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 992                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 993                 a.frac = a.frac - b.frac;
 994             } else {
 995                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 996                 a.frac = b.frac - a.frac;
 997                 a.exp = b.exp;
 998                 a_sign ^= 1;
 999             }
1000
1001             if (a.frac == 0) {
1002                 a.cls = float_class_zero;
1003                 a.sign = s->float_rounding_mode == float_round_down;
1004             } else {
1005                 int shift = clz64(a.frac);
1006                 a.frac = a.frac << shift;
1007                 a.exp = a.exp - shift;
1008                 a.sign = a_sign;
1009             }
1010             return a;
1011         }
1012         if (is_nan(a.cls) || is_nan(b.cls)) {
1013             return pick_nan(a, b, s);
1014         }
1015         if (a.cls == float_class_inf) {
1016             if (b.cls == float_class_inf) {
1017                 float_raise(float_flag_invalid, s);
1018                 parts_default_nan(&a, s);
1019             }
1020             return a;
1021         }
1022         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1023             a.sign = s->float_rounding_mode == float_round_down;
1024             return a;
1025         }
1026         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1027             b.sign = a_sign ^ 1;
1028             return b;
1029         }
1030         if (b.cls == float_class_zero) {
1031             return a;
1032         }
1033     } else {
1034         /* Addition */
1035         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1036             if (a.exp > b.exp) {
1037                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1038             } else if (a.exp < b.exp) {
1039                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1040                 a.exp = b.exp;
1041             }
1042
1043             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1044                 shift64RightJamming(a.frac, 1, &a.frac);
1045                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1046                 a.exp += 1;
1047             }
1048             return a;
1049         }
1050         if (is_nan(a.cls) || is_nan(b.cls)) {
1051             return pick_nan(a, b, s);
1052         }
1053         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1054             return a;
1055         }
1056         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1057             b.sign = b_sign;
1058             return b;
1059         }
1060     }
1061     g_assert_not_reached();
1062 }
1063
1064 /*
1065  * Returns the result of adding or subtracting the floating-point
1066  * values `a' and `b'. The operation is performed according to the
1067  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1068  */
1069
1070 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1071 {
1072     FloatParts64 pa = float16_unpack_canonical(a, status);
1073     FloatParts64 pb = float16_unpack_canonical(b, status);
1074     FloatParts64 pr = addsub_floats(pa, pb, false, status);
1075
1076     return float16_round_pack_canonical(pr, status);
1077 }
1078
1079 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1080 {
1081     FloatParts64 pa = float16_unpack_canonical(a, status);
1082     FloatParts64 pb = float16_unpack_canonical(b, status);
1083     FloatParts64 pr = addsub_floats(pa, pb, true, status);
1084
1085     return float16_round_pack_canonical(pr, status);
1086 }
1087
1088 static float32 QEMU_SOFTFLOAT_ATTR
1089 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1090 {
1091     FloatParts64 pa = float32_unpack_canonical(a, status);
1092     FloatParts64 pb = float32_unpack_canonical(b, status);
1093     FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1094
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1099 {
1100     return soft_f32_addsub(a, b, false, status);
1101 }
1102
1103 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1104 {
1105     return soft_f32_addsub(a, b, true, status);
1106 }
1107
1108 static float64 QEMU_SOFTFLOAT_ATTR
1109 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1110 {
1111     FloatParts64 pa = float64_unpack_canonical(a, status);
1112     FloatParts64 pb = float64_unpack_canonical(b, status);
1113     FloatParts64 pr = addsub_floats(pa, pb, subtract, status);
1114
1115     return float64_round_pack_canonical(pr, status);
1116 }
1117
1118 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1119 {
1120     return soft_f64_addsub(a, b, false, status);
1121 }
1122
1123 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1124 {
1125     return soft_f64_addsub(a, b, true, status);
1126 }
1127
1128 static float hard_f32_add(float a, float b)
1129 {
1130     return a + b;
1131 }
1132
1133 static float hard_f32_sub(float a, float b)
1134 {
1135     return a - b;
1136 }
1137
1138 static double hard_f64_add(double a, double b)
1139 {
1140     return a + b;
1141 }
1142
1143 static double hard_f64_sub(double a, double b)
1144 {
1145     return a - b;
1146 }
1147
1148 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1149 {
1150     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1151         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1152     }
1153     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1154 }
1155
1156 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1157 {
1158     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1159         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1160     } else {
1161         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1162     }
1163 }
1164
1165 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1166                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1167 {
1168     return float32_gen2(a, b, s, hard, soft,
1169                         f32_is_zon2, f32_addsubmul_post);
1170 }
1171
1172 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1173                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1174 {
1175     return float64_gen2(a, b, s, hard, soft,
1176                         f64_is_zon2, f64_addsubmul_post);
1177 }
1178
1179 float32 QEMU_FLATTEN
1180 float32_add(float32 a, float32 b, float_status *s)
1181 {
1182     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1183 }
1184
1185 float32 QEMU_FLATTEN
1186 float32_sub(float32 a, float32 b, float_status *s)
1187 {
1188     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1189 }
1190
1191 float64 QEMU_FLATTEN
1192 float64_add(float64 a, float64 b, float_status *s)
1193 {
1194     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1195 }
1196
1197 float64 QEMU_FLATTEN
1198 float64_sub(float64 a, float64 b, float_status *s)
1199 {
1200     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1201 }
1202
1203 /*
1204  * Returns the result of adding or subtracting the bfloat16
1205  * values `a' and `b'.
1206  */
1207 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1208 {
1209     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1210     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1211     FloatParts64 pr = addsub_floats(pa, pb, false, status);
1212
1213     return bfloat16_round_pack_canonical(pr, status);
1214 }
1215
1216 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1217 {
1218     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1219     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1220     FloatParts64 pr = addsub_floats(pa, pb, true, status);
1221
1222     return bfloat16_round_pack_canonical(pr, status);
1223 }
1224
1225 /*
1226  * Returns the result of multiplying the floating-point values `a' and
1227  * `b'. The operation is performed according to the IEC/IEEE Standard
1228  * for Binary Floating-Point Arithmetic.
1229  */
1230
1231 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1232 {
1233     bool sign = a.sign ^ b.sign;
1234
1235     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1236         uint64_t hi, lo;
1237         int exp = a.exp + b.exp;
1238
1239         mul64To128(a.frac, b.frac, &hi, &lo);
1240         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1241             exp += 1;
1242         } else {
1243             hi <<= 1;
1244         }
1245         hi |= (lo != 0);
1246
1247         /* Re-use a */
1248         a.exp = exp;
1249         a.sign = sign;
1250         a.frac = hi;
1251         return a;
1252     }
1253     /* handle all the NaN cases */
1254     if (is_nan(a.cls) || is_nan(b.cls)) {
1255         return pick_nan(a, b, s);
1256     }
1257     /* Inf * Zero == NaN */
1258     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1259         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1260         float_raise(float_flag_invalid, s);
1261         parts_default_nan(&a, s);
1262         return a;
1263     }
1264     /* Multiply by 0 or Inf */
1265     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1266         a.sign = sign;
1267         return a;
1268     }
1269     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1270         b.sign = sign;
1271         return b;
1272     }
1273     g_assert_not_reached();
1274 }
1275
1276 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1277 {
1278     FloatParts64 pa = float16_unpack_canonical(a, status);
1279     FloatParts64 pb = float16_unpack_canonical(b, status);
1280     FloatParts64 pr = mul_floats(pa, pb, status);
1281
1282     return float16_round_pack_canonical(pr, status);
1283 }
1284
1285 static float32 QEMU_SOFTFLOAT_ATTR
1286 soft_f32_mul(float32 a, float32 b, float_status *status)
1287 {
1288     FloatParts64 pa = float32_unpack_canonical(a, status);
1289     FloatParts64 pb = float32_unpack_canonical(b, status);
1290     FloatParts64 pr = mul_floats(pa, pb, status);
1291
1292     return float32_round_pack_canonical(pr, status);
1293 }
1294
1295 static float64 QEMU_SOFTFLOAT_ATTR
1296 soft_f64_mul(float64 a, float64 b, float_status *status)
1297 {
1298     FloatParts64 pa = float64_unpack_canonical(a, status);
1299     FloatParts64 pb = float64_unpack_canonical(b, status);
1300     FloatParts64 pr = mul_floats(pa, pb, status);
1301
1302     return float64_round_pack_canonical(pr, status);
1303 }
1304
1305 static float hard_f32_mul(float a, float b)
1306 {
1307     return a * b;
1308 }
1309
1310 static double hard_f64_mul(double a, double b)
1311 {
1312     return a * b;
1313 }
1314
1315 float32 QEMU_FLATTEN
1316 float32_mul(float32 a, float32 b, float_status *s)
1317 {
1318     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1319                         f32_is_zon2, f32_addsubmul_post);
1320 }
1321
1322 float64 QEMU_FLATTEN
1323 float64_mul(float64 a, float64 b, float_status *s)
1324 {
1325     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1326                         f64_is_zon2, f64_addsubmul_post);
1327 }
1328
1329 /*
1330  * Returns the result of multiplying the bfloat16
1331  * values `a' and `b'.
1332  */
1333
1334 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1335 {
1336     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1337     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1338     FloatParts64 pr = mul_floats(pa, pb, status);
1339
1340     return bfloat16_round_pack_canonical(pr, status);
1341 }
1342
1343 /*
1344  * Returns the result of multiplying the floating-point values `a' and
1345  * `b' then adding 'c', with no intermediate rounding step after the
1346  * multiplication. The operation is performed according to the
1347  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1348  * The flags argument allows the caller to select negation of the
1349  * addend, the intermediate product, or the final result. (The
1350  * difference between this and having the caller do a separate
1351  * negation is that negating externally will flip the sign bit on
1352  * NaNs.)
1353  */
1354
1355 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1356                                 int flags, float_status *s)
1357 {
1358     bool inf_zero, p_sign;
1359     bool sign_flip = flags & float_muladd_negate_result;
1360     FloatClass p_class;
1361     uint64_t hi, lo;
1362     int p_exp;
1363     int ab_mask, abc_mask;
1364
1365     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1366     abc_mask = float_cmask(c.cls) | ab_mask;
1367     inf_zero = ab_mask == float_cmask_infzero;
1368
1369     /* It is implementation-defined whether the cases of (0,inf,qnan)
1370      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1371      * they return if they do), so we have to hand this information
1372      * off to the target-specific pick-a-NaN routine.
1373      */
1374     if (unlikely(abc_mask & float_cmask_anynan)) {
1375         return pick_nan_muladd(a, b, c, inf_zero, s);
1376     }
1377
1378     if (inf_zero) {
1379         float_raise(float_flag_invalid, s);
1380         parts_default_nan(&a, s);
1381         return a;
1382     }
1383
1384     if (flags & float_muladd_negate_c) {
1385         c.sign ^= 1;
1386     }
1387
1388     p_sign = a.sign ^ b.sign;
1389
1390     if (flags & float_muladd_negate_product) {
1391         p_sign ^= 1;
1392     }
1393
1394     if (ab_mask & float_cmask_inf) {
1395         p_class = float_class_inf;
1396     } else if (ab_mask & float_cmask_zero) {
1397         p_class = float_class_zero;
1398     } else {
1399         p_class = float_class_normal;
1400     }
1401
1402     if (c.cls == float_class_inf) {
1403         if (p_class == float_class_inf && p_sign != c.sign) {
1404             float_raise(float_flag_invalid, s);
1405             parts_default_nan(&c, s);
1406         } else {
1407             c.sign ^= sign_flip;
1408         }
1409         return c;
1410     }
1411
1412     if (p_class == float_class_inf) {
1413         a.cls = float_class_inf;
1414         a.sign = p_sign ^ sign_flip;
1415         return a;
1416     }
1417
1418     if (p_class == float_class_zero) {
1419         if (c.cls == float_class_zero) {
1420             if (p_sign != c.sign) {
1421                 p_sign = s->float_rounding_mode == float_round_down;
1422             }
1423             c.sign = p_sign;
1424         } else if (flags & float_muladd_halve_result) {
1425             c.exp -= 1;
1426         }
1427         c.sign ^= sign_flip;
1428         return c;
1429     }
1430
1431     /* a & b should be normals now... */
1432     assert(a.cls == float_class_normal &&
1433            b.cls == float_class_normal);
1434
1435     p_exp = a.exp + b.exp;
1436
1437     mul64To128(a.frac, b.frac, &hi, &lo);
1438
1439     /* Renormalize to the msb. */
1440     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1441         p_exp += 1;
1442     } else {
1443         shortShift128Left(hi, lo, 1, &hi, &lo);
1444     }
1445
1446     /* + add/sub */
1447     if (c.cls != float_class_zero) {
1448         int exp_diff = p_exp - c.exp;
1449         if (p_sign == c.sign) {
1450             /* Addition */
1451             if (exp_diff <= 0) {
1452                 shift64RightJamming(hi, -exp_diff, &hi);
1453                 p_exp = c.exp;
1454                 if (uadd64_overflow(hi, c.frac, &hi)) {
1455                     shift64RightJamming(hi, 1, &hi);
1456                     hi |= DECOMPOSED_IMPLICIT_BIT;
1457                     p_exp += 1;
1458                 }
1459             } else {
1460                 uint64_t c_hi, c_lo, over;
1461                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1462                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1463                 if (over) {
1464                     shift64RightJamming(hi, 1, &hi);
1465                     hi |= DECOMPOSED_IMPLICIT_BIT;
1466                     p_exp += 1;
1467                 }
1468             }
1469         } else {
1470             /* Subtraction */
1471             uint64_t c_hi = c.frac, c_lo = 0;
1472
1473             if (exp_diff <= 0) {
1474                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1475                 if (exp_diff == 0
1476                     &&
1477                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1478                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1479                 } else {
1480                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1481                     p_sign ^= 1;
1482                     p_exp = c.exp;
1483                 }
1484             } else {
1485                 shift128RightJamming(c_hi, c_lo,
1486                                      exp_diff,
1487                                      &c_hi, &c_lo);
1488                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1489             }
1490
1491             if (hi == 0 && lo == 0) {
1492                 a.cls = float_class_zero;
1493                 a.sign = s->float_rounding_mode == float_round_down;
1494                 a.sign ^= sign_flip;
1495                 return a;
1496             } else {
1497                 int shift;
1498                 if (hi != 0) {
1499                     shift = clz64(hi);
1500                 } else {
1501                     shift = clz64(lo) + 64;
1502                 }
1503                 /* Normalizing to a binary point of 124 is the
1504                    correct adjust for the exponent.  However since we're
1505                    shifting, we might as well put the binary point back
1506                    at 63 where we really want it.  Therefore shift as
1507                    if we're leaving 1 bit at the top of the word, but
1508                    adjust the exponent as if we're leaving 3 bits.  */
1509                 shift128Left(hi, lo, shift, &hi, &lo);
1510                 p_exp -= shift;
1511             }
1512         }
1513     }
1514     hi |= (lo != 0);
1515
1516     if (flags & float_muladd_halve_result) {
1517         p_exp -= 1;
1518     }
1519
1520     /* finally prepare our result */
1521     a.cls = float_class_normal;
1522     a.sign = p_sign ^ sign_flip;
1523     a.exp = p_exp;
1524     a.frac = hi;
1525
1526     return a;
1527 }
1528
1529 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1530                                                 int flags, float_status *status)
1531 {
1532     FloatParts64 pa = float16_unpack_canonical(a, status);
1533     FloatParts64 pb = float16_unpack_canonical(b, status);
1534     FloatParts64 pc = float16_unpack_canonical(c, status);
1535     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1536
1537     return float16_round_pack_canonical(pr, status);
1538 }
1539
1540 static float32 QEMU_SOFTFLOAT_ATTR
1541 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1542                 float_status *status)
1543 {
1544     FloatParts64 pa = float32_unpack_canonical(a, status);
1545     FloatParts64 pb = float32_unpack_canonical(b, status);
1546     FloatParts64 pc = float32_unpack_canonical(c, status);
1547     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1548
1549     return float32_round_pack_canonical(pr, status);
1550 }
1551
1552 static float64 QEMU_SOFTFLOAT_ATTR
1553 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1554                 float_status *status)
1555 {
1556     FloatParts64 pa = float64_unpack_canonical(a, status);
1557     FloatParts64 pb = float64_unpack_canonical(b, status);
1558     FloatParts64 pc = float64_unpack_canonical(c, status);
1559     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1560
1561     return float64_round_pack_canonical(pr, status);
1562 }
1563
1564 static bool force_soft_fma;
1565
1566 float32 QEMU_FLATTEN
1567 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1568 {
1569     union_float32 ua, ub, uc, ur;
1570
1571     ua.s = xa;
1572     ub.s = xb;
1573     uc.s = xc;
1574
1575     if (unlikely(!can_use_fpu(s))) {
1576         goto soft;
1577     }
1578     if (unlikely(flags & float_muladd_halve_result)) {
1579         goto soft;
1580     }
1581
1582     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1583     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1584         goto soft;
1585     }
1586
1587     if (unlikely(force_soft_fma)) {
1588         goto soft;
1589     }
1590
1591     /*
1592      * When (a || b) == 0, there's no need to check for under/over flow,
1593      * since we know the addend is (normal || 0) and the product is 0.
1594      */
1595     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1596         union_float32 up;
1597         bool prod_sign;
1598
1599         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1600         prod_sign ^= !!(flags & float_muladd_negate_product);
1601         up.s = float32_set_sign(float32_zero, prod_sign);
1602
1603         if (flags & float_muladd_negate_c) {
1604             uc.h = -uc.h;
1605         }
1606         ur.h = up.h + uc.h;
1607     } else {
1608         union_float32 ua_orig = ua;
1609         union_float32 uc_orig = uc;
1610
1611         if (flags & float_muladd_negate_product) {
1612             ua.h = -ua.h;
1613         }
1614         if (flags & float_muladd_negate_c) {
1615             uc.h = -uc.h;
1616         }
1617
1618         ur.h = fmaf(ua.h, ub.h, uc.h);
1619
1620         if (unlikely(f32_is_inf(ur))) {
1621             float_raise(float_flag_overflow, s);
1622         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1623             ua = ua_orig;
1624             uc = uc_orig;
1625             goto soft;
1626         }
1627     }
1628     if (flags & float_muladd_negate_result) {
1629         return float32_chs(ur.s);
1630     }
1631     return ur.s;
1632
1633  soft:
1634     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1635 }
1636
1637 float64 QEMU_FLATTEN
1638 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1639 {
1640     union_float64 ua, ub, uc, ur;
1641
1642     ua.s = xa;
1643     ub.s = xb;
1644     uc.s = xc;
1645
1646     if (unlikely(!can_use_fpu(s))) {
1647         goto soft;
1648     }
1649     if (unlikely(flags & float_muladd_halve_result)) {
1650         goto soft;
1651     }
1652
1653     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1654     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1655         goto soft;
1656     }
1657
1658     if (unlikely(force_soft_fma)) {
1659         goto soft;
1660     }
1661
1662     /*
1663      * When (a || b) == 0, there's no need to check for under/over flow,
1664      * since we know the addend is (normal || 0) and the product is 0.
1665      */
1666     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1667         union_float64 up;
1668         bool prod_sign;
1669
1670         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1671         prod_sign ^= !!(flags & float_muladd_negate_product);
1672         up.s = float64_set_sign(float64_zero, prod_sign);
1673
1674         if (flags & float_muladd_negate_c) {
1675             uc.h = -uc.h;
1676         }
1677         ur.h = up.h + uc.h;
1678     } else {
1679         union_float64 ua_orig = ua;
1680         union_float64 uc_orig = uc;
1681
1682         if (flags & float_muladd_negate_product) {
1683             ua.h = -ua.h;
1684         }
1685         if (flags & float_muladd_negate_c) {
1686             uc.h = -uc.h;
1687         }
1688
1689         ur.h = fma(ua.h, ub.h, uc.h);
1690
1691         if (unlikely(f64_is_inf(ur))) {
1692             float_raise(float_flag_overflow, s);
1693         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1694             ua = ua_orig;
1695             uc = uc_orig;
1696             goto soft;
1697         }
1698     }
1699     if (flags & float_muladd_negate_result) {
1700         return float64_chs(ur.s);
1701     }
1702     return ur.s;
1703
1704  soft:
1705     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1706 }
1707
1708 /*
1709  * Returns the result of multiplying the bfloat16 values `a'
1710  * and `b' then adding 'c', with no intermediate rounding step after the
1711  * multiplication.
1712  */
1713
1714 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1715                                       int flags, float_status *status)
1716 {
1717     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1718     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1719     FloatParts64 pc = bfloat16_unpack_canonical(c, status);
1720     FloatParts64 pr = muladd_floats(pa, pb, pc, flags, status);
1721
1722     return bfloat16_round_pack_canonical(pr, status);
1723 }
1724
1725 /*
1726  * Returns the result of dividing the floating-point value `a' by the
1727  * corresponding value `b'. The operation is performed according to
1728  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1729  */
1730
1731 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1732 {
1733     bool sign = a.sign ^ b.sign;
1734
1735     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1736         uint64_t n0, n1, q, r;
1737         int exp = a.exp - b.exp;
1738
1739         /*
1740          * We want a 2*N / N-bit division to produce exactly an N-bit
1741          * result, so that we do not lose any precision and so that we
1742          * do not have to renormalize afterward.  If A.frac < B.frac,
1743          * then division would produce an (N-1)-bit result; shift A left
1744          * by one to produce the an N-bit result, and decrement the
1745          * exponent to match.
1746          *
1747          * The udiv_qrnnd algorithm that we're using requires normalization,
1748          * i.e. the msb of the denominator must be set, which is already true.
1749          */
1750         if (a.frac < b.frac) {
1751             exp -= 1;
1752             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1753         } else {
1754             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1755         }
1756         q = udiv_qrnnd(&r, n1, n0, b.frac);
1757
1758         /* Set lsb if there is a remainder, to set inexact. */
1759         a.frac = q | (r != 0);
1760         a.sign = sign;
1761         a.exp = exp;
1762         return a;
1763     }
1764     /* handle all the NaN cases */
1765     if (is_nan(a.cls) || is_nan(b.cls)) {
1766         return pick_nan(a, b, s);
1767     }
1768     /* 0/0 or Inf/Inf */
1769     if (a.cls == b.cls
1770         &&
1771         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1772         float_raise(float_flag_invalid, s);
1773         parts_default_nan(&a, s);
1774         return a;
1775     }
1776     /* Inf / x or 0 / x */
1777     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1778         a.sign = sign;
1779         return a;
1780     }
1781     /* Div 0 => Inf */
1782     if (b.cls == float_class_zero) {
1783         float_raise(float_flag_divbyzero, s);
1784         a.cls = float_class_inf;
1785         a.sign = sign;
1786         return a;
1787     }
1788     /* Div by Inf */
1789     if (b.cls == float_class_inf) {
1790         a.cls = float_class_zero;
1791         a.sign = sign;
1792         return a;
1793     }
1794     g_assert_not_reached();
1795 }
1796
1797 float16 float16_div(float16 a, float16 b, float_status *status)
1798 {
1799     FloatParts64 pa = float16_unpack_canonical(a, status);
1800     FloatParts64 pb = float16_unpack_canonical(b, status);
1801     FloatParts64 pr = div_floats(pa, pb, status);
1802
1803     return float16_round_pack_canonical(pr, status);
1804 }
1805
1806 static float32 QEMU_SOFTFLOAT_ATTR
1807 soft_f32_div(float32 a, float32 b, float_status *status)
1808 {
1809     FloatParts64 pa = float32_unpack_canonical(a, status);
1810     FloatParts64 pb = float32_unpack_canonical(b, status);
1811     FloatParts64 pr = div_floats(pa, pb, status);
1812
1813     return float32_round_pack_canonical(pr, status);
1814 }
1815
1816 static float64 QEMU_SOFTFLOAT_ATTR
1817 soft_f64_div(float64 a, float64 b, float_status *status)
1818 {
1819     FloatParts64 pa = float64_unpack_canonical(a, status);
1820     FloatParts64 pb = float64_unpack_canonical(b, status);
1821     FloatParts64 pr = div_floats(pa, pb, status);
1822
1823     return float64_round_pack_canonical(pr, status);
1824 }
1825
1826 static float hard_f32_div(float a, float b)
1827 {
1828     return a / b;
1829 }
1830
1831 static double hard_f64_div(double a, double b)
1832 {
1833     return a / b;
1834 }
1835
1836 static bool f32_div_pre(union_float32 a, union_float32 b)
1837 {
1838     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1839         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1840                fpclassify(b.h) == FP_NORMAL;
1841     }
1842     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1843 }
1844
1845 static bool f64_div_pre(union_float64 a, union_float64 b)
1846 {
1847     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1848         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1849                fpclassify(b.h) == FP_NORMAL;
1850     }
1851     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1852 }
1853
1854 static bool f32_div_post(union_float32 a, union_float32 b)
1855 {
1856     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1857         return fpclassify(a.h) != FP_ZERO;
1858     }
1859     return !float32_is_zero(a.s);
1860 }
1861
1862 static bool f64_div_post(union_float64 a, union_float64 b)
1863 {
1864     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1865         return fpclassify(a.h) != FP_ZERO;
1866     }
1867     return !float64_is_zero(a.s);
1868 }
1869
1870 float32 QEMU_FLATTEN
1871 float32_div(float32 a, float32 b, float_status *s)
1872 {
1873     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1874                         f32_div_pre, f32_div_post);
1875 }
1876
1877 float64 QEMU_FLATTEN
1878 float64_div(float64 a, float64 b, float_status *s)
1879 {
1880     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1881                         f64_div_pre, f64_div_post);
1882 }
1883
1884 /*
1885  * Returns the result of dividing the bfloat16
1886  * value `a' by the corresponding value `b'.
1887  */
1888
1889 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1890 {
1891     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
1892     FloatParts64 pb = bfloat16_unpack_canonical(b, status);
1893     FloatParts64 pr = div_floats(pa, pb, status);
1894
1895     return bfloat16_round_pack_canonical(pr, status);
1896 }
1897
1898 /*
1899  * Float to Float conversions
1900  *
1901  * Returns the result of converting one float format to another. The
1902  * conversion is performed according to the IEC/IEEE Standard for
1903  * Binary Floating-Point Arithmetic.
1904  *
1905  * The float_to_float helper only needs to take care of raising
1906  * invalid exceptions and handling the conversion on NaNs.
1907  */
1908
1909 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
1910                                  float_status *s)
1911 {
1912     if (dstf->arm_althp) {
1913         switch (a.cls) {
1914         case float_class_qnan:
1915         case float_class_snan:
1916             /* There is no NaN in the destination format.  Raise Invalid
1917              * and return a zero with the sign of the input NaN.
1918              */
1919             float_raise(float_flag_invalid, s);
1920             a.cls = float_class_zero;
1921             a.frac = 0;
1922             a.exp = 0;
1923             break;
1924
1925         case float_class_inf:
1926             /* There is no Inf in the destination format.  Raise Invalid
1927              * and return the maximum normal with the correct sign.
1928              */
1929             float_raise(float_flag_invalid, s);
1930             a.cls = float_class_normal;
1931             a.exp = dstf->exp_max;
1932             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1933             break;
1934
1935         default:
1936             break;
1937         }
1938     } else if (is_nan(a.cls)) {
1939         return return_nan(a, s);
1940     }
1941     return a;
1942 }
1943
1944 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1945 {
1946     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1947     FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1948     FloatParts64 pr = float_to_float(p, &float32_params, s);
1949     return float32_round_pack_canonical(pr, s);
1950 }
1951
1952 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1953 {
1954     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1955     FloatParts64 p = float16a_unpack_canonical(a, s, fmt16);
1956     FloatParts64 pr = float_to_float(p, &float64_params, s);
1957     return float64_round_pack_canonical(pr, s);
1958 }
1959
1960 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1961 {
1962     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1963     FloatParts64 p = float32_unpack_canonical(a, s);
1964     FloatParts64 pr = float_to_float(p, fmt16, s);
1965     return float16a_round_pack_canonical(pr, s, fmt16);
1966 }
1967
1968 static float64 QEMU_SOFTFLOAT_ATTR
1969 soft_float32_to_float64(float32 a, float_status *s)
1970 {
1971     FloatParts64 p = float32_unpack_canonical(a, s);
1972     FloatParts64 pr = float_to_float(p, &float64_params, s);
1973     return float64_round_pack_canonical(pr, s);
1974 }
1975
1976 float64 float32_to_float64(float32 a, float_status *s)
1977 {
1978     if (likely(float32_is_normal(a))) {
1979         /* Widening conversion can never produce inexact results.  */
1980         union_float32 uf;
1981         union_float64 ud;
1982         uf.s = a;
1983         ud.h = uf.h;
1984         return ud.s;
1985     } else if (float32_is_zero(a)) {
1986         return float64_set_sign(float64_zero, float32_is_neg(a));
1987     } else {
1988         return soft_float32_to_float64(a, s);
1989     }
1990 }
1991
1992 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1993 {
1994     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1995     FloatParts64 p = float64_unpack_canonical(a, s);
1996     FloatParts64 pr = float_to_float(p, fmt16, s);
1997     return float16a_round_pack_canonical(pr, s, fmt16);
1998 }
1999
2000 float32 float64_to_float32(float64 a, float_status *s)
2001 {
2002     FloatParts64 p = float64_unpack_canonical(a, s);
2003     FloatParts64 pr = float_to_float(p, &float32_params, s);
2004     return float32_round_pack_canonical(pr, s);
2005 }
2006
2007 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2008 {
2009     FloatParts64 p = bfloat16_unpack_canonical(a, s);
2010     FloatParts64 pr = float_to_float(p, &float32_params, s);
2011     return float32_round_pack_canonical(pr, s);
2012 }
2013
2014 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2015 {
2016     FloatParts64 p = bfloat16_unpack_canonical(a, s);
2017     FloatParts64 pr = float_to_float(p, &float64_params, s);
2018     return float64_round_pack_canonical(pr, s);
2019 }
2020
2021 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2022 {
2023     FloatParts64 p = float32_unpack_canonical(a, s);
2024     FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2025     return bfloat16_round_pack_canonical(pr, s);
2026 }
2027
2028 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2029 {
2030     FloatParts64 p = float64_unpack_canonical(a, s);
2031     FloatParts64 pr = float_to_float(p, &bfloat16_params, s);
2032     return bfloat16_round_pack_canonical(pr, s);
2033 }
2034
2035 /*
2036  * Rounds the floating-point value `a' to an integer, and returns the
2037  * result as a floating-point value. The operation is performed
2038  * according to the IEC/IEEE Standard for Binary Floating-Point
2039  * Arithmetic.
2040  */
2041
2042 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2043                                int scale, float_status *s)
2044 {
2045     switch (a.cls) {
2046     case float_class_qnan:
2047     case float_class_snan:
2048         return return_nan(a, s);
2049
2050     case float_class_zero:
2051     case float_class_inf:
2052         /* already "integral" */
2053         break;
2054
2055     case float_class_normal:
2056         scale = MIN(MAX(scale, -0x10000), 0x10000);
2057         a.exp += scale;
2058
2059         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2060             /* already integral */
2061             break;
2062         }
2063         if (a.exp < 0) {
2064             bool one;
2065             /* all fractional */
2066             float_raise(float_flag_inexact, s);
2067             switch (rmode) {
2068             case float_round_nearest_even:
2069                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2070                 break;
2071             case float_round_ties_away:
2072                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2073                 break;
2074             case float_round_to_zero:
2075                 one = false;
2076                 break;
2077             case float_round_up:
2078                 one = !a.sign;
2079                 break;
2080             case float_round_down:
2081                 one = a.sign;
2082                 break;
2083             case float_round_to_odd:
2084                 one = true;
2085                 break;
2086             default:
2087                 g_assert_not_reached();
2088             }
2089
2090             if (one) {
2091                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2092                 a.exp = 0;
2093             } else {
2094                 a.cls = float_class_zero;
2095             }
2096         } else {
2097             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2098             uint64_t frac_lsbm1 = frac_lsb >> 1;
2099             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2100             uint64_t rnd_mask = rnd_even_mask >> 1;
2101             uint64_t inc;
2102
2103             switch (rmode) {
2104             case float_round_nearest_even:
2105                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2106                 break;
2107             case float_round_ties_away:
2108                 inc = frac_lsbm1;
2109                 break;
2110             case float_round_to_zero:
2111                 inc = 0;
2112                 break;
2113             case float_round_up:
2114                 inc = a.sign ? 0 : rnd_mask;
2115                 break;
2116             case float_round_down:
2117                 inc = a.sign ? rnd_mask : 0;
2118                 break;
2119             case float_round_to_odd:
2120                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2121                 break;
2122             default:
2123                 g_assert_not_reached();
2124             }
2125
2126             if (a.frac & rnd_mask) {
2127                 float_raise(float_flag_inexact, s);
2128                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2129                     a.frac >>= 1;
2130                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2131                     a.exp++;
2132                 }
2133                 a.frac &= ~rnd_mask;
2134             }
2135         }
2136         break;
2137     default:
2138         g_assert_not_reached();
2139     }
2140     return a;
2141 }
2142
2143 float16 float16_round_to_int(float16 a, float_status *s)
2144 {
2145     FloatParts64 pa = float16_unpack_canonical(a, s);
2146     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2147     return float16_round_pack_canonical(pr, s);
2148 }
2149
2150 float32 float32_round_to_int(float32 a, float_status *s)
2151 {
2152     FloatParts64 pa = float32_unpack_canonical(a, s);
2153     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2154     return float32_round_pack_canonical(pr, s);
2155 }
2156
2157 float64 float64_round_to_int(float64 a, float_status *s)
2158 {
2159     FloatParts64 pa = float64_unpack_canonical(a, s);
2160     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2161     return float64_round_pack_canonical(pr, s);
2162 }
2163
2164 /*
2165  * Rounds the bfloat16 value `a' to an integer, and returns the
2166  * result as a bfloat16 value.
2167  */
2168
2169 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2170 {
2171     FloatParts64 pa = bfloat16_unpack_canonical(a, s);
2172     FloatParts64 pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2173     return bfloat16_round_pack_canonical(pr, s);
2174 }
2175
2176 /*
2177  * Returns the result of converting the floating-point value `a' to
2178  * the two's complement integer format. The conversion is performed
2179  * according to the IEC/IEEE Standard for Binary Floating-Point
2180  * Arithmetic---which means in particular that the conversion is
2181  * rounded according to the current rounding mode. If `a' is a NaN,
2182  * the largest positive integer is returned. Otherwise, if the
2183  * conversion overflows, the largest integer with the same sign as `a'
2184  * is returned.
2185 */
2186
2187 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2188                                      int scale, int64_t min, int64_t max,
2189                                      float_status *s)
2190 {
2191     uint64_t r;
2192     int orig_flags = get_float_exception_flags(s);
2193     FloatParts64 p = round_to_int(in, rmode, scale, s);
2194
2195     switch (p.cls) {
2196     case float_class_snan:
2197     case float_class_qnan:
2198         s->float_exception_flags = orig_flags | float_flag_invalid;
2199         return max;
2200     case float_class_inf:
2201         s->float_exception_flags = orig_flags | float_flag_invalid;
2202         return p.sign ? min : max;
2203     case float_class_zero:
2204         return 0;
2205     case float_class_normal:
2206         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2207             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2208         } else {
2209             r = UINT64_MAX;
2210         }
2211         if (p.sign) {
2212             if (r <= -(uint64_t) min) {
2213                 return -r;
2214             } else {
2215                 s->float_exception_flags = orig_flags | float_flag_invalid;
2216                 return min;
2217             }
2218         } else {
2219             if (r <= max) {
2220                 return r;
2221             } else {
2222                 s->float_exception_flags = orig_flags | float_flag_invalid;
2223                 return max;
2224             }
2225         }
2226     default:
2227         g_assert_not_reached();
2228     }
2229 }
2230
2231 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2232                               float_status *s)
2233 {
2234     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2235                                  rmode, scale, INT8_MIN, INT8_MAX, s);
2236 }
2237
2238 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2239                                 float_status *s)
2240 {
2241     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2242                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2243 }
2244
2245 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2246                                 float_status *s)
2247 {
2248     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2249                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2250 }
2251
2252 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2253                                 float_status *s)
2254 {
2255     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2256                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2257 }
2258
2259 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2260                                 float_status *s)
2261 {
2262     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2263                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2264 }
2265
2266 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2267                                 float_status *s)
2268 {
2269     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2270                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2271 }
2272
2273 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2274                                 float_status *s)
2275 {
2276     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2277                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2278 }
2279
2280 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2281                                 float_status *s)
2282 {
2283     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2284                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2285 }
2286
2287 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2288                                 float_status *s)
2289 {
2290     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2291                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2292 }
2293
2294 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2295                                 float_status *s)
2296 {
2297     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2298                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2299 }
2300
2301 int8_t float16_to_int8(float16 a, float_status *s)
2302 {
2303     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2304 }
2305
2306 int16_t float16_to_int16(float16 a, float_status *s)
2307 {
2308     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2309 }
2310
2311 int32_t float16_to_int32(float16 a, float_status *s)
2312 {
2313     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2314 }
2315
2316 int64_t float16_to_int64(float16 a, float_status *s)
2317 {
2318     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2319 }
2320
2321 int16_t float32_to_int16(float32 a, float_status *s)
2322 {
2323     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2324 }
2325
2326 int32_t float32_to_int32(float32 a, float_status *s)
2327 {
2328     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2329 }
2330
2331 int64_t float32_to_int64(float32 a, float_status *s)
2332 {
2333     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2334 }
2335
2336 int16_t float64_to_int16(float64 a, float_status *s)
2337 {
2338     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2339 }
2340
2341 int32_t float64_to_int32(float64 a, float_status *s)
2342 {
2343     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2344 }
2345
2346 int64_t float64_to_int64(float64 a, float_status *s)
2347 {
2348     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2349 }
2350
2351 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2352 {
2353     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2354 }
2355
2356 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2357 {
2358     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2359 }
2360
2361 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2362 {
2363     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2364 }
2365
2366 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2367 {
2368     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2369 }
2370
2371 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2372 {
2373     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2374 }
2375
2376 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2377 {
2378     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2379 }
2380
2381 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2382 {
2383     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2384 }
2385
2386 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2387 {
2388     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2389 }
2390
2391 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2392 {
2393     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2394 }
2395
2396 /*
2397  * Returns the result of converting the floating-point value `a' to
2398  * the two's complement integer format.
2399  */
2400
2401 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2402                                  float_status *s)
2403 {
2404     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2405                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2406 }
2407
2408 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2409                                  float_status *s)
2410 {
2411     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2412                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2413 }
2414
2415 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2416                                  float_status *s)
2417 {
2418     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2419                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2420 }
2421
2422 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2423 {
2424     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2425 }
2426
2427 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2428 {
2429     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2430 }
2431
2432 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2433 {
2434     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2435 }
2436
2437 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2438 {
2439     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2440 }
2441
2442 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2443 {
2444     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2445 }
2446
2447 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2448 {
2449     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2450 }
2451
2452 /*
2453  *  Returns the result of converting the floating-point value `a' to
2454  *  the unsigned integer format. The conversion is performed according
2455  *  to the IEC/IEEE Standard for Binary Floating-Point
2456  *  Arithmetic---which means in particular that the conversion is
2457  *  rounded according to the current rounding mode. If `a' is a NaN,
2458  *  the largest unsigned integer is returned. Otherwise, if the
2459  *  conversion overflows, the largest unsigned integer is returned. If
2460  *  the 'a' is negative, the result is rounded and zero is returned;
2461  *  values that do not round to zero will raise the inexact exception
2462  *  flag.
2463  */
2464
2465 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2466                                        int scale, uint64_t max,
2467                                        float_status *s)
2468 {
2469     int orig_flags = get_float_exception_flags(s);
2470     FloatParts64 p = round_to_int(in, rmode, scale, s);
2471     uint64_t r;
2472
2473     switch (p.cls) {
2474     case float_class_snan:
2475     case float_class_qnan:
2476         s->float_exception_flags = orig_flags | float_flag_invalid;
2477         return max;
2478     case float_class_inf:
2479         s->float_exception_flags = orig_flags | float_flag_invalid;
2480         return p.sign ? 0 : max;
2481     case float_class_zero:
2482         return 0;
2483     case float_class_normal:
2484         if (p.sign) {
2485             s->float_exception_flags = orig_flags | float_flag_invalid;
2486             return 0;
2487         }
2488
2489         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2490             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2491         } else {
2492             s->float_exception_flags = orig_flags | float_flag_invalid;
2493             return max;
2494         }
2495
2496         /* For uint64 this will never trip, but if p.exp is too large
2497          * to shift a decomposed fraction we shall have exited via the
2498          * 3rd leg above.
2499          */
2500         if (r > max) {
2501             s->float_exception_flags = orig_flags | float_flag_invalid;
2502             return max;
2503         }
2504         return r;
2505     default:
2506         g_assert_not_reached();
2507     }
2508 }
2509
2510 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2511                                 float_status *s)
2512 {
2513     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2514                                   rmode, scale, UINT8_MAX, s);
2515 }
2516
2517 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2518                                   float_status *s)
2519 {
2520     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2521                                   rmode, scale, UINT16_MAX, s);
2522 }
2523
2524 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2525                                   float_status *s)
2526 {
2527     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2528                                   rmode, scale, UINT32_MAX, s);
2529 }
2530
2531 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2532                                   float_status *s)
2533 {
2534     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2535                                   rmode, scale, UINT64_MAX, s);
2536 }
2537
2538 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2539                                   float_status *s)
2540 {
2541     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2542                                   rmode, scale, UINT16_MAX, s);
2543 }
2544
2545 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2546                                   float_status *s)
2547 {
2548     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2549                                   rmode, scale, UINT32_MAX, s);
2550 }
2551
2552 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2553                                   float_status *s)
2554 {
2555     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2556                                   rmode, scale, UINT64_MAX, s);
2557 }
2558
2559 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2560                                   float_status *s)
2561 {
2562     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2563                                   rmode, scale, UINT16_MAX, s);
2564 }
2565
2566 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2567                                   float_status *s)
2568 {
2569     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2570                                   rmode, scale, UINT32_MAX, s);
2571 }
2572
2573 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2574                                   float_status *s)
2575 {
2576     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2577                                   rmode, scale, UINT64_MAX, s);
2578 }
2579
2580 uint8_t float16_to_uint8(float16 a, float_status *s)
2581 {
2582     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2583 }
2584
2585 uint16_t float16_to_uint16(float16 a, float_status *s)
2586 {
2587     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2588 }
2589
2590 uint32_t float16_to_uint32(float16 a, float_status *s)
2591 {
2592     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2593 }
2594
2595 uint64_t float16_to_uint64(float16 a, float_status *s)
2596 {
2597     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2598 }
2599
2600 uint16_t float32_to_uint16(float32 a, float_status *s)
2601 {
2602     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2603 }
2604
2605 uint32_t float32_to_uint32(float32 a, float_status *s)
2606 {
2607     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2608 }
2609
2610 uint64_t float32_to_uint64(float32 a, float_status *s)
2611 {
2612     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2613 }
2614
2615 uint16_t float64_to_uint16(float64 a, float_status *s)
2616 {
2617     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2618 }
2619
2620 uint32_t float64_to_uint32(float64 a, float_status *s)
2621 {
2622     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2623 }
2624
2625 uint64_t float64_to_uint64(float64 a, float_status *s)
2626 {
2627     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2628 }
2629
2630 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2631 {
2632     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2633 }
2634
2635 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2636 {
2637     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2638 }
2639
2640 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2641 {
2642     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2643 }
2644
2645 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2646 {
2647     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2648 }
2649
2650 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2651 {
2652     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2653 }
2654
2655 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2656 {
2657     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2658 }
2659
2660 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2661 {
2662     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2663 }
2664
2665 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2666 {
2667     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2668 }
2669
2670 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2671 {
2672     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2673 }
2674
2675 /*
2676  *  Returns the result of converting the bfloat16 value `a' to
2677  *  the unsigned integer format.
2678  */
2679
2680 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2681                                    int scale, float_status *s)
2682 {
2683     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2684                                   rmode, scale, UINT16_MAX, s);
2685 }
2686
2687 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2688                                    int scale, float_status *s)
2689 {
2690     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2691                                   rmode, scale, UINT32_MAX, s);
2692 }
2693
2694 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2695                                    int scale, float_status *s)
2696 {
2697     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2698                                   rmode, scale, UINT64_MAX, s);
2699 }
2700
2701 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2702 {
2703     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2704 }
2705
2706 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2707 {
2708     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2709 }
2710
2711 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2712 {
2713     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2714 }
2715
2716 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2717 {
2718     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2719 }
2720
2721 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2722 {
2723     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2724 }
2725
2726 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2727 {
2728     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2729 }
2730
2731 /*
2732  * Integer to float conversions
2733  *
2734  * Returns the result of converting the two's complement integer `a'
2735  * to the floating-point format. The conversion is performed according
2736  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2737  */
2738
2739 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2740 {
2741     FloatParts64 r = { .sign = false };
2742
2743     if (a == 0) {
2744         r.cls = float_class_zero;
2745     } else {
2746         uint64_t f = a;
2747         int shift;
2748
2749         r.cls = float_class_normal;
2750         if (a < 0) {
2751             f = -f;
2752             r.sign = true;
2753         }
2754         shift = clz64(f);
2755         scale = MIN(MAX(scale, -0x10000), 0x10000);
2756
2757         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2758         r.frac = f << shift;
2759     }
2760
2761     return r;
2762 }
2763
2764 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2765 {
2766     FloatParts64 pa = int_to_float(a, scale, status);
2767     return float16_round_pack_canonical(pa, status);
2768 }
2769
2770 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2771 {
2772     return int64_to_float16_scalbn(a, scale, status);
2773 }
2774
2775 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2776 {
2777     return int64_to_float16_scalbn(a, scale, status);
2778 }
2779
2780 float16 int64_to_float16(int64_t a, float_status *status)
2781 {
2782     return int64_to_float16_scalbn(a, 0, status);
2783 }
2784
2785 float16 int32_to_float16(int32_t a, float_status *status)
2786 {
2787     return int64_to_float16_scalbn(a, 0, status);
2788 }
2789
2790 float16 int16_to_float16(int16_t a, float_status *status)
2791 {
2792     return int64_to_float16_scalbn(a, 0, status);
2793 }
2794
2795 float16 int8_to_float16(int8_t a, float_status *status)
2796 {
2797     return int64_to_float16_scalbn(a, 0, status);
2798 }
2799
2800 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2801 {
2802     FloatParts64 pa = int_to_float(a, scale, status);
2803     return float32_round_pack_canonical(pa, status);
2804 }
2805
2806 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2807 {
2808     return int64_to_float32_scalbn(a, scale, status);
2809 }
2810
2811 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2812 {
2813     return int64_to_float32_scalbn(a, scale, status);
2814 }
2815
2816 float32 int64_to_float32(int64_t a, float_status *status)
2817 {
2818     return int64_to_float32_scalbn(a, 0, status);
2819 }
2820
2821 float32 int32_to_float32(int32_t a, float_status *status)
2822 {
2823     return int64_to_float32_scalbn(a, 0, status);
2824 }
2825
2826 float32 int16_to_float32(int16_t a, float_status *status)
2827 {
2828     return int64_to_float32_scalbn(a, 0, status);
2829 }
2830
2831 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2832 {
2833     FloatParts64 pa = int_to_float(a, scale, status);
2834     return float64_round_pack_canonical(pa, status);
2835 }
2836
2837 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2838 {
2839     return int64_to_float64_scalbn(a, scale, status);
2840 }
2841
2842 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2843 {
2844     return int64_to_float64_scalbn(a, scale, status);
2845 }
2846
2847 float64 int64_to_float64(int64_t a, float_status *status)
2848 {
2849     return int64_to_float64_scalbn(a, 0, status);
2850 }
2851
2852 float64 int32_to_float64(int32_t a, float_status *status)
2853 {
2854     return int64_to_float64_scalbn(a, 0, status);
2855 }
2856
2857 float64 int16_to_float64(int16_t a, float_status *status)
2858 {
2859     return int64_to_float64_scalbn(a, 0, status);
2860 }
2861
2862 /*
2863  * Returns the result of converting the two's complement integer `a'
2864  * to the bfloat16 format.
2865  */
2866
2867 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2868 {
2869     FloatParts64 pa = int_to_float(a, scale, status);
2870     return bfloat16_round_pack_canonical(pa, status);
2871 }
2872
2873 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2874 {
2875     return int64_to_bfloat16_scalbn(a, scale, status);
2876 }
2877
2878 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2879 {
2880     return int64_to_bfloat16_scalbn(a, scale, status);
2881 }
2882
2883 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2884 {
2885     return int64_to_bfloat16_scalbn(a, 0, status);
2886 }
2887
2888 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2889 {
2890     return int64_to_bfloat16_scalbn(a, 0, status);
2891 }
2892
2893 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2894 {
2895     return int64_to_bfloat16_scalbn(a, 0, status);
2896 }
2897
2898 /*
2899  * Unsigned Integer to float conversions
2900  *
2901  * Returns the result of converting the unsigned integer `a' to the
2902  * floating-point format. The conversion is performed according to the
2903  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2904  */
2905
2906 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
2907 {
2908     FloatParts64 r = { .sign = false };
2909     int shift;
2910
2911     if (a == 0) {
2912         r.cls = float_class_zero;
2913     } else {
2914         scale = MIN(MAX(scale, -0x10000), 0x10000);
2915         shift = clz64(a);
2916         r.cls = float_class_normal;
2917         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2918         r.frac = a << shift;
2919     }
2920
2921     return r;
2922 }
2923
2924 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2925 {
2926     FloatParts64 pa = uint_to_float(a, scale, status);
2927     return float16_round_pack_canonical(pa, status);
2928 }
2929
2930 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2931 {
2932     return uint64_to_float16_scalbn(a, scale, status);
2933 }
2934
2935 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2936 {
2937     return uint64_to_float16_scalbn(a, scale, status);
2938 }
2939
2940 float16 uint64_to_float16(uint64_t a, float_status *status)
2941 {
2942     return uint64_to_float16_scalbn(a, 0, status);
2943 }
2944
2945 float16 uint32_to_float16(uint32_t a, float_status *status)
2946 {
2947     return uint64_to_float16_scalbn(a, 0, status);
2948 }
2949
2950 float16 uint16_to_float16(uint16_t a, float_status *status)
2951 {
2952     return uint64_to_float16_scalbn(a, 0, status);
2953 }
2954
2955 float16 uint8_to_float16(uint8_t a, float_status *status)
2956 {
2957     return uint64_to_float16_scalbn(a, 0, status);
2958 }
2959
2960 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2961 {
2962     FloatParts64 pa = uint_to_float(a, scale, status);
2963     return float32_round_pack_canonical(pa, status);
2964 }
2965
2966 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2967 {
2968     return uint64_to_float32_scalbn(a, scale, status);
2969 }
2970
2971 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2972 {
2973     return uint64_to_float32_scalbn(a, scale, status);
2974 }
2975
2976 float32 uint64_to_float32(uint64_t a, float_status *status)
2977 {
2978     return uint64_to_float32_scalbn(a, 0, status);
2979 }
2980
2981 float32 uint32_to_float32(uint32_t a, float_status *status)
2982 {
2983     return uint64_to_float32_scalbn(a, 0, status);
2984 }
2985
2986 float32 uint16_to_float32(uint16_t a, float_status *status)
2987 {
2988     return uint64_to_float32_scalbn(a, 0, status);
2989 }
2990
2991 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2992 {
2993     FloatParts64 pa = uint_to_float(a, scale, status);
2994     return float64_round_pack_canonical(pa, status);
2995 }
2996
2997 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2998 {
2999     return uint64_to_float64_scalbn(a, scale, status);
3000 }
3001
3002 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3003 {
3004     return uint64_to_float64_scalbn(a, scale, status);
3005 }
3006
3007 float64 uint64_to_float64(uint64_t a, float_status *status)
3008 {
3009     return uint64_to_float64_scalbn(a, 0, status);
3010 }
3011
3012 float64 uint32_to_float64(uint32_t a, float_status *status)
3013 {
3014     return uint64_to_float64_scalbn(a, 0, status);
3015 }
3016
3017 float64 uint16_to_float64(uint16_t a, float_status *status)
3018 {
3019     return uint64_to_float64_scalbn(a, 0, status);
3020 }
3021
3022 /*
3023  * Returns the result of converting the unsigned integer `a' to the
3024  * bfloat16 format.
3025  */
3026
3027 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3028 {
3029     FloatParts64 pa = uint_to_float(a, scale, status);
3030     return bfloat16_round_pack_canonical(pa, status);
3031 }
3032
3033 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3034 {
3035     return uint64_to_bfloat16_scalbn(a, scale, status);
3036 }
3037
3038 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3039 {
3040     return uint64_to_bfloat16_scalbn(a, scale, status);
3041 }
3042
3043 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3044 {
3045     return uint64_to_bfloat16_scalbn(a, 0, status);
3046 }
3047
3048 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3049 {
3050     return uint64_to_bfloat16_scalbn(a, 0, status);
3051 }
3052
3053 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3054 {
3055     return uint64_to_bfloat16_scalbn(a, 0, status);
3056 }
3057
3058 /* Float Min/Max */
3059 /* min() and max() functions. These can't be implemented as
3060  * 'compare and pick one input' because that would mishandle
3061  * NaNs and +0 vs -0.
3062  *
3063  * minnum() and maxnum() functions. These are similar to the min()
3064  * and max() functions but if one of the arguments is a QNaN and
3065  * the other is numerical then the numerical argument is returned.
3066  * SNaNs will get quietened before being returned.
3067  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3068  * and maxNum() operations. min() and max() are the typical min/max
3069  * semantics provided by many CPUs which predate that specification.
3070  *
3071  * minnummag() and maxnummag() functions correspond to minNumMag()
3072  * and minNumMag() from the IEEE-754 2008.
3073  */
3074 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3075                                 bool ieee, bool ismag, float_status *s)
3076 {
3077     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3078         if (ieee) {
3079             /* Takes two floating-point values `a' and `b', one of
3080              * which is a NaN, and returns the appropriate NaN
3081              * result. If either `a' or `b' is a signaling NaN,
3082              * the invalid exception is raised.
3083              */
3084             if (is_snan(a.cls) || is_snan(b.cls)) {
3085                 return pick_nan(a, b, s);
3086             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3087                 return b;
3088             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3089                 return a;
3090             }
3091         }
3092         return pick_nan(a, b, s);
3093     } else {
3094         int a_exp, b_exp;
3095
3096         switch (a.cls) {
3097         case float_class_normal:
3098             a_exp = a.exp;
3099             break;
3100         case float_class_inf:
3101             a_exp = INT_MAX;
3102             break;
3103         case float_class_zero:
3104             a_exp = INT_MIN;
3105             break;
3106         default:
3107             g_assert_not_reached();
3108             break;
3109         }
3110         switch (b.cls) {
3111         case float_class_normal:
3112             b_exp = b.exp;
3113             break;
3114         case float_class_inf:
3115             b_exp = INT_MAX;
3116             break;
3117         case float_class_zero:
3118             b_exp = INT_MIN;
3119             break;
3120         default:
3121             g_assert_not_reached();
3122             break;
3123         }
3124
3125         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3126             bool a_less = a_exp < b_exp;
3127             if (a_exp == b_exp) {
3128                 a_less = a.frac < b.frac;
3129             }
3130             return a_less ^ ismin ? b : a;
3131         }
3132
3133         if (a.sign == b.sign) {
3134             bool a_less = a_exp < b_exp;
3135             if (a_exp == b_exp) {
3136                 a_less = a.frac < b.frac;
3137             }
3138             return a.sign ^ a_less ^ ismin ? b : a;
3139         } else {
3140             return a.sign ^ ismin ? b : a;
3141         }
3142     }
3143 }
3144
3145 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3146 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3147                                      float_status *s)                   \
3148 {                                                                       \
3149     FloatParts64 pa = float ## sz ## _unpack_canonical(a, s);             \
3150     FloatParts64 pb = float ## sz ## _unpack_canonical(b, s);             \
3151     FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3152                                                                         \
3153     return float ## sz ## _round_pack_canonical(pr, s);                 \
3154 }
3155
3156 MINMAX(16, min, true, false, false)
3157 MINMAX(16, minnum, true, true, false)
3158 MINMAX(16, minnummag, true, true, true)
3159 MINMAX(16, max, false, false, false)
3160 MINMAX(16, maxnum, false, true, false)
3161 MINMAX(16, maxnummag, false, true, true)
3162
3163 MINMAX(32, min, true, false, false)
3164 MINMAX(32, minnum, true, true, false)
3165 MINMAX(32, minnummag, true, true, true)
3166 MINMAX(32, max, false, false, false)
3167 MINMAX(32, maxnum, false, true, false)
3168 MINMAX(32, maxnummag, false, true, true)
3169
3170 MINMAX(64, min, true, false, false)
3171 MINMAX(64, minnum, true, true, false)
3172 MINMAX(64, minnummag, true, true, true)
3173 MINMAX(64, max, false, false, false)
3174 MINMAX(64, maxnum, false, true, false)
3175 MINMAX(64, maxnummag, false, true, true)
3176
3177 #undef MINMAX
3178
3179 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3180 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3181 {                                                                       \
3182     FloatParts64 pa = bfloat16_unpack_canonical(a, s);                    \
3183     FloatParts64 pb = bfloat16_unpack_canonical(b, s);                    \
3184     FloatParts64 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3185                                                                         \
3186     return bfloat16_round_pack_canonical(pr, s);                        \
3187 }
3188
3189 BF16_MINMAX(min, true, false, false)
3190 BF16_MINMAX(minnum, true, true, false)
3191 BF16_MINMAX(minnummag, true, true, true)
3192 BF16_MINMAX(max, false, false, false)
3193 BF16_MINMAX(maxnum, false, true, false)
3194 BF16_MINMAX(maxnummag, false, true, true)
3195
3196 #undef BF16_MINMAX
3197
3198 /* Floating point compare */
3199 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3200                                     float_status *s)
3201 {
3202     if (is_nan(a.cls) || is_nan(b.cls)) {
3203         if (!is_quiet ||
3204             a.cls == float_class_snan ||
3205             b.cls == float_class_snan) {
3206             float_raise(float_flag_invalid, s);
3207         }
3208         return float_relation_unordered;
3209     }
3210
3211     if (a.cls == float_class_zero) {
3212         if (b.cls == float_class_zero) {
3213             return float_relation_equal;
3214         }
3215         return b.sign ? float_relation_greater : float_relation_less;
3216     } else if (b.cls == float_class_zero) {
3217         return a.sign ? float_relation_less : float_relation_greater;
3218     }
3219
3220     /* The only really important thing about infinity is its sign. If
3221      * both are infinities the sign marks the smallest of the two.
3222      */
3223     if (a.cls == float_class_inf) {
3224         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3225             return float_relation_equal;
3226         }
3227         return a.sign ? float_relation_less : float_relation_greater;
3228     } else if (b.cls == float_class_inf) {
3229         return b.sign ? float_relation_greater : float_relation_less;
3230     }
3231
3232     if (a.sign != b.sign) {
3233         return a.sign ? float_relation_less : float_relation_greater;
3234     }
3235
3236     if (a.exp == b.exp) {
3237         if (a.frac == b.frac) {
3238             return float_relation_equal;
3239         }
3240         if (a.sign) {
3241             return a.frac > b.frac ?
3242                 float_relation_less : float_relation_greater;
3243         } else {
3244             return a.frac > b.frac ?
3245                 float_relation_greater : float_relation_less;
3246         }
3247     } else {
3248         if (a.sign) {
3249             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3250         } else {
3251             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3252         }
3253     }
3254 }
3255
3256 #define COMPARE(name, attr, sz)                                         \
3257 static int attr                                                         \
3258 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3259 {                                                                       \
3260     FloatParts64 pa = float ## sz ## _unpack_canonical(a, s);             \
3261     FloatParts64 pb = float ## sz ## _unpack_canonical(b, s);             \
3262     return compare_floats(pa, pb, is_quiet, s);                         \
3263 }
3264
3265 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3266 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3267 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3268
3269 #undef COMPARE
3270
3271 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3272 {
3273     return soft_f16_compare(a, b, false, s);
3274 }
3275
3276 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3277 {
3278     return soft_f16_compare(a, b, true, s);
3279 }
3280
3281 static FloatRelation QEMU_FLATTEN
3282 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3283 {
3284     union_float32 ua, ub;
3285
3286     ua.s = xa;
3287     ub.s = xb;
3288
3289     if (QEMU_NO_HARDFLOAT) {
3290         goto soft;
3291     }
3292
3293     float32_input_flush2(&ua.s, &ub.s, s);
3294     if (isgreaterequal(ua.h, ub.h)) {
3295         if (isgreater(ua.h, ub.h)) {
3296             return float_relation_greater;
3297         }
3298         return float_relation_equal;
3299     }
3300     if (likely(isless(ua.h, ub.h))) {
3301         return float_relation_less;
3302     }
3303     /* The only condition remaining is unordered.
3304      * Fall through to set flags.
3305      */
3306  soft:
3307     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3308 }
3309
3310 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3311 {
3312     return f32_compare(a, b, false, s);
3313 }
3314
3315 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3316 {
3317     return f32_compare(a, b, true, s);
3318 }
3319
3320 static FloatRelation QEMU_FLATTEN
3321 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3322 {
3323     union_float64 ua, ub;
3324
3325     ua.s = xa;
3326     ub.s = xb;
3327
3328     if (QEMU_NO_HARDFLOAT) {
3329         goto soft;
3330     }
3331
3332     float64_input_flush2(&ua.s, &ub.s, s);
3333     if (isgreaterequal(ua.h, ub.h)) {
3334         if (isgreater(ua.h, ub.h)) {
3335             return float_relation_greater;
3336         }
3337         return float_relation_equal;
3338     }
3339     if (likely(isless(ua.h, ub.h))) {
3340         return float_relation_less;
3341     }
3342     /* The only condition remaining is unordered.
3343      * Fall through to set flags.
3344      */
3345  soft:
3346     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3347 }
3348
3349 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3350 {
3351     return f64_compare(a, b, false, s);
3352 }
3353
3354 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3355 {
3356     return f64_compare(a, b, true, s);
3357 }
3358
3359 static FloatRelation QEMU_FLATTEN
3360 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3361 {
3362     FloatParts64 pa = bfloat16_unpack_canonical(a, s);
3363     FloatParts64 pb = bfloat16_unpack_canonical(b, s);
3364     return compare_floats(pa, pb, is_quiet, s);
3365 }
3366
3367 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3368 {
3369     return soft_bf16_compare(a, b, false, s);
3370 }
3371
3372 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3373 {
3374     return soft_bf16_compare(a, b, true, s);
3375 }
3376
3377 /* Multiply A by 2 raised to the power N.  */
3378 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3379 {
3380     if (unlikely(is_nan(a.cls))) {
3381         return return_nan(a, s);
3382     }
3383     if (a.cls == float_class_normal) {
3384         /* The largest float type (even though not supported by FloatParts64)
3385          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3386          * still allows rounding to infinity, without allowing overflow
3387          * within the int32_t that backs FloatParts64.exp.
3388          */
3389         n = MIN(MAX(n, -0x10000), 0x10000);
3390         a.exp += n;
3391     }
3392     return a;
3393 }
3394
3395 float16 float16_scalbn(float16 a, int n, float_status *status)
3396 {
3397     FloatParts64 pa = float16_unpack_canonical(a, status);
3398     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3399     return float16_round_pack_canonical(pr, status);
3400 }
3401
3402 float32 float32_scalbn(float32 a, int n, float_status *status)
3403 {
3404     FloatParts64 pa = float32_unpack_canonical(a, status);
3405     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3406     return float32_round_pack_canonical(pr, status);
3407 }
3408
3409 float64 float64_scalbn(float64 a, int n, float_status *status)
3410 {
3411     FloatParts64 pa = float64_unpack_canonical(a, status);
3412     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3413     return float64_round_pack_canonical(pr, status);
3414 }
3415
3416 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3417 {
3418     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3419     FloatParts64 pr = scalbn_decomposed(pa, n, status);
3420     return bfloat16_round_pack_canonical(pr, status);
3421 }
3422
3423 /*
3424  * Square Root
3425  *
3426  * The old softfloat code did an approximation step before zeroing in
3427  * on the final result. However for simpleness we just compute the
3428  * square root by iterating down from the implicit bit to enough extra
3429  * bits to ensure we get a correctly rounded result.
3430  *
3431  * This does mean however the calculation is slower than before,
3432  * especially for 64 bit floats.
3433  */
3434
3435 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3436 {
3437     uint64_t a_frac, r_frac, s_frac;
3438     int bit, last_bit;
3439
3440     if (is_nan(a.cls)) {
3441         return return_nan(a, s);
3442     }
3443     if (a.cls == float_class_zero) {
3444         return a;  /* sqrt(+-0) = +-0 */
3445     }
3446     if (a.sign) {
3447         float_raise(float_flag_invalid, s);
3448         parts_default_nan(&a, s);
3449         return a;
3450     }
3451     if (a.cls == float_class_inf) {
3452         return a;  /* sqrt(+inf) = +inf */
3453     }
3454
3455     assert(a.cls == float_class_normal);
3456
3457     /* We need two overflow bits at the top. Adding room for that is a
3458      * right shift. If the exponent is odd, we can discard the low bit
3459      * by multiplying the fraction by 2; that's a left shift. Combine
3460      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3461      */
3462     a_frac = a.frac >> (2 - (a.exp & 1));
3463     a.exp >>= 1;
3464
3465     /* Bit-by-bit computation of sqrt.  */
3466     r_frac = 0;
3467     s_frac = 0;
3468
3469     /* Iterate from implicit bit down to the 3 extra bits to compute a
3470      * properly rounded result. Remember we've inserted two more bits
3471      * at the top, so these positions are two less.
3472      */
3473     bit = DECOMPOSED_BINARY_POINT - 2;
3474     last_bit = MAX(p->frac_shift - 4, 0);
3475     do {
3476         uint64_t q = 1ULL << bit;
3477         uint64_t t_frac = s_frac + q;
3478         if (t_frac <= a_frac) {
3479             s_frac = t_frac + q;
3480             a_frac -= t_frac;
3481             r_frac += q;
3482         }
3483         a_frac <<= 1;
3484     } while (--bit >= last_bit);
3485
3486     /* Undo the right shift done above. If there is any remaining
3487      * fraction, the result is inexact. Set the sticky bit.
3488      */
3489     a.frac = (r_frac << 2) + (a_frac != 0);
3490
3491     return a;
3492 }
3493
3494 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3495 {
3496     FloatParts64 pa = float16_unpack_canonical(a, status);
3497     FloatParts64 pr = sqrt_float(pa, status, &float16_params);
3498     return float16_round_pack_canonical(pr, status);
3499 }
3500
3501 static float32 QEMU_SOFTFLOAT_ATTR
3502 soft_f32_sqrt(float32 a, float_status *status)
3503 {
3504     FloatParts64 pa = float32_unpack_canonical(a, status);
3505     FloatParts64 pr = sqrt_float(pa, status, &float32_params);
3506     return float32_round_pack_canonical(pr, status);
3507 }
3508
3509 static float64 QEMU_SOFTFLOAT_ATTR
3510 soft_f64_sqrt(float64 a, float_status *status)
3511 {
3512     FloatParts64 pa = float64_unpack_canonical(a, status);
3513     FloatParts64 pr = sqrt_float(pa, status, &float64_params);
3514     return float64_round_pack_canonical(pr, status);
3515 }
3516
3517 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3518 {
3519     union_float32 ua, ur;
3520
3521     ua.s = xa;
3522     if (unlikely(!can_use_fpu(s))) {
3523         goto soft;
3524     }
3525
3526     float32_input_flush1(&ua.s, s);
3527     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3528         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3529                        fpclassify(ua.h) == FP_ZERO) ||
3530                      signbit(ua.h))) {
3531             goto soft;
3532         }
3533     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3534                         float32_is_neg(ua.s))) {
3535         goto soft;
3536     }
3537     ur.h = sqrtf(ua.h);
3538     return ur.s;
3539
3540  soft:
3541     return soft_f32_sqrt(ua.s, s);
3542 }
3543
3544 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3545 {
3546     union_float64 ua, ur;
3547
3548     ua.s = xa;
3549     if (unlikely(!can_use_fpu(s))) {
3550         goto soft;
3551     }
3552
3553     float64_input_flush1(&ua.s, s);
3554     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3555         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3556                        fpclassify(ua.h) == FP_ZERO) ||
3557                      signbit(ua.h))) {
3558             goto soft;
3559         }
3560     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3561                         float64_is_neg(ua.s))) {
3562         goto soft;
3563     }
3564     ur.h = sqrt(ua.h);
3565     return ur.s;
3566
3567  soft:
3568     return soft_f64_sqrt(ua.s, s);
3569 }
3570
3571 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3572 {
3573     FloatParts64 pa = bfloat16_unpack_canonical(a, status);
3574     FloatParts64 pr = sqrt_float(pa, status, &bfloat16_params);
3575     return bfloat16_round_pack_canonical(pr, status);
3576 }
3577
3578 /*----------------------------------------------------------------------------
3579 | The pattern for a default generated NaN.
3580 *----------------------------------------------------------------------------*/
3581
3582 float16 float16_default_nan(float_status *status)
3583 {
3584     FloatParts64 p;
3585
3586     parts_default_nan(&p, status);
3587     p.frac >>= float16_params.frac_shift;
3588     return float16_pack_raw(p);
3589 }
3590
3591 float32 float32_default_nan(float_status *status)
3592 {
3593     FloatParts64 p;
3594
3595     parts_default_nan(&p, status);
3596     p.frac >>= float32_params.frac_shift;
3597     return float32_pack_raw(p);
3598 }
3599
3600 float64 float64_default_nan(float_status *status)
3601 {
3602     FloatParts64 p;
3603
3604     parts_default_nan(&p, status);
3605     p.frac >>= float64_params.frac_shift;
3606     return float64_pack_raw(p);
3607 }
3608
3609 float128 float128_default_nan(float_status *status)
3610 {
3611     FloatParts64 p;
3612     float128 r;
3613
3614     parts_default_nan(&p, status);
3615     /* Extrapolate from the choices made by parts_default_nan to fill
3616      * in the quad-floating format.  If the low bit is set, assume we
3617      * want to set all non-snan bits.
3618      */
3619     r.low = -(p.frac & 1);
3620     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3621     r.high |= UINT64_C(0x7FFF000000000000);
3622     r.high |= (uint64_t)p.sign << 63;
3623
3624     return r;
3625 }
3626
3627 bfloat16 bfloat16_default_nan(float_status *status)
3628 {
3629     FloatParts64 p;
3630
3631     parts_default_nan(&p, status);
3632     p.frac >>= bfloat16_params.frac_shift;
3633     return bfloat16_pack_raw(p);
3634 }
3635
3636 /*----------------------------------------------------------------------------
3637 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3638 *----------------------------------------------------------------------------*/
3639
3640 float16 float16_silence_nan(float16 a, float_status *status)
3641 {
3642     FloatParts64 p = float16_unpack_raw(a);
3643     p.frac <<= float16_params.frac_shift;
3644     p = parts_silence_nan(p, status);
3645     p.frac >>= float16_params.frac_shift;
3646     return float16_pack_raw(p);
3647 }
3648
3649 float32 float32_silence_nan(float32 a, float_status *status)
3650 {
3651     FloatParts64 p = float32_unpack_raw(a);
3652     p.frac <<= float32_params.frac_shift;
3653     p = parts_silence_nan(p, status);
3654     p.frac >>= float32_params.frac_shift;
3655     return float32_pack_raw(p);
3656 }
3657
3658 float64 float64_silence_nan(float64 a, float_status *status)
3659 {
3660     FloatParts64 p = float64_unpack_raw(a);
3661     p.frac <<= float64_params.frac_shift;
3662     p = parts_silence_nan(p, status);
3663     p.frac >>= float64_params.frac_shift;
3664     return float64_pack_raw(p);
3665 }
3666
3667 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3668 {
3669     FloatParts64 p = bfloat16_unpack_raw(a);
3670     p.frac <<= bfloat16_params.frac_shift;
3671     p = parts_silence_nan(p, status);
3672     p.frac >>= bfloat16_params.frac_shift;
3673     return bfloat16_pack_raw(p);
3674 }
3675
3676 /*----------------------------------------------------------------------------
3677 | If `a' is denormal and we are in flush-to-zero mode then set the
3678 | input-denormal exception and return zero. Otherwise just return the value.
3679 *----------------------------------------------------------------------------*/
3680
3681 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3682 {
3683     if (p.exp == 0 && p.frac != 0) {
3684         float_raise(float_flag_input_denormal, status);
3685         return true;
3686     }
3687
3688     return false;
3689 }
3690
3691 float16 float16_squash_input_denormal(float16 a, float_status *status)
3692 {
3693     if (status->flush_inputs_to_zero) {
3694         FloatParts64 p = float16_unpack_raw(a);
3695         if (parts_squash_denormal(p, status)) {
3696             return float16_set_sign(float16_zero, p.sign);
3697         }
3698     }
3699     return a;
3700 }
3701
3702 float32 float32_squash_input_denormal(float32 a, float_status *status)
3703 {
3704     if (status->flush_inputs_to_zero) {
3705         FloatParts64 p = float32_unpack_raw(a);
3706         if (parts_squash_denormal(p, status)) {
3707             return float32_set_sign(float32_zero, p.sign);
3708         }
3709     }
3710     return a;
3711 }
3712
3713 float64 float64_squash_input_denormal(float64 a, float_status *status)
3714 {
3715     if (status->flush_inputs_to_zero) {
3716         FloatParts64 p = float64_unpack_raw(a);
3717         if (parts_squash_denormal(p, status)) {
3718             return float64_set_sign(float64_zero, p.sign);
3719         }
3720     }
3721     return a;
3722 }
3723
3724 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3725 {
3726     if (status->flush_inputs_to_zero) {
3727         FloatParts64 p = bfloat16_unpack_raw(a);
3728         if (parts_squash_denormal(p, status)) {
3729             return bfloat16_set_sign(bfloat16_zero, p.sign);
3730         }
3731     }
3732     return a;
3733 }
3734
3735 /*----------------------------------------------------------------------------
3736 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3737 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3738 | input.  If `zSign' is 1, the input is negated before being converted to an
3739 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3740 | is simply rounded to an integer, with the inexact exception raised if the
3741 | input cannot be represented exactly as an integer.  However, if the fixed-
3742 | point input is too large, the invalid exception is raised and the largest
3743 | positive or negative integer is returned.
3744 *----------------------------------------------------------------------------*/
3745
3746 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3747                                  float_status *status)
3748 {
3749     int8_t roundingMode;
3750     bool roundNearestEven;
3751     int8_t roundIncrement, roundBits;
3752     int32_t z;
3753
3754     roundingMode = status->float_rounding_mode;
3755     roundNearestEven = ( roundingMode == float_round_nearest_even );
3756     switch (roundingMode) {
3757     case float_round_nearest_even:
3758     case float_round_ties_away:
3759         roundIncrement = 0x40;
3760         break;
3761     case float_round_to_zero:
3762         roundIncrement = 0;
3763         break;
3764     case float_round_up:
3765         roundIncrement = zSign ? 0 : 0x7f;
3766         break;
3767     case float_round_down:
3768         roundIncrement = zSign ? 0x7f : 0;
3769         break;
3770     case float_round_to_odd:
3771         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3772         break;
3773     default:
3774         abort();
3775     }
3776     roundBits = absZ & 0x7F;
3777     absZ = ( absZ + roundIncrement )>>7;
3778     if (!(roundBits ^ 0x40) && roundNearestEven) {
3779         absZ &= ~1;
3780     }
3781     z = absZ;
3782     if ( zSign ) z = - z;
3783     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3784         float_raise(float_flag_invalid, status);
3785         return zSign ? INT32_MIN : INT32_MAX;
3786     }
3787     if (roundBits) {
3788         float_raise(float_flag_inexact, status);
3789     }
3790     return z;
3791
3792 }
3793
3794 /*----------------------------------------------------------------------------
3795 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3796 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3797 | and returns the properly rounded 64-bit integer corresponding to the input.
3798 | If `zSign' is 1, the input is negated before being converted to an integer.
3799 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3800 | the inexact exception raised if the input cannot be represented exactly as
3801 | an integer.  However, if the fixed-point input is too large, the invalid
3802 | exception is raised and the largest positive or negative integer is
3803 | returned.
3804 *----------------------------------------------------------------------------*/
3805
3806 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3807                                float_status *status)
3808 {
3809     int8_t roundingMode;
3810     bool roundNearestEven, increment;
3811     int64_t z;
3812
3813     roundingMode = status->float_rounding_mode;
3814     roundNearestEven = ( roundingMode == float_round_nearest_even );
3815     switch (roundingMode) {
3816     case float_round_nearest_even:
3817     case float_round_ties_away:
3818         increment = ((int64_t) absZ1 < 0);
3819         break;
3820     case float_round_to_zero:
3821         increment = 0;
3822         break;
3823     case float_round_up:
3824         increment = !zSign && absZ1;
3825         break;
3826     case float_round_down:
3827         increment = zSign && absZ1;
3828         break;
3829     case float_round_to_odd:
3830         increment = !(absZ0 & 1) && absZ1;
3831         break;
3832     default:
3833         abort();
3834     }
3835     if ( increment ) {
3836         ++absZ0;
3837         if ( absZ0 == 0 ) goto overflow;
3838         if (!(absZ1 << 1) && roundNearestEven) {
3839             absZ0 &= ~1;
3840         }
3841     }
3842     z = absZ0;
3843     if ( zSign ) z = - z;
3844     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3845  overflow:
3846         float_raise(float_flag_invalid, status);
3847         return zSign ? INT64_MIN : INT64_MAX;
3848     }
3849     if (absZ1) {
3850         float_raise(float_flag_inexact, status);
3851     }
3852     return z;
3853
3854 }
3855
3856 /*----------------------------------------------------------------------------
3857 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3858 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3859 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3860 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3861 | with the inexact exception raised if the input cannot be represented exactly
3862 | as an integer.  However, if the fixed-point input is too large, the invalid
3863 | exception is raised and the largest unsigned integer is returned.
3864 *----------------------------------------------------------------------------*/
3865
3866 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3867                                 uint64_t absZ1, float_status *status)
3868 {
3869     int8_t roundingMode;
3870     bool roundNearestEven, increment;
3871
3872     roundingMode = status->float_rounding_mode;
3873     roundNearestEven = (roundingMode == float_round_nearest_even);
3874     switch (roundingMode) {
3875     case float_round_nearest_even:
3876     case float_round_ties_away:
3877         increment = ((int64_t)absZ1 < 0);
3878         break;
3879     case float_round_to_zero:
3880         increment = 0;
3881         break;
3882     case float_round_up:
3883         increment = !zSign && absZ1;
3884         break;
3885     case float_round_down:
3886         increment = zSign && absZ1;
3887         break;
3888     case float_round_to_odd:
3889         increment = !(absZ0 & 1) && absZ1;
3890         break;
3891     default:
3892         abort();
3893     }
3894     if (increment) {
3895         ++absZ0;
3896         if (absZ0 == 0) {
3897             float_raise(float_flag_invalid, status);
3898             return UINT64_MAX;
3899         }
3900         if (!(absZ1 << 1) && roundNearestEven) {
3901             absZ0 &= ~1;
3902         }
3903     }
3904
3905     if (zSign && absZ0) {
3906         float_raise(float_flag_invalid, status);
3907         return 0;
3908     }
3909
3910     if (absZ1) {
3911         float_raise(float_flag_inexact, status);
3912     }
3913     return absZ0;
3914 }
3915
3916 /*----------------------------------------------------------------------------
3917 | Normalizes the subnormal single-precision floating-point value represented
3918 | by the denormalized significand `aSig'.  The normalized exponent and
3919 | significand are stored at the locations pointed to by `zExpPtr' and
3920 | `zSigPtr', respectively.
3921 *----------------------------------------------------------------------------*/
3922
3923 static void
3924  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3925 {
3926     int8_t shiftCount;
3927
3928     shiftCount = clz32(aSig) - 8;
3929     *zSigPtr = aSig<<shiftCount;
3930     *zExpPtr = 1 - shiftCount;
3931
3932 }
3933
3934 /*----------------------------------------------------------------------------
3935 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3936 | and significand `zSig', and returns the proper single-precision floating-
3937 | point value corresponding to the abstract input.  Ordinarily, the abstract
3938 | value is simply rounded and packed into the single-precision format, with
3939 | the inexact exception raised if the abstract input cannot be represented
3940 | exactly.  However, if the abstract value is too large, the overflow and
3941 | inexact exceptions are raised and an infinity or maximal finite value is
3942 | returned.  If the abstract value is too small, the input value is rounded to
3943 | a subnormal number, and the underflow and inexact exceptions are raised if
3944 | the abstract input cannot be represented exactly as a subnormal single-
3945 | precision floating-point number.
3946 |     The input significand `zSig' has its binary point between bits 30
3947 | and 29, which is 7 bits to the left of the usual location.  This shifted
3948 | significand must be normalized or smaller.  If `zSig' is not normalized,
3949 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3950 | and it must not require rounding.  In the usual case that `zSig' is
3951 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3952 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3953 | Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955
3956 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3957                                    float_status *status)
3958 {
3959     int8_t roundingMode;
3960     bool roundNearestEven;
3961     int8_t roundIncrement, roundBits;
3962     bool isTiny;
3963
3964     roundingMode = status->float_rounding_mode;
3965     roundNearestEven = ( roundingMode == float_round_nearest_even );
3966     switch (roundingMode) {
3967     case float_round_nearest_even:
3968     case float_round_ties_away:
3969         roundIncrement = 0x40;
3970         break;
3971     case float_round_to_zero:
3972         roundIncrement = 0;
3973         break;
3974     case float_round_up:
3975         roundIncrement = zSign ? 0 : 0x7f;
3976         break;
3977     case float_round_down:
3978         roundIncrement = zSign ? 0x7f : 0;
3979         break;
3980     case float_round_to_odd:
3981         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3982         break;
3983     default:
3984         abort();
3985         break;
3986     }
3987     roundBits = zSig & 0x7F;
3988     if ( 0xFD <= (uint16_t) zExp ) {
3989         if (    ( 0xFD < zExp )
3990              || (    ( zExp == 0xFD )
3991                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3992            ) {
3993             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3994                                    roundIncrement != 0;
3995             float_raise(float_flag_overflow | float_flag_inexact, status);
3996             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3997         }
3998         if ( zExp < 0 ) {
3999             if (status->flush_to_zero) {
4000                 float_raise(float_flag_output_denormal, status);
4001                 return packFloat32(zSign, 0, 0);
4002             }
4003             isTiny = status->tininess_before_rounding
4004                   || (zExp < -1)
4005                   || (zSig + roundIncrement < 0x80000000);
4006             shift32RightJamming( zSig, - zExp, &zSig );
4007             zExp = 0;
4008             roundBits = zSig & 0x7F;
4009             if (isTiny && roundBits) {
4010                 float_raise(float_flag_underflow, status);
4011             }
4012             if (roundingMode == float_round_to_odd) {
4013                 /*
4014                  * For round-to-odd case, the roundIncrement depends on
4015                  * zSig which just changed.
4016                  */
4017                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4018             }
4019         }
4020     }
4021     if (roundBits) {
4022         float_raise(float_flag_inexact, status);
4023     }
4024     zSig = ( zSig + roundIncrement )>>7;
4025     if (!(roundBits ^ 0x40) && roundNearestEven) {
4026         zSig &= ~1;
4027     }
4028     if ( zSig == 0 ) zExp = 0;
4029     return packFloat32( zSign, zExp, zSig );
4030
4031 }
4032
4033 /*----------------------------------------------------------------------------
4034 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4035 | and significand `zSig', and returns the proper single-precision floating-
4036 | point value corresponding to the abstract input.  This routine is just like
4037 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4038 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4039 | floating-point exponent.
4040 *----------------------------------------------------------------------------*/
4041
4042 static float32
4043  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4044                               float_status *status)
4045 {
4046     int8_t shiftCount;
4047
4048     shiftCount = clz32(zSig) - 1;
4049     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4050                                status);
4051
4052 }
4053
4054 /*----------------------------------------------------------------------------
4055 | Normalizes the subnormal double-precision floating-point value represented
4056 | by the denormalized significand `aSig'.  The normalized exponent and
4057 | significand are stored at the locations pointed to by `zExpPtr' and
4058 | `zSigPtr', respectively.
4059 *----------------------------------------------------------------------------*/
4060
4061 static void
4062  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4063 {
4064     int8_t shiftCount;
4065
4066     shiftCount = clz64(aSig) - 11;
4067     *zSigPtr = aSig<<shiftCount;
4068     *zExpPtr = 1 - shiftCount;
4069
4070 }
4071
4072 /*----------------------------------------------------------------------------
4073 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4074 | double-precision floating-point value, returning the result.  After being
4075 | shifted into the proper positions, the three fields are simply added
4076 | together to form the result.  This means that any integer portion of `zSig'
4077 | will be added into the exponent.  Since a properly normalized significand
4078 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4079 | than the desired result exponent whenever `zSig' is a complete, normalized
4080 | significand.
4081 *----------------------------------------------------------------------------*/
4082
4083 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4084 {
4085
4086     return make_float64(
4087         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4088
4089 }
4090
4091 /*----------------------------------------------------------------------------
4092 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4093 | and significand `zSig', and returns the proper double-precision floating-
4094 | point value corresponding to the abstract input.  Ordinarily, the abstract
4095 | value is simply rounded and packed into the double-precision format, with
4096 | the inexact exception raised if the abstract input cannot be represented
4097 | exactly.  However, if the abstract value is too large, the overflow and
4098 | inexact exceptions are raised and an infinity or maximal finite value is
4099 | returned.  If the abstract value is too small, the input value is rounded to
4100 | a subnormal number, and the underflow and inexact exceptions are raised if
4101 | the abstract input cannot be represented exactly as a subnormal double-
4102 | precision floating-point number.
4103 |     The input significand `zSig' has its binary point between bits 62
4104 | and 61, which is 10 bits to the left of the usual location.  This shifted
4105 | significand must be normalized or smaller.  If `zSig' is not normalized,
4106 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4107 | and it must not require rounding.  In the usual case that `zSig' is
4108 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4109 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4110 | Binary Floating-Point Arithmetic.
4111 *----------------------------------------------------------------------------*/
4112
4113 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4114                                    float_status *status)
4115 {
4116     int8_t roundingMode;
4117     bool roundNearestEven;
4118     int roundIncrement, roundBits;
4119     bool isTiny;
4120
4121     roundingMode = status->float_rounding_mode;
4122     roundNearestEven = ( roundingMode == float_round_nearest_even );
4123     switch (roundingMode) {
4124     case float_round_nearest_even:
4125     case float_round_ties_away:
4126         roundIncrement = 0x200;
4127         break;
4128     case float_round_to_zero:
4129         roundIncrement = 0;
4130         break;
4131     case float_round_up:
4132         roundIncrement = zSign ? 0 : 0x3ff;
4133         break;
4134     case float_round_down:
4135         roundIncrement = zSign ? 0x3ff : 0;
4136         break;
4137     case float_round_to_odd:
4138         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4139         break;
4140     default:
4141         abort();
4142     }
4143     roundBits = zSig & 0x3FF;
4144     if ( 0x7FD <= (uint16_t) zExp ) {
4145         if (    ( 0x7FD < zExp )
4146              || (    ( zExp == 0x7FD )
4147                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4148            ) {
4149             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4150                                    roundIncrement != 0;
4151             float_raise(float_flag_overflow | float_flag_inexact, status);
4152             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4153         }
4154         if ( zExp < 0 ) {
4155             if (status->flush_to_zero) {
4156                 float_raise(float_flag_output_denormal, status);
4157                 return packFloat64(zSign, 0, 0);
4158             }
4159             isTiny = status->tininess_before_rounding
4160                   || (zExp < -1)
4161                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4162             shift64RightJamming( zSig, - zExp, &zSig );
4163             zExp = 0;
4164             roundBits = zSig & 0x3FF;
4165             if (isTiny && roundBits) {
4166                 float_raise(float_flag_underflow, status);
4167             }
4168             if (roundingMode == float_round_to_odd) {
4169                 /*
4170                  * For round-to-odd case, the roundIncrement depends on
4171                  * zSig which just changed.
4172                  */
4173                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4174             }
4175         }
4176     }
4177     if (roundBits) {
4178         float_raise(float_flag_inexact, status);
4179     }
4180     zSig = ( zSig + roundIncrement )>>10;
4181     if (!(roundBits ^ 0x200) && roundNearestEven) {
4182         zSig &= ~1;
4183     }
4184     if ( zSig == 0 ) zExp = 0;
4185     return packFloat64( zSign, zExp, zSig );
4186
4187 }
4188
4189 /*----------------------------------------------------------------------------
4190 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4191 | and significand `zSig', and returns the proper double-precision floating-
4192 | point value corresponding to the abstract input.  This routine is just like
4193 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4194 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4195 | floating-point exponent.
4196 *----------------------------------------------------------------------------*/
4197
4198 static float64
4199  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4200                               float_status *status)
4201 {
4202     int8_t shiftCount;
4203
4204     shiftCount = clz64(zSig) - 1;
4205     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4206                                status);
4207
4208 }
4209
4210 /*----------------------------------------------------------------------------
4211 | Normalizes the subnormal extended double-precision floating-point value
4212 | represented by the denormalized significand `aSig'.  The normalized exponent
4213 | and significand are stored at the locations pointed to by `zExpPtr' and
4214 | `zSigPtr', respectively.
4215 *----------------------------------------------------------------------------*/
4216
4217 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4218                                 uint64_t *zSigPtr)
4219 {
4220     int8_t shiftCount;
4221
4222     shiftCount = clz64(aSig);
4223     *zSigPtr = aSig<<shiftCount;
4224     *zExpPtr = 1 - shiftCount;
4225 }
4226
4227 /*----------------------------------------------------------------------------
4228 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4229 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4230 | and returns the proper extended double-precision floating-point value
4231 | corresponding to the abstract input.  Ordinarily, the abstract value is
4232 | rounded and packed into the extended double-precision format, with the
4233 | inexact exception raised if the abstract input cannot be represented
4234 | exactly.  However, if the abstract value is too large, the overflow and
4235 | inexact exceptions are raised and an infinity or maximal finite value is
4236 | returned.  If the abstract value is too small, the input value is rounded to
4237 | a subnormal number, and the underflow and inexact exceptions are raised if
4238 | the abstract input cannot be represented exactly as a subnormal extended
4239 | double-precision floating-point number.
4240 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4241 | number of bits as single or double precision, respectively.  Otherwise, the
4242 | result is rounded to the full precision of the extended double-precision
4243 | format.
4244 |     The input significand must be normalized or smaller.  If the input
4245 | significand is not normalized, `zExp' must be 0; in that case, the result
4246 | returned is a subnormal number, and it must not require rounding.  The
4247 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4248 | Floating-Point Arithmetic.
4249 *----------------------------------------------------------------------------*/
4250
4251 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4252                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4253                               float_status *status)
4254 {
4255     int8_t roundingMode;
4256     bool roundNearestEven, increment, isTiny;
4257     int64_t roundIncrement, roundMask, roundBits;
4258
4259     roundingMode = status->float_rounding_mode;
4260     roundNearestEven = ( roundingMode == float_round_nearest_even );
4261     if ( roundingPrecision == 80 ) goto precision80;
4262     if ( roundingPrecision == 64 ) {
4263         roundIncrement = UINT64_C(0x0000000000000400);
4264         roundMask = UINT64_C(0x00000000000007FF);
4265     }
4266     else if ( roundingPrecision == 32 ) {
4267         roundIncrement = UINT64_C(0x0000008000000000);
4268         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4269     }
4270     else {
4271         goto precision80;
4272     }
4273     zSig0 |= ( zSig1 != 0 );
4274     switch (roundingMode) {
4275     case float_round_nearest_even:
4276     case float_round_ties_away:
4277         break;
4278     case float_round_to_zero:
4279         roundIncrement = 0;
4280         break;
4281     case float_round_up:
4282         roundIncrement = zSign ? 0 : roundMask;
4283         break;
4284     case float_round_down:
4285         roundIncrement = zSign ? roundMask : 0;
4286         break;
4287     default:
4288         abort();
4289     }
4290     roundBits = zSig0 & roundMask;
4291     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4292         if (    ( 0x7FFE < zExp )
4293              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4294            ) {
4295             goto overflow;
4296         }
4297         if ( zExp <= 0 ) {
4298             if (status->flush_to_zero) {
4299                 float_raise(float_flag_output_denormal, status);
4300                 return packFloatx80(zSign, 0, 0);
4301             }
4302             isTiny = status->tininess_before_rounding
4303                   || (zExp < 0 )
4304                   || (zSig0 <= zSig0 + roundIncrement);
4305             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4306             zExp = 0;
4307             roundBits = zSig0 & roundMask;
4308             if (isTiny && roundBits) {
4309                 float_raise(float_flag_underflow, status);
4310             }
4311             if (roundBits) {
4312                 float_raise(float_flag_inexact, status);
4313             }
4314             zSig0 += roundIncrement;
4315             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4316             roundIncrement = roundMask + 1;
4317             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4318                 roundMask |= roundIncrement;
4319             }
4320             zSig0 &= ~ roundMask;
4321             return packFloatx80( zSign, zExp, zSig0 );
4322         }
4323     }
4324     if (roundBits) {
4325         float_raise(float_flag_inexact, status);
4326     }
4327     zSig0 += roundIncrement;
4328     if ( zSig0 < roundIncrement ) {
4329         ++zExp;
4330         zSig0 = UINT64_C(0x8000000000000000);
4331     }
4332     roundIncrement = roundMask + 1;
4333     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4334         roundMask |= roundIncrement;
4335     }
4336     zSig0 &= ~ roundMask;
4337     if ( zSig0 == 0 ) zExp = 0;
4338     return packFloatx80( zSign, zExp, zSig0 );
4339  precision80:
4340     switch (roundingMode) {
4341     case float_round_nearest_even:
4342     case float_round_ties_away:
4343         increment = ((int64_t)zSig1 < 0);
4344         break;
4345     case float_round_to_zero:
4346         increment = 0;
4347         break;
4348     case float_round_up:
4349         increment = !zSign && zSig1;
4350         break;
4351     case float_round_down:
4352         increment = zSign && zSig1;
4353         break;
4354     default:
4355         abort();
4356     }
4357     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4358         if (    ( 0x7FFE < zExp )
4359              || (    ( zExp == 0x7FFE )
4360                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4361                   && increment
4362                 )
4363            ) {
4364             roundMask = 0;
4365  overflow:
4366             float_raise(float_flag_overflow | float_flag_inexact, status);
4367             if (    ( roundingMode == float_round_to_zero )
4368                  || ( zSign && ( roundingMode == float_round_up ) )
4369                  || ( ! zSign && ( roundingMode == float_round_down ) )
4370                ) {
4371                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4372             }
4373             return packFloatx80(zSign,
4374                                 floatx80_infinity_high,
4375                                 floatx80_infinity_low);
4376         }
4377         if ( zExp <= 0 ) {
4378             isTiny = status->tininess_before_rounding
4379                   || (zExp < 0)
4380                   || !increment
4381                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4382             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4383             zExp = 0;
4384             if (isTiny && zSig1) {
4385                 float_raise(float_flag_underflow, status);
4386             }
4387             if (zSig1) {
4388                 float_raise(float_flag_inexact, status);
4389             }
4390             switch (roundingMode) {
4391             case float_round_nearest_even:
4392             case float_round_ties_away:
4393                 increment = ((int64_t)zSig1 < 0);
4394                 break;
4395             case float_round_to_zero:
4396                 increment = 0;
4397                 break;
4398             case float_round_up:
4399                 increment = !zSign && zSig1;
4400                 break;
4401             case float_round_down:
4402                 increment = zSign && zSig1;
4403                 break;
4404             default:
4405                 abort();
4406             }
4407             if ( increment ) {
4408                 ++zSig0;
4409                 if (!(zSig1 << 1) && roundNearestEven) {
4410                     zSig0 &= ~1;
4411                 }
4412                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4413             }
4414             return packFloatx80( zSign, zExp, zSig0 );
4415         }
4416     }
4417     if (zSig1) {
4418         float_raise(float_flag_inexact, status);
4419     }
4420     if ( increment ) {
4421         ++zSig0;
4422         if ( zSig0 == 0 ) {
4423             ++zExp;
4424             zSig0 = UINT64_C(0x8000000000000000);
4425         }
4426         else {
4427             if (!(zSig1 << 1) && roundNearestEven) {
4428                 zSig0 &= ~1;
4429             }
4430         }
4431     }
4432     else {
4433         if ( zSig0 == 0 ) zExp = 0;
4434     }
4435     return packFloatx80( zSign, zExp, zSig0 );
4436
4437 }
4438
4439 /*----------------------------------------------------------------------------
4440 | Takes an abstract floating-point value having sign `zSign', exponent
4441 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4442 | and returns the proper extended double-precision floating-point value
4443 | corresponding to the abstract input.  This routine is just like
4444 | `roundAndPackFloatx80' except that the input significand does not have to be
4445 | normalized.
4446 *----------------------------------------------------------------------------*/
4447
4448 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4449                                        bool zSign, int32_t zExp,
4450                                        uint64_t zSig0, uint64_t zSig1,
4451                                        float_status *status)
4452 {
4453     int8_t shiftCount;
4454
4455     if ( zSig0 == 0 ) {
4456         zSig0 = zSig1;
4457         zSig1 = 0;
4458         zExp -= 64;
4459     }
4460     shiftCount = clz64(zSig0);
4461     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4462     zExp -= shiftCount;
4463     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4464                                 zSig0, zSig1, status);
4465
4466 }
4467
4468 /*----------------------------------------------------------------------------
4469 | Returns the least-significant 64 fraction bits of the quadruple-precision
4470 | floating-point value `a'.
4471 *----------------------------------------------------------------------------*/
4472
4473 static inline uint64_t extractFloat128Frac1( float128 a )
4474 {
4475
4476     return a.low;
4477
4478 }
4479
4480 /*----------------------------------------------------------------------------
4481 | Returns the most-significant 48 fraction bits of the quadruple-precision
4482 | floating-point value `a'.
4483 *----------------------------------------------------------------------------*/
4484
4485 static inline uint64_t extractFloat128Frac0( float128 a )
4486 {
4487
4488     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4489
4490 }
4491
4492 /*----------------------------------------------------------------------------
4493 | Returns the exponent bits of the quadruple-precision floating-point value
4494 | `a'.
4495 *----------------------------------------------------------------------------*/
4496
4497 static inline int32_t extractFloat128Exp( float128 a )
4498 {
4499
4500     return ( a.high>>48 ) & 0x7FFF;
4501
4502 }
4503
4504 /*----------------------------------------------------------------------------
4505 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4506 *----------------------------------------------------------------------------*/
4507
4508 static inline bool extractFloat128Sign(float128 a)
4509 {
4510     return a.high >> 63;
4511 }
4512
4513 /*----------------------------------------------------------------------------
4514 | Normalizes the subnormal quadruple-precision floating-point value
4515 | represented by the denormalized significand formed by the concatenation of
4516 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4517 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4518 | significand are stored at the location pointed to by `zSig0Ptr', and the
4519 | least significant 64 bits of the normalized significand are stored at the
4520 | location pointed to by `zSig1Ptr'.
4521 *----------------------------------------------------------------------------*/
4522
4523 static void
4524  normalizeFloat128Subnormal(
4525      uint64_t aSig0,
4526      uint64_t aSig1,
4527      int32_t *zExpPtr,
4528      uint64_t *zSig0Ptr,
4529      uint64_t *zSig1Ptr
4530  )
4531 {
4532     int8_t shiftCount;
4533
4534     if ( aSig0 == 0 ) {
4535         shiftCount = clz64(aSig1) - 15;
4536         if ( shiftCount < 0 ) {
4537             *zSig0Ptr = aSig1>>( - shiftCount );
4538             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4539         }
4540         else {
4541             *zSig0Ptr = aSig1<<shiftCount;
4542             *zSig1Ptr = 0;
4543         }
4544         *zExpPtr = - shiftCount - 63;
4545     }
4546     else {
4547         shiftCount = clz64(aSig0) - 15;
4548         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4549         *zExpPtr = 1 - shiftCount;
4550     }
4551
4552 }
4553
4554 /*----------------------------------------------------------------------------
4555 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4556 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4557 | floating-point value, returning the result.  After being shifted into the
4558 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4559 | added together to form the most significant 32 bits of the result.  This
4560 | means that any integer portion of `zSig0' will be added into the exponent.
4561 | Since a properly normalized significand will have an integer portion equal
4562 | to 1, the `zExp' input should be 1 less than the desired result exponent
4563 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4564 | significand.
4565 *----------------------------------------------------------------------------*/
4566
4567 static inline float128
4568 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4569 {
4570     float128 z;
4571
4572     z.low = zSig1;
4573     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4574     return z;
4575 }
4576
4577 /*----------------------------------------------------------------------------
4578 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4579 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4580 | and `zSig2', and returns the proper quadruple-precision floating-point value
4581 | corresponding to the abstract input.  Ordinarily, the abstract value is
4582 | simply rounded and packed into the quadruple-precision format, with the
4583 | inexact exception raised if the abstract input cannot be represented
4584 | exactly.  However, if the abstract value is too large, the overflow and
4585 | inexact exceptions are raised and an infinity or maximal finite value is
4586 | returned.  If the abstract value is too small, the input value is rounded to
4587 | a subnormal number, and the underflow and inexact exceptions are raised if
4588 | the abstract input cannot be represented exactly as a subnormal quadruple-
4589 | precision floating-point number.
4590 |     The input significand must be normalized or smaller.  If the input
4591 | significand is not normalized, `zExp' must be 0; in that case, the result
4592 | returned is a subnormal number, and it must not require rounding.  In the
4593 | usual case that the input significand is normalized, `zExp' must be 1 less
4594 | than the ``true'' floating-point exponent.  The handling of underflow and
4595 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4596 *----------------------------------------------------------------------------*/
4597
4598 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4599                                      uint64_t zSig0, uint64_t zSig1,
4600                                      uint64_t zSig2, float_status *status)
4601 {
4602     int8_t roundingMode;
4603     bool roundNearestEven, increment, isTiny;
4604
4605     roundingMode = status->float_rounding_mode;
4606     roundNearestEven = ( roundingMode == float_round_nearest_even );
4607     switch (roundingMode) {
4608     case float_round_nearest_even:
4609     case float_round_ties_away:
4610         increment = ((int64_t)zSig2 < 0);
4611         break;
4612     case float_round_to_zero:
4613         increment = 0;
4614         break;
4615     case float_round_up:
4616         increment = !zSign && zSig2;
4617         break;
4618     case float_round_down:
4619         increment = zSign && zSig2;
4620         break;
4621     case float_round_to_odd:
4622         increment = !(zSig1 & 0x1) && zSig2;
4623         break;
4624     default:
4625         abort();
4626     }
4627     if ( 0x7FFD <= (uint32_t) zExp ) {
4628         if (    ( 0x7FFD < zExp )
4629              || (    ( zExp == 0x7FFD )
4630                   && eq128(
4631                          UINT64_C(0x0001FFFFFFFFFFFF),
4632                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4633                          zSig0,
4634                          zSig1
4635                      )
4636                   && increment
4637                 )
4638            ) {
4639             float_raise(float_flag_overflow | float_flag_inexact, status);
4640             if (    ( roundingMode == float_round_to_zero )
4641                  || ( zSign && ( roundingMode == float_round_up ) )
4642                  || ( ! zSign && ( roundingMode == float_round_down ) )
4643                  || (roundingMode == float_round_to_odd)
4644                ) {
4645                 return
4646                     packFloat128(
4647                         zSign,
4648                         0x7FFE,
4649                         UINT64_C(0x0000FFFFFFFFFFFF),
4650                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4651                     );
4652             }
4653             return packFloat128( zSign, 0x7FFF, 0, 0 );
4654         }
4655         if ( zExp < 0 ) {
4656             if (status->flush_to_zero) {
4657                 float_raise(float_flag_output_denormal, status);
4658                 return packFloat128(zSign, 0, 0, 0);
4659             }
4660             isTiny = status->tininess_before_rounding
4661                   || (zExp < -1)
4662                   || !increment
4663                   || lt128(zSig0, zSig1,
4664                            UINT64_C(0x0001FFFFFFFFFFFF),
4665                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4666             shift128ExtraRightJamming(
4667                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4668             zExp = 0;
4669             if (isTiny && zSig2) {
4670                 float_raise(float_flag_underflow, status);
4671             }
4672             switch (roundingMode) {
4673             case float_round_nearest_even:
4674             case float_round_ties_away:
4675                 increment = ((int64_t)zSig2 < 0);
4676                 break;
4677             case float_round_to_zero:
4678                 increment = 0;
4679                 break;
4680             case float_round_up:
4681                 increment = !zSign && zSig2;
4682                 break;
4683             case float_round_down:
4684                 increment = zSign && zSig2;
4685                 break;
4686             case float_round_to_odd:
4687                 increment = !(zSig1 & 0x1) && zSig2;
4688                 break;
4689             default:
4690                 abort();
4691             }
4692         }
4693     }
4694     if (zSig2) {
4695         float_raise(float_flag_inexact, status);
4696     }
4697     if ( increment ) {
4698         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4699         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4700             zSig1 &= ~1;
4701         }
4702     }
4703     else {
4704         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4705     }
4706     return packFloat128( zSign, zExp, zSig0, zSig1 );
4707
4708 }
4709
4710 /*----------------------------------------------------------------------------
4711 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4712 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4713 | returns the proper quadruple-precision floating-point value corresponding
4714 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4715 | except that the input significand has fewer bits and does not have to be
4716 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4717 | point exponent.
4718 *----------------------------------------------------------------------------*/
4719
4720 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4721                                               uint64_t zSig0, uint64_t zSig1,
4722                                               float_status *status)
4723 {
4724     int8_t shiftCount;
4725     uint64_t zSig2;
4726
4727     if ( zSig0 == 0 ) {
4728         zSig0 = zSig1;
4729         zSig1 = 0;
4730         zExp -= 64;
4731     }
4732     shiftCount = clz64(zSig0) - 15;
4733     if ( 0 <= shiftCount ) {
4734         zSig2 = 0;
4735         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4736     }
4737     else {
4738         shift128ExtraRightJamming(
4739             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4740     }
4741     zExp -= shiftCount;
4742     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4743
4744 }
4745
4746
4747 /*----------------------------------------------------------------------------
4748 | Returns the result of converting the 32-bit two's complement integer `a'
4749 | to the extended double-precision floating-point format.  The conversion
4750 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4751 | Arithmetic.
4752 *----------------------------------------------------------------------------*/
4753
4754 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4755 {
4756     bool zSign;
4757     uint32_t absA;
4758     int8_t shiftCount;
4759     uint64_t zSig;
4760
4761     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4762     zSign = ( a < 0 );
4763     absA = zSign ? - a : a;
4764     shiftCount = clz32(absA) + 32;
4765     zSig = absA;
4766     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4767
4768 }
4769
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the 32-bit two's complement integer `a' to
4772 | the quadruple-precision floating-point format.  The conversion is performed
4773 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4774 *----------------------------------------------------------------------------*/
4775
4776 float128 int32_to_float128(int32_t a, float_status *status)
4777 {
4778     bool zSign;
4779     uint32_t absA;
4780     int8_t shiftCount;
4781     uint64_t zSig0;
4782
4783     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4784     zSign = ( a < 0 );
4785     absA = zSign ? - a : a;
4786     shiftCount = clz32(absA) + 17;
4787     zSig0 = absA;
4788     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4789
4790 }
4791
4792 /*----------------------------------------------------------------------------
4793 | Returns the result of converting the 64-bit two's complement integer `a'
4794 | to the extended double-precision floating-point format.  The conversion
4795 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4796 | Arithmetic.
4797 *----------------------------------------------------------------------------*/
4798
4799 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4800 {
4801     bool zSign;
4802     uint64_t absA;
4803     int8_t shiftCount;
4804
4805     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4806     zSign = ( a < 0 );
4807     absA = zSign ? - a : a;
4808     shiftCount = clz64(absA);
4809     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4810
4811 }
4812
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of converting the 64-bit two's complement integer `a' to
4815 | the quadruple-precision floating-point format.  The conversion is performed
4816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4818
4819 float128 int64_to_float128(int64_t a, float_status *status)
4820 {
4821     bool zSign;
4822     uint64_t absA;
4823     int8_t shiftCount;
4824     int32_t zExp;
4825     uint64_t zSig0, zSig1;
4826
4827     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4828     zSign = ( a < 0 );
4829     absA = zSign ? - a : a;
4830     shiftCount = clz64(absA) + 49;
4831     zExp = 0x406E - shiftCount;
4832     if ( 64 <= shiftCount ) {
4833         zSig1 = 0;
4834         zSig0 = absA;
4835         shiftCount -= 64;
4836     }
4837     else {
4838         zSig1 = absA;
4839         zSig0 = 0;
4840     }
4841     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4842     return packFloat128( zSign, zExp, zSig0, zSig1 );
4843
4844 }
4845
4846 /*----------------------------------------------------------------------------
4847 | Returns the result of converting the 64-bit unsigned integer `a'
4848 | to the quadruple-precision floating-point format.  The conversion is performed
4849 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4850 *----------------------------------------------------------------------------*/
4851
4852 float128 uint64_to_float128(uint64_t a, float_status *status)
4853 {
4854     if (a == 0) {
4855         return float128_zero;
4856     }
4857     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4858 }
4859
4860 /*----------------------------------------------------------------------------
4861 | Returns the result of converting the single-precision floating-point value
4862 | `a' to the extended double-precision floating-point format.  The conversion
4863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4864 | Arithmetic.
4865 *----------------------------------------------------------------------------*/
4866
4867 floatx80 float32_to_floatx80(float32 a, float_status *status)
4868 {
4869     bool aSign;
4870     int aExp;
4871     uint32_t aSig;
4872
4873     a = float32_squash_input_denormal(a, status);
4874     aSig = extractFloat32Frac( a );
4875     aExp = extractFloat32Exp( a );
4876     aSign = extractFloat32Sign( a );
4877     if ( aExp == 0xFF ) {
4878         if (aSig) {
4879             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4880                                                status);
4881             return floatx80_silence_nan(res, status);
4882         }
4883         return packFloatx80(aSign,
4884                             floatx80_infinity_high,
4885                             floatx80_infinity_low);
4886     }
4887     if ( aExp == 0 ) {
4888         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4889         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4890     }
4891     aSig |= 0x00800000;
4892     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4893
4894 }
4895
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of converting the single-precision floating-point value
4898 | `a' to the double-precision floating-point format.  The conversion is
4899 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4900 | Arithmetic.
4901 *----------------------------------------------------------------------------*/
4902
4903 float128 float32_to_float128(float32 a, float_status *status)
4904 {
4905     bool aSign;
4906     int aExp;
4907     uint32_t aSig;
4908
4909     a = float32_squash_input_denormal(a, status);
4910     aSig = extractFloat32Frac( a );
4911     aExp = extractFloat32Exp( a );
4912     aSign = extractFloat32Sign( a );
4913     if ( aExp == 0xFF ) {
4914         if (aSig) {
4915             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4916         }
4917         return packFloat128( aSign, 0x7FFF, 0, 0 );
4918     }
4919     if ( aExp == 0 ) {
4920         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4921         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4922         --aExp;
4923     }
4924     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4925
4926 }
4927
4928 /*----------------------------------------------------------------------------
4929 | Returns the remainder of the single-precision floating-point value `a'
4930 | with respect to the corresponding value `b'.  The operation is performed
4931 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4933
4934 float32 float32_rem(float32 a, float32 b, float_status *status)
4935 {
4936     bool aSign, zSign;
4937     int aExp, bExp, expDiff;
4938     uint32_t aSig, bSig;
4939     uint32_t q;
4940     uint64_t aSig64, bSig64, q64;
4941     uint32_t alternateASig;
4942     int32_t sigMean;
4943     a = float32_squash_input_denormal(a, status);
4944     b = float32_squash_input_denormal(b, status);
4945
4946     aSig = extractFloat32Frac( a );
4947     aExp = extractFloat32Exp( a );
4948     aSign = extractFloat32Sign( a );
4949     bSig = extractFloat32Frac( b );
4950     bExp = extractFloat32Exp( b );
4951     if ( aExp == 0xFF ) {
4952         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4953             return propagateFloat32NaN(a, b, status);
4954         }
4955         float_raise(float_flag_invalid, status);
4956         return float32_default_nan(status);
4957     }
4958     if ( bExp == 0xFF ) {
4959         if (bSig) {
4960             return propagateFloat32NaN(a, b, status);
4961         }
4962         return a;
4963     }
4964     if ( bExp == 0 ) {
4965         if ( bSig == 0 ) {
4966             float_raise(float_flag_invalid, status);
4967             return float32_default_nan(status);
4968         }
4969         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4970     }
4971     if ( aExp == 0 ) {
4972         if ( aSig == 0 ) return a;
4973         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4974     }
4975     expDiff = aExp - bExp;
4976     aSig |= 0x00800000;
4977     bSig |= 0x00800000;
4978     if ( expDiff < 32 ) {
4979         aSig <<= 8;
4980         bSig <<= 8;
4981         if ( expDiff < 0 ) {
4982             if ( expDiff < -1 ) return a;
4983             aSig >>= 1;
4984         }
4985         q = ( bSig <= aSig );
4986         if ( q ) aSig -= bSig;
4987         if ( 0 < expDiff ) {
4988             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4989             q >>= 32 - expDiff;
4990             bSig >>= 2;
4991             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4992         }
4993         else {
4994             aSig >>= 2;
4995             bSig >>= 2;
4996         }
4997     }
4998     else {
4999         if ( bSig <= aSig ) aSig -= bSig;
5000         aSig64 = ( (uint64_t) aSig )<<40;
5001         bSig64 = ( (uint64_t) bSig )<<40;
5002         expDiff -= 64;
5003         while ( 0 < expDiff ) {
5004             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5005             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5006             aSig64 = - ( ( bSig * q64 )<<38 );
5007             expDiff -= 62;
5008         }
5009         expDiff += 64;
5010         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5011         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5012         q = q64>>( 64 - expDiff );
5013         bSig <<= 6;
5014         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5015     }
5016     do {
5017         alternateASig = aSig;
5018         ++q;
5019         aSig -= bSig;
5020     } while ( 0 <= (int32_t) aSig );
5021     sigMean = aSig + alternateASig;
5022     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5023         aSig = alternateASig;
5024     }
5025     zSign = ( (int32_t) aSig < 0 );
5026     if ( zSign ) aSig = - aSig;
5027     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5028 }
5029
5030
5031
5032 /*----------------------------------------------------------------------------
5033 | Returns the binary exponential of the single-precision floating-point value
5034 | `a'. The operation is performed according to the IEC/IEEE Standard for
5035 | Binary Floating-Point Arithmetic.
5036 |
5037 | Uses the following identities:
5038 |
5039 | 1. -------------------------------------------------------------------------
5040 |      x    x*ln(2)
5041 |     2  = e
5042 |
5043 | 2. -------------------------------------------------------------------------
5044 |                      2     3     4     5           n
5045 |      x        x     x     x     x     x           x
5046 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5047 |               1!    2!    3!    4!    5!          n!
5048 *----------------------------------------------------------------------------*/
5049
5050 static const float64 float32_exp2_coefficients[15] =
5051 {
5052     const_float64( 0x3ff0000000000000ll ), /*  1 */
5053     const_float64( 0x3fe0000000000000ll ), /*  2 */
5054     const_float64( 0x3fc5555555555555ll ), /*  3 */
5055     const_float64( 0x3fa5555555555555ll ), /*  4 */
5056     const_float64( 0x3f81111111111111ll ), /*  5 */
5057     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5058     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5059     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5060     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5061     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5062     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5063     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5064     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5065     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5066     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5067 };
5068
5069 float32 float32_exp2(float32 a, float_status *status)
5070 {
5071     bool aSign;
5072     int aExp;
5073     uint32_t aSig;
5074     float64 r, x, xn;
5075     int i;
5076     a = float32_squash_input_denormal(a, status);
5077
5078     aSig = extractFloat32Frac( a );
5079     aExp = extractFloat32Exp( a );
5080     aSign = extractFloat32Sign( a );
5081
5082     if ( aExp == 0xFF) {
5083         if (aSig) {
5084             return propagateFloat32NaN(a, float32_zero, status);
5085         }
5086         return (aSign) ? float32_zero : a;
5087     }
5088     if (aExp == 0) {
5089         if (aSig == 0) return float32_one;
5090     }
5091
5092     float_raise(float_flag_inexact, status);
5093
5094     /* ******************************* */
5095     /* using float64 for approximation */
5096     /* ******************************* */
5097     x = float32_to_float64(a, status);
5098     x = float64_mul(x, float64_ln2, status);
5099
5100     xn = x;
5101     r = float64_one;
5102     for (i = 0 ; i < 15 ; i++) {
5103         float64 f;
5104
5105         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5106         r = float64_add(r, f, status);
5107
5108         xn = float64_mul(xn, x, status);
5109     }
5110
5111     return float64_to_float32(r, status);
5112 }
5113
5114 /*----------------------------------------------------------------------------
5115 | Returns the binary log of the single-precision floating-point value `a'.
5116 | The operation is performed according to the IEC/IEEE Standard for Binary
5117 | Floating-Point Arithmetic.
5118 *----------------------------------------------------------------------------*/
5119 float32 float32_log2(float32 a, float_status *status)
5120 {
5121     bool aSign, zSign;
5122     int aExp;
5123     uint32_t aSig, zSig, i;
5124
5125     a = float32_squash_input_denormal(a, status);
5126     aSig = extractFloat32Frac( a );
5127     aExp = extractFloat32Exp( a );
5128     aSign = extractFloat32Sign( a );
5129
5130     if ( aExp == 0 ) {
5131         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5132         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5133     }
5134     if ( aSign ) {
5135         float_raise(float_flag_invalid, status);
5136         return float32_default_nan(status);
5137     }
5138     if ( aExp == 0xFF ) {
5139         if (aSig) {
5140             return propagateFloat32NaN(a, float32_zero, status);
5141         }
5142         return a;
5143     }
5144
5145     aExp -= 0x7F;
5146     aSig |= 0x00800000;
5147     zSign = aExp < 0;
5148     zSig = aExp << 23;
5149
5150     for (i = 1 << 22; i > 0; i >>= 1) {
5151         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5152         if ( aSig & 0x01000000 ) {
5153             aSig >>= 1;
5154             zSig |= i;
5155         }
5156     }
5157
5158     if ( zSign )
5159         zSig = -zSig;
5160
5161     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5162 }
5163
5164 /*----------------------------------------------------------------------------
5165 | Returns the result of converting the double-precision floating-point value
5166 | `a' to the extended double-precision floating-point format.  The conversion
5167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5168 | Arithmetic.
5169 *----------------------------------------------------------------------------*/
5170
5171 floatx80 float64_to_floatx80(float64 a, float_status *status)
5172 {
5173     bool aSign;
5174     int aExp;
5175     uint64_t aSig;
5176
5177     a = float64_squash_input_denormal(a, status);
5178     aSig = extractFloat64Frac( a );
5179     aExp = extractFloat64Exp( a );
5180     aSign = extractFloat64Sign( a );
5181     if ( aExp == 0x7FF ) {
5182         if (aSig) {
5183             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5184                                                status);
5185             return floatx80_silence_nan(res, status);
5186         }
5187         return packFloatx80(aSign,
5188                             floatx80_infinity_high,
5189                             floatx80_infinity_low);
5190     }
5191     if ( aExp == 0 ) {
5192         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5193         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5194     }
5195     return
5196         packFloatx80(
5197             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5198
5199 }
5200
5201 /*----------------------------------------------------------------------------
5202 | Returns the result of converting the double-precision floating-point value
5203 | `a' to the quadruple-precision floating-point format.  The conversion is
5204 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5205 | Arithmetic.
5206 *----------------------------------------------------------------------------*/
5207
5208 float128 float64_to_float128(float64 a, float_status *status)
5209 {
5210     bool aSign;
5211     int aExp;
5212     uint64_t aSig, zSig0, zSig1;
5213
5214     a = float64_squash_input_denormal(a, status);
5215     aSig = extractFloat64Frac( a );
5216     aExp = extractFloat64Exp( a );
5217     aSign = extractFloat64Sign( a );
5218     if ( aExp == 0x7FF ) {
5219         if (aSig) {
5220             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5221         }
5222         return packFloat128( aSign, 0x7FFF, 0, 0 );
5223     }
5224     if ( aExp == 0 ) {
5225         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5226         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5227         --aExp;
5228     }
5229     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5230     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5231
5232 }
5233
5234
5235 /*----------------------------------------------------------------------------
5236 | Returns the remainder of the double-precision floating-point value `a'
5237 | with respect to the corresponding value `b'.  The operation is performed
5238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5239 *----------------------------------------------------------------------------*/
5240
5241 float64 float64_rem(float64 a, float64 b, float_status *status)
5242 {
5243     bool aSign, zSign;
5244     int aExp, bExp, expDiff;
5245     uint64_t aSig, bSig;
5246     uint64_t q, alternateASig;
5247     int64_t sigMean;
5248
5249     a = float64_squash_input_denormal(a, status);
5250     b = float64_squash_input_denormal(b, status);
5251     aSig = extractFloat64Frac( a );
5252     aExp = extractFloat64Exp( a );
5253     aSign = extractFloat64Sign( a );
5254     bSig = extractFloat64Frac( b );
5255     bExp = extractFloat64Exp( b );
5256     if ( aExp == 0x7FF ) {
5257         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5258             return propagateFloat64NaN(a, b, status);
5259         }
5260         float_raise(float_flag_invalid, status);
5261         return float64_default_nan(status);
5262     }
5263     if ( bExp == 0x7FF ) {
5264         if (bSig) {
5265             return propagateFloat64NaN(a, b, status);
5266         }
5267         return a;
5268     }
5269     if ( bExp == 0 ) {
5270         if ( bSig == 0 ) {
5271             float_raise(float_flag_invalid, status);
5272             return float64_default_nan(status);
5273         }
5274         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5275     }
5276     if ( aExp == 0 ) {
5277         if ( aSig == 0 ) return a;
5278         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5279     }
5280     expDiff = aExp - bExp;
5281     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5282     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5283     if ( expDiff < 0 ) {
5284         if ( expDiff < -1 ) return a;
5285         aSig >>= 1;
5286     }
5287     q = ( bSig <= aSig );
5288     if ( q ) aSig -= bSig;
5289     expDiff -= 64;
5290     while ( 0 < expDiff ) {
5291         q = estimateDiv128To64( aSig, 0, bSig );
5292         q = ( 2 < q ) ? q - 2 : 0;
5293         aSig = - ( ( bSig>>2 ) * q );
5294         expDiff -= 62;
5295     }
5296     expDiff += 64;
5297     if ( 0 < expDiff ) {
5298         q = estimateDiv128To64( aSig, 0, bSig );
5299         q = ( 2 < q ) ? q - 2 : 0;
5300         q >>= 64 - expDiff;
5301         bSig >>= 2;
5302         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5303     }
5304     else {
5305         aSig >>= 2;
5306         bSig >>= 2;
5307     }
5308     do {
5309         alternateASig = aSig;
5310         ++q;
5311         aSig -= bSig;
5312     } while ( 0 <= (int64_t) aSig );
5313     sigMean = aSig + alternateASig;
5314     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5315         aSig = alternateASig;
5316     }
5317     zSign = ( (int64_t) aSig < 0 );
5318     if ( zSign ) aSig = - aSig;
5319     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5320
5321 }
5322
5323 /*----------------------------------------------------------------------------
5324 | Returns the binary log of the double-precision floating-point value `a'.
5325 | The operation is performed according to the IEC/IEEE Standard for Binary
5326 | Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328 float64 float64_log2(float64 a, float_status *status)
5329 {
5330     bool aSign, zSign;
5331     int aExp;
5332     uint64_t aSig, aSig0, aSig1, zSig, i;
5333     a = float64_squash_input_denormal(a, status);
5334
5335     aSig = extractFloat64Frac( a );
5336     aExp = extractFloat64Exp( a );
5337     aSign = extractFloat64Sign( a );
5338
5339     if ( aExp == 0 ) {
5340         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5341         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5342     }
5343     if ( aSign ) {
5344         float_raise(float_flag_invalid, status);
5345         return float64_default_nan(status);
5346     }
5347     if ( aExp == 0x7FF ) {
5348         if (aSig) {
5349             return propagateFloat64NaN(a, float64_zero, status);
5350         }
5351         return a;
5352     }
5353
5354     aExp -= 0x3FF;
5355     aSig |= UINT64_C(0x0010000000000000);
5356     zSign = aExp < 0;
5357     zSig = (uint64_t)aExp << 52;
5358     for (i = 1LL << 51; i > 0; i >>= 1) {
5359         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5360         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5361         if ( aSig & UINT64_C(0x0020000000000000) ) {
5362             aSig >>= 1;
5363             zSig |= i;
5364         }
5365     }
5366
5367     if ( zSign )
5368         zSig = -zSig;
5369     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5370 }
5371
5372 /*----------------------------------------------------------------------------
5373 | Returns the result of converting the extended double-precision floating-
5374 | point value `a' to the 32-bit two's complement integer format.  The
5375 | conversion is performed according to the IEC/IEEE Standard for Binary
5376 | Floating-Point Arithmetic---which means in particular that the conversion
5377 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5378 | largest positive integer is returned.  Otherwise, if the conversion
5379 | overflows, the largest integer with the same sign as `a' is returned.
5380 *----------------------------------------------------------------------------*/
5381
5382 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5383 {
5384     bool aSign;
5385     int32_t aExp, shiftCount;
5386     uint64_t aSig;
5387
5388     if (floatx80_invalid_encoding(a)) {
5389         float_raise(float_flag_invalid, status);
5390         return 1 << 31;
5391     }
5392     aSig = extractFloatx80Frac( a );
5393     aExp = extractFloatx80Exp( a );
5394     aSign = extractFloatx80Sign( a );
5395     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5396     shiftCount = 0x4037 - aExp;
5397     if ( shiftCount <= 0 ) shiftCount = 1;
5398     shift64RightJamming( aSig, shiftCount, &aSig );
5399     return roundAndPackInt32(aSign, aSig, status);
5400
5401 }
5402
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of converting the extended double-precision floating-
5405 | point value `a' to the 32-bit two's complement integer format.  The
5406 | conversion is performed according to the IEC/IEEE Standard for Binary
5407 | Floating-Point Arithmetic, except that the conversion is always rounded
5408 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5409 | Otherwise, if the conversion overflows, the largest integer with the same
5410 | sign as `a' is returned.
5411 *----------------------------------------------------------------------------*/
5412
5413 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5414 {
5415     bool aSign;
5416     int32_t aExp, shiftCount;
5417     uint64_t aSig, savedASig;
5418     int32_t z;
5419
5420     if (floatx80_invalid_encoding(a)) {
5421         float_raise(float_flag_invalid, status);
5422         return 1 << 31;
5423     }
5424     aSig = extractFloatx80Frac( a );
5425     aExp = extractFloatx80Exp( a );
5426     aSign = extractFloatx80Sign( a );
5427     if ( 0x401E < aExp ) {
5428         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5429         goto invalid;
5430     }
5431     else if ( aExp < 0x3FFF ) {
5432         if (aExp || aSig) {
5433             float_raise(float_flag_inexact, status);
5434         }
5435         return 0;
5436     }
5437     shiftCount = 0x403E - aExp;
5438     savedASig = aSig;
5439     aSig >>= shiftCount;
5440     z = aSig;
5441     if ( aSign ) z = - z;
5442     if ( ( z < 0 ) ^ aSign ) {
5443  invalid:
5444         float_raise(float_flag_invalid, status);
5445         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5446     }
5447     if ( ( aSig<<shiftCount ) != savedASig ) {
5448         float_raise(float_flag_inexact, status);
5449     }
5450     return z;
5451
5452 }
5453
5454 /*----------------------------------------------------------------------------
5455 | Returns the result of converting the extended double-precision floating-
5456 | point value `a' to the 64-bit two's complement integer format.  The
5457 | conversion is performed according to the IEC/IEEE Standard for Binary
5458 | Floating-Point Arithmetic---which means in particular that the conversion
5459 | is rounded according to the current rounding mode.  If `a' is a NaN,
5460 | the largest positive integer is returned.  Otherwise, if the conversion
5461 | overflows, the largest integer with the same sign as `a' is returned.
5462 *----------------------------------------------------------------------------*/
5463
5464 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5465 {
5466     bool aSign;
5467     int32_t aExp, shiftCount;
5468     uint64_t aSig, aSigExtra;
5469
5470     if (floatx80_invalid_encoding(a)) {
5471         float_raise(float_flag_invalid, status);
5472         return 1ULL << 63;
5473     }
5474     aSig = extractFloatx80Frac( a );
5475     aExp = extractFloatx80Exp( a );
5476     aSign = extractFloatx80Sign( a );
5477     shiftCount = 0x403E - aExp;
5478     if ( shiftCount <= 0 ) {
5479         if ( shiftCount ) {
5480             float_raise(float_flag_invalid, status);
5481             if (!aSign || floatx80_is_any_nan(a)) {
5482                 return INT64_MAX;
5483             }
5484             return INT64_MIN;
5485         }
5486         aSigExtra = 0;
5487     }
5488     else {
5489         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5490     }
5491     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5492
5493 }
5494
5495 /*----------------------------------------------------------------------------
5496 | Returns the result of converting the extended double-precision floating-
5497 | point value `a' to the 64-bit two's complement integer format.  The
5498 | conversion is performed according to the IEC/IEEE Standard for Binary
5499 | Floating-Point Arithmetic, except that the conversion is always rounded
5500 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5501 | Otherwise, if the conversion overflows, the largest integer with the same
5502 | sign as `a' is returned.
5503 *----------------------------------------------------------------------------*/
5504
5505 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5506 {
5507     bool aSign;
5508     int32_t aExp, shiftCount;
5509     uint64_t aSig;
5510     int64_t z;
5511
5512     if (floatx80_invalid_encoding(a)) {
5513         float_raise(float_flag_invalid, status);
5514         return 1ULL << 63;
5515     }
5516     aSig = extractFloatx80Frac( a );
5517     aExp = extractFloatx80Exp( a );
5518     aSign = extractFloatx80Sign( a );
5519     shiftCount = aExp - 0x403E;
5520     if ( 0 <= shiftCount ) {
5521         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5522         if ( ( a.high != 0xC03E ) || aSig ) {
5523             float_raise(float_flag_invalid, status);
5524             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5525                 return INT64_MAX;
5526             }
5527         }
5528         return INT64_MIN;
5529     }
5530     else if ( aExp < 0x3FFF ) {
5531         if (aExp | aSig) {
5532             float_raise(float_flag_inexact, status);
5533         }
5534         return 0;
5535     }
5536     z = aSig>>( - shiftCount );
5537     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5538         float_raise(float_flag_inexact, status);
5539     }
5540     if ( aSign ) z = - z;
5541     return z;
5542
5543 }
5544
5545 /*----------------------------------------------------------------------------
5546 | Returns the result of converting the extended double-precision floating-
5547 | point value `a' to the single-precision floating-point format.  The
5548 | conversion is performed according to the IEC/IEEE Standard for Binary
5549 | Floating-Point Arithmetic.
5550 *----------------------------------------------------------------------------*/
5551
5552 float32 floatx80_to_float32(floatx80 a, float_status *status)
5553 {
5554     bool aSign;
5555     int32_t aExp;
5556     uint64_t aSig;
5557
5558     if (floatx80_invalid_encoding(a)) {
5559         float_raise(float_flag_invalid, status);
5560         return float32_default_nan(status);
5561     }
5562     aSig = extractFloatx80Frac( a );
5563     aExp = extractFloatx80Exp( a );
5564     aSign = extractFloatx80Sign( a );
5565     if ( aExp == 0x7FFF ) {
5566         if ( (uint64_t) ( aSig<<1 ) ) {
5567             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5568                                              status);
5569             return float32_silence_nan(res, status);
5570         }
5571         return packFloat32( aSign, 0xFF, 0 );
5572     }
5573     shift64RightJamming( aSig, 33, &aSig );
5574     if ( aExp || aSig ) aExp -= 0x3F81;
5575     return roundAndPackFloat32(aSign, aExp, aSig, status);
5576
5577 }
5578
5579 /*----------------------------------------------------------------------------
5580 | Returns the result of converting the extended double-precision floating-
5581 | point value `a' to the double-precision floating-point format.  The
5582 | conversion is performed according to the IEC/IEEE Standard for Binary
5583 | Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585
5586 float64 floatx80_to_float64(floatx80 a, float_status *status)
5587 {
5588     bool aSign;
5589     int32_t aExp;
5590     uint64_t aSig, zSig;
5591
5592     if (floatx80_invalid_encoding(a)) {
5593         float_raise(float_flag_invalid, status);
5594         return float64_default_nan(status);
5595     }
5596     aSig = extractFloatx80Frac( a );
5597     aExp = extractFloatx80Exp( a );
5598     aSign = extractFloatx80Sign( a );
5599     if ( aExp == 0x7FFF ) {
5600         if ( (uint64_t) ( aSig<<1 ) ) {
5601             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5602                                              status);
5603             return float64_silence_nan(res, status);
5604         }
5605         return packFloat64( aSign, 0x7FF, 0 );
5606     }
5607     shift64RightJamming( aSig, 1, &zSig );
5608     if ( aExp || aSig ) aExp -= 0x3C01;
5609     return roundAndPackFloat64(aSign, aExp, zSig, status);
5610
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the extended double-precision floating-
5615 | point value `a' to the quadruple-precision floating-point format.  The
5616 | conversion is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619
5620 float128 floatx80_to_float128(floatx80 a, float_status *status)
5621 {
5622     bool aSign;
5623     int aExp;
5624     uint64_t aSig, zSig0, zSig1;
5625
5626     if (floatx80_invalid_encoding(a)) {
5627         float_raise(float_flag_invalid, status);
5628         return float128_default_nan(status);
5629     }
5630     aSig = extractFloatx80Frac( a );
5631     aExp = extractFloatx80Exp( a );
5632     aSign = extractFloatx80Sign( a );
5633     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5634         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5635                                            status);
5636         return float128_silence_nan(res, status);
5637     }
5638     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5639     return packFloat128( aSign, aExp, zSig0, zSig1 );
5640
5641 }
5642
5643 /*----------------------------------------------------------------------------
5644 | Rounds the extended double-precision floating-point value `a'
5645 | to the precision provided by floatx80_rounding_precision and returns the
5646 | result as an extended double-precision floating-point value.
5647 | The operation is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5650
5651 floatx80 floatx80_round(floatx80 a, float_status *status)
5652 {
5653     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5654                                 extractFloatx80Sign(a),
5655                                 extractFloatx80Exp(a),
5656                                 extractFloatx80Frac(a), 0, status);
5657 }
5658
5659 /*----------------------------------------------------------------------------
5660 | Rounds the extended double-precision floating-point value `a' to an integer,
5661 | and returns the result as an extended quadruple-precision floating-point
5662 | value.  The operation is performed according to the IEC/IEEE Standard for
5663 | Binary Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5665
5666 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5667 {
5668     bool aSign;
5669     int32_t aExp;
5670     uint64_t lastBitMask, roundBitsMask;
5671     floatx80 z;
5672
5673     if (floatx80_invalid_encoding(a)) {
5674         float_raise(float_flag_invalid, status);
5675         return floatx80_default_nan(status);
5676     }
5677     aExp = extractFloatx80Exp( a );
5678     if ( 0x403E <= aExp ) {
5679         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5680             return propagateFloatx80NaN(a, a, status);
5681         }
5682         return a;
5683     }
5684     if ( aExp < 0x3FFF ) {
5685         if (    ( aExp == 0 )
5686              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5687             return a;
5688         }
5689         float_raise(float_flag_inexact, status);
5690         aSign = extractFloatx80Sign( a );
5691         switch (status->float_rounding_mode) {
5692          case float_round_nearest_even:
5693             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5694                ) {
5695                 return
5696                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5697             }
5698             break;
5699         case float_round_ties_away:
5700             if (aExp == 0x3FFE) {
5701                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5702             }
5703             break;
5704          case float_round_down:
5705             return
5706                   aSign ?
5707                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5708                 : packFloatx80( 0, 0, 0 );
5709          case float_round_up:
5710             return
5711                   aSign ? packFloatx80( 1, 0, 0 )
5712                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5713
5714         case float_round_to_zero:
5715             break;
5716         default:
5717             g_assert_not_reached();
5718         }
5719         return packFloatx80( aSign, 0, 0 );
5720     }
5721     lastBitMask = 1;
5722     lastBitMask <<= 0x403E - aExp;
5723     roundBitsMask = lastBitMask - 1;
5724     z = a;
5725     switch (status->float_rounding_mode) {
5726     case float_round_nearest_even:
5727         z.low += lastBitMask>>1;
5728         if ((z.low & roundBitsMask) == 0) {
5729             z.low &= ~lastBitMask;
5730         }
5731         break;
5732     case float_round_ties_away:
5733         z.low += lastBitMask >> 1;
5734         break;
5735     case float_round_to_zero:
5736         break;
5737     case float_round_up:
5738         if (!extractFloatx80Sign(z)) {
5739             z.low += roundBitsMask;
5740         }
5741         break;
5742     case float_round_down:
5743         if (extractFloatx80Sign(z)) {
5744             z.low += roundBitsMask;
5745         }
5746         break;
5747     default:
5748         abort();
5749     }
5750     z.low &= ~ roundBitsMask;
5751     if ( z.low == 0 ) {
5752         ++z.high;
5753         z.low = UINT64_C(0x8000000000000000);
5754     }
5755     if (z.low != a.low) {
5756         float_raise(float_flag_inexact, status);
5757     }
5758     return z;
5759
5760 }
5761
5762 /*----------------------------------------------------------------------------
5763 | Returns the result of adding the absolute values of the extended double-
5764 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5765 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5766 | The addition is performed according to the IEC/IEEE Standard for Binary
5767 | Floating-Point Arithmetic.
5768 *----------------------------------------------------------------------------*/
5769
5770 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5771                                 float_status *status)
5772 {
5773     int32_t aExp, bExp, zExp;
5774     uint64_t aSig, bSig, zSig0, zSig1;
5775     int32_t expDiff;
5776
5777     aSig = extractFloatx80Frac( a );
5778     aExp = extractFloatx80Exp( a );
5779     bSig = extractFloatx80Frac( b );
5780     bExp = extractFloatx80Exp( b );
5781     expDiff = aExp - bExp;
5782     if ( 0 < expDiff ) {
5783         if ( aExp == 0x7FFF ) {
5784             if ((uint64_t)(aSig << 1)) {
5785                 return propagateFloatx80NaN(a, b, status);
5786             }
5787             return a;
5788         }
5789         if ( bExp == 0 ) --expDiff;
5790         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5791         zExp = aExp;
5792     }
5793     else if ( expDiff < 0 ) {
5794         if ( bExp == 0x7FFF ) {
5795             if ((uint64_t)(bSig << 1)) {
5796                 return propagateFloatx80NaN(a, b, status);
5797             }
5798             return packFloatx80(zSign,
5799                                 floatx80_infinity_high,
5800                                 floatx80_infinity_low);
5801         }
5802         if ( aExp == 0 ) ++expDiff;
5803         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5804         zExp = bExp;
5805     }
5806     else {
5807         if ( aExp == 0x7FFF ) {
5808             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5809                 return propagateFloatx80NaN(a, b, status);
5810             }
5811             return a;
5812         }
5813         zSig1 = 0;
5814         zSig0 = aSig + bSig;
5815         if ( aExp == 0 ) {
5816             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5817                 /* At least one of the values is a pseudo-denormal,
5818                  * and there is a carry out of the result.  */
5819                 zExp = 1;
5820                 goto shiftRight1;
5821             }
5822             if (zSig0 == 0) {
5823                 return packFloatx80(zSign, 0, 0);
5824             }
5825             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5826             goto roundAndPack;
5827         }
5828         zExp = aExp;
5829         goto shiftRight1;
5830     }
5831     zSig0 = aSig + bSig;
5832     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5833  shiftRight1:
5834     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5835     zSig0 |= UINT64_C(0x8000000000000000);
5836     ++zExp;
5837  roundAndPack:
5838     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5839                                 zSign, zExp, zSig0, zSig1, status);
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of subtracting the absolute values of the extended
5844 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5845 | difference is negated before being returned.  `zSign' is ignored if the
5846 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5847 | Standard for Binary Floating-Point Arithmetic.
5848 *----------------------------------------------------------------------------*/
5849
5850 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5851                                 float_status *status)
5852 {
5853     int32_t aExp, bExp, zExp;
5854     uint64_t aSig, bSig, zSig0, zSig1;
5855     int32_t expDiff;
5856
5857     aSig = extractFloatx80Frac( a );
5858     aExp = extractFloatx80Exp( a );
5859     bSig = extractFloatx80Frac( b );
5860     bExp = extractFloatx80Exp( b );
5861     expDiff = aExp - bExp;
5862     if ( 0 < expDiff ) goto aExpBigger;
5863     if ( expDiff < 0 ) goto bExpBigger;
5864     if ( aExp == 0x7FFF ) {
5865         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5866             return propagateFloatx80NaN(a, b, status);
5867         }
5868         float_raise(float_flag_invalid, status);
5869         return floatx80_default_nan(status);
5870     }
5871     if ( aExp == 0 ) {
5872         aExp = 1;
5873         bExp = 1;
5874     }
5875     zSig1 = 0;
5876     if ( bSig < aSig ) goto aBigger;
5877     if ( aSig < bSig ) goto bBigger;
5878     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5879  bExpBigger:
5880     if ( bExp == 0x7FFF ) {
5881         if ((uint64_t)(bSig << 1)) {
5882             return propagateFloatx80NaN(a, b, status);
5883         }
5884         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5885                             floatx80_infinity_low);
5886     }
5887     if ( aExp == 0 ) ++expDiff;
5888     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5889  bBigger:
5890     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5891     zExp = bExp;
5892     zSign ^= 1;
5893     goto normalizeRoundAndPack;
5894  aExpBigger:
5895     if ( aExp == 0x7FFF ) {
5896         if ((uint64_t)(aSig << 1)) {
5897             return propagateFloatx80NaN(a, b, status);
5898         }
5899         return a;
5900     }
5901     if ( bExp == 0 ) --expDiff;
5902     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5903  aBigger:
5904     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5905     zExp = aExp;
5906  normalizeRoundAndPack:
5907     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5908                                          zSign, zExp, zSig0, zSig1, status);
5909 }
5910
5911 /*----------------------------------------------------------------------------
5912 | Returns the result of adding the extended double-precision floating-point
5913 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5914 | Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5916
5917 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5918 {
5919     bool aSign, bSign;
5920
5921     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5922         float_raise(float_flag_invalid, status);
5923         return floatx80_default_nan(status);
5924     }
5925     aSign = extractFloatx80Sign( a );
5926     bSign = extractFloatx80Sign( b );
5927     if ( aSign == bSign ) {
5928         return addFloatx80Sigs(a, b, aSign, status);
5929     }
5930     else {
5931         return subFloatx80Sigs(a, b, aSign, status);
5932     }
5933
5934 }
5935
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of subtracting the extended double-precision floating-
5938 | point values `a' and `b'.  The operation is performed according to the
5939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5941
5942 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5943 {
5944     bool aSign, bSign;
5945
5946     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5947         float_raise(float_flag_invalid, status);
5948         return floatx80_default_nan(status);
5949     }
5950     aSign = extractFloatx80Sign( a );
5951     bSign = extractFloatx80Sign( b );
5952     if ( aSign == bSign ) {
5953         return subFloatx80Sigs(a, b, aSign, status);
5954     }
5955     else {
5956         return addFloatx80Sigs(a, b, aSign, status);
5957     }
5958
5959 }
5960
5961 /*----------------------------------------------------------------------------
5962 | Returns the result of multiplying the extended double-precision floating-
5963 | point values `a' and `b'.  The operation is performed according to the
5964 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5965 *----------------------------------------------------------------------------*/
5966
5967 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5968 {
5969     bool aSign, bSign, zSign;
5970     int32_t aExp, bExp, zExp;
5971     uint64_t aSig, bSig, zSig0, zSig1;
5972
5973     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5974         float_raise(float_flag_invalid, status);
5975         return floatx80_default_nan(status);
5976     }
5977     aSig = extractFloatx80Frac( a );
5978     aExp = extractFloatx80Exp( a );
5979     aSign = extractFloatx80Sign( a );
5980     bSig = extractFloatx80Frac( b );
5981     bExp = extractFloatx80Exp( b );
5982     bSign = extractFloatx80Sign( b );
5983     zSign = aSign ^ bSign;
5984     if ( aExp == 0x7FFF ) {
5985         if (    (uint64_t) ( aSig<<1 )
5986              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5987             return propagateFloatx80NaN(a, b, status);
5988         }
5989         if ( ( bExp | bSig ) == 0 ) goto invalid;
5990         return packFloatx80(zSign, floatx80_infinity_high,
5991                                    floatx80_infinity_low);
5992     }
5993     if ( bExp == 0x7FFF ) {
5994         if ((uint64_t)(bSig << 1)) {
5995             return propagateFloatx80NaN(a, b, status);
5996         }
5997         if ( ( aExp | aSig ) == 0 ) {
5998  invalid:
5999             float_raise(float_flag_invalid, status);
6000             return floatx80_default_nan(status);
6001         }
6002         return packFloatx80(zSign, floatx80_infinity_high,
6003                                    floatx80_infinity_low);
6004     }
6005     if ( aExp == 0 ) {
6006         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6007         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6008     }
6009     if ( bExp == 0 ) {
6010         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6011         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6012     }
6013     zExp = aExp + bExp - 0x3FFE;
6014     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6015     if ( 0 < (int64_t) zSig0 ) {
6016         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6017         --zExp;
6018     }
6019     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6020                                 zSign, zExp, zSig0, zSig1, status);
6021 }
6022
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of dividing the extended double-precision floating-point
6025 | value `a' by the corresponding value `b'.  The operation is performed
6026 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6027 *----------------------------------------------------------------------------*/
6028
6029 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6030 {
6031     bool aSign, bSign, zSign;
6032     int32_t aExp, bExp, zExp;
6033     uint64_t aSig, bSig, zSig0, zSig1;
6034     uint64_t rem0, rem1, rem2, term0, term1, term2;
6035
6036     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6037         float_raise(float_flag_invalid, status);
6038         return floatx80_default_nan(status);
6039     }
6040     aSig = extractFloatx80Frac( a );
6041     aExp = extractFloatx80Exp( a );
6042     aSign = extractFloatx80Sign( a );
6043     bSig = extractFloatx80Frac( b );
6044     bExp = extractFloatx80Exp( b );
6045     bSign = extractFloatx80Sign( b );
6046     zSign = aSign ^ bSign;
6047     if ( aExp == 0x7FFF ) {
6048         if ((uint64_t)(aSig << 1)) {
6049             return propagateFloatx80NaN(a, b, status);
6050         }
6051         if ( bExp == 0x7FFF ) {
6052             if ((uint64_t)(bSig << 1)) {
6053                 return propagateFloatx80NaN(a, b, status);
6054             }
6055             goto invalid;
6056         }
6057         return packFloatx80(zSign, floatx80_infinity_high,
6058                                    floatx80_infinity_low);
6059     }
6060     if ( bExp == 0x7FFF ) {
6061         if ((uint64_t)(bSig << 1)) {
6062             return propagateFloatx80NaN(a, b, status);
6063         }
6064         return packFloatx80( zSign, 0, 0 );
6065     }
6066     if ( bExp == 0 ) {
6067         if ( bSig == 0 ) {
6068             if ( ( aExp | aSig ) == 0 ) {
6069  invalid:
6070                 float_raise(float_flag_invalid, status);
6071                 return floatx80_default_nan(status);
6072             }
6073             float_raise(float_flag_divbyzero, status);
6074             return packFloatx80(zSign, floatx80_infinity_high,
6075                                        floatx80_infinity_low);
6076         }
6077         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6078     }
6079     if ( aExp == 0 ) {
6080         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6081         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6082     }
6083     zExp = aExp - bExp + 0x3FFE;
6084     rem1 = 0;
6085     if ( bSig <= aSig ) {
6086         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6087         ++zExp;
6088     }
6089     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6090     mul64To128( bSig, zSig0, &term0, &term1 );
6091     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6092     while ( (int64_t) rem0 < 0 ) {
6093         --zSig0;
6094         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6095     }
6096     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6097     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6098         mul64To128( bSig, zSig1, &term1, &term2 );
6099         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6100         while ( (int64_t) rem1 < 0 ) {
6101             --zSig1;
6102             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6103         }
6104         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6105     }
6106     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6107                                 zSign, zExp, zSig0, zSig1, status);
6108 }
6109
6110 /*----------------------------------------------------------------------------
6111 | Returns the remainder of the extended double-precision floating-point value
6112 | `a' with respect to the corresponding value `b'.  The operation is performed
6113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6114 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6115 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6116 | the absolute value of the integer quotient.
6117 *----------------------------------------------------------------------------*/
6118
6119 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6120                          float_status *status)
6121 {
6122     bool aSign, zSign;
6123     int32_t aExp, bExp, expDiff, aExpOrig;
6124     uint64_t aSig0, aSig1, bSig;
6125     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6126
6127     *quotient = 0;
6128     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6129         float_raise(float_flag_invalid, status);
6130         return floatx80_default_nan(status);
6131     }
6132     aSig0 = extractFloatx80Frac( a );
6133     aExpOrig = aExp = extractFloatx80Exp( a );
6134     aSign = extractFloatx80Sign( a );
6135     bSig = extractFloatx80Frac( b );
6136     bExp = extractFloatx80Exp( b );
6137     if ( aExp == 0x7FFF ) {
6138         if (    (uint64_t) ( aSig0<<1 )
6139              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6140             return propagateFloatx80NaN(a, b, status);
6141         }
6142         goto invalid;
6143     }
6144     if ( bExp == 0x7FFF ) {
6145         if ((uint64_t)(bSig << 1)) {
6146             return propagateFloatx80NaN(a, b, status);
6147         }
6148         if (aExp == 0 && aSig0 >> 63) {
6149             /*
6150              * Pseudo-denormal argument must be returned in normalized
6151              * form.
6152              */
6153             return packFloatx80(aSign, 1, aSig0);
6154         }
6155         return a;
6156     }
6157     if ( bExp == 0 ) {
6158         if ( bSig == 0 ) {
6159  invalid:
6160             float_raise(float_flag_invalid, status);
6161             return floatx80_default_nan(status);
6162         }
6163         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6164     }
6165     if ( aExp == 0 ) {
6166         if ( aSig0 == 0 ) return a;
6167         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6168     }
6169     zSign = aSign;
6170     expDiff = aExp - bExp;
6171     aSig1 = 0;
6172     if ( expDiff < 0 ) {
6173         if ( mod || expDiff < -1 ) {
6174             if (aExp == 1 && aExpOrig == 0) {
6175                 /*
6176                  * Pseudo-denormal argument must be returned in
6177                  * normalized form.
6178                  */
6179                 return packFloatx80(aSign, aExp, aSig0);
6180             }
6181             return a;
6182         }
6183         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6184         expDiff = 0;
6185     }
6186     *quotient = q = ( bSig <= aSig0 );
6187     if ( q ) aSig0 -= bSig;
6188     expDiff -= 64;
6189     while ( 0 < expDiff ) {
6190         q = estimateDiv128To64( aSig0, aSig1, bSig );
6191         q = ( 2 < q ) ? q - 2 : 0;
6192         mul64To128( bSig, q, &term0, &term1 );
6193         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6194         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6195         expDiff -= 62;
6196         *quotient <<= 62;
6197         *quotient += q;
6198     }
6199     expDiff += 64;
6200     if ( 0 < expDiff ) {
6201         q = estimateDiv128To64( aSig0, aSig1, bSig );
6202         q = ( 2 < q ) ? q - 2 : 0;
6203         q >>= 64 - expDiff;
6204         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6205         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6206         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6207         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6208             ++q;
6209             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6210         }
6211         if (expDiff < 64) {
6212             *quotient <<= expDiff;
6213         } else {
6214             *quotient = 0;
6215         }
6216         *quotient += q;
6217     }
6218     else {
6219         term1 = 0;
6220         term0 = bSig;
6221     }
6222     if (!mod) {
6223         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6224         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6225                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6226                         && ( q & 1 ) )
6227             ) {
6228             aSig0 = alternateASig0;
6229             aSig1 = alternateASig1;
6230             zSign = ! zSign;
6231             ++*quotient;
6232         }
6233     }
6234     return
6235         normalizeRoundAndPackFloatx80(
6236             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6237
6238 }
6239
6240 /*----------------------------------------------------------------------------
6241 | Returns the remainder of the extended double-precision floating-point value
6242 | `a' with respect to the corresponding value `b'.  The operation is performed
6243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6244 *----------------------------------------------------------------------------*/
6245
6246 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6247 {
6248     uint64_t quotient;
6249     return floatx80_modrem(a, b, false, &quotient, status);
6250 }
6251
6252 /*----------------------------------------------------------------------------
6253 | Returns the remainder of the extended double-precision floating-point value
6254 | `a' with respect to the corresponding value `b', with the quotient truncated
6255 | toward zero.
6256 *----------------------------------------------------------------------------*/
6257
6258 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6259 {
6260     uint64_t quotient;
6261     return floatx80_modrem(a, b, true, &quotient, status);
6262 }
6263
6264 /*----------------------------------------------------------------------------
6265 | Returns the square root of the extended double-precision floating-point
6266 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6267 | for Binary Floating-Point Arithmetic.
6268 *----------------------------------------------------------------------------*/
6269
6270 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6271 {
6272     bool aSign;
6273     int32_t aExp, zExp;
6274     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6275     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6276
6277     if (floatx80_invalid_encoding(a)) {
6278         float_raise(float_flag_invalid, status);
6279         return floatx80_default_nan(status);
6280     }
6281     aSig0 = extractFloatx80Frac( a );
6282     aExp = extractFloatx80Exp( a );
6283     aSign = extractFloatx80Sign( a );
6284     if ( aExp == 0x7FFF ) {
6285         if ((uint64_t)(aSig0 << 1)) {
6286             return propagateFloatx80NaN(a, a, status);
6287         }
6288         if ( ! aSign ) return a;
6289         goto invalid;
6290     }
6291     if ( aSign ) {
6292         if ( ( aExp | aSig0 ) == 0 ) return a;
6293  invalid:
6294         float_raise(float_flag_invalid, status);
6295         return floatx80_default_nan(status);
6296     }
6297     if ( aExp == 0 ) {
6298         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6299         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6300     }
6301     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6302     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6303     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6304     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6305     doubleZSig0 = zSig0<<1;
6306     mul64To128( zSig0, zSig0, &term0, &term1 );
6307     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6308     while ( (int64_t) rem0 < 0 ) {
6309         --zSig0;
6310         doubleZSig0 -= 2;
6311         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6312     }
6313     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6314     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6315         if ( zSig1 == 0 ) zSig1 = 1;
6316         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6317         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6318         mul64To128( zSig1, zSig1, &term2, &term3 );
6319         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6320         while ( (int64_t) rem1 < 0 ) {
6321             --zSig1;
6322             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6323             term3 |= 1;
6324             term2 |= doubleZSig0;
6325             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6326         }
6327         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6328     }
6329     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6330     zSig0 |= doubleZSig0;
6331     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6332                                 0, zExp, zSig0, zSig1, status);
6333 }
6334
6335 /*----------------------------------------------------------------------------
6336 | Returns the result of converting the quadruple-precision floating-point
6337 | value `a' to the 32-bit two's complement integer format.  The conversion
6338 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6339 | Arithmetic---which means in particular that the conversion is rounded
6340 | according to the current rounding mode.  If `a' is a NaN, the largest
6341 | positive integer is returned.  Otherwise, if the conversion overflows, the
6342 | largest integer with the same sign as `a' is returned.
6343 *----------------------------------------------------------------------------*/
6344
6345 int32_t float128_to_int32(float128 a, float_status *status)
6346 {
6347     bool aSign;
6348     int32_t aExp, shiftCount;
6349     uint64_t aSig0, aSig1;
6350
6351     aSig1 = extractFloat128Frac1( a );
6352     aSig0 = extractFloat128Frac0( a );
6353     aExp = extractFloat128Exp( a );
6354     aSign = extractFloat128Sign( a );
6355     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6356     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6357     aSig0 |= ( aSig1 != 0 );
6358     shiftCount = 0x4028 - aExp;
6359     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6360     return roundAndPackInt32(aSign, aSig0, status);
6361
6362 }
6363
6364 /*----------------------------------------------------------------------------
6365 | Returns the result of converting the quadruple-precision floating-point
6366 | value `a' to the 32-bit two's complement integer format.  The conversion
6367 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6368 | Arithmetic, except that the conversion is always rounded toward zero.  If
6369 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6370 | conversion overflows, the largest integer with the same sign as `a' is
6371 | returned.
6372 *----------------------------------------------------------------------------*/
6373
6374 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6375 {
6376     bool aSign;
6377     int32_t aExp, shiftCount;
6378     uint64_t aSig0, aSig1, savedASig;
6379     int32_t z;
6380
6381     aSig1 = extractFloat128Frac1( a );
6382     aSig0 = extractFloat128Frac0( a );
6383     aExp = extractFloat128Exp( a );
6384     aSign = extractFloat128Sign( a );
6385     aSig0 |= ( aSig1 != 0 );
6386     if ( 0x401E < aExp ) {
6387         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6388         goto invalid;
6389     }
6390     else if ( aExp < 0x3FFF ) {
6391         if (aExp || aSig0) {
6392             float_raise(float_flag_inexact, status);
6393         }
6394         return 0;
6395     }
6396     aSig0 |= UINT64_C(0x0001000000000000);
6397     shiftCount = 0x402F - aExp;
6398     savedASig = aSig0;
6399     aSig0 >>= shiftCount;
6400     z = aSig0;
6401     if ( aSign ) z = - z;
6402     if ( ( z < 0 ) ^ aSign ) {
6403  invalid:
6404         float_raise(float_flag_invalid, status);
6405         return aSign ? INT32_MIN : INT32_MAX;
6406     }
6407     if ( ( aSig0<<shiftCount ) != savedASig ) {
6408         float_raise(float_flag_inexact, status);
6409     }
6410     return z;
6411
6412 }
6413
6414 /*----------------------------------------------------------------------------
6415 | Returns the result of converting the quadruple-precision floating-point
6416 | value `a' to the 64-bit two's complement integer format.  The conversion
6417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6418 | Arithmetic---which means in particular that the conversion is rounded
6419 | according to the current rounding mode.  If `a' is a NaN, the largest
6420 | positive integer is returned.  Otherwise, if the conversion overflows, the
6421 | largest integer with the same sign as `a' is returned.
6422 *----------------------------------------------------------------------------*/
6423
6424 int64_t float128_to_int64(float128 a, float_status *status)
6425 {
6426     bool aSign;
6427     int32_t aExp, shiftCount;
6428     uint64_t aSig0, aSig1;
6429
6430     aSig1 = extractFloat128Frac1( a );
6431     aSig0 = extractFloat128Frac0( a );
6432     aExp = extractFloat128Exp( a );
6433     aSign = extractFloat128Sign( a );
6434     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6435     shiftCount = 0x402F - aExp;
6436     if ( shiftCount <= 0 ) {
6437         if ( 0x403E < aExp ) {
6438             float_raise(float_flag_invalid, status);
6439             if (    ! aSign
6440                  || (    ( aExp == 0x7FFF )
6441                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6442                     )
6443                ) {
6444                 return INT64_MAX;
6445             }
6446             return INT64_MIN;
6447         }
6448         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6449     }
6450     else {
6451         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6452     }
6453     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6454
6455 }
6456
6457 /*----------------------------------------------------------------------------
6458 | Returns the result of converting the quadruple-precision floating-point
6459 | value `a' to the 64-bit two's complement integer format.  The conversion
6460 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6461 | Arithmetic, except that the conversion is always rounded toward zero.
6462 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6463 | the conversion overflows, the largest integer with the same sign as `a' is
6464 | returned.
6465 *----------------------------------------------------------------------------*/
6466
6467 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6468 {
6469     bool aSign;
6470     int32_t aExp, shiftCount;
6471     uint64_t aSig0, aSig1;
6472     int64_t z;
6473
6474     aSig1 = extractFloat128Frac1( a );
6475     aSig0 = extractFloat128Frac0( a );
6476     aExp = extractFloat128Exp( a );
6477     aSign = extractFloat128Sign( a );
6478     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6479     shiftCount = aExp - 0x402F;
6480     if ( 0 < shiftCount ) {
6481         if ( 0x403E <= aExp ) {
6482             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6483             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6484                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6485                 if (aSig1) {
6486                     float_raise(float_flag_inexact, status);
6487                 }
6488             }
6489             else {
6490                 float_raise(float_flag_invalid, status);
6491                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6492                     return INT64_MAX;
6493                 }
6494             }
6495             return INT64_MIN;
6496         }
6497         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6498         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6499             float_raise(float_flag_inexact, status);
6500         }
6501     }
6502     else {
6503         if ( aExp < 0x3FFF ) {
6504             if ( aExp | aSig0 | aSig1 ) {
6505                 float_raise(float_flag_inexact, status);
6506             }
6507             return 0;
6508         }
6509         z = aSig0>>( - shiftCount );
6510         if (    aSig1
6511              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6512             float_raise(float_flag_inexact, status);
6513         }
6514     }
6515     if ( aSign ) z = - z;
6516     return z;
6517
6518 }
6519
6520 /*----------------------------------------------------------------------------
6521 | Returns the result of converting the quadruple-precision floating-point value
6522 | `a' to the 64-bit unsigned integer format.  The conversion is
6523 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6524 | Arithmetic---which means in particular that the conversion is rounded
6525 | according to the current rounding mode.  If `a' is a NaN, the largest
6526 | positive integer is returned.  If the conversion overflows, the
6527 | largest unsigned integer is returned.  If 'a' is negative, the value is
6528 | rounded and zero is returned; negative values that do not round to zero
6529 | will raise the inexact exception.
6530 *----------------------------------------------------------------------------*/
6531
6532 uint64_t float128_to_uint64(float128 a, float_status *status)
6533 {
6534     bool aSign;
6535     int aExp;
6536     int shiftCount;
6537     uint64_t aSig0, aSig1;
6538
6539     aSig0 = extractFloat128Frac0(a);
6540     aSig1 = extractFloat128Frac1(a);
6541     aExp = extractFloat128Exp(a);
6542     aSign = extractFloat128Sign(a);
6543     if (aSign && (aExp > 0x3FFE)) {
6544         float_raise(float_flag_invalid, status);
6545         if (float128_is_any_nan(a)) {
6546             return UINT64_MAX;
6547         } else {
6548             return 0;
6549         }
6550     }
6551     if (aExp) {
6552         aSig0 |= UINT64_C(0x0001000000000000);
6553     }
6554     shiftCount = 0x402F - aExp;
6555     if (shiftCount <= 0) {
6556         if (0x403E < aExp) {
6557             float_raise(float_flag_invalid, status);
6558             return UINT64_MAX;
6559         }
6560         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6561     } else {
6562         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6563     }
6564     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6565 }
6566
6567 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6568 {
6569     uint64_t v;
6570     signed char current_rounding_mode = status->float_rounding_mode;
6571
6572     set_float_rounding_mode(float_round_to_zero, status);
6573     v = float128_to_uint64(a, status);
6574     set_float_rounding_mode(current_rounding_mode, status);
6575
6576     return v;
6577 }
6578
6579 /*----------------------------------------------------------------------------
6580 | Returns the result of converting the quadruple-precision floating-point
6581 | value `a' to the 32-bit unsigned integer format.  The conversion
6582 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6583 | Arithmetic except that the conversion is always rounded toward zero.
6584 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6585 | if the conversion overflows, the largest unsigned integer is returned.
6586 | If 'a' is negative, the value is rounded and zero is returned; negative
6587 | values that do not round to zero will raise the inexact exception.
6588 *----------------------------------------------------------------------------*/
6589
6590 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6591 {
6592     uint64_t v;
6593     uint32_t res;
6594     int old_exc_flags = get_float_exception_flags(status);
6595
6596     v = float128_to_uint64_round_to_zero(a, status);
6597     if (v > 0xffffffff) {
6598         res = 0xffffffff;
6599     } else {
6600         return v;
6601     }
6602     set_float_exception_flags(old_exc_flags, status);
6603     float_raise(float_flag_invalid, status);
6604     return res;
6605 }
6606
6607 /*----------------------------------------------------------------------------
6608 | Returns the result of converting the quadruple-precision floating-point value
6609 | `a' to the 32-bit unsigned integer format.  The conversion is
6610 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6611 | Arithmetic---which means in particular that the conversion is rounded
6612 | according to the current rounding mode.  If `a' is a NaN, the largest
6613 | positive integer is returned.  If the conversion overflows, the
6614 | largest unsigned integer is returned.  If 'a' is negative, the value is
6615 | rounded and zero is returned; negative values that do not round to zero
6616 | will raise the inexact exception.
6617 *----------------------------------------------------------------------------*/
6618
6619 uint32_t float128_to_uint32(float128 a, float_status *status)
6620 {
6621     uint64_t v;
6622     uint32_t res;
6623     int old_exc_flags = get_float_exception_flags(status);
6624
6625     v = float128_to_uint64(a, status);
6626     if (v > 0xffffffff) {
6627         res = 0xffffffff;
6628     } else {
6629         return v;
6630     }
6631     set_float_exception_flags(old_exc_flags, status);
6632     float_raise(float_flag_invalid, status);
6633     return res;
6634 }
6635
6636 /*----------------------------------------------------------------------------
6637 | Returns the result of converting the quadruple-precision floating-point
6638 | value `a' to the single-precision floating-point format.  The conversion
6639 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6640 | Arithmetic.
6641 *----------------------------------------------------------------------------*/
6642
6643 float32 float128_to_float32(float128 a, float_status *status)
6644 {
6645     bool aSign;
6646     int32_t aExp;
6647     uint64_t aSig0, aSig1;
6648     uint32_t zSig;
6649
6650     aSig1 = extractFloat128Frac1( a );
6651     aSig0 = extractFloat128Frac0( a );
6652     aExp = extractFloat128Exp( a );
6653     aSign = extractFloat128Sign( a );
6654     if ( aExp == 0x7FFF ) {
6655         if ( aSig0 | aSig1 ) {
6656             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6657         }
6658         return packFloat32( aSign, 0xFF, 0 );
6659     }
6660     aSig0 |= ( aSig1 != 0 );
6661     shift64RightJamming( aSig0, 18, &aSig0 );
6662     zSig = aSig0;
6663     if ( aExp || zSig ) {
6664         zSig |= 0x40000000;
6665         aExp -= 0x3F81;
6666     }
6667     return roundAndPackFloat32(aSign, aExp, zSig, status);
6668
6669 }
6670
6671 /*----------------------------------------------------------------------------
6672 | Returns the result of converting the quadruple-precision floating-point
6673 | value `a' to the double-precision floating-point format.  The conversion
6674 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6675 | Arithmetic.
6676 *----------------------------------------------------------------------------*/
6677
6678 float64 float128_to_float64(float128 a, float_status *status)
6679 {
6680     bool aSign;
6681     int32_t aExp;
6682     uint64_t aSig0, aSig1;
6683
6684     aSig1 = extractFloat128Frac1( a );
6685     aSig0 = extractFloat128Frac0( a );
6686     aExp = extractFloat128Exp( a );
6687     aSign = extractFloat128Sign( a );
6688     if ( aExp == 0x7FFF ) {
6689         if ( aSig0 | aSig1 ) {
6690             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6691         }
6692         return packFloat64( aSign, 0x7FF, 0 );
6693     }
6694     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6695     aSig0 |= ( aSig1 != 0 );
6696     if ( aExp || aSig0 ) {
6697         aSig0 |= UINT64_C(0x4000000000000000);
6698         aExp -= 0x3C01;
6699     }
6700     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6701
6702 }
6703
6704 /*----------------------------------------------------------------------------
6705 | Returns the result of converting the quadruple-precision floating-point
6706 | value `a' to the extended double-precision floating-point format.  The
6707 | conversion is performed according to the IEC/IEEE Standard for Binary
6708 | Floating-Point Arithmetic.
6709 *----------------------------------------------------------------------------*/
6710
6711 floatx80 float128_to_floatx80(float128 a, float_status *status)
6712 {
6713     bool aSign;
6714     int32_t aExp;
6715     uint64_t aSig0, aSig1;
6716
6717     aSig1 = extractFloat128Frac1( a );
6718     aSig0 = extractFloat128Frac0( a );
6719     aExp = extractFloat128Exp( a );
6720     aSign = extractFloat128Sign( a );
6721     if ( aExp == 0x7FFF ) {
6722         if ( aSig0 | aSig1 ) {
6723             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6724                                                status);
6725             return floatx80_silence_nan(res, status);
6726         }
6727         return packFloatx80(aSign, floatx80_infinity_high,
6728                                    floatx80_infinity_low);
6729     }
6730     if ( aExp == 0 ) {
6731         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6732         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6733     }
6734     else {
6735         aSig0 |= UINT64_C(0x0001000000000000);
6736     }
6737     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6738     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6739
6740 }
6741
6742 /*----------------------------------------------------------------------------
6743 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6744 | returns the result as a quadruple-precision floating-point value.  The
6745 | operation is performed according to the IEC/IEEE Standard for Binary
6746 | Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6748
6749 float128 float128_round_to_int(float128 a, float_status *status)
6750 {
6751     bool aSign;
6752     int32_t aExp;
6753     uint64_t lastBitMask, roundBitsMask;
6754     float128 z;
6755
6756     aExp = extractFloat128Exp( a );
6757     if ( 0x402F <= aExp ) {
6758         if ( 0x406F <= aExp ) {
6759             if (    ( aExp == 0x7FFF )
6760                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6761                ) {
6762                 return propagateFloat128NaN(a, a, status);
6763             }
6764             return a;
6765         }
6766         lastBitMask = 1;
6767         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6768         roundBitsMask = lastBitMask - 1;
6769         z = a;
6770         switch (status->float_rounding_mode) {
6771         case float_round_nearest_even:
6772             if ( lastBitMask ) {
6773                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6774                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6775             }
6776             else {
6777                 if ( (int64_t) z.low < 0 ) {
6778                     ++z.high;
6779                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6780                 }
6781             }
6782             break;
6783         case float_round_ties_away:
6784             if (lastBitMask) {
6785                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6786             } else {
6787                 if ((int64_t) z.low < 0) {
6788                     ++z.high;
6789                 }
6790             }
6791             break;
6792         case float_round_to_zero:
6793             break;
6794         case float_round_up:
6795             if (!extractFloat128Sign(z)) {
6796                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6797             }
6798             break;
6799         case float_round_down:
6800             if (extractFloat128Sign(z)) {
6801                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6802             }
6803             break;
6804         case float_round_to_odd:
6805             /*
6806              * Note that if lastBitMask == 0, the last bit is the lsb
6807              * of high, and roundBitsMask == -1.
6808              */
6809             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6810                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6811             }
6812             break;
6813         default:
6814             abort();
6815         }
6816         z.low &= ~ roundBitsMask;
6817     }
6818     else {
6819         if ( aExp < 0x3FFF ) {
6820             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6821             float_raise(float_flag_inexact, status);
6822             aSign = extractFloat128Sign( a );
6823             switch (status->float_rounding_mode) {
6824             case float_round_nearest_even:
6825                 if (    ( aExp == 0x3FFE )
6826                      && (   extractFloat128Frac0( a )
6827                           | extractFloat128Frac1( a ) )
6828                    ) {
6829                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6830                 }
6831                 break;
6832             case float_round_ties_away:
6833                 if (aExp == 0x3FFE) {
6834                     return packFloat128(aSign, 0x3FFF, 0, 0);
6835                 }
6836                 break;
6837             case float_round_down:
6838                 return
6839                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6840                     : packFloat128( 0, 0, 0, 0 );
6841             case float_round_up:
6842                 return
6843                       aSign ? packFloat128( 1, 0, 0, 0 )
6844                     : packFloat128( 0, 0x3FFF, 0, 0 );
6845
6846             case float_round_to_odd:
6847                 return packFloat128(aSign, 0x3FFF, 0, 0);
6848
6849             case float_round_to_zero:
6850                 break;
6851             }
6852             return packFloat128( aSign, 0, 0, 0 );
6853         }
6854         lastBitMask = 1;
6855         lastBitMask <<= 0x402F - aExp;
6856         roundBitsMask = lastBitMask - 1;
6857         z.low = 0;
6858         z.high = a.high;
6859         switch (status->float_rounding_mode) {
6860         case float_round_nearest_even:
6861             z.high += lastBitMask>>1;
6862             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6863                 z.high &= ~ lastBitMask;
6864             }
6865             break;
6866         case float_round_ties_away:
6867             z.high += lastBitMask>>1;
6868             break;
6869         case float_round_to_zero:
6870             break;
6871         case float_round_up:
6872             if (!extractFloat128Sign(z)) {
6873                 z.high |= ( a.low != 0 );
6874                 z.high += roundBitsMask;
6875             }
6876             break;
6877         case float_round_down:
6878             if (extractFloat128Sign(z)) {
6879                 z.high |= (a.low != 0);
6880                 z.high += roundBitsMask;
6881             }
6882             break;
6883         case float_round_to_odd:
6884             if ((z.high & lastBitMask) == 0) {
6885                 z.high |= (a.low != 0);
6886                 z.high += roundBitsMask;
6887             }
6888             break;
6889         default:
6890             abort();
6891         }
6892         z.high &= ~ roundBitsMask;
6893     }
6894     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6895         float_raise(float_flag_inexact, status);
6896     }
6897     return z;
6898
6899 }
6900
6901 /*----------------------------------------------------------------------------
6902 | Returns the result of adding the absolute values of the quadruple-precision
6903 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6904 | before being returned.  `zSign' is ignored if the result is a NaN.
6905 | The addition is performed according to the IEC/IEEE Standard for Binary
6906 | Floating-Point Arithmetic.
6907 *----------------------------------------------------------------------------*/
6908
6909 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6910                                 float_status *status)
6911 {
6912     int32_t aExp, bExp, zExp;
6913     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6914     int32_t expDiff;
6915
6916     aSig1 = extractFloat128Frac1( a );
6917     aSig0 = extractFloat128Frac0( a );
6918     aExp = extractFloat128Exp( a );
6919     bSig1 = extractFloat128Frac1( b );
6920     bSig0 = extractFloat128Frac0( b );
6921     bExp = extractFloat128Exp( b );
6922     expDiff = aExp - bExp;
6923     if ( 0 < expDiff ) {
6924         if ( aExp == 0x7FFF ) {
6925             if (aSig0 | aSig1) {
6926                 return propagateFloat128NaN(a, b, status);
6927             }
6928             return a;
6929         }
6930         if ( bExp == 0 ) {
6931             --expDiff;
6932         }
6933         else {
6934             bSig0 |= UINT64_C(0x0001000000000000);
6935         }
6936         shift128ExtraRightJamming(
6937             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6938         zExp = aExp;
6939     }
6940     else if ( expDiff < 0 ) {
6941         if ( bExp == 0x7FFF ) {
6942             if (bSig0 | bSig1) {
6943                 return propagateFloat128NaN(a, b, status);
6944             }
6945             return packFloat128( zSign, 0x7FFF, 0, 0 );
6946         }
6947         if ( aExp == 0 ) {
6948             ++expDiff;
6949         }
6950         else {
6951             aSig0 |= UINT64_C(0x0001000000000000);
6952         }
6953         shift128ExtraRightJamming(
6954             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6955         zExp = bExp;
6956     }
6957     else {
6958         if ( aExp == 0x7FFF ) {
6959             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6960                 return propagateFloat128NaN(a, b, status);
6961             }
6962             return a;
6963         }
6964         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6965         if ( aExp == 0 ) {
6966             if (status->flush_to_zero) {
6967                 if (zSig0 | zSig1) {
6968                     float_raise(float_flag_output_denormal, status);
6969                 }
6970                 return packFloat128(zSign, 0, 0, 0);
6971             }
6972             return packFloat128( zSign, 0, zSig0, zSig1 );
6973         }
6974         zSig2 = 0;
6975         zSig0 |= UINT64_C(0x0002000000000000);
6976         zExp = aExp;
6977         goto shiftRight1;
6978     }
6979     aSig0 |= UINT64_C(0x0001000000000000);
6980     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6981     --zExp;
6982     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6983     ++zExp;
6984  shiftRight1:
6985     shift128ExtraRightJamming(
6986         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6987  roundAndPack:
6988     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6989
6990 }
6991
6992 /*----------------------------------------------------------------------------
6993 | Returns the result of subtracting the absolute values of the quadruple-
6994 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6995 | difference is negated before being returned.  `zSign' is ignored if the
6996 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6997 | Standard for Binary Floating-Point Arithmetic.
6998 *----------------------------------------------------------------------------*/
6999
7000 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7001                                 float_status *status)
7002 {
7003     int32_t aExp, bExp, zExp;
7004     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7005     int32_t expDiff;
7006
7007     aSig1 = extractFloat128Frac1( a );
7008     aSig0 = extractFloat128Frac0( a );
7009     aExp = extractFloat128Exp( a );
7010     bSig1 = extractFloat128Frac1( b );
7011     bSig0 = extractFloat128Frac0( b );
7012     bExp = extractFloat128Exp( b );
7013     expDiff = aExp - bExp;
7014     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7015     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7016     if ( 0 < expDiff ) goto aExpBigger;
7017     if ( expDiff < 0 ) goto bExpBigger;
7018     if ( aExp == 0x7FFF ) {
7019         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7020             return propagateFloat128NaN(a, b, status);
7021         }
7022         float_raise(float_flag_invalid, status);
7023         return float128_default_nan(status);
7024     }
7025     if ( aExp == 0 ) {
7026         aExp = 1;
7027         bExp = 1;
7028     }
7029     if ( bSig0 < aSig0 ) goto aBigger;
7030     if ( aSig0 < bSig0 ) goto bBigger;
7031     if ( bSig1 < aSig1 ) goto aBigger;
7032     if ( aSig1 < bSig1 ) goto bBigger;
7033     return packFloat128(status->float_rounding_mode == float_round_down,
7034                         0, 0, 0);
7035  bExpBigger:
7036     if ( bExp == 0x7FFF ) {
7037         if (bSig0 | bSig1) {
7038             return propagateFloat128NaN(a, b, status);
7039         }
7040         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7041     }
7042     if ( aExp == 0 ) {
7043         ++expDiff;
7044     }
7045     else {
7046         aSig0 |= UINT64_C(0x4000000000000000);
7047     }
7048     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7049     bSig0 |= UINT64_C(0x4000000000000000);
7050  bBigger:
7051     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7052     zExp = bExp;
7053     zSign ^= 1;
7054     goto normalizeRoundAndPack;
7055  aExpBigger:
7056     if ( aExp == 0x7FFF ) {
7057         if (aSig0 | aSig1) {
7058             return propagateFloat128NaN(a, b, status);
7059         }
7060         return a;
7061     }
7062     if ( bExp == 0 ) {
7063         --expDiff;
7064     }
7065     else {
7066         bSig0 |= UINT64_C(0x4000000000000000);
7067     }
7068     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7069     aSig0 |= UINT64_C(0x4000000000000000);
7070  aBigger:
7071     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7072     zExp = aExp;
7073  normalizeRoundAndPack:
7074     --zExp;
7075     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7076                                          status);
7077
7078 }
7079
7080 /*----------------------------------------------------------------------------
7081 | Returns the result of adding the quadruple-precision floating-point values
7082 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7083 | for Binary Floating-Point Arithmetic.
7084 *----------------------------------------------------------------------------*/
7085
7086 float128 float128_add(float128 a, float128 b, float_status *status)
7087 {
7088     bool aSign, bSign;
7089
7090     aSign = extractFloat128Sign( a );
7091     bSign = extractFloat128Sign( b );
7092     if ( aSign == bSign ) {
7093         return addFloat128Sigs(a, b, aSign, status);
7094     }
7095     else {
7096         return subFloat128Sigs(a, b, aSign, status);
7097     }
7098
7099 }
7100
7101 /*----------------------------------------------------------------------------
7102 | Returns the result of subtracting the quadruple-precision floating-point
7103 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7104 | Standard for Binary Floating-Point Arithmetic.
7105 *----------------------------------------------------------------------------*/
7106
7107 float128 float128_sub(float128 a, float128 b, float_status *status)
7108 {
7109     bool aSign, bSign;
7110
7111     aSign = extractFloat128Sign( a );
7112     bSign = extractFloat128Sign( b );
7113     if ( aSign == bSign ) {
7114         return subFloat128Sigs(a, b, aSign, status);
7115     }
7116     else {
7117         return addFloat128Sigs(a, b, aSign, status);
7118     }
7119
7120 }
7121
7122 /*----------------------------------------------------------------------------
7123 | Returns the result of multiplying the quadruple-precision floating-point
7124 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7125 | Standard for Binary Floating-Point Arithmetic.
7126 *----------------------------------------------------------------------------*/
7127
7128 float128 float128_mul(float128 a, float128 b, float_status *status)
7129 {
7130     bool aSign, bSign, zSign;
7131     int32_t aExp, bExp, zExp;
7132     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7133
7134     aSig1 = extractFloat128Frac1( a );
7135     aSig0 = extractFloat128Frac0( a );
7136     aExp = extractFloat128Exp( a );
7137     aSign = extractFloat128Sign( a );
7138     bSig1 = extractFloat128Frac1( b );
7139     bSig0 = extractFloat128Frac0( b );
7140     bExp = extractFloat128Exp( b );
7141     bSign = extractFloat128Sign( b );
7142     zSign = aSign ^ bSign;
7143     if ( aExp == 0x7FFF ) {
7144         if (    ( aSig0 | aSig1 )
7145              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7146             return propagateFloat128NaN(a, b, status);
7147         }
7148         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7149         return packFloat128( zSign, 0x7FFF, 0, 0 );
7150     }
7151     if ( bExp == 0x7FFF ) {
7152         if (bSig0 | bSig1) {
7153             return propagateFloat128NaN(a, b, status);
7154         }
7155         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7156  invalid:
7157             float_raise(float_flag_invalid, status);
7158             return float128_default_nan(status);
7159         }
7160         return packFloat128( zSign, 0x7FFF, 0, 0 );
7161     }
7162     if ( aExp == 0 ) {
7163         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7164         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7165     }
7166     if ( bExp == 0 ) {
7167         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7168         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7169     }
7170     zExp = aExp + bExp - 0x4000;
7171     aSig0 |= UINT64_C(0x0001000000000000);
7172     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7173     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7174     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7175     zSig2 |= ( zSig3 != 0 );
7176     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7177         shift128ExtraRightJamming(
7178             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7179         ++zExp;
7180     }
7181     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7182
7183 }
7184
7185 /*----------------------------------------------------------------------------
7186 | Returns the result of dividing the quadruple-precision floating-point value
7187 | `a' by the corresponding value `b'.  The operation is performed according to
7188 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7189 *----------------------------------------------------------------------------*/
7190
7191 float128 float128_div(float128 a, float128 b, float_status *status)
7192 {
7193     bool aSign, bSign, zSign;
7194     int32_t aExp, bExp, zExp;
7195     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7196     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7197
7198     aSig1 = extractFloat128Frac1( a );
7199     aSig0 = extractFloat128Frac0( a );
7200     aExp = extractFloat128Exp( a );
7201     aSign = extractFloat128Sign( a );
7202     bSig1 = extractFloat128Frac1( b );
7203     bSig0 = extractFloat128Frac0( b );
7204     bExp = extractFloat128Exp( b );
7205     bSign = extractFloat128Sign( b );
7206     zSign = aSign ^ bSign;
7207     if ( aExp == 0x7FFF ) {
7208         if (aSig0 | aSig1) {
7209             return propagateFloat128NaN(a, b, status);
7210         }
7211         if ( bExp == 0x7FFF ) {
7212             if (bSig0 | bSig1) {
7213                 return propagateFloat128NaN(a, b, status);
7214             }
7215             goto invalid;
7216         }
7217         return packFloat128( zSign, 0x7FFF, 0, 0 );
7218     }
7219     if ( bExp == 0x7FFF ) {
7220         if (bSig0 | bSig1) {
7221             return propagateFloat128NaN(a, b, status);
7222         }
7223         return packFloat128( zSign, 0, 0, 0 );
7224     }
7225     if ( bExp == 0 ) {
7226         if ( ( bSig0 | bSig1 ) == 0 ) {
7227             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7228  invalid:
7229                 float_raise(float_flag_invalid, status);
7230                 return float128_default_nan(status);
7231             }
7232             float_raise(float_flag_divbyzero, status);
7233             return packFloat128( zSign, 0x7FFF, 0, 0 );
7234         }
7235         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7236     }
7237     if ( aExp == 0 ) {
7238         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7239         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7240     }
7241     zExp = aExp - bExp + 0x3FFD;
7242     shortShift128Left(
7243         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7244     shortShift128Left(
7245         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7246     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7247         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7248         ++zExp;
7249     }
7250     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7251     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7252     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7253     while ( (int64_t) rem0 < 0 ) {
7254         --zSig0;
7255         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7256     }
7257     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7258     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7259         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7260         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7261         while ( (int64_t) rem1 < 0 ) {
7262             --zSig1;
7263             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7264         }
7265         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7266     }
7267     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7268     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7269
7270 }
7271
7272 /*----------------------------------------------------------------------------
7273 | Returns the remainder of the quadruple-precision floating-point value `a'
7274 | with respect to the corresponding value `b'.  The operation is performed
7275 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7276 *----------------------------------------------------------------------------*/
7277
7278 float128 float128_rem(float128 a, float128 b, float_status *status)
7279 {
7280     bool aSign, zSign;
7281     int32_t aExp, bExp, expDiff;
7282     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7283     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7284     int64_t sigMean0;
7285
7286     aSig1 = extractFloat128Frac1( a );
7287     aSig0 = extractFloat128Frac0( a );
7288     aExp = extractFloat128Exp( a );
7289     aSign = extractFloat128Sign( a );
7290     bSig1 = extractFloat128Frac1( b );
7291     bSig0 = extractFloat128Frac0( b );
7292     bExp = extractFloat128Exp( b );
7293     if ( aExp == 0x7FFF ) {
7294         if (    ( aSig0 | aSig1 )
7295              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7296             return propagateFloat128NaN(a, b, status);
7297         }
7298         goto invalid;
7299     }
7300     if ( bExp == 0x7FFF ) {
7301         if (bSig0 | bSig1) {
7302             return propagateFloat128NaN(a, b, status);
7303         }
7304         return a;
7305     }
7306     if ( bExp == 0 ) {
7307         if ( ( bSig0 | bSig1 ) == 0 ) {
7308  invalid:
7309             float_raise(float_flag_invalid, status);
7310             return float128_default_nan(status);
7311         }
7312         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7313     }
7314     if ( aExp == 0 ) {
7315         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7316         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7317     }
7318     expDiff = aExp - bExp;
7319     if ( expDiff < -1 ) return a;
7320     shortShift128Left(
7321         aSig0 | UINT64_C(0x0001000000000000),
7322         aSig1,
7323         15 - ( expDiff < 0 ),
7324         &aSig0,
7325         &aSig1
7326     );
7327     shortShift128Left(
7328         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7329     q = le128( bSig0, bSig1, aSig0, aSig1 );
7330     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7331     expDiff -= 64;
7332     while ( 0 < expDiff ) {
7333         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7334         q = ( 4 < q ) ? q - 4 : 0;
7335         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7336         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7337         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7338         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7339         expDiff -= 61;
7340     }
7341     if ( -64 < expDiff ) {
7342         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7343         q = ( 4 < q ) ? q - 4 : 0;
7344         q >>= - expDiff;
7345         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7346         expDiff += 52;
7347         if ( expDiff < 0 ) {
7348             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7349         }
7350         else {
7351             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7352         }
7353         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7354         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7355     }
7356     else {
7357         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7358         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7359     }
7360     do {
7361         alternateASig0 = aSig0;
7362         alternateASig1 = aSig1;
7363         ++q;
7364         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7365     } while ( 0 <= (int64_t) aSig0 );
7366     add128(
7367         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7368     if (    ( sigMean0 < 0 )
7369          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7370         aSig0 = alternateASig0;
7371         aSig1 = alternateASig1;
7372     }
7373     zSign = ( (int64_t) aSig0 < 0 );
7374     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7375     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7376                                          status);
7377 }
7378
7379 /*----------------------------------------------------------------------------
7380 | Returns the square root of the quadruple-precision floating-point value `a'.
7381 | The operation is performed according to the IEC/IEEE Standard for Binary
7382 | Floating-Point Arithmetic.
7383 *----------------------------------------------------------------------------*/
7384
7385 float128 float128_sqrt(float128 a, float_status *status)
7386 {
7387     bool aSign;
7388     int32_t aExp, zExp;
7389     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7390     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7391
7392     aSig1 = extractFloat128Frac1( a );
7393     aSig0 = extractFloat128Frac0( a );
7394     aExp = extractFloat128Exp( a );
7395     aSign = extractFloat128Sign( a );
7396     if ( aExp == 0x7FFF ) {
7397         if (aSig0 | aSig1) {
7398             return propagateFloat128NaN(a, a, status);
7399         }
7400         if ( ! aSign ) return a;
7401         goto invalid;
7402     }
7403     if ( aSign ) {
7404         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7405  invalid:
7406         float_raise(float_flag_invalid, status);
7407         return float128_default_nan(status);
7408     }
7409     if ( aExp == 0 ) {
7410         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7411         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7412     }
7413     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7414     aSig0 |= UINT64_C(0x0001000000000000);
7415     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7416     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7417     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7418     doubleZSig0 = zSig0<<1;
7419     mul64To128( zSig0, zSig0, &term0, &term1 );
7420     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7421     while ( (int64_t) rem0 < 0 ) {
7422         --zSig0;
7423         doubleZSig0 -= 2;
7424         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7425     }
7426     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7427     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7428         if ( zSig1 == 0 ) zSig1 = 1;
7429         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7430         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7431         mul64To128( zSig1, zSig1, &term2, &term3 );
7432         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7433         while ( (int64_t) rem1 < 0 ) {
7434             --zSig1;
7435             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7436             term3 |= 1;
7437             term2 |= doubleZSig0;
7438             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7439         }
7440         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7441     }
7442     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7443     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7444
7445 }
7446
7447 static inline FloatRelation
7448 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7449                           float_status *status)
7450 {
7451     bool aSign, bSign;
7452
7453     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7454         float_raise(float_flag_invalid, status);
7455         return float_relation_unordered;
7456     }
7457     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7458           ( extractFloatx80Frac( a )<<1 ) ) ||
7459         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7460           ( extractFloatx80Frac( b )<<1 ) )) {
7461         if (!is_quiet ||
7462             floatx80_is_signaling_nan(a, status) ||
7463             floatx80_is_signaling_nan(b, status)) {
7464             float_raise(float_flag_invalid, status);
7465         }
7466         return float_relation_unordered;
7467     }
7468     aSign = extractFloatx80Sign( a );
7469     bSign = extractFloatx80Sign( b );
7470     if ( aSign != bSign ) {
7471
7472         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7473              ( ( a.low | b.low ) == 0 ) ) {
7474             /* zero case */
7475             return float_relation_equal;
7476         } else {
7477             return 1 - (2 * aSign);
7478         }
7479     } else {
7480         /* Normalize pseudo-denormals before comparison.  */
7481         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7482             ++a.high;
7483         }
7484         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7485             ++b.high;
7486         }
7487         if (a.low == b.low && a.high == b.high) {
7488             return float_relation_equal;
7489         } else {
7490             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7491         }
7492     }
7493 }
7494
7495 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7496 {
7497     return floatx80_compare_internal(a, b, 0, status);
7498 }
7499
7500 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7501                                      float_status *status)
7502 {
7503     return floatx80_compare_internal(a, b, 1, status);
7504 }
7505
7506 static inline FloatRelation
7507 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7508                           float_status *status)
7509 {
7510     bool aSign, bSign;
7511
7512     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7513           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7514         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7515           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7516         if (!is_quiet ||
7517             float128_is_signaling_nan(a, status) ||
7518             float128_is_signaling_nan(b, status)) {
7519             float_raise(float_flag_invalid, status);
7520         }
7521         return float_relation_unordered;
7522     }
7523     aSign = extractFloat128Sign( a );
7524     bSign = extractFloat128Sign( b );
7525     if ( aSign != bSign ) {
7526         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7527             /* zero case */
7528             return float_relation_equal;
7529         } else {
7530             return 1 - (2 * aSign);
7531         }
7532     } else {
7533         if (a.low == b.low && a.high == b.high) {
7534             return float_relation_equal;
7535         } else {
7536             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7537         }
7538     }
7539 }
7540
7541 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7542 {
7543     return float128_compare_internal(a, b, 0, status);
7544 }
7545
7546 FloatRelation float128_compare_quiet(float128 a, float128 b,
7547                                      float_status *status)
7548 {
7549     return float128_compare_internal(a, b, 1, status);
7550 }
7551
7552 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7553 {
7554     bool aSign;
7555     int32_t aExp;
7556     uint64_t aSig;
7557
7558     if (floatx80_invalid_encoding(a)) {
7559         float_raise(float_flag_invalid, status);
7560         return floatx80_default_nan(status);
7561     }
7562     aSig = extractFloatx80Frac( a );
7563     aExp = extractFloatx80Exp( a );
7564     aSign = extractFloatx80Sign( a );
7565
7566     if ( aExp == 0x7FFF ) {
7567         if ( aSig<<1 ) {
7568             return propagateFloatx80NaN(a, a, status);
7569         }
7570         return a;
7571     }
7572
7573     if (aExp == 0) {
7574         if (aSig == 0) {
7575             return a;
7576         }
7577         aExp++;
7578     }
7579
7580     if (n > 0x10000) {
7581         n = 0x10000;
7582     } else if (n < -0x10000) {
7583         n = -0x10000;
7584     }
7585
7586     aExp += n;
7587     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7588                                          aSign, aExp, aSig, 0, status);
7589 }
7590
7591 float128 float128_scalbn(float128 a, int n, float_status *status)
7592 {
7593     bool aSign;
7594     int32_t aExp;
7595     uint64_t aSig0, aSig1;
7596
7597     aSig1 = extractFloat128Frac1( a );
7598     aSig0 = extractFloat128Frac0( a );
7599     aExp = extractFloat128Exp( a );
7600     aSign = extractFloat128Sign( a );
7601     if ( aExp == 0x7FFF ) {
7602         if ( aSig0 | aSig1 ) {
7603             return propagateFloat128NaN(a, a, status);
7604         }
7605         return a;
7606     }
7607     if (aExp != 0) {
7608         aSig0 |= UINT64_C(0x0001000000000000);
7609     } else if (aSig0 == 0 && aSig1 == 0) {
7610         return a;
7611     } else {
7612         aExp++;
7613     }
7614
7615     if (n > 0x10000) {
7616         n = 0x10000;
7617     } else if (n < -0x10000) {
7618         n = -0x10000;
7619     }
7620
7621     aExp += n - 1;
7622     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7623                                          , status);
7624
7625 }
7626
7627 static void __attribute__((constructor)) softfloat_init(void)
7628 {
7629     union_float64 ua, ub, uc, ur;
7630
7631     if (QEMU_NO_HARDFLOAT) {
7632         return;
7633     }
7634     /*
7635      * Test that the host's FMA is not obviously broken. For example,
7636      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7637      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7638      */
7639     ua.s = 0x0020000000000001ULL;
7640     ub.s = 0x3ca0000000000000ULL;
7641     uc.s = 0x0020000000000000ULL;
7642     ur.h = fma(ua.h, ub.h, uc.h);
7643     if (ur.s != 0x0020000000000001ULL) {
7644         force_soft_fma = true;
7645     }
7646 }