ceph/src/boost/boost/json/detail/ryu/impl/d2s.ipp

   1 // Copyright 2018 Ulf Adams
   2 //
   3 // The contents of this file may be used under the terms of the Apache License,
   4 // Version 2.0.
   5 //
   6 //    (See accompanying file LICENSE-Apache or copy at
   7 //     http://www.apache.org/licenses/LICENSE-2.0)
   8 //
   9 // Alternatively, the contents of this file may be used under the terms of
  10 // the Boost Software License, Version 1.0.
  11 //    (See accompanying file LICENSE-Boost or copy at
  12 //     https://www.boost.org/LICENSE_1_0.txt)
  13 //
  14 // Unless required by applicable law or agreed to in writing, this software
  15 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  16 // KIND, either express or implied.
  17
  18 // Runtime compiler options:
  19 // -DRYU_DEBUG Generate verbose debugging output to stdout.
  20 //
  21 // -DRYU_ONLY_64_BIT_OPS Avoid using uint128_t or 64-bit intrinsics. Slower,
  22 //     depending on your compiler.
  23 //
  24 // -DRYU_OPTIMIZE_SIZE Use smaller lookup tables. Instead of storing every
  25 //     required power of 5, only store every 26th entry, and compute
  26 //     intermediate values with a multiplication. This reduces the lookup table
  27 //     size by about 10x (only one case, and only double) at the cost of some
  28 //     performance. Currently requires MSVC intrinsics.
  29
  30 /*
  31     This is a derivative work
  32 */
  33
  34 #ifndef BOOST_JSON_DETAIL_RYU_IMPL_D2S_IPP
  35 #define BOOST_JSON_DETAIL_RYU_IMPL_D2S_IPP
  36
  37 #include <boost/json/detail/ryu/ryu.hpp>
  38 #include <cstdlib>
  39 #include <cstring>
  40
  41 #ifdef RYU_DEBUG
  42 #include <stdio.h>
  43 #endif
  44
  45 // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
  46 // Let's do the same for now.
  47 #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
  48 #define BOOST_JSON_RYU_HAS_UINT128
  49 #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
  50 #define BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS
  51 #endif
  52
  53 #include <boost/json/detail/ryu/detail/common.hpp>
  54 #include <boost/json/detail/ryu/detail/digit_table.hpp>
  55 #include <boost/json/detail/ryu/detail/d2s.hpp>
  56 #include <boost/json/detail/ryu/detail/d2s_intrinsics.hpp>
  57
  58 BOOST_JSON_NS_BEGIN
  59 namespace detail {
  60
  61 namespace ryu {
  62 namespace detail {
  63
  64 // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
  65 // Multiplication:
  66 //   The 64-bit factor is variable and passed in, the 128-bit factor comes
  67 //   from a lookup table. We know that the 64-bit factor only has 55
  68 //   significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
  69 //   factor only has 124 significant bits (i.e., the 4 topmost bits are
  70 //   zeros).
  71 // Shift:
  72 //   In principle, the multiplication result requires 55 + 124 = 179 bits to
  73 //   represent. However, we then shift this value to the right by j, which is
  74 //   at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
  75 //   bits. This means that we only need the topmost 64 significant bits of
  76 //   the 64x128-bit multiplication.
  77 //
  78 // There are several ways to do this:
  79 // 1. Best case: the compiler exposes a 128-bit type.
  80 //    We perform two 64x64-bit multiplications, add the higher 64 bits of the
  81 //    lower result to the higher result, and shift by j - 64 bits.
  82 //
  83 //    We explicitly cast from 64-bit to 128-bit, so the compiler can tell
  84 //    that these are only 64-bit inputs, and can map these to the best
  85 //    possible sequence of assembly instructions.
  86 //    x64 machines happen to have matching assembly instructions for
  87 //    64x64-bit multiplications and 128-bit shifts.
  88 //
  89 // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
  90 //    instructions mentioned in 1.
  91 //
  92 // 3. We only have 64x64 bit instructions that return the lower 64 bits of
  93 //    the result, i.e., we have to use plain C.
  94 //    Our inputs are less than the full width, so we have three options:
  95 //    a. Ignore this fact and just implement the intrinsics manually.
  96 //    b. Split both into 31-bit pieces, which guarantees no internal overflow,
  97 //       but requires extra work upfront (unless we change the lookup table).
  98 //    c. Split only the first factor into 31-bit pieces, which also guarantees
  99 //       no internal overflow, but requires extra work since the intermediate
 100 //       results are not perfectly aligned.
 101 #if defined(BOOST_JSON_RYU_HAS_UINT128)
 102
 103 // Best case: use 128-bit type.
 104 inline
 105 std::uint64_t
 106     mulShift(
 107     const std::uint64_t m,
 108     const std::uint64_t* const mul,
 109     const std::int32_t j) noexcept
 110 {
 111     const uint128_t b0 = ((uint128_t) m) * mul[0];
 112     const uint128_t b2 = ((uint128_t) m) * mul[1];
 113     return (std::uint64_t) (((b0 >> 64) + b2) >> (j - 64));
 114 }
 115
 116 inline
 117 uint64_t
 118 mulShiftAll(
 119     const std::uint64_t m,
 120     const std::uint64_t* const mul,
 121     std::int32_t const j,
 122     std::uint64_t* const vp,
 123     std::uint64_t* const vm,
 124     const std::uint32_t mmShift) noexcept
 125 {
 126 //  m <<= 2;
 127 //  uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
 128 //  uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
 129 //
 130 //  uint128_t hi = (b0 >> 64) + b2;
 131 //  uint128_t lo = b0 & 0xffffffffffffffffull;
 132 //  uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
 133 //  uint128_t vpLo = lo + (factor << 1);
 134 //  *vp = (std::uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
 135 //  uint128_t vmLo = lo - (factor << mmShift);
 136 //  *vm = (std::uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
 137 //  return (std::uint64_t) (hi >> (j - 64));
 138     *vp = mulShift(4 * m + 2, mul, j);
 139     *vm = mulShift(4 * m - 1 - mmShift, mul, j);
 140     return mulShift(4 * m, mul, j);
 141 }
 142
 143 #elif defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS)
 144
 145 inline
 146 std::uint64_t
 147 mulShift(
 148     const std::uint64_t m,
 149     const std::uint64_t* const mul,
 150     const std::int32_t j) noexcept
 151 {
 152     // m is maximum 55 bits
 153     std::uint64_t high1;                                   // 128
 154     std::uint64_t const low1 = umul128(m, mul[1], &high1); // 64
 155     std::uint64_t high0;                                   // 64
 156     umul128(m, mul[0], &high0);                            // 0
 157     std::uint64_t const sum = high0 + low1;
 158     if (sum < high0)
 159         ++high1; // overflow into high1
 160     return shiftright128(sum, high1, j - 64);
 161 }
 162
 163 inline
 164 std::uint64_t
 165 mulShiftAll(
 166     const std::uint64_t m,
 167     const std::uint64_t* const mul,
 168     const std::int32_t j,
 169     std::uint64_t* const vp,
 170     std::uint64_t* const vm,
 171     const std::uint32_t mmShift) noexcept
 172 {
 173     *vp = mulShift(4 * m + 2, mul, j);
 174     *vm = mulShift(4 * m - 1 - mmShift, mul, j);
 175     return mulShift(4 * m, mul, j);
 176 }
 177
 178 #else // !defined(BOOST_JSON_RYU_HAS_UINT128) && !defined(BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS)
 179
 180 inline
 181 std::uint64_t
 182 mulShiftAll(
 183     std::uint64_t m,
 184     const std::uint64_t* const mul,
 185     const std::int32_t j,
 186     std::uint64_t* const vp,
 187     std::uint64_t* const vm,
 188     const std::uint32_t mmShift)
 189 {
 190     m <<= 1;
 191     // m is maximum 55 bits
 192     std::uint64_t tmp;
 193     std::uint64_t const lo = umul128(m, mul[0], &tmp);
 194     std::uint64_t hi;
 195     std::uint64_t const mid = tmp + umul128(m, mul[1], &hi);
 196     hi += mid < tmp; // overflow into hi
 197
 198     const std::uint64_t lo2 = lo + mul[0];
 199     const std::uint64_t mid2 = mid + mul[1] + (lo2 < lo);
 200     const std::uint64_t hi2 = hi + (mid2 < mid);
 201     *vp = shiftright128(mid2, hi2, (std::uint32_t)(j - 64 - 1));
 202
 203     if (mmShift == 1)
 204     {
 205         const std::uint64_t lo3 = lo - mul[0];
 206         const std::uint64_t mid3 = mid - mul[1] - (lo3 > lo);
 207         const std::uint64_t hi3 = hi - (mid3 > mid);
 208         *vm = shiftright128(mid3, hi3, (std::uint32_t)(j - 64 - 1));
 209     }
 210     else
 211     {
 212         const std::uint64_t lo3 = lo + lo;
 213         const std::uint64_t mid3 = mid + mid + (lo3 < lo);
 214         const std::uint64_t hi3 = hi + hi + (mid3 < mid);
 215         const std::uint64_t lo4 = lo3 - mul[0];
 216         const std::uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
 217         const std::uint64_t hi4 = hi3 - (mid4 > mid3);
 218         *vm = shiftright128(mid4, hi4, (std::uint32_t)(j - 64));
 219     }
 220
 221     return shiftright128(mid, hi, (std::uint32_t)(j - 64 - 1));
 222 }
 223
 224 #endif // BOOST_JSON_RYU_HAS_64_BIT_INTRINSICS
 225
 226 inline
 227 std::uint32_t
 228 decimalLength17(
 229     const std::uint64_t v)
 230 {
 231     // This is slightly faster than a loop.
 232     // The average output length is 16.38 digits, so we check high-to-low.
 233     // Function precondition: v is not an 18, 19, or 20-digit number.
 234     // (17 digits are sufficient for round-tripping.)
 235     BOOST_ASSERT(v < 100000000000000000L);
 236     if (v >= 10000000000000000L) { return 17; }
 237     if (v >= 1000000000000000L) { return 16; }
 238     if (v >= 100000000000000L) { return 15; }
 239     if (v >= 10000000000000L) { return 14; }
 240     if (v >= 1000000000000L) { return 13; }
 241     if (v >= 100000000000L) { return 12; }
 242     if (v >= 10000000000L) { return 11; }
 243     if (v >= 1000000000L) { return 10; }
 244     if (v >= 100000000L) { return 9; }
 245     if (v >= 10000000L) { return 8; }
 246     if (v >= 1000000L) { return 7; }
 247     if (v >= 100000L) { return 6; }
 248     if (v >= 10000L) { return 5; }
 249     if (v >= 1000L) { return 4; }
 250     if (v >= 100L) { return 3; }
 251     if (v >= 10L) { return 2; }
 252     return 1;
 253 }
 254
 255 // A floating decimal representing m * 10^e.
 256 struct floating_decimal_64
 257 {
 258     std::uint64_t mantissa;
 259     // Decimal exponent's range is -324 to 308
 260     // inclusive, and can fit in a short if needed.
 261     std::int32_t exponent;
 262 };
 263
 264 inline
 265 floating_decimal_64
 266 d2d(
 267     const std::uint64_t ieeeMantissa,
 268     const std::uint32_t ieeeExponent)
 269 {
 270     std::int32_t e2;
 271     std::uint64_t m2;
 272     if (ieeeExponent == 0)
 273     {
 274         // We subtract 2 so that the bounds computation has 2 additional bits.
 275         e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
 276         m2 = ieeeMantissa;
 277     }
 278     else
 279     {
 280         e2 = (std::int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
 281         m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
 282     }
 283     const bool even = (m2 & 1) == 0;
 284     const bool acceptBounds = even;
 285
 286 #ifdef RYU_DEBUG
 287     printf("-> %" PRIu64 " * 2^%d\n", m2, e2 + 2);
 288 #endif
 289
 290     // Step 2: Determine the interval of valid decimal representations.
 291     const std::uint64_t mv = 4 * m2;
 292     // Implicit bool -> int conversion. True is 1, false is 0.
 293     const std::uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
 294     // We would compute mp and mm like this:
 295     // uint64_t mp = 4 * m2 + 2;
 296     // uint64_t mm = mv - 1 - mmShift;
 297
 298     // Step 3: Convert to a decimal power base using 128-bit arithmetic.
 299     std::uint64_t vr, vp, vm;
 300     std::int32_t e10;
 301     bool vmIsTrailingZeros = false;
 302     bool vrIsTrailingZeros = false;
 303     if (e2 >= 0) {
 304         // I tried special-casing q == 0, but there was no effect on performance.
 305         // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
 306         const std::uint32_t q = log10Pow2(e2) - (e2 > 3);
 307         e10 = (std::int32_t)q;
 308         const std::int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
 309         const std::int32_t i = -e2 + (std::int32_t)q + k;
 310 #if defined(BOOST_JSON_RYU_OPTIMIZE_SIZE)
 311         uint64_t pow5[2];
 312         double_computeInvPow5(q, pow5);
 313         vr = mulShiftAll(m2, pow5, i, &vp, &vm, mmShift);
 314 #else
 315         vr = mulShiftAll(m2, DOUBLE_POW5_INV_SPLIT()[q], i, &vp, &vm, mmShift);
 316 #endif
 317 #ifdef RYU_DEBUG
 318         printf("%" PRIu64 " * 2^%d / 10^%u\n", mv, e2, q);
 319         printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
 320 #endif
 321         if (q <= 21)
 322         {
 323             // This should use q <= 22, but I think 21 is also safe. Smaller values
 324             // may still be safe, but it's more difficult to reason about them.
 325             // Only one of mp, mv, and mm can be a multiple of 5, if any.
 326             const std::uint32_t mvMod5 = ((std::uint32_t)mv) - 5 * ((std::uint32_t)div5(mv));
 327             if (mvMod5 == 0)
 328             {
 329                 vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
 330             }
 331             else if (acceptBounds)
 332             {
 333                 // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
 334                 // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
 335                 // <=> true && pow5Factor(mm) >= q, since e2 >= q.
 336                 vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
 337             }
 338             else
 339             {
 340                 // Same as min(e2 + 1, pow5Factor(mp)) >= q.
 341                 vp -= multipleOfPowerOf5(mv + 2, q);
 342             }
 343         }
 344     }
 345     else
 346     {
 347         // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
 348         const std::uint32_t q = log10Pow5(-e2) - (-e2 > 1);
 349         e10 = (std::int32_t)q + e2;
 350         const std::int32_t i = -e2 - (std::int32_t)q;
 351         const std::int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
 352         const std::int32_t j = (std::int32_t)q - k;
 353 #if defined(BOOST_JSON_RYU_OPTIMIZE_SIZE)
 354         std::uint64_t pow5[2];
 355         double_computePow5(i, pow5);
 356         vr = mulShiftAll(m2, pow5, j, &vp, &vm, mmShift);
 357 #else
 358         vr = mulShiftAll(m2, DOUBLE_POW5_SPLIT()[i], j, &vp, &vm, mmShift);
 359 #endif
 360 #ifdef RYU_DEBUG
 361         printf("%" PRIu64 " * 5^%d / 10^%u\n", mv, -e2, q);
 362         printf("%u %d %d %d\n", q, i, k, j);
 363         printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
 364 #endif
 365         if (q <= 1)
 366         {
 367             // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
 368             // mv = 4 * m2, so it always has at least two trailing 0 bits.
 369             vrIsTrailingZeros = true;
 370             if (acceptBounds)
 371             {
 372                 // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
 373                 vmIsTrailingZeros = mmShift == 1;
 374             }
 375             else
 376             {
 377                 // mp = mv + 2, so it always has at least one trailing 0 bit.
 378                 --vp;
 379             }
 380         }
 381         else if (q < 63)
 382         {
 383             // TODO(ulfjack): Use a tighter bound here.
 384             // We want to know if the full product has at least q trailing zeros.
 385             // We need to compute min(p2(mv), p5(mv) - e2) >= q
 386             // <=> p2(mv) >= q && p5(mv) - e2 >= q
 387             // <=> p2(mv) >= q (because -e2 >= q)
 388             vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
 389 #ifdef RYU_DEBUG
 390             printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
 391 #endif
 392         }
 393     }
 394 #ifdef RYU_DEBUG
 395     printf("e10=%d\n", e10);
 396     printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
 397     printf("vm is trailing zeros=%s\n", vmIsTrailingZeros ? "true" : "false");
 398     printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
 399 #endif
 400
 401     // Step 4: Find the shortest decimal representation in the interval of valid representations.
 402     std::int32_t removed = 0;
 403     std::uint8_t lastRemovedDigit = 0;
 404     std::uint64_t output;
 405     // On average, we remove ~2 digits.
 406     if (vmIsTrailingZeros || vrIsTrailingZeros)
 407     {
 408         // General case, which happens rarely (~0.7%).
 409         for (;;)
 410         {
 411             const std::uint64_t vpDiv10 = div10(vp);
 412             const std::uint64_t vmDiv10 = div10(vm);
 413             if (vpDiv10 <= vmDiv10)
 414                 break;
 415             const std::uint32_t vmMod10 = ((std::uint32_t)vm) - 10 * ((std::uint32_t)vmDiv10);
 416             const std::uint64_t vrDiv10 = div10(vr);
 417             const std::uint32_t vrMod10 = ((std::uint32_t)vr) - 10 * ((std::uint32_t)vrDiv10);
 418             vmIsTrailingZeros &= vmMod10 == 0;
 419             vrIsTrailingZeros &= lastRemovedDigit == 0;
 420             lastRemovedDigit = (uint8_t)vrMod10;
 421             vr = vrDiv10;
 422             vp = vpDiv10;
 423             vm = vmDiv10;
 424             ++removed;
 425         }
 426 #ifdef RYU_DEBUG
 427         printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
 428         printf("d-10=%s\n", vmIsTrailingZeros ? "true" : "false");
 429 #endif
 430         if (vmIsTrailingZeros)
 431         {
 432             for (;;)
 433             {
 434                 const std::uint64_t vmDiv10 = div10(vm);
 435                 const std::uint32_t vmMod10 = ((std::uint32_t)vm) - 10 * ((std::uint32_t)vmDiv10);
 436                 if (vmMod10 != 0)
 437                     break;
 438                 const std::uint64_t vpDiv10 = div10(vp);
 439                 const std::uint64_t vrDiv10 = div10(vr);
 440                 const std::uint32_t vrMod10 = ((std::uint32_t)vr) - 10 * ((std::uint32_t)vrDiv10);
 441                 vrIsTrailingZeros &= lastRemovedDigit == 0;
 442                 lastRemovedDigit = (uint8_t)vrMod10;
 443                 vr = vrDiv10;
 444                 vp = vpDiv10;
 445                 vm = vmDiv10;
 446                 ++removed;
 447             }
 448         }
 449 #ifdef RYU_DEBUG
 450         printf("%" PRIu64 " %d\n", vr, lastRemovedDigit);
 451         printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
 452 #endif
 453         if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0)
 454         {
 455             // Round even if the exact number is .....50..0.
 456             lastRemovedDigit = 4;
 457         }
 458         // We need to take vr + 1 if vr is outside bounds or we need to round up.
 459         output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
 460     }
 461     else
 462     {
 463         // Specialized for the common case (~99.3%). Percentages below are relative to this.
 464         bool roundUp = false;
 465         const std::uint64_t vpDiv100 = div100(vp);
 466         const std::uint64_t vmDiv100 = div100(vm);
 467         if (vpDiv100 > vmDiv100)
 468         {
 469             // Optimization: remove two digits at a time (~86.2%).
 470             const std::uint64_t vrDiv100 = div100(vr);
 471             const std::uint32_t vrMod100 = ((std::uint32_t)vr) - 100 * ((std::uint32_t)vrDiv100);
 472             roundUp = vrMod100 >= 50;
 473             vr = vrDiv100;
 474             vp = vpDiv100;
 475             vm = vmDiv100;
 476             removed += 2;
 477         }
 478         // Loop iterations below (approximately), without optimization above:
 479         // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
 480         // Loop iterations below (approximately), with optimization above:
 481         // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
 482         for (;;)
 483         {
 484             const std::uint64_t vpDiv10 = div10(vp);
 485             const std::uint64_t vmDiv10 = div10(vm);
 486             if (vpDiv10 <= vmDiv10)
 487                 break;
 488             const std::uint64_t vrDiv10 = div10(vr);
 489             const std::uint32_t vrMod10 = ((std::uint32_t)vr) - 10 * ((std::uint32_t)vrDiv10);
 490             roundUp = vrMod10 >= 5;
 491             vr = vrDiv10;
 492             vp = vpDiv10;
 493             vm = vmDiv10;
 494             ++removed;
 495         }
 496 #ifdef RYU_DEBUG
 497         printf("%" PRIu64 " roundUp=%s\n", vr, roundUp ? "true" : "false");
 498         printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
 499 #endif
 500         // We need to take vr + 1 if vr is outside bounds or we need to round up.
 501         output = vr + (vr == vm || roundUp);
 502     }
 503     const std::int32_t exp = e10 + removed;
 504
 505 #ifdef RYU_DEBUG
 506     printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
 507     printf("O=%" PRIu64 "\n", output);
 508     printf("EXP=%d\n", exp);
 509 #endif
 510
 511     floating_decimal_64 fd;
 512     fd.exponent = exp;
 513     fd.mantissa = output;
 514     return fd;
 515 }
 516
 517 inline
 518 int
 519 to_chars(
 520     const floating_decimal_64 v,
 521     const bool sign,
 522     char* const result)
 523 {
 524     // Step 5: Print the decimal representation.
 525     int index = 0;
 526     if (sign)
 527         result[index++] = '-';
 528
 529     std::uint64_t output = v.mantissa;
 530     std::uint32_t const olength = decimalLength17(output);
 531
 532 #ifdef RYU_DEBUG
 533     printf("DIGITS=%" PRIu64 "\n", v.mantissa);
 534     printf("OLEN=%u\n", olength);
 535     printf("EXP=%u\n", v.exponent + olength);
 536 #endif
 537
 538     // Print the decimal digits.
 539     // The following code is equivalent to:
 540     // for (uint32_t i = 0; i < olength - 1; ++i) {
 541     //   const uint32_t c = output % 10; output /= 10;
 542     //   result[index + olength - i] = (char) ('0' + c);
 543     // }
 544     // result[index] = '0' + output % 10;
 545
 546     std::uint32_t i = 0;
 547     // We prefer 32-bit operations, even on 64-bit platforms.
 548     // We have at most 17 digits, and uint32_t can store 9 digits.
 549     // If output doesn't fit into uint32_t, we cut off 8 digits,
 550     // so the rest will fit into uint32_t.
 551     if ((output >> 32) != 0)
 552     {
 553         // Expensive 64-bit division.
 554         std::uint64_t const q = div1e8(output);
 555         std::uint32_t output2 = ((std::uint32_t)output) - 100000000 * ((std::uint32_t)q);
 556         output = q;
 557
 558         const std::uint32_t c = output2 % 10000;
 559         output2 /= 10000;
 560         const std::uint32_t d = output2 % 10000;
 561         const std::uint32_t c0 = (c % 100) << 1;
 562         const std::uint32_t c1 = (c / 100) << 1;
 563         const std::uint32_t d0 = (d % 100) << 1;
 564         const std::uint32_t d1 = (d / 100) << 1;
 565         std::memcpy(result + index + olength - i - 1, DIGIT_TABLE() + c0, 2);
 566         std::memcpy(result + index + olength - i - 3, DIGIT_TABLE() + c1, 2);
 567         std::memcpy(result + index + olength - i - 5, DIGIT_TABLE() + d0, 2);
 568         std::memcpy(result + index + olength - i - 7, DIGIT_TABLE() + d1, 2);
 569         i += 8;
 570     }
 571     uint32_t output2 = (std::uint32_t)output;
 572     while (output2 >= 10000)
 573     {
 574 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
 575         const uint32_t c = output2 - 10000 * (output2 / 10000);
 576 #else
 577         const uint32_t c = output2 % 10000;
 578 #endif
 579         output2 /= 10000;
 580         const uint32_t c0 = (c % 100) << 1;
 581         const uint32_t c1 = (c / 100) << 1;
 582         memcpy(result + index + olength - i - 1, DIGIT_TABLE() + c0, 2);
 583         memcpy(result + index + olength - i - 3, DIGIT_TABLE() + c1, 2);
 584         i += 4;
 585     }
 586     if (output2 >= 100) {
 587         const uint32_t c = (output2 % 100) << 1;
 588         output2 /= 100;
 589         memcpy(result + index + olength - i - 1, DIGIT_TABLE() + c, 2);
 590         i += 2;
 591     }
 592     if (output2 >= 10) {
 593         const uint32_t c = output2 << 1;
 594         // We can't use memcpy here: the decimal dot goes between these two digits.
 595         result[index + olength - i] = DIGIT_TABLE()[c + 1];
 596         result[index] = DIGIT_TABLE()[c];
 597     }
 598     else {
 599         result[index] = (char)('0' + output2);
 600     }
 601
 602     // Print decimal point if needed.
 603     if (olength > 1) {
 604         result[index + 1] = '.';
 605         index += olength + 1;
 606     }
 607     else {
 608         ++index;
 609     }
 610
 611     // Print the exponent.
 612     result[index++] = 'E';
 613     int32_t exp = v.exponent + (int32_t)olength - 1;
 614     if (exp < 0) {
 615         result[index++] = '-';
 616         exp = -exp;
 617     }
 618
 619     if (exp >= 100) {
 620         const int32_t c = exp % 10;
 621         memcpy(result + index, DIGIT_TABLE() + 2 * (exp / 10), 2);
 622         result[index + 2] = (char)('0' + c);
 623         index += 3;
 624     }
 625     else if (exp >= 10) {
 626         memcpy(result + index, DIGIT_TABLE() + 2 * exp, 2);
 627         index += 2;
 628     }
 629     else {
 630         result[index++] = (char)('0' + exp);
 631     }
 632
 633     return index;
 634 }
 635
 636 static inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent,
 637   floating_decimal_64* const v) {
 638   const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
 639   const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
 640
 641   if (e2 > 0) {
 642     // f = m2 * 2^e2 >= 2^53 is an integer.
 643     // Ignore this case for now.
 644     return false;
 645   }
 646
 647   if (e2 < -52) {
 648     // f < 1.
 649     return false;
 650   }
 651
 652   // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
 653   // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
 654   const uint64_t mask = (1ull << -e2) - 1;
 655   const uint64_t fraction = m2 & mask;
 656   if (fraction != 0) {
 657     return false;
 658   }
 659
 660   // f is an integer in the range [1, 2^53).
 661   // Note: mantissa might contain trailing (decimal) 0's.
 662   // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
 663   v->mantissa = m2 >> -e2;
 664   v->exponent = 0;
 665   return true;
 666 }
 667
 668 } // detail
 669
 670 int
 671 d2s_buffered_n(
 672     double f,
 673     char* result) noexcept
 674 {
 675     using namespace detail;
 676     // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
 677     std::uint64_t const bits = double_to_bits(f);
 678
 679 #ifdef RYU_DEBUG
 680     printf("IN=");
 681     for (std::int32_t bit = 63; bit >= 0; --bit) {
 682         printf("%d", (int)((bits >> bit) & 1));
 683     }
 684     printf("\n");
 685 #endif
 686
 687     // Decode bits into sign, mantissa, and exponent.
 688     const bool ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
 689     const std::uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
 690     const std::uint32_t ieeeExponent = (std::uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
 691     // Case distinction; exit early for the easy cases.
 692     if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
 693         return copy_special_str(result, ieeeSign, ieeeExponent != 0, ieeeMantissa != 0);
 694     }
 695
 696     floating_decimal_64 v;
 697     const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
 698     if (isSmallInt) {
 699         // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
 700         // For scientific notation we need to move these zeros into the exponent.
 701         // (This is not needed for fixed-point notation, so it might be beneficial to trim
 702         // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
 703         for (;;) {
 704             std::uint64_t const q = div10(v.mantissa);
 705             std::uint32_t const r = ((std::uint32_t) v.mantissa) - 10 * ((std::uint32_t) q);
 706             if (r != 0)
 707                 break;
 708             v.mantissa = q;
 709             ++v.exponent;
 710         }
 711     }
 712     else {
 713         v = d2d(ieeeMantissa, ieeeExponent);
 714     }
 715
 716     return to_chars(v, ieeeSign, result);
 717 }
 718
 719 } // ryu
 720
 721 } // detail
 722 BOOST_JSON_NS_END
 723
 724 #endif