[rustc.git] / src / libcore / num / dec2flt / algorithm.rs

// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! The various algorithms from the paper.

use cmp::min;
use cmp::Ordering::{Less, Equal, Greater};
use num::diy_float::Fp;
use num::dec2flt::table;
use num::dec2flt::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float};
use num::dec2flt::num::{self, Big};

/// Number of significand bits in Fp
const P: u32 = 64;

// We simply store the best approximation for *all* exponents, so the variable "h" and the
// associated conditions can be omitted. This trades performance for a couple kilobytes of space.

fn power_of_ten(e: i16) -> Fp {
    assert!(e >= table::MIN_E);
    let i = e - table::MIN_E;
    let sig = table::POWERS.0[i as usize];
    let exp = table::POWERS.1[i as usize];
    Fp { f: sig, e: exp }
}

// In most architectures, floating point operations have an explicit bit size, therefore the
// precision of the computation is determined on a per-operation basis.
#[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
mod fpu_precision {
    pub fn set_precision<T>() { }
}

// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
// The x87 FPU operates with 80 bits of precision by default, which means that operations will
// round to 80 bits causing double rounding to happen when values are eventually represented as
// 32/64 bit float values. To overcome this, the FPU control word can be set so that the
// computations are performed in the desired precision.
#[cfg(all(target_arch="x86", not(target_feature="sse2")))]
mod fpu_precision {
    use mem::size_of;

    /// A structure used to preserve the original value of the FPU control word, so that it can be
    /// restored when the structure is dropped.
    ///
    /// The x87 FPU is a 16-bits register whose fields are as follows:
    ///
    /// | 12-15 | 10-11 | 8-9 | 6-7 |  5 |  4 |  3 |  2 |  1 |  0 |
    /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
    /// |       | RC    | PC  |     | PM | UM | OM | ZM | DM | IM |
    ///
    /// The documentation for all of the fields is available in the IA-32 Architectures Software
    /// Developer's Manual (Volume 1).
    ///
    /// The only field which is relevant for the following code is PC, Precision Control. This
    /// field determines the precision of the operations performed by the  FPU. It can be set to:
    ///  - 0b00, single precision i.e. 32-bits
    ///  - 0b10, double precision i.e. 64-bits
    ///  - 0b11, double extended precision i.e. 80-bits (default state)
    /// The 0b01 value is reserved and should not be used.
    pub struct FPUControlWord(u16);

    fn set_cw(cw: u16) {
        unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
    }

    /// Set the precision field of the FPU to `T` and return a `FPUControlWord`
    pub fn set_precision<T>() -> FPUControlWord {
        let cw = 0u16;

        // Compute the value for the Precision Control field that is appropriate for `T`.
        let cw_precision = match size_of::<T>() {
            4 => 0x0000, // 32 bits
            8 => 0x0200, // 64 bits
            _ => 0x0300, // default, 80 bits
        };

        // Get the original value of the control word to restore it later, when the
        // `FPUControlWord` structure is dropped
        unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }

        // Set the control word to the desired precision. This is achieved by masking away the old
        // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
        set_cw((cw & 0xFCFF) | cw_precision);

        FPUControlWord(cw)
    }

    impl Drop for FPUControlWord {
        fn drop(&mut self) {
            set_cw(self.0)
        }
    }
}

/// The fast path of Bellerophon using machine-sized integers and floats.
///
/// This is extracted into a separate function so that it can be attempted before constructing
/// a bignum.
pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
    let num_digits = integral.len() + fractional.len();
    // log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end,
    // this is just a quick, cheap rejection (and also frees the rest of the code from
    // worrying about underflow).
    if num_digits > 16 {
        return None;
    }
    if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 {
        return None;
    }
    let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
    if f > T::MAX_SIG {
        return None;
    }

    // The fast path crucially depends on arithmetic being rounded to the correct number of bits
    // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
    // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
    // The `set_precision` function takes care of setting the precision on architectures which
    // require setting it by changing the global state (like the control word of the x87 FPU).
    let _cw = fpu_precision::set_precision::<T>();

    // The case e < 0 cannot be folded into the other branch. Negative powers result in
    // a repeating fractional part in binary, which are rounded, which causes real
    // (and occasionally quite significant!) errors in the final result.
    if e >= 0 {
        Some(T::from_int(f) * T::short_fast_pow10(e as usize))
    } else {
        Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize))
    }
}

/// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis.
///
/// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation
/// of `10^e` (in the same floating point format). This is often enough to get the correct result.
/// However, when the result is close to halfway between two adjacent (ordinary) floats, the
/// compound rounding error from multiplying two approximation means the result may be off by a
/// few bits. When this happens, the iterative Algorithm R fixes things up.
///
/// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper.
/// In the words of Clinger:
///
/// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error
/// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is
/// > not a bound for the true error, but bounds the difference between the approximation z and
/// > the best possible approximation that uses p bits of significand.)
pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
    let slop;
    if f <= &Big::from_u64(T::MAX_SIG) {
        // The cases abs(e) < log5(2^N) are in fast_path()
        slop = if e >= 0 { 0 } else { 3 };
    } else {
        slop = if e >= 0 { 1 } else { 4 };
    }
    let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
    let exp_p_n = 1 << (P - T::SIG_BITS as u32);
    let lowbits: i64 = (z.f % exp_p_n) as i64;
    // Is the slop large enough to make a difference when
    // rounding to n bits?
    if (lowbits - exp_p_n as i64 / 2).abs() <= slop {
        algorithm_r(f, e, fp_to_float(z))
    } else {
        fp_to_float(z)
    }
}

/// An iterative algorithm that improves a floating point approximation of `f * 10^e`.
///
/// Each iteration gets one unit in the last place closer, which of course takes terribly long to
/// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the
/// starting approximation is off by at most one ULP.
fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
    let mut z = z0;
    loop {
        let raw = z.unpack();
        let (m, k) = (raw.sig, raw.k);
        let mut x = f.clone();
        let mut y = Big::from_u64(m);

        // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`.
        // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the
        // power of two common to `10^e` and `2^k` to make the numbers smaller.
        make_ratio(&mut x, &mut y, e, k);

        let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32];
        // This is written a bit awkwardly because our bignums don't support
        // negative numbers, so we use the absolute value + sign information.
        // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that
        // we need to worry about overflow, then they are also large enough that `make_ratio` has
        // reduced the fraction by a factor of 2^64 or more.
        let (d2, d_negative) = if x >= y {
            // Don't need x any more, save a clone().
            x.sub(&y).mul_pow2(1).mul_digits(&m_digits);
            (x, false)
        } else {
            // Still need y - make a copy.
            let mut y = y.clone();
            y.sub(&x).mul_pow2(1).mul_digits(&m_digits);
            (y, true)
        };

        if d2 < y {
            let mut d2_double = d2;
            d2_double.mul_pow2(1);
            if m == T::MIN_SIG && d_negative && d2_double > y {
                z = prev_float(z);
            } else {
                return z;
            }
        } else if d2 == y {
            if m % 2 == 0 {
                if m == T::MIN_SIG && d_negative {
                    z = prev_float(z);
                } else {
                    return z;
                }
            } else if d_negative {
                z = prev_float(z);
            } else {
                z = next_float(z);
            }
        } else if d_negative {
            z = prev_float(z);
        } else {
            z = next_float(z);
        }
    }
}

/// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the
/// significand of a floating point approximation, make the ratio `x / y` equal to
/// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common.
fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) {
    let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize);
    if e >= 0 {
        if k >= 0 {
            // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two.
            let common = min(e_abs, k_abs);
            x.mul_pow5(e_abs).mul_pow2(e_abs - common);
            y.mul_pow2(k_abs - common);
        } else {
            // x = f * 10^e * 2^abs(k), y = m
            // This can't overflow because it requires positive `e` and negative `k`, which can
            // only happen for values extremely close to 1, which means that `e` and `k` will be
            // comparatively tiny.
            x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs);
        }
    } else {
        if k >= 0 {
            // x = f, y = m * 10^abs(e) * 2^k
            // This can't overflow either, see above.
            y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs);
        } else {
            // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two.
            let common = min(e_abs, k_abs);
            x.mul_pow2(k_abs - common);
            y.mul_pow5(e_abs).mul_pow2(e_abs - common);
        }
    }
}

/// Conceptually, Algorithm M is the simplest way to convert a decimal to a float.
///
/// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives
/// a valid float significand. The binary exponent `k` is the number of times we multiplied
/// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`.
/// When we have found out significand, we only need to round by inspecting the remainder of the
/// division, which is done in helper functions further below.
///
/// This algorithm is super slow, even with the optimization described in `quick_start()`.
/// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal
/// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed.
/// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand,
/// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return
/// infinity.
///
/// Handling underflow and subnormals is trickier. One big problem is that, with the minimum
/// exponent, the ratio might still be too large for a significand. See underflow() for details.
pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
    let mut u;
    let mut v;
    let e_abs = e.abs() as usize;
    let mut k = 0;
    if e < 0 {
        u = f.clone();
        v = Big::from_small(1);
        v.mul_pow5(e_abs).mul_pow2(e_abs);
    } else {
        // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of
        // fp_to_float(big_to_fp(u)) here, only without the double rounding.
        u = f.clone();
        u.mul_pow5(e_abs).mul_pow2(e_abs);
        v = Big::from_small(1);
    }
    quick_start::<T>(&mut u, &mut v, &mut k);
    let mut rem = Big::from_small(0);
    let mut x = Big::from_small(0);
    let min_sig = Big::from_u64(T::MIN_SIG);
    let max_sig = Big::from_u64(T::MAX_SIG);
    loop {
        u.div_rem(&v, &mut x, &mut rem);
        if k == T::MIN_EXP_INT {
            // We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`,
            // then we'd be off by a factor of two. Unfortunately this means we have to special-
            // case normal numbers with the minimum exponent.
            // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
            // that it's actually correct!
            if x >= min_sig && x <= max_sig {
                break;
            }
            return underflow(x, v, rem);
        }
        if k > T::MAX_EXP_INT {
            return T::INFINITY;
        }
        if x < min_sig {
            u.mul_pow2(1);
            k -= 1;
        } else if x > max_sig {
            v.mul_pow2(1);
            k += 1;
        } else {
            break;
        }
    }
    let q = num::to_u64(&x);
    let z = rawfp::encode_normal(Unpacked::new(q, k));
    round_by_remainder(v, rem, q, z)
}

/// Skip over most AlgorithmM iterations by checking the bit length.
fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
    // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v).
    // The estimate is off by at most 1, but always an under-estimate, so the error on log(u)
    // and log(v) are of the same sign and cancel out (if both are large). Therefore the error
    // for log(u / v) is at most one as well.
    // The target ratio is one where u/v is in an in-range significand. Thus our termination
    // condition is log2(u / v) being the significand bits, plus/minus one.
    // FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
    let target_ratio = T::SIG_BITS as i16;
    let log2_u = u.bit_length() as i16;
    let log2_v = v.bit_length() as i16;
    let mut u_shift: i16 = 0;
    let mut v_shift: i16 = 0;
    assert!(*k == 0);
    loop {
        if *k == T::MIN_EXP_INT {
            // Underflow or subnormal. Leave it to the main function.
            break;
        }
        if *k == T::MAX_EXP_INT {
            // Overflow. Leave it to the main function.
            break;
        }
        let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift);
        if log2_ratio < target_ratio - 1 {
            u_shift += 1;
            *k -= 1;
        } else if log2_ratio > target_ratio + 1 {
            v_shift += 1;
            *k += 1;
        } else {
            break;
        }
    }
    u.mul_pow2(u_shift as usize);
    v.mul_pow2(v_shift as usize);
}

fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
    if x < Big::from_u64(T::MIN_SIG) {
        let q = num::to_u64(&x);
        let z = rawfp::encode_subnormal(q);
        return round_by_remainder(v, rem, q, z);
    }
    // Ratio isn't an in-range significand with the minimum exponent, so we need to round off
    // excess bits and adjust the exponent accordingly. The real value now looks like this:
    //
    //        x        lsb
    // /--------------\/
    // 1010101010101010.10101010101010 * 2^k
    // \-----/\-------/ \------------/
    //    q     trunc.    (represented by rem)
    //
    // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding
    // on their own. When they are equal and the remainder is non-zero, the value still
    // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
    // is zero, we have a half-to-even situation.
    let bits = x.bit_length();
    let lsb = bits - T::SIG_BITS as usize;
    let q = num::get_bits(&x, lsb, bits);
    let k = T::MIN_EXP_INT + lsb as i16;
    let z = rawfp::encode_normal(Unpacked::new(q, k));
    let q_even = q % 2 == 0;
    match num::compare_with_half_ulp(&x, lsb) {
        Greater => next_float(z),
        Less => z,
        Equal if rem.is_zero() && q_even => z,
        Equal => next_float(z),
    }
}

/// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division.
fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T {
    let mut v_minus_r = v;
    v_minus_r.sub(&r);
    if r < v_minus_r {
        z
    } else if r > v_minus_r {
        next_float(z)
    } else if q % 2 == 0 {
        z
    } else {
        next_float(z)
    }
}
Commit	Line	Data
e9174d1e SL	1	// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
	2	// file at the top-level directory of this distribution and at
	3	// http://rust-lang.org/COPYRIGHT.
	4	//
	5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	8	// option. This file may not be copied, modified, or distributed
	9	// except according to those terms.
	10
	11	//! The various algorithms from the paper.
	12
e9174d1e SL	13	use cmp::min;
e9174d1e SL	14	use cmp::Ordering::{Less, Equal, Greater};
b039eaaf SL	15	use num::diy_float::Fp;
	16	use num::dec2flt::table;
	17	use num::dec2flt::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float};
	18	use num::dec2flt::num::{self, Big};
e9174d1e SL	19
	20	/// Number of significand bits in Fp
	21	const P: u32 = 64;
	22
	23	// We simply store the best approximation for all exponents, so the variable "h" and the
	24	// associated conditions can be omitted. This trades performance for a couple kilobytes of space.
	25
	26	fn power_of_ten(e: i16) -> Fp {
	27	assert!(e >= table::MIN_E);
	28	let i = e - table::MIN_E;
	29	let sig = table::POWERS.0[i as usize];
	30	let exp = table::POWERS.1[i as usize];
	31	Fp { f: sig, e: exp }
	32	}
	33
a7813a04 XL	34	// In most architectures, floating point operations have an explicit bit size, therefore the
	35	// precision of the computation is determined on a per-operation basis.
	36	#[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
	37	mod fpu_precision {
	38	pub fn set_precision<T>() { }
	39	}
	40
	41	// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
	42	// The x87 FPU operates with 80 bits of precision by default, which means that operations will
	43	// round to 80 bits causing double rounding to happen when values are eventually represented as
	44	// 32/64 bit float values. To overcome this, the FPU control word can be set so that the
	45	// computations are performed in the desired precision.
	46	#[cfg(all(target_arch="x86", not(target_feature="sse2")))]
	47	mod fpu_precision {
	48	use mem::size_of;
a7813a04 XL	49
	50	/// A structure used to preserve the original value of the FPU control word, so that it can be
	51	/// restored when the structure is dropped.
	52	///
	53	/// The x87 FPU is a 16-bits register whose fields are as follows:
	54	///
	55	/// \| 12-15 \| 10-11 \| 8-9 \| 6-7 \| 5 \| 4 \| 3 \| 2 \| 1 \| 0 \|
	56	/// \|------:\|------:\|----:\|----:\|---:\|---:\|---:\|---:\|---:\|---:\|
	57	/// \| \| RC \| PC \| \| PM \| UM \| OM \| ZM \| DM \| IM \|
	58	///
	59	/// The documentation for all of the fields is available in the IA-32 Architectures Software
	60	/// Developer's Manual (Volume 1).
	61	///
	62	/// The only field which is relevant for the following code is PC, Precision Control. This
	63	/// field determines the precision of the operations performed by the FPU. It can be set to:
	64	/// - 0b00, single precision i.e. 32-bits
	65	/// - 0b10, double precision i.e. 64-bits
	66	/// - 0b11, double extended precision i.e. 80-bits (default state)
	67	/// The 0b01 value is reserved and should not be used.
	68	pub struct FPUControlWord(u16);
	69
	70	fn set_cw(cw: u16) {
	71	unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
	72	}
	73
	74	/// Set the precision field of the FPU to `T` and return a `FPUControlWord`
	75	pub fn set_precision<T>() -> FPUControlWord {
	76	let cw = 0u16;
	77
	78	// Compute the value for the Precision Control field that is appropriate for `T`.
	79	let cw_precision = match size_of::<T>() {
	80	4 => 0x0000, // 32 bits
	81	8 => 0x0200, // 64 bits
	82	_ => 0x0300, // default, 80 bits
	83	};
	84
	85	// Get the original value of the control word to restore it later, when the
	86	// `FPUControlWord` structure is dropped
	87	unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }
	88
	89	// Set the control word to the desired precision. This is achieved by masking away the old
	90	// precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
	91	set_cw((cw & 0xFCFF) \| cw_precision);
	92
	93	FPUControlWord(cw)
	94	}
	95
	96	impl Drop for FPUControlWord {
	97	fn drop(&mut self) {
	98	set_cw(self.0)
	99	}
	100	}
	101	}
	102
e9174d1e SL	103	/// The fast path of Bellerophon using machine-sized integers and floats.
	104	///
	105	/// This is extracted into a separate function so that it can be attempted before constructing
	106	/// a bignum.
e9174d1e SL	107	pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
e9174d1e SL	108	let num_digits = integral.len() + fractional.len();
cc61c64b	109	// log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end,
e9174d1e SL	110	// this is just a quick, cheap rejection (and also frees the rest of the code from
	111	// worrying about underflow).
	112	if num_digits > 16 {
	113	return None;
	114	}
cc61c64b	115	if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 {
e9174d1e SL	116	return None;
	117	}
	118	let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
cc61c64b	119	if f > T::MAX_SIG {
e9174d1e SL	120	return None;
e9174d1e SL	121	}
a7813a04 XL	122
	123	// The fast path crucially depends on arithmetic being rounded to the correct number of bits
	124	// without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
	125	// of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
	126	// The `set_precision` function takes care of setting the precision on architectures which
	127	// require setting it by changing the global state (like the control word of the x87 FPU).
	128	let _cw = fpu_precision::set_precision::<T>();
	129
e9174d1e SL	130	// The case e < 0 cannot be folded into the other branch. Negative powers result in
e9174d1e SL	131	// a repeating fractional part in binary, which are rounded, which causes real
a7813a04	132	// (and occasionally quite significant!) errors in the final result.
9cc50fc6 SL	133	if e >= 0 {
9cc50fc6 SL	134	Some(T::from_int(f) * T::short_fast_pow10(e as usize))
e9174d1e	135	} else {
9cc50fc6	136	Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize))
e9174d1e SL	137	}
	138	}
	139
	140	/// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis.
	141	///
	142	/// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation
	143	/// of `10^e` (in the same floating point format). This is often enough to get the correct result.
cc61c64b	144	/// However, when the result is close to halfway between two adjacent (ordinary) floats, the
e9174d1e SL	145	/// compound rounding error from multiplying two approximation means the result may be off by a
	146	/// few bits. When this happens, the iterative Algorithm R fixes things up.
	147	///
	148	/// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper.
	149	/// In the words of Clinger:
	150	///
	151	/// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error
	152	/// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is
	153	/// > not a bound for the true error, but bounds the difference between the approximation z and
	154	/// > the best possible approximation that uses p bits of significand.)
	155	pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
	156	let slop;
cc61c64b	157	if f <= &Big::from_u64(T::MAX_SIG) {
e9174d1e SL	158	// The cases abs(e) < log5(2^N) are in fast_path()
	159	slop = if e >= 0 { 0 } else { 3 };
	160	} else {
	161	slop = if e >= 0 { 1 } else { 4 };
	162	}
	163	let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
cc61c64b	164	let exp_p_n = 1 << (P - T::SIG_BITS as u32);
e9174d1e SL	165	let lowbits: i64 = (z.f % exp_p_n) as i64;
	166	// Is the slop large enough to make a difference when
	167	// rounding to n bits?
	168	if (lowbits - exp_p_n as i64 / 2).abs() <= slop {
	169	algorithm_r(f, e, fp_to_float(z))
	170	} else {
	171	fp_to_float(z)
	172	}
	173	}
	174
	175	/// An iterative algorithm that improves a floating point approximation of `f * 10^e`.
	176	///
	177	/// Each iteration gets one unit in the last place closer, which of course takes terribly long to
	178	/// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the
	179	/// starting approximation is off by at most one ULP.
	180	fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
	181	let mut z = z0;
	182	loop {
	183	let raw = z.unpack();
	184	let (m, k) = (raw.sig, raw.k);
	185	let mut x = f.clone();
	186	let mut y = Big::from_u64(m);
	187
	188	// Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`.
	189	// This not only avoids dealing with the signs of `e` and `k`, we also eliminate the
	190	// power of two common to `10^e` and `2^k` to make the numbers smaller.
	191	make_ratio(&mut x, &mut y, e, k);
	192
	193	let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32];
	194	// This is written a bit awkwardly because our bignums don't support
	195	// negative numbers, so we use the absolute value + sign information.
	196	// The multiplication with m_digits can't overflow. If `x` or `y` are large enough that
7453a54e	197	// we need to worry about overflow, then they are also large enough that `make_ratio` has
e9174d1e SL	198	// reduced the fraction by a factor of 2^64 or more.
	199	let (d2, d_negative) = if x >= y {
	200	// Don't need x any more, save a clone().
	201	x.sub(&y).mul_pow2(1).mul_digits(&m_digits);
	202	(x, false)
	203	} else {
	204	// Still need y - make a copy.
	205	let mut y = y.clone();
	206	y.sub(&x).mul_pow2(1).mul_digits(&m_digits);
	207	(y, true)
	208	};
	209
	210	if d2 < y {
	211	let mut d2_double = d2;
	212	d2_double.mul_pow2(1);
cc61c64b	213	if m == T::MIN_SIG && d_negative && d2_double > y {
e9174d1e SL	214	z = prev_float(z);
	215	} else {
	216	return z;
	217	}
	218	} else if d2 == y {
	219	if m % 2 == 0 {
cc61c64b	220	if m == T::MIN_SIG && d_negative {
e9174d1e SL	221	z = prev_float(z);
	222	} else {
	223	return z;
	224	}
	225	} else if d_negative {
	226	z = prev_float(z);
	227	} else {
	228	z = next_float(z);
	229	}
	230	} else if d_negative {
	231	z = prev_float(z);
	232	} else {
	233	z = next_float(z);
	234	}
	235	}
	236	}
	237
	238	/// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the
	239	/// significand of a floating point approximation, make the ratio `x / y` equal to
	240	/// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common.
	241	fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) {
	242	let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize);
	243	if e >= 0 {
	244	if k >= 0 {
	245	// x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two.
	246	let common = min(e_abs, k_abs);
	247	x.mul_pow5(e_abs).mul_pow2(e_abs - common);
	248	y.mul_pow2(k_abs - common);
	249	} else {
	250	// x = f * 10^e * 2^abs(k), y = m
	251	// This can't overflow because it requires positive `e` and negative `k`, which can
	252	// only happen for values extremely close to 1, which means that `e` and `k` will be
	253	// comparatively tiny.
	254	x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs);
	255	}
	256	} else {
	257	if k >= 0 {
	258	// x = f, y = m * 10^abs(e) * 2^k
	259	// This can't overflow either, see above.
	260	y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs);
	261	} else {
	262	// x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two.
	263	let common = min(e_abs, k_abs);
	264	x.mul_pow2(k_abs - common);
	265	y.mul_pow5(e_abs).mul_pow2(e_abs - common);
	266	}
	267	}
	268	}
	269
	270	/// Conceptually, Algorithm M is the simplest way to convert a decimal to a float.
	271	///
	272	/// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives
	273	/// a valid float significand. The binary exponent `k` is the number of times we multiplied
	274	/// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`.
	275	/// When we have found out significand, we only need to round by inspecting the remainder of the
	276	/// division, which is done in helper functions further below.
	277	///
	278	/// This algorithm is super slow, even with the optimization described in `quick_start()`.
	279	/// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal
	280	/// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed.
	281	/// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand,
	282	/// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return
	283	/// infinity.
	284	///
285	/// Handling underflow and subnormals is trickier. One big problem is that, with the minimum
286	/// exponent, the ratio might still be too large for a significand. See underflow() for details.
287	pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
288	let mut u;
289	let mut v;
290	let e_abs = e.abs() as usize;
291	let mut k = 0;
292	if e < 0 {
293	u = f.clone();
294	v = Big::from_small(1);
295	v.mul_pow5(e_abs).mul_pow2(e_abs);
296	} else {
297	// FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of
298	// fp_to_float(big_to_fp(u)) here, only without the double rounding.
299	u = f.clone();
300	u.mul_pow5(e_abs).mul_pow2(e_abs);
301	v = Big::from_small(1);
302	}
303	quick_start::<T>(&mut u, &mut v, &mut k);
304	let mut rem = Big::from_small(0);
305	let mut x = Big::from_small(0);
cc61c64b XL	306	let min_sig = Big::from_u64(T::MIN_SIG);
cc61c64b XL	307	let max_sig = Big::from_u64(T::MAX_SIG);
e9174d1e SL	308	loop {
e9174d1e SL	309	u.div_rem(&v, &mut x, &mut rem);
cc61c64b XL	310	if k == T::MIN_EXP_INT {
cc61c64b XL	311	// We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`,
e9174d1e SL	312	// then we'd be off by a factor of two. Unfortunately this means we have to special-
	313	// case normal numbers with the minimum exponent.
	314	// FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
	315	// that it's actually correct!
	316	if x >= min_sig && x <= max_sig {
	317	break;
	318	}
	319	return underflow(x, v, rem);
	320	}
cc61c64b XL	321	if k > T::MAX_EXP_INT {
cc61c64b XL	322	return T::INFINITY;
e9174d1e SL	323	}
	324	if x < min_sig {
	325	u.mul_pow2(1);
	326	k -= 1;
	327	} else if x > max_sig {
	328	v.mul_pow2(1);
	329	k += 1;
	330	} else {
	331	break;
	332	}
	333	}
	334	let q = num::to_u64(&x);
	335	let z = rawfp::encode_normal(Unpacked::new(q, k));
	336	round_by_remainder(v, rem, q, z)
	337	}
	338
	339	/// Skip over most AlgorithmM iterations by checking the bit length.
	340	fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
	341	// The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v).
	342	// The estimate is off by at most 1, but always an under-estimate, so the error on log(u)
	343	// and log(v) are of the same sign and cancel out (if both are large). Therefore the error
	344	// for log(u / v) is at most one as well.
	345	// The target ratio is one where u/v is in an in-range significand. Thus our termination
	346	// condition is log2(u / v) being the significand bits, plus/minus one.
	347	// FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
cc61c64b	348	let target_ratio = T::SIG_BITS as i16;
e9174d1e SL	349	let log2_u = u.bit_length() as i16;
	350	let log2_v = v.bit_length() as i16;
	351	let mut u_shift: i16 = 0;
	352	let mut v_shift: i16 = 0;
	353	assert!(*k == 0);
	354	loop {
cc61c64b	355	if *k == T::MIN_EXP_INT {
e9174d1e SL	356	// Underflow or subnormal. Leave it to the main function.
	357	break;
	358	}
cc61c64b	359	if *k == T::MAX_EXP_INT {
e9174d1e SL	360	// Overflow. Leave it to the main function.
	361	break;
	362	}
	363	let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift);
	364	if log2_ratio < target_ratio - 1 {
	365	u_shift += 1;
	366	*k -= 1;
	367	} else if log2_ratio > target_ratio + 1 {
	368	v_shift += 1;
	369	*k += 1;
	370	} else {
	371	break;
	372	}
	373	}
	374	u.mul_pow2(u_shift as usize);
	375	v.mul_pow2(v_shift as usize);
	376	}
	377
	378	fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
cc61c64b	379	if x < Big::from_u64(T::MIN_SIG) {
e9174d1e SL	380	let q = num::to_u64(&x);
	381	let z = rawfp::encode_subnormal(q);
	382	return round_by_remainder(v, rem, q, z);
	383	}
	384	// Ratio isn't an in-range significand with the minimum exponent, so we need to round off
	385	// excess bits and adjust the exponent accordingly. The real value now looks like this:
	386	//
	387	// x lsb
	388	// /--------------\/
	389	// 1010101010101010.10101010101010 * 2^k
	390	// \-----/\-------/ \------------/
	391	// q trunc. (represented by rem)
	392	//
	393	// Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding
	394	// on their own. When they are equal and the remainder is non-zero, the value still
cc61c64b	395	// needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
e9174d1e SL	396	// is zero, we have a half-to-even situation.
e9174d1e SL	397	let bits = x.bit_length();
cc61c64b	398	let lsb = bits - T::SIG_BITS as usize;
e9174d1e	399	let q = num::get_bits(&x, lsb, bits);
cc61c64b	400	let k = T::MIN_EXP_INT + lsb as i16;
e9174d1e SL	401	let z = rawfp::encode_normal(Unpacked::new(q, k));
	402	let q_even = q % 2 == 0;
	403	match num::compare_with_half_ulp(&x, lsb) {
	404	Greater => next_float(z),
	405	Less => z,
	406	Equal if rem.is_zero() && q_even => z,
	407	Equal => next_float(z),
	408	}
	409	}
	410
	411	/// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division.
	412	fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T {
	413	let mut v_minus_r = v;
	414	v_minus_r.sub(&r);
	415	if r < v_minus_r {
	416	z
	417	} else if r > v_minus_r {
	418	next_float(z)
	419	} else if q % 2 == 0 {
	420	z
	421	} else {
	422	next_float(z)
	423	}
	424	}