]> git.proxmox.com Git - rustc.git/blame - src/libcore/num/dec2flt/algorithm.rs
New upstream version 1.18.0+dfsg1
[rustc.git] / src / libcore / num / dec2flt / algorithm.rs
CommitLineData
e9174d1e
SL
1// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! The various algorithms from the paper.
12
e9174d1e
SL
13use cmp::min;
14use cmp::Ordering::{Less, Equal, Greater};
b039eaaf
SL
15use num::diy_float::Fp;
16use num::dec2flt::table;
17use num::dec2flt::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float};
18use num::dec2flt::num::{self, Big};
e9174d1e
SL
19
20/// Number of significand bits in Fp
21const P: u32 = 64;
22
23// We simply store the best approximation for *all* exponents, so the variable "h" and the
24// associated conditions can be omitted. This trades performance for a couple kilobytes of space.
25
26fn power_of_ten(e: i16) -> Fp {
27 assert!(e >= table::MIN_E);
28 let i = e - table::MIN_E;
29 let sig = table::POWERS.0[i as usize];
30 let exp = table::POWERS.1[i as usize];
31 Fp { f: sig, e: exp }
32}
33
a7813a04
XL
34// In most architectures, floating point operations have an explicit bit size, therefore the
35// precision of the computation is determined on a per-operation basis.
36#[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
37mod fpu_precision {
38 pub fn set_precision<T>() { }
39}
40
41// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
42// The x87 FPU operates with 80 bits of precision by default, which means that operations will
43// round to 80 bits causing double rounding to happen when values are eventually represented as
44// 32/64 bit float values. To overcome this, the FPU control word can be set so that the
45// computations are performed in the desired precision.
46#[cfg(all(target_arch="x86", not(target_feature="sse2")))]
47mod fpu_precision {
48 use mem::size_of;
a7813a04
XL
49
50 /// A structure used to preserve the original value of the FPU control word, so that it can be
51 /// restored when the structure is dropped.
52 ///
53 /// The x87 FPU is a 16-bits register whose fields are as follows:
54 ///
55 /// | 12-15 | 10-11 | 8-9 | 6-7 | 5 | 4 | 3 | 2 | 1 | 0 |
56 /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
57 /// | | RC | PC | | PM | UM | OM | ZM | DM | IM |
58 ///
59 /// The documentation for all of the fields is available in the IA-32 Architectures Software
60 /// Developer's Manual (Volume 1).
61 ///
62 /// The only field which is relevant for the following code is PC, Precision Control. This
63 /// field determines the precision of the operations performed by the FPU. It can be set to:
64 /// - 0b00, single precision i.e. 32-bits
65 /// - 0b10, double precision i.e. 64-bits
66 /// - 0b11, double extended precision i.e. 80-bits (default state)
67 /// The 0b01 value is reserved and should not be used.
68 pub struct FPUControlWord(u16);
69
70 fn set_cw(cw: u16) {
71 unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
72 }
73
74 /// Set the precision field of the FPU to `T` and return a `FPUControlWord`
75 pub fn set_precision<T>() -> FPUControlWord {
76 let cw = 0u16;
77
78 // Compute the value for the Precision Control field that is appropriate for `T`.
79 let cw_precision = match size_of::<T>() {
80 4 => 0x0000, // 32 bits
81 8 => 0x0200, // 64 bits
82 _ => 0x0300, // default, 80 bits
83 };
84
85 // Get the original value of the control word to restore it later, when the
86 // `FPUControlWord` structure is dropped
87 unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }
88
89 // Set the control word to the desired precision. This is achieved by masking away the old
90 // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
91 set_cw((cw & 0xFCFF) | cw_precision);
92
93 FPUControlWord(cw)
94 }
95
96 impl Drop for FPUControlWord {
97 fn drop(&mut self) {
98 set_cw(self.0)
99 }
100 }
101}
102
e9174d1e
SL
103/// The fast path of Bellerophon using machine-sized integers and floats.
104///
105/// This is extracted into a separate function so that it can be attempted before constructing
106/// a bignum.
e9174d1e
SL
107pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
108 let num_digits = integral.len() + fractional.len();
cc61c64b 109 // log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end,
e9174d1e
SL
110 // this is just a quick, cheap rejection (and also frees the rest of the code from
111 // worrying about underflow).
112 if num_digits > 16 {
113 return None;
114 }
cc61c64b 115 if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 {
e9174d1e
SL
116 return None;
117 }
118 let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
cc61c64b 119 if f > T::MAX_SIG {
e9174d1e
SL
120 return None;
121 }
a7813a04
XL
122
123 // The fast path crucially depends on arithmetic being rounded to the correct number of bits
124 // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
125 // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
126 // The `set_precision` function takes care of setting the precision on architectures which
127 // require setting it by changing the global state (like the control word of the x87 FPU).
128 let _cw = fpu_precision::set_precision::<T>();
129
e9174d1e
SL
130 // The case e < 0 cannot be folded into the other branch. Negative powers result in
131 // a repeating fractional part in binary, which are rounded, which causes real
a7813a04 132 // (and occasionally quite significant!) errors in the final result.
9cc50fc6
SL
133 if e >= 0 {
134 Some(T::from_int(f) * T::short_fast_pow10(e as usize))
e9174d1e 135 } else {
9cc50fc6 136 Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize))
e9174d1e
SL
137 }
138}
139
140/// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis.
141///
142/// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation
143/// of `10^e` (in the same floating point format). This is often enough to get the correct result.
cc61c64b 144/// However, when the result is close to halfway between two adjacent (ordinary) floats, the
e9174d1e
SL
145/// compound rounding error from multiplying two approximation means the result may be off by a
146/// few bits. When this happens, the iterative Algorithm R fixes things up.
147///
148/// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper.
149/// In the words of Clinger:
150///
151/// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error
152/// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is
153/// > not a bound for the true error, but bounds the difference between the approximation z and
154/// > the best possible approximation that uses p bits of significand.)
155pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
156 let slop;
cc61c64b 157 if f <= &Big::from_u64(T::MAX_SIG) {
e9174d1e
SL
158 // The cases abs(e) < log5(2^N) are in fast_path()
159 slop = if e >= 0 { 0 } else { 3 };
160 } else {
161 slop = if e >= 0 { 1 } else { 4 };
162 }
163 let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
cc61c64b 164 let exp_p_n = 1 << (P - T::SIG_BITS as u32);
e9174d1e
SL
165 let lowbits: i64 = (z.f % exp_p_n) as i64;
166 // Is the slop large enough to make a difference when
167 // rounding to n bits?
168 if (lowbits - exp_p_n as i64 / 2).abs() <= slop {
169 algorithm_r(f, e, fp_to_float(z))
170 } else {
171 fp_to_float(z)
172 }
173}
174
175/// An iterative algorithm that improves a floating point approximation of `f * 10^e`.
176///
177/// Each iteration gets one unit in the last place closer, which of course takes terribly long to
178/// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the
179/// starting approximation is off by at most one ULP.
180fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
181 let mut z = z0;
182 loop {
183 let raw = z.unpack();
184 let (m, k) = (raw.sig, raw.k);
185 let mut x = f.clone();
186 let mut y = Big::from_u64(m);
187
188 // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`.
189 // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the
190 // power of two common to `10^e` and `2^k` to make the numbers smaller.
191 make_ratio(&mut x, &mut y, e, k);
192
193 let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32];
194 // This is written a bit awkwardly because our bignums don't support
195 // negative numbers, so we use the absolute value + sign information.
196 // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that
7453a54e 197 // we need to worry about overflow, then they are also large enough that `make_ratio` has
e9174d1e
SL
198 // reduced the fraction by a factor of 2^64 or more.
199 let (d2, d_negative) = if x >= y {
200 // Don't need x any more, save a clone().
201 x.sub(&y).mul_pow2(1).mul_digits(&m_digits);
202 (x, false)
203 } else {
204 // Still need y - make a copy.
205 let mut y = y.clone();
206 y.sub(&x).mul_pow2(1).mul_digits(&m_digits);
207 (y, true)
208 };
209
210 if d2 < y {
211 let mut d2_double = d2;
212 d2_double.mul_pow2(1);
cc61c64b 213 if m == T::MIN_SIG && d_negative && d2_double > y {
e9174d1e
SL
214 z = prev_float(z);
215 } else {
216 return z;
217 }
218 } else if d2 == y {
219 if m % 2 == 0 {
cc61c64b 220 if m == T::MIN_SIG && d_negative {
e9174d1e
SL
221 z = prev_float(z);
222 } else {
223 return z;
224 }
225 } else if d_negative {
226 z = prev_float(z);
227 } else {
228 z = next_float(z);
229 }
230 } else if d_negative {
231 z = prev_float(z);
232 } else {
233 z = next_float(z);
234 }
235 }
236}
237
238/// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the
239/// significand of a floating point approximation, make the ratio `x / y` equal to
240/// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common.
241fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) {
242 let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize);
243 if e >= 0 {
244 if k >= 0 {
245 // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two.
246 let common = min(e_abs, k_abs);
247 x.mul_pow5(e_abs).mul_pow2(e_abs - common);
248 y.mul_pow2(k_abs - common);
249 } else {
250 // x = f * 10^e * 2^abs(k), y = m
251 // This can't overflow because it requires positive `e` and negative `k`, which can
252 // only happen for values extremely close to 1, which means that `e` and `k` will be
253 // comparatively tiny.
254 x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs);
255 }
256 } else {
257 if k >= 0 {
258 // x = f, y = m * 10^abs(e) * 2^k
259 // This can't overflow either, see above.
260 y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs);
261 } else {
262 // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two.
263 let common = min(e_abs, k_abs);
264 x.mul_pow2(k_abs - common);
265 y.mul_pow5(e_abs).mul_pow2(e_abs - common);
266 }
267 }
268}
269
270/// Conceptually, Algorithm M is the simplest way to convert a decimal to a float.
271///
272/// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives
273/// a valid float significand. The binary exponent `k` is the number of times we multiplied
274/// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`.
275/// When we have found out significand, we only need to round by inspecting the remainder of the
276/// division, which is done in helper functions further below.
277///
278/// This algorithm is super slow, even with the optimization described in `quick_start()`.
279/// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal
280/// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed.
281/// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand,
282/// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return
283/// infinity.
284///
285/// Handling underflow and subnormals is trickier. One big problem is that, with the minimum
286/// exponent, the ratio might still be too large for a significand. See underflow() for details.
287pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
288 let mut u;
289 let mut v;
290 let e_abs = e.abs() as usize;
291 let mut k = 0;
292 if e < 0 {
293 u = f.clone();
294 v = Big::from_small(1);
295 v.mul_pow5(e_abs).mul_pow2(e_abs);
296 } else {
297 // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of
298 // fp_to_float(big_to_fp(u)) here, only without the double rounding.
299 u = f.clone();
300 u.mul_pow5(e_abs).mul_pow2(e_abs);
301 v = Big::from_small(1);
302 }
303 quick_start::<T>(&mut u, &mut v, &mut k);
304 let mut rem = Big::from_small(0);
305 let mut x = Big::from_small(0);
cc61c64b
XL
306 let min_sig = Big::from_u64(T::MIN_SIG);
307 let max_sig = Big::from_u64(T::MAX_SIG);
e9174d1e
SL
308 loop {
309 u.div_rem(&v, &mut x, &mut rem);
cc61c64b
XL
310 if k == T::MIN_EXP_INT {
311 // We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`,
e9174d1e
SL
312 // then we'd be off by a factor of two. Unfortunately this means we have to special-
313 // case normal numbers with the minimum exponent.
314 // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
315 // that it's actually correct!
316 if x >= min_sig && x <= max_sig {
317 break;
318 }
319 return underflow(x, v, rem);
320 }
cc61c64b
XL
321 if k > T::MAX_EXP_INT {
322 return T::INFINITY;
e9174d1e
SL
323 }
324 if x < min_sig {
325 u.mul_pow2(1);
326 k -= 1;
327 } else if x > max_sig {
328 v.mul_pow2(1);
329 k += 1;
330 } else {
331 break;
332 }
333 }
334 let q = num::to_u64(&x);
335 let z = rawfp::encode_normal(Unpacked::new(q, k));
336 round_by_remainder(v, rem, q, z)
337}
338
339/// Skip over most AlgorithmM iterations by checking the bit length.
340fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
341 // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v).
342 // The estimate is off by at most 1, but always an under-estimate, so the error on log(u)
343 // and log(v) are of the same sign and cancel out (if both are large). Therefore the error
344 // for log(u / v) is at most one as well.
345 // The target ratio is one where u/v is in an in-range significand. Thus our termination
346 // condition is log2(u / v) being the significand bits, plus/minus one.
347 // FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
cc61c64b 348 let target_ratio = T::SIG_BITS as i16;
e9174d1e
SL
349 let log2_u = u.bit_length() as i16;
350 let log2_v = v.bit_length() as i16;
351 let mut u_shift: i16 = 0;
352 let mut v_shift: i16 = 0;
353 assert!(*k == 0);
354 loop {
cc61c64b 355 if *k == T::MIN_EXP_INT {
e9174d1e
SL
356 // Underflow or subnormal. Leave it to the main function.
357 break;
358 }
cc61c64b 359 if *k == T::MAX_EXP_INT {
e9174d1e
SL
360 // Overflow. Leave it to the main function.
361 break;
362 }
363 let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift);
364 if log2_ratio < target_ratio - 1 {
365 u_shift += 1;
366 *k -= 1;
367 } else if log2_ratio > target_ratio + 1 {
368 v_shift += 1;
369 *k += 1;
370 } else {
371 break;
372 }
373 }
374 u.mul_pow2(u_shift as usize);
375 v.mul_pow2(v_shift as usize);
376}
377
378fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
cc61c64b 379 if x < Big::from_u64(T::MIN_SIG) {
e9174d1e
SL
380 let q = num::to_u64(&x);
381 let z = rawfp::encode_subnormal(q);
382 return round_by_remainder(v, rem, q, z);
383 }
384 // Ratio isn't an in-range significand with the minimum exponent, so we need to round off
385 // excess bits and adjust the exponent accordingly. The real value now looks like this:
386 //
387 // x lsb
388 // /--------------\/
389 // 1010101010101010.10101010101010 * 2^k
390 // \-----/\-------/ \------------/
391 // q trunc. (represented by rem)
392 //
393 // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding
394 // on their own. When they are equal and the remainder is non-zero, the value still
cc61c64b 395 // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
e9174d1e
SL
396 // is zero, we have a half-to-even situation.
397 let bits = x.bit_length();
cc61c64b 398 let lsb = bits - T::SIG_BITS as usize;
e9174d1e 399 let q = num::get_bits(&x, lsb, bits);
cc61c64b 400 let k = T::MIN_EXP_INT + lsb as i16;
e9174d1e
SL
401 let z = rawfp::encode_normal(Unpacked::new(q, k));
402 let q_even = q % 2 == 0;
403 match num::compare_with_half_ulp(&x, lsb) {
404 Greater => next_float(z),
405 Less => z,
406 Equal if rem.is_zero() && q_even => z,
407 Equal => next_float(z),
408 }
409}
410
411/// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division.
412fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T {
413 let mut v_minus_r = v;
414 v_minus_r.sub(&r);
415 if r < v_minus_r {
416 z
417 } else if r > v_minus_r {
418 next_float(z)
419 } else if q % 2 == 0 {
420 z
421 } else {
422 next_float(z)
423 }
424}