]>
Commit | Line | Data |
---|---|---|
e9174d1e SL |
1 | // Copyright 2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | //! The various algorithms from the paper. | |
12 | ||
e9174d1e SL |
13 | use cmp::min; |
14 | use cmp::Ordering::{Less, Equal, Greater}; | |
b039eaaf SL |
15 | use num::diy_float::Fp; |
16 | use num::dec2flt::table; | |
17 | use num::dec2flt::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float}; | |
18 | use num::dec2flt::num::{self, Big}; | |
e9174d1e SL |
19 | |
20 | /// Number of significand bits in Fp | |
21 | const P: u32 = 64; | |
22 | ||
23 | // We simply store the best approximation for *all* exponents, so the variable "h" and the | |
24 | // associated conditions can be omitted. This trades performance for a couple kilobytes of space. | |
25 | ||
26 | fn power_of_ten(e: i16) -> Fp { | |
27 | assert!(e >= table::MIN_E); | |
28 | let i = e - table::MIN_E; | |
29 | let sig = table::POWERS.0[i as usize]; | |
30 | let exp = table::POWERS.1[i as usize]; | |
31 | Fp { f: sig, e: exp } | |
32 | } | |
33 | ||
a7813a04 XL |
34 | // In most architectures, floating point operations have an explicit bit size, therefore the |
35 | // precision of the computation is determined on a per-operation basis. | |
36 | #[cfg(any(not(target_arch="x86"), target_feature="sse2"))] | |
37 | mod fpu_precision { | |
38 | pub fn set_precision<T>() { } | |
39 | } | |
40 | ||
41 | // On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available. | |
42 | // The x87 FPU operates with 80 bits of precision by default, which means that operations will | |
43 | // round to 80 bits causing double rounding to happen when values are eventually represented as | |
44 | // 32/64 bit float values. To overcome this, the FPU control word can be set so that the | |
45 | // computations are performed in the desired precision. | |
46 | #[cfg(all(target_arch="x86", not(target_feature="sse2")))] | |
47 | mod fpu_precision { | |
48 | use mem::size_of; | |
a7813a04 XL |
49 | |
50 | /// A structure used to preserve the original value of the FPU control word, so that it can be | |
51 | /// restored when the structure is dropped. | |
52 | /// | |
53 | /// The x87 FPU is a 16-bits register whose fields are as follows: | |
54 | /// | |
55 | /// | 12-15 | 10-11 | 8-9 | 6-7 | 5 | 4 | 3 | 2 | 1 | 0 | | |
56 | /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:| | |
57 | /// | | RC | PC | | PM | UM | OM | ZM | DM | IM | | |
58 | /// | |
59 | /// The documentation for all of the fields is available in the IA-32 Architectures Software | |
60 | /// Developer's Manual (Volume 1). | |
61 | /// | |
62 | /// The only field which is relevant for the following code is PC, Precision Control. This | |
63 | /// field determines the precision of the operations performed by the FPU. It can be set to: | |
64 | /// - 0b00, single precision i.e. 32-bits | |
65 | /// - 0b10, double precision i.e. 64-bits | |
66 | /// - 0b11, double extended precision i.e. 80-bits (default state) | |
67 | /// The 0b01 value is reserved and should not be used. | |
68 | pub struct FPUControlWord(u16); | |
69 | ||
70 | fn set_cw(cw: u16) { | |
71 | unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") } | |
72 | } | |
73 | ||
74 | /// Set the precision field of the FPU to `T` and return a `FPUControlWord` | |
75 | pub fn set_precision<T>() -> FPUControlWord { | |
76 | let cw = 0u16; | |
77 | ||
78 | // Compute the value for the Precision Control field that is appropriate for `T`. | |
79 | let cw_precision = match size_of::<T>() { | |
80 | 4 => 0x0000, // 32 bits | |
81 | 8 => 0x0200, // 64 bits | |
82 | _ => 0x0300, // default, 80 bits | |
83 | }; | |
84 | ||
85 | // Get the original value of the control word to restore it later, when the | |
86 | // `FPUControlWord` structure is dropped | |
87 | unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") } | |
88 | ||
89 | // Set the control word to the desired precision. This is achieved by masking away the old | |
90 | // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above. | |
91 | set_cw((cw & 0xFCFF) | cw_precision); | |
92 | ||
93 | FPUControlWord(cw) | |
94 | } | |
95 | ||
96 | impl Drop for FPUControlWord { | |
97 | fn drop(&mut self) { | |
98 | set_cw(self.0) | |
99 | } | |
100 | } | |
101 | } | |
102 | ||
e9174d1e SL |
103 | /// The fast path of Bellerophon using machine-sized integers and floats. |
104 | /// | |
105 | /// This is extracted into a separate function so that it can be attempted before constructing | |
106 | /// a bignum. | |
e9174d1e SL |
107 | pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> { |
108 | let num_digits = integral.len() + fractional.len(); | |
cc61c64b | 109 | // log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end, |
e9174d1e SL |
110 | // this is just a quick, cheap rejection (and also frees the rest of the code from |
111 | // worrying about underflow). | |
112 | if num_digits > 16 { | |
113 | return None; | |
114 | } | |
cc61c64b | 115 | if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 { |
e9174d1e SL |
116 | return None; |
117 | } | |
118 | let f = num::from_str_unchecked(integral.iter().chain(fractional.iter())); | |
cc61c64b | 119 | if f > T::MAX_SIG { |
e9174d1e SL |
120 | return None; |
121 | } | |
a7813a04 XL |
122 | |
123 | // The fast path crucially depends on arithmetic being rounded to the correct number of bits | |
124 | // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision | |
125 | // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit. | |
126 | // The `set_precision` function takes care of setting the precision on architectures which | |
127 | // require setting it by changing the global state (like the control word of the x87 FPU). | |
128 | let _cw = fpu_precision::set_precision::<T>(); | |
129 | ||
e9174d1e SL |
130 | // The case e < 0 cannot be folded into the other branch. Negative powers result in |
131 | // a repeating fractional part in binary, which are rounded, which causes real | |
a7813a04 | 132 | // (and occasionally quite significant!) errors in the final result. |
9cc50fc6 SL |
133 | if e >= 0 { |
134 | Some(T::from_int(f) * T::short_fast_pow10(e as usize)) | |
e9174d1e | 135 | } else { |
9cc50fc6 | 136 | Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize)) |
e9174d1e SL |
137 | } |
138 | } | |
139 | ||
140 | /// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis. | |
141 | /// | |
142 | /// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation | |
143 | /// of `10^e` (in the same floating point format). This is often enough to get the correct result. | |
cc61c64b | 144 | /// However, when the result is close to halfway between two adjacent (ordinary) floats, the |
e9174d1e SL |
145 | /// compound rounding error from multiplying two approximation means the result may be off by a |
146 | /// few bits. When this happens, the iterative Algorithm R fixes things up. | |
147 | /// | |
148 | /// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper. | |
149 | /// In the words of Clinger: | |
150 | /// | |
151 | /// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error | |
152 | /// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is | |
153 | /// > not a bound for the true error, but bounds the difference between the approximation z and | |
154 | /// > the best possible approximation that uses p bits of significand.) | |
155 | pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T { | |
156 | let slop; | |
cc61c64b | 157 | if f <= &Big::from_u64(T::MAX_SIG) { |
e9174d1e SL |
158 | // The cases abs(e) < log5(2^N) are in fast_path() |
159 | slop = if e >= 0 { 0 } else { 3 }; | |
160 | } else { | |
161 | slop = if e >= 0 { 1 } else { 4 }; | |
162 | } | |
163 | let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize(); | |
cc61c64b | 164 | let exp_p_n = 1 << (P - T::SIG_BITS as u32); |
e9174d1e SL |
165 | let lowbits: i64 = (z.f % exp_p_n) as i64; |
166 | // Is the slop large enough to make a difference when | |
167 | // rounding to n bits? | |
168 | if (lowbits - exp_p_n as i64 / 2).abs() <= slop { | |
169 | algorithm_r(f, e, fp_to_float(z)) | |
170 | } else { | |
171 | fp_to_float(z) | |
172 | } | |
173 | } | |
174 | ||
175 | /// An iterative algorithm that improves a floating point approximation of `f * 10^e`. | |
176 | /// | |
177 | /// Each iteration gets one unit in the last place closer, which of course takes terribly long to | |
178 | /// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the | |
179 | /// starting approximation is off by at most one ULP. | |
180 | fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T { | |
181 | let mut z = z0; | |
182 | loop { | |
183 | let raw = z.unpack(); | |
184 | let (m, k) = (raw.sig, raw.k); | |
185 | let mut x = f.clone(); | |
186 | let mut y = Big::from_u64(m); | |
187 | ||
188 | // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`. | |
189 | // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the | |
190 | // power of two common to `10^e` and `2^k` to make the numbers smaller. | |
191 | make_ratio(&mut x, &mut y, e, k); | |
192 | ||
193 | let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32]; | |
194 | // This is written a bit awkwardly because our bignums don't support | |
195 | // negative numbers, so we use the absolute value + sign information. | |
196 | // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that | |
7453a54e | 197 | // we need to worry about overflow, then they are also large enough that `make_ratio` has |
e9174d1e SL |
198 | // reduced the fraction by a factor of 2^64 or more. |
199 | let (d2, d_negative) = if x >= y { | |
200 | // Don't need x any more, save a clone(). | |
201 | x.sub(&y).mul_pow2(1).mul_digits(&m_digits); | |
202 | (x, false) | |
203 | } else { | |
204 | // Still need y - make a copy. | |
205 | let mut y = y.clone(); | |
206 | y.sub(&x).mul_pow2(1).mul_digits(&m_digits); | |
207 | (y, true) | |
208 | }; | |
209 | ||
210 | if d2 < y { | |
211 | let mut d2_double = d2; | |
212 | d2_double.mul_pow2(1); | |
cc61c64b | 213 | if m == T::MIN_SIG && d_negative && d2_double > y { |
e9174d1e SL |
214 | z = prev_float(z); |
215 | } else { | |
216 | return z; | |
217 | } | |
218 | } else if d2 == y { | |
219 | if m % 2 == 0 { | |
cc61c64b | 220 | if m == T::MIN_SIG && d_negative { |
e9174d1e SL |
221 | z = prev_float(z); |
222 | } else { | |
223 | return z; | |
224 | } | |
225 | } else if d_negative { | |
226 | z = prev_float(z); | |
227 | } else { | |
228 | z = next_float(z); | |
229 | } | |
230 | } else if d_negative { | |
231 | z = prev_float(z); | |
232 | } else { | |
233 | z = next_float(z); | |
234 | } | |
235 | } | |
236 | } | |
237 | ||
238 | /// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the | |
239 | /// significand of a floating point approximation, make the ratio `x / y` equal to | |
240 | /// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common. | |
241 | fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) { | |
242 | let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize); | |
243 | if e >= 0 { | |
244 | if k >= 0 { | |
245 | // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two. | |
246 | let common = min(e_abs, k_abs); | |
247 | x.mul_pow5(e_abs).mul_pow2(e_abs - common); | |
248 | y.mul_pow2(k_abs - common); | |
249 | } else { | |
250 | // x = f * 10^e * 2^abs(k), y = m | |
251 | // This can't overflow because it requires positive `e` and negative `k`, which can | |
252 | // only happen for values extremely close to 1, which means that `e` and `k` will be | |
253 | // comparatively tiny. | |
254 | x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs); | |
255 | } | |
256 | } else { | |
257 | if k >= 0 { | |
258 | // x = f, y = m * 10^abs(e) * 2^k | |
259 | // This can't overflow either, see above. | |
260 | y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs); | |
261 | } else { | |
262 | // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two. | |
263 | let common = min(e_abs, k_abs); | |
264 | x.mul_pow2(k_abs - common); | |
265 | y.mul_pow5(e_abs).mul_pow2(e_abs - common); | |
266 | } | |
267 | } | |
268 | } | |
269 | ||
270 | /// Conceptually, Algorithm M is the simplest way to convert a decimal to a float. | |
271 | /// | |
272 | /// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives | |
273 | /// a valid float significand. The binary exponent `k` is the number of times we multiplied | |
274 | /// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`. | |
275 | /// When we have found out significand, we only need to round by inspecting the remainder of the | |
276 | /// division, which is done in helper functions further below. | |
277 | /// | |
278 | /// This algorithm is super slow, even with the optimization described in `quick_start()`. | |
279 | /// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal | |
280 | /// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed. | |
281 | /// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand, | |
282 | /// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return | |
283 | /// infinity. | |
284 | /// | |
285 | /// Handling underflow and subnormals is trickier. One big problem is that, with the minimum | |
286 | /// exponent, the ratio might still be too large for a significand. See underflow() for details. | |
287 | pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T { | |
288 | let mut u; | |
289 | let mut v; | |
290 | let e_abs = e.abs() as usize; | |
291 | let mut k = 0; | |
292 | if e < 0 { | |
293 | u = f.clone(); | |
294 | v = Big::from_small(1); | |
295 | v.mul_pow5(e_abs).mul_pow2(e_abs); | |
296 | } else { | |
297 | // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of | |
298 | // fp_to_float(big_to_fp(u)) here, only without the double rounding. | |
299 | u = f.clone(); | |
300 | u.mul_pow5(e_abs).mul_pow2(e_abs); | |
301 | v = Big::from_small(1); | |
302 | } | |
303 | quick_start::<T>(&mut u, &mut v, &mut k); | |
304 | let mut rem = Big::from_small(0); | |
305 | let mut x = Big::from_small(0); | |
cc61c64b XL |
306 | let min_sig = Big::from_u64(T::MIN_SIG); |
307 | let max_sig = Big::from_u64(T::MAX_SIG); | |
e9174d1e SL |
308 | loop { |
309 | u.div_rem(&v, &mut x, &mut rem); | |
cc61c64b XL |
310 | if k == T::MIN_EXP_INT { |
311 | // We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`, | |
e9174d1e SL |
312 | // then we'd be off by a factor of two. Unfortunately this means we have to special- |
313 | // case normal numbers with the minimum exponent. | |
314 | // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure | |
315 | // that it's actually correct! | |
316 | if x >= min_sig && x <= max_sig { | |
317 | break; | |
318 | } | |
319 | return underflow(x, v, rem); | |
320 | } | |
cc61c64b XL |
321 | if k > T::MAX_EXP_INT { |
322 | return T::INFINITY; | |
e9174d1e SL |
323 | } |
324 | if x < min_sig { | |
325 | u.mul_pow2(1); | |
326 | k -= 1; | |
327 | } else if x > max_sig { | |
328 | v.mul_pow2(1); | |
329 | k += 1; | |
330 | } else { | |
331 | break; | |
332 | } | |
333 | } | |
334 | let q = num::to_u64(&x); | |
335 | let z = rawfp::encode_normal(Unpacked::new(q, k)); | |
336 | round_by_remainder(v, rem, q, z) | |
337 | } | |
338 | ||
339 | /// Skip over most AlgorithmM iterations by checking the bit length. | |
340 | fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) { | |
341 | // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v). | |
342 | // The estimate is off by at most 1, but always an under-estimate, so the error on log(u) | |
343 | // and log(v) are of the same sign and cancel out (if both are large). Therefore the error | |
344 | // for log(u / v) is at most one as well. | |
345 | // The target ratio is one where u/v is in an in-range significand. Thus our termination | |
346 | // condition is log2(u / v) being the significand bits, plus/minus one. | |
347 | // FIXME Looking at the second bit could improve the estimate and avoid some more divisions. | |
cc61c64b | 348 | let target_ratio = T::SIG_BITS as i16; |
e9174d1e SL |
349 | let log2_u = u.bit_length() as i16; |
350 | let log2_v = v.bit_length() as i16; | |
351 | let mut u_shift: i16 = 0; | |
352 | let mut v_shift: i16 = 0; | |
353 | assert!(*k == 0); | |
354 | loop { | |
cc61c64b | 355 | if *k == T::MIN_EXP_INT { |
e9174d1e SL |
356 | // Underflow or subnormal. Leave it to the main function. |
357 | break; | |
358 | } | |
cc61c64b | 359 | if *k == T::MAX_EXP_INT { |
e9174d1e SL |
360 | // Overflow. Leave it to the main function. |
361 | break; | |
362 | } | |
363 | let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift); | |
364 | if log2_ratio < target_ratio - 1 { | |
365 | u_shift += 1; | |
366 | *k -= 1; | |
367 | } else if log2_ratio > target_ratio + 1 { | |
368 | v_shift += 1; | |
369 | *k += 1; | |
370 | } else { | |
371 | break; | |
372 | } | |
373 | } | |
374 | u.mul_pow2(u_shift as usize); | |
375 | v.mul_pow2(v_shift as usize); | |
376 | } | |
377 | ||
378 | fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T { | |
cc61c64b | 379 | if x < Big::from_u64(T::MIN_SIG) { |
e9174d1e SL |
380 | let q = num::to_u64(&x); |
381 | let z = rawfp::encode_subnormal(q); | |
382 | return round_by_remainder(v, rem, q, z); | |
383 | } | |
384 | // Ratio isn't an in-range significand with the minimum exponent, so we need to round off | |
385 | // excess bits and adjust the exponent accordingly. The real value now looks like this: | |
386 | // | |
387 | // x lsb | |
388 | // /--------------\/ | |
389 | // 1010101010101010.10101010101010 * 2^k | |
390 | // \-----/\-------/ \------------/ | |
391 | // q trunc. (represented by rem) | |
392 | // | |
393 | // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding | |
394 | // on their own. When they are equal and the remainder is non-zero, the value still | |
cc61c64b | 395 | // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder |
e9174d1e SL |
396 | // is zero, we have a half-to-even situation. |
397 | let bits = x.bit_length(); | |
cc61c64b | 398 | let lsb = bits - T::SIG_BITS as usize; |
e9174d1e | 399 | let q = num::get_bits(&x, lsb, bits); |
cc61c64b | 400 | let k = T::MIN_EXP_INT + lsb as i16; |
e9174d1e SL |
401 | let z = rawfp::encode_normal(Unpacked::new(q, k)); |
402 | let q_even = q % 2 == 0; | |
403 | match num::compare_with_half_ulp(&x, lsb) { | |
404 | Greater => next_float(z), | |
405 | Less => z, | |
406 | Equal if rem.is_zero() && q_even => z, | |
407 | Equal => next_float(z), | |
408 | } | |
409 | } | |
410 | ||
411 | /// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division. | |
412 | fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T { | |
413 | let mut v_minus_r = v; | |
414 | v_minus_r.sub(&r); | |
415 | if r < v_minus_r { | |
416 | z | |
417 | } else if r > v_minus_r { | |
418 | next_float(z) | |
419 | } else if q % 2 == 0 { | |
420 | z | |
421 | } else { | |
422 | next_float(z) | |
423 | } | |
424 | } |