]>
Commit | Line | Data |
---|---|---|
f035d41b XL |
1 | // Adapted from https://github.com/Alexhuszagh/rust-lexical. |
2 | ||
3 | //! Estimate the error in an 80-bit approximation of a float. | |
4 | //! | |
5 | //! This estimates the error in a floating-point representation. | |
6 | //! | |
7 | //! This implementation is loosely based off the Golang implementation, | |
add651ee | 8 | //! found here: <https://golang.org/src/strconv/atof.go> |
f035d41b XL |
9 | |
10 | use super::float::*; | |
11 | use super::num::*; | |
12 | use super::rounding::*; | |
13 | ||
14 | pub(crate) trait FloatErrors { | |
15 | /// Get the full error scale. | |
16 | fn error_scale() -> u32; | |
17 | /// Get the half error scale. | |
18 | fn error_halfscale() -> u32; | |
19 | /// Determine if the number of errors is tolerable for float precision. | |
20 | fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool; | |
21 | } | |
22 | ||
23 | /// Check if the error is accurate with a round-nearest rounding scheme. | |
24 | #[inline] | |
25 | fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool { | |
26 | // Round-to-nearest, need to use the halfway point. | |
27 | if extrabits == 65 { | |
28 | // Underflow, we have a shift larger than the mantissa. | |
29 | // Representation is valid **only** if the value is close enough | |
30 | // overflow to the next bit within errors. If it overflows, | |
31 | // the representation is **not** valid. | |
32 | !fp.mant.overflowing_add(errors).1 | |
33 | } else { | |
34 | let mask: u64 = lower_n_mask(extrabits); | |
35 | let extra: u64 = fp.mant & mask; | |
36 | ||
37 | // Round-to-nearest, need to check if we're close to halfway. | |
38 | // IE, b10100 | 100000, where `|` signifies the truncation point. | |
39 | let halfway: u64 = lower_n_halfway(extrabits); | |
40 | let cmp1 = halfway.wrapping_sub(errors) < extra; | |
41 | let cmp2 = extra < halfway.wrapping_add(errors); | |
42 | ||
43 | // If both comparisons are true, we have significant rounding error, | |
44 | // and the value cannot be exactly represented. Otherwise, the | |
45 | // representation is valid. | |
46 | !(cmp1 && cmp2) | |
47 | } | |
48 | } | |
49 | ||
50 | impl FloatErrors for u64 { | |
51 | #[inline] | |
52 | fn error_scale() -> u32 { | |
53 | 8 | |
54 | } | |
55 | ||
56 | #[inline] | |
57 | fn error_halfscale() -> u32 { | |
58 | u64::error_scale() / 2 | |
59 | } | |
60 | ||
61 | #[inline] | |
62 | fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool { | |
63 | // Determine if extended-precision float is a good approximation. | |
64 | // If the error has affected too many units, the float will be | |
65 | // inaccurate, or if the representation is too close to halfway | |
66 | // that any operations could affect this halfway representation. | |
67 | // See the documentation for dtoa for more information. | |
68 | let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE); | |
69 | let denormal_exp = bias - 63; | |
70 | // This is always a valid u32, since (denormal_exp - fp.exp) | |
71 | // will always be positive and the significand size is {23, 52}. | |
72 | let extrabits = if fp.exp <= denormal_exp { | |
73 | 64 - F::MANTISSA_SIZE + denormal_exp - fp.exp | |
74 | } else { | |
75 | 63 - F::MANTISSA_SIZE | |
76 | }; | |
77 | ||
78 | // Our logic is as follows: we want to determine if the actual | |
79 | // mantissa and the errors during calculation differ significantly | |
80 | // from the rounding point. The rounding point for round-nearest | |
81 | // is the halfway point, IE, this when the truncated bits start | |
82 | // with b1000..., while the rounding point for the round-toward | |
83 | // is when the truncated bits are equal to 0. | |
84 | // To do so, we can check whether the rounding point +/- the error | |
85 | // are >/< the actual lower n bits. | |
86 | // | |
87 | // For whether we need to use signed or unsigned types for this | |
88 | // analysis, see this example, using u8 rather than u64 to simplify | |
89 | // things. | |
90 | // | |
91 | // # Comparisons | |
92 | // cmp1 = (halfway - errors) < extra | |
93 | // cmp1 = extra < (halfway + errors) | |
94 | // | |
95 | // # Large Extrabits, Low Errors | |
96 | // | |
97 | // extrabits = 8 | |
98 | // halfway = 0b10000000 | |
99 | // extra = 0b10000010 | |
100 | // errors = 0b00000100 | |
101 | // halfway - errors = 0b01111100 | |
102 | // halfway + errors = 0b10000100 | |
103 | // | |
104 | // Unsigned: | |
105 | // halfway - errors = 124 | |
106 | // halfway + errors = 132 | |
107 | // extra = 130 | |
108 | // cmp1 = true | |
109 | // cmp2 = true | |
110 | // Signed: | |
111 | // halfway - errors = 124 | |
112 | // halfway + errors = -124 | |
113 | // extra = -126 | |
114 | // cmp1 = false | |
115 | // cmp2 = true | |
116 | // | |
117 | // # Conclusion | |
118 | // | |
119 | // Since errors will always be small, and since we want to detect | |
120 | // if the representation is accurate, we need to use an **unsigned** | |
121 | // type for comparisons. | |
122 | ||
123 | let extrabits = extrabits as u64; | |
124 | let errors = count as u64; | |
125 | if extrabits > 65 { | |
126 | // Underflow, we have a literal 0. | |
127 | return true; | |
128 | } | |
129 | ||
130 | nearest_error_is_accurate(errors, fp, extrabits) | |
131 | } | |
132 | } |