]>
Commit | Line | Data |
---|---|---|
f20569fa XL |
1 | #![allow(missing_docs)] |
2 | #![allow(deprecated)] // Float | |
3 | ||
4 | use std::cmp::Ordering::{self, Equal, Greater, Less}; | |
5 | use std::mem; | |
6 | ||
7 | #[cfg(test)] | |
8 | mod tests; | |
9 | ||
10 | fn local_cmp(x: f64, y: f64) -> Ordering { | |
11 | // arbitrarily decide that NaNs are larger than everything. | |
12 | if y.is_nan() { | |
13 | Less | |
14 | } else if x.is_nan() { | |
15 | Greater | |
16 | } else if x < y { | |
17 | Less | |
18 | } else if x == y { | |
19 | Equal | |
20 | } else { | |
21 | Greater | |
22 | } | |
23 | } | |
24 | ||
25 | fn local_sort(v: &mut [f64]) { | |
26 | v.sort_by(|x: &f64, y: &f64| local_cmp(*x, *y)); | |
27 | } | |
28 | ||
29 | /// Trait that provides simple descriptive statistics on a univariate set of numeric samples. | |
30 | pub trait Stats { | |
31 | /// Sum of the samples. | |
32 | /// | |
33 | /// Note: this method sacrifices performance at the altar of accuracy | |
34 | /// Depends on IEEE-754 arithmetic guarantees. See proof of correctness at: | |
35 | /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric | |
36 | /// Predicates"][paper] | |
37 | /// | |
38 | /// [paper]: http://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps | |
39 | fn sum(&self) -> f64; | |
40 | ||
41 | /// Minimum value of the samples. | |
42 | fn min(&self) -> f64; | |
43 | ||
44 | /// Maximum value of the samples. | |
45 | fn max(&self) -> f64; | |
46 | ||
47 | /// Arithmetic mean (average) of the samples: sum divided by sample-count. | |
48 | /// | |
49 | /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean> | |
50 | fn mean(&self) -> f64; | |
51 | ||
52 | /// Median of the samples: value separating the lower half of the samples from the higher half. | |
53 | /// Equal to `self.percentile(50.0)`. | |
54 | /// | |
55 | /// See: <https://en.wikipedia.org/wiki/Median> | |
56 | fn median(&self) -> f64; | |
57 | ||
58 | /// Variance of the samples: bias-corrected mean of the squares of the differences of each | |
59 | /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the | |
60 | /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n` | |
61 | /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather | |
62 | /// than `n`. | |
63 | /// | |
64 | /// See: <https://en.wikipedia.org/wiki/Variance> | |
65 | fn var(&self) -> f64; | |
66 | ||
67 | /// Standard deviation: the square root of the sample variance. | |
68 | /// | |
69 | /// Note: this is not a robust statistic for non-normal distributions. Prefer the | |
70 | /// `median_abs_dev` for unknown distributions. | |
71 | /// | |
72 | /// See: <https://en.wikipedia.org/wiki/Standard_deviation> | |
73 | fn std_dev(&self) -> f64; | |
74 | ||
75 | /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`. | |
76 | /// | |
77 | /// Note: this is not a robust statistic for non-normal distributions. Prefer the | |
78 | /// `median_abs_dev_pct` for unknown distributions. | |
79 | fn std_dev_pct(&self) -> f64; | |
80 | ||
81 | /// Scaled median of the absolute deviations of each sample from the sample median. This is a | |
82 | /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to | |
83 | /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled | |
84 | /// by the constant `1.4826` to allow its use as a consistent estimator for the standard | |
85 | /// deviation. | |
86 | /// | |
87 | /// See: <https://en.wikipedia.org/wiki/Median_absolute_deviation> | |
88 | fn median_abs_dev(&self) -> f64; | |
89 | ||
90 | /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`. | |
91 | fn median_abs_dev_pct(&self) -> f64; | |
92 | ||
93 | /// Percentile: the value below which `pct` percent of the values in `self` fall. For example, | |
94 | /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self` | |
95 | /// satisfy `s <= v`. | |
96 | /// | |
97 | /// Calculated by linear interpolation between closest ranks. | |
98 | /// | |
99 | /// See: <https://en.wikipedia.org/wiki/Percentile> | |
100 | fn percentile(&self, pct: f64) -> f64; | |
101 | ||
102 | /// Quartiles of the sample: three values that divide the sample into four equal groups, each | |
103 | /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This | |
104 | /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but | |
105 | /// is otherwise equivalent. | |
106 | /// | |
107 | /// See also: <https://en.wikipedia.org/wiki/Quartile> | |
108 | fn quartiles(&self) -> (f64, f64, f64); | |
109 | ||
110 | /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th | |
111 | /// percentile (3rd quartile). See `quartiles`. | |
112 | /// | |
113 | /// See also: <https://en.wikipedia.org/wiki/Interquartile_range> | |
114 | fn iqr(&self) -> f64; | |
115 | } | |
116 | ||
117 | /// Extracted collection of all the summary statistics of a sample set. | |
118 | #[derive(Debug, Clone, PartialEq, Copy)] | |
119 | #[allow(missing_docs)] | |
120 | pub struct Summary { | |
121 | pub sum: f64, | |
122 | pub min: f64, | |
123 | pub max: f64, | |
124 | pub mean: f64, | |
125 | pub median: f64, | |
126 | pub var: f64, | |
127 | pub std_dev: f64, | |
128 | pub std_dev_pct: f64, | |
129 | pub median_abs_dev: f64, | |
130 | pub median_abs_dev_pct: f64, | |
131 | pub quartiles: (f64, f64, f64), | |
132 | pub iqr: f64, | |
133 | } | |
134 | ||
135 | impl Summary { | |
136 | /// Construct a new summary of a sample set. | |
137 | pub fn new(samples: &[f64]) -> Summary { | |
138 | Summary { | |
139 | sum: samples.sum(), | |
140 | min: samples.min(), | |
141 | max: samples.max(), | |
142 | mean: samples.mean(), | |
143 | median: samples.median(), | |
144 | var: samples.var(), | |
145 | std_dev: samples.std_dev(), | |
146 | std_dev_pct: samples.std_dev_pct(), | |
147 | median_abs_dev: samples.median_abs_dev(), | |
148 | median_abs_dev_pct: samples.median_abs_dev_pct(), | |
149 | quartiles: samples.quartiles(), | |
150 | iqr: samples.iqr(), | |
151 | } | |
152 | } | |
153 | } | |
154 | ||
155 | impl Stats for [f64] { | |
156 | // FIXME #11059 handle NaN, inf and overflow | |
157 | fn sum(&self) -> f64 { | |
158 | let mut partials = vec![]; | |
159 | ||
160 | for &x in self { | |
161 | let mut x = x; | |
162 | let mut j = 0; | |
163 | // This inner loop applies `hi`/`lo` summation to each | |
164 | // partial so that the list of partial sums remains exact. | |
165 | for i in 0..partials.len() { | |
166 | let mut y: f64 = partials[i]; | |
167 | if x.abs() < y.abs() { | |
168 | mem::swap(&mut x, &mut y); | |
169 | } | |
170 | // Rounded `x+y` is stored in `hi` with round-off stored in | |
171 | // `lo`. Together `hi+lo` are exactly equal to `x+y`. | |
172 | let hi = x + y; | |
173 | let lo = y - (hi - x); | |
174 | if lo != 0.0 { | |
175 | partials[j] = lo; | |
176 | j += 1; | |
177 | } | |
178 | x = hi; | |
179 | } | |
180 | if j >= partials.len() { | |
181 | partials.push(x); | |
182 | } else { | |
183 | partials[j] = x; | |
184 | partials.truncate(j + 1); | |
185 | } | |
186 | } | |
187 | let zero: f64 = 0.0; | |
188 | partials.iter().fold(zero, |p, q| p + *q) | |
189 | } | |
190 | ||
191 | fn min(&self) -> f64 { | |
192 | assert!(!self.is_empty()); | |
193 | self.iter().fold(self[0], |p, q| p.min(*q)) | |
194 | } | |
195 | ||
196 | fn max(&self) -> f64 { | |
197 | assert!(!self.is_empty()); | |
198 | self.iter().fold(self[0], |p, q| p.max(*q)) | |
199 | } | |
200 | ||
201 | fn mean(&self) -> f64 { | |
202 | assert!(!self.is_empty()); | |
203 | self.sum() / (self.len() as f64) | |
204 | } | |
205 | ||
206 | fn median(&self) -> f64 { | |
207 | self.percentile(50_f64) | |
208 | } | |
209 | ||
210 | fn var(&self) -> f64 { | |
211 | if self.len() < 2 { | |
212 | 0.0 | |
213 | } else { | |
214 | let mean = self.mean(); | |
215 | let mut v: f64 = 0.0; | |
216 | for s in self { | |
217 | let x = *s - mean; | |
218 | v += x * x; | |
219 | } | |
220 | // N.B., this is _supposed to be_ len-1, not len. If you | |
221 | // change it back to len, you will be calculating a | |
222 | // population variance, not a sample variance. | |
223 | let denom = (self.len() - 1) as f64; | |
224 | v / denom | |
225 | } | |
226 | } | |
227 | ||
228 | fn std_dev(&self) -> f64 { | |
229 | self.var().sqrt() | |
230 | } | |
231 | ||
232 | fn std_dev_pct(&self) -> f64 { | |
233 | let hundred = 100_f64; | |
234 | (self.std_dev() / self.mean()) * hundred | |
235 | } | |
236 | ||
237 | fn median_abs_dev(&self) -> f64 { | |
238 | let med = self.median(); | |
239 | let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect(); | |
240 | // This constant is derived by smarter statistics brains than me, but it is | |
241 | // consistent with how R and other packages treat the MAD. | |
242 | let number = 1.4826; | |
243 | abs_devs.median() * number | |
244 | } | |
245 | ||
246 | fn median_abs_dev_pct(&self) -> f64 { | |
247 | let hundred = 100_f64; | |
248 | (self.median_abs_dev() / self.median()) * hundred | |
249 | } | |
250 | ||
251 | fn percentile(&self, pct: f64) -> f64 { | |
252 | let mut tmp = self.to_vec(); | |
253 | local_sort(&mut tmp); | |
254 | percentile_of_sorted(&tmp, pct) | |
255 | } | |
256 | ||
257 | fn quartiles(&self) -> (f64, f64, f64) { | |
258 | let mut tmp = self.to_vec(); | |
259 | local_sort(&mut tmp); | |
260 | let first = 25_f64; | |
261 | let a = percentile_of_sorted(&tmp, first); | |
262 | let second = 50_f64; | |
263 | let b = percentile_of_sorted(&tmp, second); | |
264 | let third = 75_f64; | |
265 | let c = percentile_of_sorted(&tmp, third); | |
266 | (a, b, c) | |
267 | } | |
268 | ||
269 | fn iqr(&self) -> f64 { | |
270 | let (a, _, c) = self.quartiles(); | |
271 | c - a | |
272 | } | |
273 | } | |
274 | ||
275 | // Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using | |
276 | // linear interpolation. If samples are not sorted, return nonsensical value. | |
277 | fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 { | |
278 | assert!(!sorted_samples.is_empty()); | |
279 | if sorted_samples.len() == 1 { | |
280 | return sorted_samples[0]; | |
281 | } | |
282 | let zero: f64 = 0.0; | |
283 | assert!(zero <= pct); | |
284 | let hundred = 100_f64; | |
285 | assert!(pct <= hundred); | |
286 | if pct == hundred { | |
287 | return sorted_samples[sorted_samples.len() - 1]; | |
288 | } | |
289 | let length = (sorted_samples.len() - 1) as f64; | |
290 | let rank = (pct / hundred) * length; | |
291 | let lrank = rank.floor(); | |
292 | let d = rank - lrank; | |
293 | let n = lrank as usize; | |
294 | let lo = sorted_samples[n]; | |
295 | let hi = sorted_samples[n + 1]; | |
296 | lo + (hi - lo) * d | |
297 | } | |
298 | ||
299 | /// Winsorize a set of samples, replacing values above the `100-pct` percentile | |
300 | /// and below the `pct` percentile with those percentiles themselves. This is a | |
301 | /// way of minimizing the effect of outliers, at the cost of biasing the sample. | |
302 | /// It differs from trimming in that it does not change the number of samples, | |
303 | /// just changes the values of those that are outliers. | |
304 | /// | |
305 | /// See: <https://en.wikipedia.org/wiki/Winsorising> | |
306 | pub fn winsorize(samples: &mut [f64], pct: f64) { | |
307 | let mut tmp = samples.to_vec(); | |
308 | local_sort(&mut tmp); | |
309 | let lo = percentile_of_sorted(&tmp, pct); | |
310 | let hundred = 100_f64; | |
311 | let hi = percentile_of_sorted(&tmp, hundred - pct); | |
312 | for samp in samples { | |
313 | if *samp > hi { | |
314 | *samp = hi | |
315 | } else if *samp < lo { | |
316 | *samp = lo | |
317 | } | |
318 | } | |
319 | } |