src/libtest/stats.rs

   1 #![allow(missing_docs)]
   2 #![allow(deprecated)] // Float
   3
   4 use std::cmp::Ordering::{self, Equal, Greater, Less};
   5 use std::mem;
   6
   7 fn local_cmp(x: f64, y: f64) -> Ordering {
   8     // arbitrarily decide that NaNs are larger than everything.
   9     if y.is_nan() {
  10         Less
  11     } else if x.is_nan() {
  12         Greater
  13     } else if x < y {
  14         Less
  15     } else if x == y {
  16         Equal
  17     } else {
  18         Greater
  19     }
  20 }
  21
  22 fn local_sort(v: &mut [f64]) {
  23     v.sort_by(|x: &f64, y: &f64| local_cmp(*x, *y));
  24 }
  25
  26 /// Trait that provides simple descriptive statistics on a univariate set of numeric samples.
  27 pub trait Stats {
  28     /// Sum of the samples.
  29     ///
  30     /// Note: this method sacrifices performance at the altar of accuracy
  31     /// Depends on IEEE-754 arithmetic guarantees. See proof of correctness at:
  32     /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric
  33     /// Predicates"][paper]
  34     ///
  35     /// [paper]: http://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps
  36     fn sum(&self) -> f64;
  37
  38     /// Minimum value of the samples.
  39     fn min(&self) -> f64;
  40
  41     /// Maximum value of the samples.
  42     fn max(&self) -> f64;
  43
  44     /// Arithmetic mean (average) of the samples: sum divided by sample-count.
  45     ///
  46     /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean>
  47     fn mean(&self) -> f64;
  48
  49     /// Median of the samples: value separating the lower half of the samples from the higher half.
  50     /// Equal to `self.percentile(50.0)`.
  51     ///
  52     /// See: <https://en.wikipedia.org/wiki/Median>
  53     fn median(&self) -> f64;
  54
  55     /// Variance of the samples: bias-corrected mean of the squares of the differences of each
  56     /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the
  57     /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n`
  58     /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather
  59     /// than `n`.
  60     ///
  61     /// See: <https://en.wikipedia.org/wiki/Variance>
  62     fn var(&self) -> f64;
  63
  64     /// Standard deviation: the square root of the sample variance.
  65     ///
  66     /// Note: this is not a robust statistic for non-normal distributions. Prefer the
  67     /// `median_abs_dev` for unknown distributions.
  68     ///
  69     /// See: <https://en.wikipedia.org/wiki/Standard_deviation>
  70     fn std_dev(&self) -> f64;
  71
  72     /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`.
  73     ///
  74     /// Note: this is not a robust statistic for non-normal distributions. Prefer the
  75     /// `median_abs_dev_pct` for unknown distributions.
  76     fn std_dev_pct(&self) -> f64;
  77
  78     /// Scaled median of the absolute deviations of each sample from the sample median. This is a
  79     /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to
  80     /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled
  81     /// by the constant `1.4826` to allow its use as a consistent estimator for the standard
  82     /// deviation.
  83     ///
  84     /// See: <http://en.wikipedia.org/wiki/Median_absolute_deviation>
  85     fn median_abs_dev(&self) -> f64;
  86
  87     /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`.
  88     fn median_abs_dev_pct(&self) -> f64;
  89
  90     /// Percentile: the value below which `pct` percent of the values in `self` fall. For example,
  91     /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self`
  92     /// satisfy `s <= v`.
  93     ///
  94     /// Calculated by linear interpolation between closest ranks.
  95     ///
  96     /// See: <http://en.wikipedia.org/wiki/Percentile>
  97     fn percentile(&self, pct: f64) -> f64;
  98
  99     /// Quartiles of the sample: three values that divide the sample into four equal groups, each
 100     /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This
 101     /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but
 102     /// is otherwise equivalent.
 103     ///
 104     /// See also: <https://en.wikipedia.org/wiki/Quartile>
 105     fn quartiles(&self) -> (f64, f64, f64);
 106
 107     /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th
 108     /// percentile (3rd quartile). See `quartiles`.
 109     ///
 110     /// See also: <https://en.wikipedia.org/wiki/Interquartile_range>
 111     fn iqr(&self) -> f64;
 112 }
 113
 114 /// Extracted collection of all the summary statistics of a sample set.
 115 #[derive(Clone, PartialEq, Copy)]
 116 #[allow(missing_docs)]
 117 pub struct Summary {
 118     pub sum: f64,
 119     pub min: f64,
 120     pub max: f64,
 121     pub mean: f64,
 122     pub median: f64,
 123     pub var: f64,
 124     pub std_dev: f64,
 125     pub std_dev_pct: f64,
 126     pub median_abs_dev: f64,
 127     pub median_abs_dev_pct: f64,
 128     pub quartiles: (f64, f64, f64),
 129     pub iqr: f64,
 130 }
 131
 132 impl Summary {
 133     /// Construct a new summary of a sample set.
 134     pub fn new(samples: &[f64]) -> Summary {
 135         Summary {
 136             sum: samples.sum(),
 137             min: samples.min(),
 138             max: samples.max(),
 139             mean: samples.mean(),
 140             median: samples.median(),
 141             var: samples.var(),
 142             std_dev: samples.std_dev(),
 143             std_dev_pct: samples.std_dev_pct(),
 144             median_abs_dev: samples.median_abs_dev(),
 145             median_abs_dev_pct: samples.median_abs_dev_pct(),
 146             quartiles: samples.quartiles(),
 147             iqr: samples.iqr(),
 148         }
 149     }
 150 }
 151
 152 impl Stats for [f64] {
 153     // FIXME #11059 handle NaN, inf and overflow
 154     fn sum(&self) -> f64 {
 155         let mut partials = vec![];
 156
 157         for &x in self {
 158             let mut x = x;
 159             let mut j = 0;
 160             // This inner loop applies `hi`/`lo` summation to each
 161             // partial so that the list of partial sums remains exact.
 162             for i in 0..partials.len() {
 163                 let mut y: f64 = partials[i];
 164                 if x.abs() < y.abs() {
 165                     mem::swap(&mut x, &mut y);
 166                 }
 167                 // Rounded `x+y` is stored in `hi` with round-off stored in
 168                 // `lo`. Together `hi+lo` are exactly equal to `x+y`.
 169                 let hi = x + y;
 170                 let lo = y - (hi - x);
 171                 if lo != 0.0 {
 172                     partials[j] = lo;
 173                     j += 1;
 174                 }
 175                 x = hi;
 176             }
 177             if j >= partials.len() {
 178                 partials.push(x);
 179             } else {
 180                 partials[j] = x;
 181                 partials.truncate(j + 1);
 182             }
 183         }
 184         let zero: f64 = 0.0;
 185         partials.iter().fold(zero, |p, q| p + *q)
 186     }
 187
 188     fn min(&self) -> f64 {
 189         assert!(!self.is_empty());
 190         self.iter().fold(self[0], |p, q| p.min(*q))
 191     }
 192
 193     fn max(&self) -> f64 {
 194         assert!(!self.is_empty());
 195         self.iter().fold(self[0], |p, q| p.max(*q))
 196     }
 197
 198     fn mean(&self) -> f64 {
 199         assert!(!self.is_empty());
 200         self.sum() / (self.len() as f64)
 201     }
 202
 203     fn median(&self) -> f64 {
 204         self.percentile(50 as f64)
 205     }
 206
 207     fn var(&self) -> f64 {
 208         if self.len() < 2 {
 209             0.0
 210         } else {
 211             let mean = self.mean();
 212             let mut v: f64 = 0.0;
 213             for s in self {
 214                 let x = *s - mean;
 215                 v = v + x * x;
 216             }
 217             // N.B., this is _supposed to be_ len-1, not len. If you
 218             // change it back to len, you will be calculating a
 219             // population variance, not a sample variance.
 220             let denom = (self.len() - 1) as f64;
 221             v / denom
 222         }
 223     }
 224
 225     fn std_dev(&self) -> f64 {
 226         self.var().sqrt()
 227     }
 228
 229     fn std_dev_pct(&self) -> f64 {
 230         let hundred = 100 as f64;
 231         (self.std_dev() / self.mean()) * hundred
 232     }
 233
 234     fn median_abs_dev(&self) -> f64 {
 235         let med = self.median();
 236         let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect();
 237         // This constant is derived by smarter statistics brains than me, but it is
 238         // consistent with how R and other packages treat the MAD.
 239         let number = 1.4826;
 240         abs_devs.median() * number
 241     }
 242
 243     fn median_abs_dev_pct(&self) -> f64 {
 244         let hundred = 100 as f64;
 245         (self.median_abs_dev() / self.median()) * hundred
 246     }
 247
 248     fn percentile(&self, pct: f64) -> f64 {
 249         let mut tmp = self.to_vec();
 250         local_sort(&mut tmp);
 251         percentile_of_sorted(&tmp, pct)
 252     }
 253
 254     fn quartiles(&self) -> (f64, f64, f64) {
 255         let mut tmp = self.to_vec();
 256         local_sort(&mut tmp);
 257         let first = 25f64;
 258         let a = percentile_of_sorted(&tmp, first);
 259         let second = 50f64;
 260         let b = percentile_of_sorted(&tmp, second);
 261         let third = 75f64;
 262         let c = percentile_of_sorted(&tmp, third);
 263         (a, b, c)
 264     }
 265
 266     fn iqr(&self) -> f64 {
 267         let (a, _, c) = self.quartiles();
 268         c - a
 269     }
 270 }
 271
 272 // Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using
 273 // linear interpolation. If samples are not sorted, return nonsensical value.
 274 fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 {
 275     assert!(!sorted_samples.is_empty());
 276     if sorted_samples.len() == 1 {
 277         return sorted_samples[0];
 278     }
 279     let zero: f64 = 0.0;
 280     assert!(zero <= pct);
 281     let hundred = 100f64;
 282     assert!(pct <= hundred);
 283     if pct == hundred {
 284         return sorted_samples[sorted_samples.len() - 1];
 285     }
 286     let length = (sorted_samples.len() - 1) as f64;
 287     let rank = (pct / hundred) * length;
 288     let lrank = rank.floor();
 289     let d = rank - lrank;
 290     let n = lrank as usize;
 291     let lo = sorted_samples[n];
 292     let hi = sorted_samples[n + 1];
 293     lo + (hi - lo) * d
 294 }
 295
 296 /// Winsorize a set of samples, replacing values above the `100-pct` percentile
 297 /// and below the `pct` percentile with those percentiles themselves. This is a
 298 /// way of minimizing the effect of outliers, at the cost of biasing the sample.
 299 /// It differs from trimming in that it does not change the number of samples,
 300 /// just changes the values of those that are outliers.
 301 ///
 302 /// See: <http://en.wikipedia.org/wiki/Winsorising>
 303 pub fn winsorize(samples: &mut [f64], pct: f64) {
 304     let mut tmp = samples.to_vec();
 305     local_sort(&mut tmp);
 306     let lo = percentile_of_sorted(&tmp, pct);
 307     let hundred = 100 as f64;
 308     let hi = percentile_of_sorted(&tmp, hundred - pct);
 309     for samp in samples {
 310         if *samp > hi {
 311             *samp = hi
 312         } else if *samp < lo {
 313             *samp = lo
 314         }
 315     }
 316 }
 317
 318 // Test vectors generated from R, using the script src/etc/stat-test-vectors.r.
 319
 320 #[cfg(test)]
 321 mod tests;
 322
 323 #[cfg(test)]
 324 mod bench {
 325     extern crate test;
 326     use self::test::Bencher;
 327     use crate::stats::Stats;
 328
 329     #[bench]
 330     pub fn sum_three_items(b: &mut Bencher) {
 331         b.iter(|| {
 332             [1e20f64, 1.5f64, -1e20f64].sum();
 333         })
 334     }
 335     #[bench]
 336     pub fn sum_many_f64(b: &mut Bencher) {
 337         let nums = [-1e30f64, 1e60, 1e30, 1.0, -1e60];
 338         let v = (0..500).map(|i| nums[i % 5]).collect::<Vec<_>>();
 339
 340         b.iter(|| {
 341             v.sum();
 342         })
 343     }
 344
 345     #[bench]
 346     pub fn no_iter(_: &mut Bencher) {}
 347 }