src/libtest/stats.rs

   1 #![allow(missing_docs)]
   2 #![allow(deprecated)] // Float
   3
   4 use std::cmp::Ordering::{self, Equal, Greater, Less};
   5 use std::mem;
   6
   7 #[cfg(test)]
   8 mod tests;
   9
  10 fn local_cmp(x: f64, y: f64) -> Ordering {
  11     // arbitrarily decide that NaNs are larger than everything.
  12     if y.is_nan() {
  13         Less
  14     } else if x.is_nan() {
  15         Greater
  16     } else if x < y {
  17         Less
  18     } else if x == y {
  19         Equal
  20     } else {
  21         Greater
  22     }
  23 }
  24
  25 fn local_sort(v: &mut [f64]) {
  26     v.sort_by(|x: &f64, y: &f64| local_cmp(*x, *y));
  27 }
  28
  29 /// Trait that provides simple descriptive statistics on a univariate set of numeric samples.
  30 pub trait Stats {
  31     /// Sum of the samples.
  32     ///
  33     /// Note: this method sacrifices performance at the altar of accuracy
  34     /// Depends on IEEE-754 arithmetic guarantees. See proof of correctness at:
  35     /// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric
  36     /// Predicates"][paper]
  37     ///
  38     /// [paper]: http://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps
  39     fn sum(&self) -> f64;
  40
  41     /// Minimum value of the samples.
  42     fn min(&self) -> f64;
  43
  44     /// Maximum value of the samples.
  45     fn max(&self) -> f64;
  46
  47     /// Arithmetic mean (average) of the samples: sum divided by sample-count.
  48     ///
  49     /// See: <https://en.wikipedia.org/wiki/Arithmetic_mean>
  50     fn mean(&self) -> f64;
  51
  52     /// Median of the samples: value separating the lower half of the samples from the higher half.
  53     /// Equal to `self.percentile(50.0)`.
  54     ///
  55     /// See: <https://en.wikipedia.org/wiki/Median>
  56     fn median(&self) -> f64;
  57
  58     /// Variance of the samples: bias-corrected mean of the squares of the differences of each
  59     /// sample from the sample mean. Note that this calculates the _sample variance_ rather than the
  60     /// population variance, which is assumed to be unknown. It therefore corrects the `(n-1)/n`
  61     /// bias that would appear if we calculated a population variance, by dividing by `(n-1)` rather
  62     /// than `n`.
  63     ///
  64     /// See: <https://en.wikipedia.org/wiki/Variance>
  65     fn var(&self) -> f64;
  66
  67     /// Standard deviation: the square root of the sample variance.
  68     ///
  69     /// Note: this is not a robust statistic for non-normal distributions. Prefer the
  70     /// `median_abs_dev` for unknown distributions.
  71     ///
  72     /// See: <https://en.wikipedia.org/wiki/Standard_deviation>
  73     fn std_dev(&self) -> f64;
  74
  75     /// Standard deviation as a percent of the mean value. See `std_dev` and `mean`.
  76     ///
  77     /// Note: this is not a robust statistic for non-normal distributions. Prefer the
  78     /// `median_abs_dev_pct` for unknown distributions.
  79     fn std_dev_pct(&self) -> f64;
  80
  81     /// Scaled median of the absolute deviations of each sample from the sample median. This is a
  82     /// robust (distribution-agnostic) estimator of sample variability. Use this in preference to
  83     /// `std_dev` if you cannot assume your sample is normally distributed. Note that this is scaled
  84     /// by the constant `1.4826` to allow its use as a consistent estimator for the standard
  85     /// deviation.
  86     ///
  87     /// See: <http://en.wikipedia.org/wiki/Median_absolute_deviation>
  88     fn median_abs_dev(&self) -> f64;
  89
  90     /// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`.
  91     fn median_abs_dev_pct(&self) -> f64;
  92
  93     /// Percentile: the value below which `pct` percent of the values in `self` fall. For example,
  94     /// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self`
  95     /// satisfy `s <= v`.
  96     ///
  97     /// Calculated by linear interpolation between closest ranks.
  98     ///
  99     /// See: <http://en.wikipedia.org/wiki/Percentile>
 100     fn percentile(&self, pct: f64) -> f64;
 101
 102     /// Quartiles of the sample: three values that divide the sample into four equal groups, each
 103     /// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This
 104     /// function may calculate the 3 quartiles more efficiently than 3 calls to `percentile`, but
 105     /// is otherwise equivalent.
 106     ///
 107     /// See also: <https://en.wikipedia.org/wiki/Quartile>
 108     fn quartiles(&self) -> (f64, f64, f64);
 109
 110     /// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th
 111     /// percentile (3rd quartile). See `quartiles`.
 112     ///
 113     /// See also: <https://en.wikipedia.org/wiki/Interquartile_range>
 114     fn iqr(&self) -> f64;
 115 }
 116
 117 /// Extracted collection of all the summary statistics of a sample set.
 118 #[derive(Debug, Clone, PartialEq, Copy)]
 119 #[allow(missing_docs)]
 120 pub struct Summary {
 121     pub sum: f64,
 122     pub min: f64,
 123     pub max: f64,
 124     pub mean: f64,
 125     pub median: f64,
 126     pub var: f64,
 127     pub std_dev: f64,
 128     pub std_dev_pct: f64,
 129     pub median_abs_dev: f64,
 130     pub median_abs_dev_pct: f64,
 131     pub quartiles: (f64, f64, f64),
 132     pub iqr: f64,
 133 }
 134
 135 impl Summary {
 136     /// Construct a new summary of a sample set.
 137     pub fn new(samples: &[f64]) -> Summary {
 138         Summary {
 139             sum: samples.sum(),
 140             min: samples.min(),
 141             max: samples.max(),
 142             mean: samples.mean(),
 143             median: samples.median(),
 144             var: samples.var(),
 145             std_dev: samples.std_dev(),
 146             std_dev_pct: samples.std_dev_pct(),
 147             median_abs_dev: samples.median_abs_dev(),
 148             median_abs_dev_pct: samples.median_abs_dev_pct(),
 149             quartiles: samples.quartiles(),
 150             iqr: samples.iqr(),
 151         }
 152     }
 153 }
 154
 155 impl Stats for [f64] {
 156     // FIXME #11059 handle NaN, inf and overflow
 157     fn sum(&self) -> f64 {
 158         let mut partials = vec![];
 159
 160         for &x in self {
 161             let mut x = x;
 162             let mut j = 0;
 163             // This inner loop applies `hi`/`lo` summation to each
 164             // partial so that the list of partial sums remains exact.
 165             for i in 0..partials.len() {
 166                 let mut y: f64 = partials[i];
 167                 if x.abs() < y.abs() {
 168                     mem::swap(&mut x, &mut y);
 169                 }
 170                 // Rounded `x+y` is stored in `hi` with round-off stored in
 171                 // `lo`. Together `hi+lo` are exactly equal to `x+y`.
 172                 let hi = x + y;
 173                 let lo = y - (hi - x);
 174                 if lo != 0.0 {
 175                     partials[j] = lo;
 176                     j += 1;
 177                 }
 178                 x = hi;
 179             }
 180             if j >= partials.len() {
 181                 partials.push(x);
 182             } else {
 183                 partials[j] = x;
 184                 partials.truncate(j + 1);
 185             }
 186         }
 187         let zero: f64 = 0.0;
 188         partials.iter().fold(zero, |p, q| p + *q)
 189     }
 190
 191     fn min(&self) -> f64 {
 192         assert!(!self.is_empty());
 193         self.iter().fold(self[0], |p, q| p.min(*q))
 194     }
 195
 196     fn max(&self) -> f64 {
 197         assert!(!self.is_empty());
 198         self.iter().fold(self[0], |p, q| p.max(*q))
 199     }
 200
 201     fn mean(&self) -> f64 {
 202         assert!(!self.is_empty());
 203         self.sum() / (self.len() as f64)
 204     }
 205
 206     fn median(&self) -> f64 {
 207         self.percentile(50_f64)
 208     }
 209
 210     fn var(&self) -> f64 {
 211         if self.len() < 2 {
 212             0.0
 213         } else {
 214             let mean = self.mean();
 215             let mut v: f64 = 0.0;
 216             for s in self {
 217                 let x = *s - mean;
 218                 v = v + x * x;
 219             }
 220             // N.B., this is _supposed to be_ len-1, not len. If you
 221             // change it back to len, you will be calculating a
 222             // population variance, not a sample variance.
 223             let denom = (self.len() - 1) as f64;
 224             v / denom
 225         }
 226     }
 227
 228     fn std_dev(&self) -> f64 {
 229         self.var().sqrt()
 230     }
 231
 232     fn std_dev_pct(&self) -> f64 {
 233         let hundred = 100_f64;
 234         (self.std_dev() / self.mean()) * hundred
 235     }
 236
 237     fn median_abs_dev(&self) -> f64 {
 238         let med = self.median();
 239         let abs_devs: Vec<f64> = self.iter().map(|&v| (med - v).abs()).collect();
 240         // This constant is derived by smarter statistics brains than me, but it is
 241         // consistent with how R and other packages treat the MAD.
 242         let number = 1.4826;
 243         abs_devs.median() * number
 244     }
 245
 246     fn median_abs_dev_pct(&self) -> f64 {
 247         let hundred = 100_f64;
 248         (self.median_abs_dev() / self.median()) * hundred
 249     }
 250
 251     fn percentile(&self, pct: f64) -> f64 {
 252         let mut tmp = self.to_vec();
 253         local_sort(&mut tmp);
 254         percentile_of_sorted(&tmp, pct)
 255     }
 256
 257     fn quartiles(&self) -> (f64, f64, f64) {
 258         let mut tmp = self.to_vec();
 259         local_sort(&mut tmp);
 260         let first = 25_f64;
 261         let a = percentile_of_sorted(&tmp, first);
 262         let second = 50_f64;
 263         let b = percentile_of_sorted(&tmp, second);
 264         let third = 75_f64;
 265         let c = percentile_of_sorted(&tmp, third);
 266         (a, b, c)
 267     }
 268
 269     fn iqr(&self) -> f64 {
 270         let (a, _, c) = self.quartiles();
 271         c - a
 272     }
 273 }
 274
 275 // Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using
 276 // linear interpolation. If samples are not sorted, return nonsensical value.
 277 fn percentile_of_sorted(sorted_samples: &[f64], pct: f64) -> f64 {
 278     assert!(!sorted_samples.is_empty());
 279     if sorted_samples.len() == 1 {
 280         return sorted_samples[0];
 281     }
 282     let zero: f64 = 0.0;
 283     assert!(zero <= pct);
 284     let hundred = 100_f64;
 285     assert!(pct <= hundred);
 286     if pct == hundred {
 287         return sorted_samples[sorted_samples.len() - 1];
 288     }
 289     let length = (sorted_samples.len() - 1) as f64;
 290     let rank = (pct / hundred) * length;
 291     let lrank = rank.floor();
 292     let d = rank - lrank;
 293     let n = lrank as usize;
 294     let lo = sorted_samples[n];
 295     let hi = sorted_samples[n + 1];
 296     lo + (hi - lo) * d
 297 }
 298
 299 /// Winsorize a set of samples, replacing values above the `100-pct` percentile
 300 /// and below the `pct` percentile with those percentiles themselves. This is a
 301 /// way of minimizing the effect of outliers, at the cost of biasing the sample.
 302 /// It differs from trimming in that it does not change the number of samples,
 303 /// just changes the values of those that are outliers.
 304 ///
 305 /// See: <http://en.wikipedia.org/wiki/Winsorising>
 306 pub fn winsorize(samples: &mut [f64], pct: f64) {
 307     let mut tmp = samples.to_vec();
 308     local_sort(&mut tmp);
 309     let lo = percentile_of_sorted(&tmp, pct);
 310     let hundred = 100_f64;
 311     let hi = percentile_of_sorted(&tmp, hundred - pct);
 312     for samp in samples {
 313         if *samp > hi {
 314             *samp = hi
 315         } else if *samp < lo {
 316             *samp = lo
 317         }
 318     }
 319 }