]>
Commit | Line | Data |
---|---|---|
92f5a8d4 TL |
1 | // (C) Copyright Nick Thompson 2018. |
2 | // Use, modification and distribution are subject to the | |
3 | // Boost Software License, Version 1.0. (See accompanying file | |
4 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
5 | ||
6 | #ifndef BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP | |
7 | #define BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP | |
8 | ||
9 | #include <algorithm> | |
10 | #include <iterator> | |
f67539c2 TL |
11 | #include <tuple> |
12 | #include <cmath> | |
20effc67 | 13 | #include <vector> |
92f5a8d4 TL |
14 | #include <boost/assert.hpp> |
15 | ||
16 | namespace boost::math::statistics { | |
17 | ||
18 | template<class ForwardIterator> | |
19 | auto mean(ForwardIterator first, ForwardIterator last) | |
20 | { | |
21 | using Real = typename std::iterator_traits<ForwardIterator>::value_type; | |
22 | BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean."); | |
23 | if constexpr (std::is_integral<Real>::value) | |
24 | { | |
25 | double mu = 0; | |
26 | double i = 1; | |
27 | for(auto it = first; it != last; ++it) { | |
28 | mu = mu + (*it - mu)/i; | |
29 | i += 1; | |
30 | } | |
31 | return mu; | |
32 | } | |
33 | else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>) | |
34 | { | |
35 | size_t elements = std::distance(first, last); | |
36 | Real mu0 = 0; | |
37 | Real mu1 = 0; | |
38 | Real mu2 = 0; | |
39 | Real mu3 = 0; | |
40 | Real i = 1; | |
41 | auto end = last - (elements % 4); | |
42 | for(auto it = first; it != end; it += 4) { | |
43 | Real inv = Real(1)/i; | |
44 | Real tmp0 = (*it - mu0); | |
45 | Real tmp1 = (*(it+1) - mu1); | |
46 | Real tmp2 = (*(it+2) - mu2); | |
47 | Real tmp3 = (*(it+3) - mu3); | |
48 | // please generate a vectorized fma here | |
49 | mu0 += tmp0*inv; | |
50 | mu1 += tmp1*inv; | |
51 | mu2 += tmp2*inv; | |
52 | mu3 += tmp3*inv; | |
53 | i += 1; | |
54 | } | |
55 | Real num1 = Real(elements - (elements %4))/Real(4); | |
56 | Real num2 = num1 + Real(elements % 4); | |
57 | ||
58 | for (auto it = end; it != last; ++it) | |
59 | { | |
60 | mu3 += (*it-mu3)/i; | |
61 | i += 1; | |
62 | } | |
63 | ||
64 | return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements); | |
65 | } | |
66 | else | |
67 | { | |
68 | auto it = first; | |
69 | Real mu = *it; | |
70 | Real i = 2; | |
71 | while(++it != last) | |
72 | { | |
73 | mu += (*it - mu)/i; | |
74 | i += 1; | |
75 | } | |
76 | return mu; | |
77 | } | |
78 | } | |
79 | ||
80 | template<class Container> | |
81 | inline auto mean(Container const & v) | |
82 | { | |
83 | return mean(v.cbegin(), v.cend()); | |
84 | } | |
85 | ||
86 | template<class ForwardIterator> | |
87 | auto variance(ForwardIterator first, ForwardIterator last) | |
88 | { | |
89 | using Real = typename std::iterator_traits<ForwardIterator>::value_type; | |
90 | BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance."); | |
91 | // Higham, Accuracy and Stability, equation 1.6a and 1.6b: | |
92 | if constexpr (std::is_integral<Real>::value) | |
93 | { | |
94 | double M = *first; | |
95 | double Q = 0; | |
96 | double k = 2; | |
97 | for (auto it = std::next(first); it != last; ++it) | |
98 | { | |
99 | double tmp = *it - M; | |
100 | Q = Q + ((k-1)*tmp*tmp)/k; | |
101 | M = M + tmp/k; | |
102 | k += 1; | |
103 | } | |
104 | return Q/(k-1); | |
105 | } | |
106 | else | |
107 | { | |
108 | Real M = *first; | |
109 | Real Q = 0; | |
110 | Real k = 2; | |
111 | for (auto it = std::next(first); it != last; ++it) | |
112 | { | |
113 | Real tmp = (*it - M)/k; | |
114 | Q += k*(k-1)*tmp*tmp; | |
115 | M += tmp; | |
116 | k += 1; | |
117 | } | |
118 | return Q/(k-1); | |
119 | } | |
120 | } | |
121 | ||
122 | template<class Container> | |
123 | inline auto variance(Container const & v) | |
124 | { | |
125 | return variance(v.cbegin(), v.cend()); | |
126 | } | |
127 | ||
128 | template<class ForwardIterator> | |
129 | auto sample_variance(ForwardIterator first, ForwardIterator last) | |
130 | { | |
131 | size_t n = std::distance(first, last); | |
132 | BOOST_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance."); | |
133 | return n*variance(first, last)/(n-1); | |
134 | } | |
135 | ||
136 | template<class Container> | |
137 | inline auto sample_variance(Container const & v) | |
138 | { | |
139 | return sample_variance(v.cbegin(), v.cend()); | |
140 | } | |
141 | ||
142 | template<class ForwardIterator> | |
143 | auto mean_and_sample_variance(ForwardIterator first, ForwardIterator last) | |
144 | { | |
145 | using Real = typename std::iterator_traits<ForwardIterator>::value_type; | |
146 | BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance."); | |
147 | // Higham, Accuracy and Stability, equation 1.6a and 1.6b: | |
148 | if constexpr (std::is_integral<Real>::value) | |
149 | { | |
150 | double M = *first; | |
151 | double Q = 0; | |
152 | double k = 2; | |
153 | for (auto it = std::next(first); it != last; ++it) | |
154 | { | |
155 | double tmp = *it - M; | |
156 | Q = Q + ((k-1)*tmp*tmp)/k; | |
157 | M = M + tmp/k; | |
158 | k += 1; | |
159 | } | |
160 | return std::pair<double, double>{M, Q/(k-2)}; | |
161 | } | |
162 | else | |
163 | { | |
164 | Real M = *first; | |
165 | Real Q = 0; | |
166 | Real k = 2; | |
167 | for (auto it = std::next(first); it != last; ++it) | |
168 | { | |
169 | Real tmp = (*it - M)/k; | |
170 | Q += k*(k-1)*tmp*tmp; | |
171 | M += tmp; | |
172 | k += 1; | |
173 | } | |
174 | return std::pair<Real, Real>{M, Q/(k-2)}; | |
175 | } | |
176 | } | |
177 | ||
178 | template<class Container> | |
179 | auto mean_and_sample_variance(Container const & v) | |
180 | { | |
181 | return mean_and_sample_variance(v.begin(), v.end()); | |
182 | } | |
183 | ||
184 | // Follows equation 1.5 of: | |
185 | // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf | |
186 | template<class ForwardIterator> | |
187 | auto skewness(ForwardIterator first, ForwardIterator last) | |
188 | { | |
189 | using Real = typename std::iterator_traits<ForwardIterator>::value_type; | |
f67539c2 | 190 | using std::sqrt; |
92f5a8d4 TL |
191 | BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness."); |
192 | if constexpr (std::is_integral<Real>::value) | |
193 | { | |
194 | double M1 = *first; | |
195 | double M2 = 0; | |
196 | double M3 = 0; | |
197 | double n = 2; | |
198 | for (auto it = std::next(first); it != last; ++it) | |
199 | { | |
200 | double delta21 = *it - M1; | |
201 | double tmp = delta21/n; | |
202 | M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); | |
203 | M2 = M2 + tmp*(n-1)*delta21; | |
204 | M1 = M1 + tmp; | |
205 | n += 1; | |
206 | } | |
207 | ||
208 | double var = M2/(n-1); | |
209 | if (var == 0) | |
210 | { | |
211 | // The limit is technically undefined, but the interpretation here is clear: | |
212 | // A constant dataset has no skewness. | |
213 | return double(0); | |
214 | } | |
215 | double skew = M3/(M2*sqrt(var)); | |
216 | return skew; | |
217 | } | |
218 | else | |
219 | { | |
220 | Real M1 = *first; | |
221 | Real M2 = 0; | |
222 | Real M3 = 0; | |
223 | Real n = 2; | |
224 | for (auto it = std::next(first); it != last; ++it) | |
225 | { | |
226 | Real delta21 = *it - M1; | |
227 | Real tmp = delta21/n; | |
228 | M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); | |
229 | M2 += tmp*(n-1)*delta21; | |
230 | M1 += tmp; | |
231 | n += 1; | |
232 | } | |
233 | ||
234 | Real var = M2/(n-1); | |
235 | if (var == 0) | |
236 | { | |
237 | // The limit is technically undefined, but the interpretation here is clear: | |
238 | // A constant dataset has no skewness. | |
239 | return Real(0); | |
240 | } | |
241 | Real skew = M3/(M2*sqrt(var)); | |
242 | return skew; | |
243 | } | |
244 | } | |
245 | ||
246 | template<class Container> | |
247 | inline auto skewness(Container const & v) | |
248 | { | |
249 | return skewness(v.cbegin(), v.cend()); | |
250 | } | |
251 | ||
252 | // Follows equation 1.5/1.6 of: | |
253 | // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf | |
254 | template<class ForwardIterator> | |
255 | auto first_four_moments(ForwardIterator first, ForwardIterator last) | |
256 | { | |
257 | using Real = typename std::iterator_traits<ForwardIterator>::value_type; | |
258 | BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments."); | |
259 | if constexpr (std::is_integral<Real>::value) | |
260 | { | |
261 | double M1 = *first; | |
262 | double M2 = 0; | |
263 | double M3 = 0; | |
264 | double M4 = 0; | |
265 | double n = 2; | |
266 | for (auto it = std::next(first); it != last; ++it) | |
267 | { | |
268 | double delta21 = *it - M1; | |
269 | double tmp = delta21/n; | |
270 | M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3); | |
271 | M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); | |
272 | M2 = M2 + tmp*(n-1)*delta21; | |
273 | M1 = M1 + tmp; | |
274 | n += 1; | |
275 | } | |
276 | ||
277 | return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1)); | |
278 | } | |
279 | else | |
280 | { | |
281 | Real M1 = *first; | |
282 | Real M2 = 0; | |
283 | Real M3 = 0; | |
284 | Real M4 = 0; | |
285 | Real n = 2; | |
286 | for (auto it = std::next(first); it != last; ++it) | |
287 | { | |
288 | Real delta21 = *it - M1; | |
289 | Real tmp = delta21/n; | |
290 | M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3); | |
291 | M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); | |
292 | M2 = M2 + tmp*(n-1)*delta21; | |
293 | M1 = M1 + tmp; | |
294 | n += 1; | |
295 | } | |
296 | ||
297 | return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1)); | |
298 | } | |
299 | } | |
300 | ||
301 | template<class Container> | |
302 | inline auto first_four_moments(Container const & v) | |
303 | { | |
304 | return first_four_moments(v.cbegin(), v.cend()); | |
305 | } | |
306 | ||
307 | ||
308 | // Follows equation 1.6 of: | |
309 | // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf | |
310 | template<class ForwardIterator> | |
311 | auto kurtosis(ForwardIterator first, ForwardIterator last) | |
312 | { | |
313 | auto [M1, M2, M3, M4] = first_four_moments(first, last); | |
314 | if (M2 == 0) | |
315 | { | |
316 | return M2; | |
317 | } | |
318 | return M4/(M2*M2); | |
319 | } | |
320 | ||
321 | template<class Container> | |
322 | inline auto kurtosis(Container const & v) | |
323 | { | |
324 | return kurtosis(v.cbegin(), v.cend()); | |
325 | } | |
326 | ||
327 | template<class ForwardIterator> | |
328 | auto excess_kurtosis(ForwardIterator first, ForwardIterator last) | |
329 | { | |
330 | return kurtosis(first, last) - 3; | |
331 | } | |
332 | ||
333 | template<class Container> | |
334 | inline auto excess_kurtosis(Container const & v) | |
335 | { | |
336 | return excess_kurtosis(v.cbegin(), v.cend()); | |
337 | } | |
338 | ||
339 | ||
340 | template<class RandomAccessIterator> | |
341 | auto median(RandomAccessIterator first, RandomAccessIterator last) | |
342 | { | |
343 | size_t num_elems = std::distance(first, last); | |
344 | BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined."); | |
345 | if (num_elems & 1) | |
346 | { | |
347 | auto middle = first + (num_elems - 1)/2; | |
348 | std::nth_element(first, middle, last); | |
349 | return *middle; | |
350 | } | |
351 | else | |
352 | { | |
353 | auto middle = first + num_elems/2 - 1; | |
354 | std::nth_element(first, middle, last); | |
355 | std::nth_element(middle, middle+1, last); | |
356 | return (*middle + *(middle+1))/2; | |
357 | } | |
358 | } | |
359 | ||
360 | ||
361 | template<class RandomAccessContainer> | |
362 | inline auto median(RandomAccessContainer & v) | |
363 | { | |
364 | return median(v.begin(), v.end()); | |
365 | } | |
366 | ||
367 | template<class RandomAccessIterator> | |
368 | auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last) | |
369 | { | |
370 | using Real = typename std::iterator_traits<RandomAccessIterator>::value_type; | |
371 | BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples."); | |
372 | ||
373 | std::sort(first, last); | |
374 | if constexpr (std::is_integral<Real>::value) | |
375 | { | |
376 | double i = 1; | |
377 | double num = 0; | |
378 | double denom = 0; | |
379 | for (auto it = first; it != last; ++it) | |
380 | { | |
381 | num += *it*i; | |
382 | denom += *it; | |
383 | ++i; | |
384 | } | |
385 | ||
386 | // If the l1 norm is zero, all elements are zero, so every element is the same. | |
387 | if (denom == 0) | |
388 | { | |
389 | return double(0); | |
390 | } | |
391 | ||
392 | return ((2*num)/denom - i)/(i-1); | |
393 | } | |
394 | else | |
395 | { | |
396 | Real i = 1; | |
397 | Real num = 0; | |
398 | Real denom = 0; | |
399 | for (auto it = first; it != last; ++it) | |
400 | { | |
401 | num += *it*i; | |
402 | denom += *it; | |
403 | ++i; | |
404 | } | |
405 | ||
406 | // If the l1 norm is zero, all elements are zero, so every element is the same. | |
407 | if (denom == 0) | |
408 | { | |
409 | return Real(0); | |
410 | } | |
411 | ||
412 | return ((2*num)/denom - i)/(i-1); | |
413 | } | |
414 | } | |
415 | ||
416 | template<class RandomAccessContainer> | |
417 | inline auto gini_coefficient(RandomAccessContainer & v) | |
418 | { | |
419 | return gini_coefficient(v.begin(), v.end()); | |
420 | } | |
421 | ||
422 | template<class RandomAccessIterator> | |
423 | inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last) | |
424 | { | |
425 | size_t n = std::distance(first, last); | |
426 | return n*gini_coefficient(first, last)/(n-1); | |
427 | } | |
428 | ||
429 | template<class RandomAccessContainer> | |
430 | inline auto sample_gini_coefficient(RandomAccessContainer & v) | |
431 | { | |
432 | return sample_gini_coefficient(v.begin(), v.end()); | |
433 | } | |
434 | ||
435 | template<class RandomAccessIterator> | |
436 | auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN()) | |
437 | { | |
438 | using std::abs; | |
439 | using Real = typename std::iterator_traits<RandomAccessIterator>::value_type; | |
440 | using std::isnan; | |
441 | if (isnan(center)) | |
442 | { | |
443 | center = boost::math::statistics::median(first, last); | |
444 | } | |
445 | size_t num_elems = std::distance(first, last); | |
446 | BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined."); | |
447 | auto comparator = [¢er](Real a, Real b) { return abs(a-center) < abs(b-center);}; | |
448 | if (num_elems & 1) | |
449 | { | |
450 | auto middle = first + (num_elems - 1)/2; | |
451 | std::nth_element(first, middle, last, comparator); | |
452 | return abs(*middle); | |
453 | } | |
454 | else | |
455 | { | |
456 | auto middle = first + num_elems/2 - 1; | |
457 | std::nth_element(first, middle, last, comparator); | |
458 | std::nth_element(middle, middle+1, last, comparator); | |
459 | return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2)); | |
460 | } | |
461 | } | |
462 | ||
463 | template<class RandomAccessContainer> | |
464 | inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN()) | |
465 | { | |
466 | return median_absolute_deviation(v.begin(), v.end(), center); | |
467 | } | |
468 | ||
f67539c2 TL |
469 | template<class ForwardIterator> |
470 | auto interquartile_range(ForwardIterator first, ForwardIterator last) | |
471 | { | |
472 | using Real = typename std::iterator_traits<ForwardIterator>::value_type; | |
473 | static_assert(!std::is_integral<Real>::value, "Integer values have not yet been implemented."); | |
474 | auto m = std::distance(first,last); | |
475 | BOOST_ASSERT_MSG(m >= 3, "At least 3 samples are required to compute the interquartile range."); | |
476 | auto k = m/4; | |
477 | auto j = m - (4*k); | |
478 | // m = 4k+j. | |
479 | // If j = 0 or j = 1, then there are an even number of samples below the median, and an even number above the median. | |
480 | // Then we must average adjacent elements to get the quartiles. | |
481 | // If j = 2 or j = 3, there are an odd number of samples above and below the median, these elements may be directly extracted to get the quartiles. | |
482 | ||
483 | if (j==2 || j==3) | |
484 | { | |
485 | auto q1 = first + k; | |
486 | auto q3 = first + 3*k + j - 1; | |
487 | std::nth_element(first, q1, last); | |
488 | Real Q1 = *q1; | |
489 | std::nth_element(q1, q3, last); | |
490 | Real Q3 = *q3; | |
491 | return Q3 - Q1; | |
492 | } else { | |
493 | // j == 0 or j==1: | |
494 | auto q1 = first + k - 1; | |
495 | auto q3 = first + 3*k - 1 + j; | |
496 | std::nth_element(first, q1, last); | |
497 | Real a = *q1; | |
498 | std::nth_element(q1, q1 + 1, last); | |
499 | Real b = *(q1 + 1); | |
500 | Real Q1 = (a+b)/2; | |
501 | std::nth_element(q1, q3, last); | |
502 | a = *q3; | |
503 | std::nth_element(q3, q3 + 1, last); | |
504 | b = *(q3 + 1); | |
505 | Real Q3 = (a+b)/2; | |
506 | return Q3 - Q1; | |
507 | } | |
508 | } | |
509 | ||
510 | template<class RandomAccessContainer> | |
511 | inline auto interquartile_range(RandomAccessContainer & v) | |
512 | { | |
513 | return interquartile_range(v.begin(), v.end()); | |
514 | } | |
515 | ||
20effc67 TL |
516 | template<class ForwardIterator, class OutputIterator> |
517 | auto sorted_mode(ForwardIterator first, ForwardIterator last, OutputIterator output) -> decltype(output) | |
518 | { | |
519 | using Z = typename std::iterator_traits<ForwardIterator>::value_type; | |
520 | static_assert(std::is_integral<Z>::value, "Floating point values have not yet been implemented."); | |
521 | using Size = typename std::iterator_traits<ForwardIterator>::difference_type; | |
522 | ||
523 | std::vector<Z> modes {}; | |
524 | modes.reserve(16); | |
525 | Size max_counter {0}; | |
526 | ||
527 | while(first != last) | |
528 | { | |
529 | Size current_count {0}; | |
530 | auto end_it {first}; | |
531 | while(end_it != last && *end_it == *first) | |
532 | { | |
533 | ++current_count; | |
534 | ++end_it; | |
535 | } | |
536 | ||
537 | if(current_count > max_counter) | |
538 | { | |
539 | modes.resize(1); | |
540 | modes[0] = *first; | |
541 | max_counter = current_count; | |
542 | } | |
543 | ||
544 | else if(current_count == max_counter) | |
545 | { | |
546 | modes.emplace_back(*first); | |
547 | } | |
548 | ||
549 | first = end_it; | |
550 | } | |
551 | ||
552 | return std::move(modes.begin(), modes.end(), output); | |
553 | } | |
554 | ||
555 | template<class Container, class OutputIterator> | |
556 | inline auto sorted_mode(Container & v, OutputIterator output) -> decltype(output) | |
557 | { | |
558 | return sorted_mode(v.begin(), v.end(), output); | |
559 | } | |
560 | ||
561 | template<class RandomAccessIterator, class OutputIterator> | |
562 | auto mode(RandomAccessIterator first, RandomAccessIterator last, OutputIterator output) -> decltype(output) | |
563 | { | |
564 | std::sort(first, last); | |
565 | return sorted_mode(first, last, output); | |
566 | } | |
567 | ||
568 | template<class RandomAccessContainer, class OutputIterator> | |
569 | inline auto mode(RandomAccessContainer & v, OutputIterator output) -> decltype(output) | |
570 | { | |
571 | return mode(v.begin(), v.end(), output); | |
572 | } | |
f67539c2 | 573 | |
92f5a8d4 TL |
574 | } |
575 | #endif |