]>
git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/math/example/normal_misc_examples.cpp
1 // normal_misc_examples.cpp
3 // Copyright Paul A. Bristow 2007, 2010.
5 // Use, modification and distribution are subject to the
6 // Boost Software License, Version 1.0.
7 // (See accompanying file LICENSE_1_0.txt
8 // or copy at http://www.boost.org/LICENSE_1_0.txt)
10 // Example of using normal distribution.
12 // Note that this file contains Quickbook mark-up as well as code
13 // and comments, don't change any of the special comment mark-ups!
17 First we need some includes to access the normal distribution
18 (and some std output of course).
21 #include <boost/math/distributions/normal.hpp> // for normal_distribution
22 using boost::math::normal
; // typedef provides default type is double.
25 using std::cout
; using std::endl
; using std::left
; using std::showpoint
; using std::noshowpoint
;
27 using std::setw
; using std::setprecision
;
29 using std::numeric_limits
;
33 cout
<< "Example: Normal distribution, Miscellaneous Applications.";
37 { // Traditional tables and values.
38 /*`Let's start by printing some traditional tables.
40 double step
= 1.; // in z
41 double range
= 4; // min and max z = -range to +range.
42 int precision
= 17; // traditional tables are only computed to much lower precision.
43 // but std::numeric_limits<double>::max_digits10; on new Standard Libraries gives
44 // 17, the maximum number of digits that can possibly be significant.
45 // std::numeric_limits<double>::digits10; == 15 is number of guaranteed digits,
46 // the other two digits being 'noisy'.
48 // Construct a standard normal distribution s
49 normal s
; // (default mean = zero, and standard deviation = unity)
50 cout
<< "Standard normal distribution, mean = "<< s
.mean()
51 << ", standard deviation = " << s
.standard_deviation() << endl
;
53 /*` First the probability distribution function (pdf).
55 cout
<< "Probability distribution function values" << endl
;
56 cout
<< " z " " pdf " << endl
;
58 for (double z
= -range
; z
< range
+ step
; z
+= step
)
60 cout
<< left
<< setprecision(3) << setw(6) << z
<< " "
61 << setprecision(precision
) << setw(12) << pdf(s
, z
) << endl
;
63 cout
.precision(6); // default
64 /*`And the area under the normal curve from -[infin] up to z,
65 the cumulative distribution function (cdf).
67 // For a standard normal distribution
68 cout
<< "Standard normal mean = "<< s
.mean()
69 << ", standard deviation = " << s
.standard_deviation() << endl
;
70 cout
<< "Integral (area under the curve) from - infinity up to z " << endl
;
71 cout
<< " z " " cdf " << endl
;
72 for (double z
= -range
; z
< range
+ step
; z
+= step
)
74 cout
<< left
<< setprecision(3) << setw(6) << z
<< " "
75 << setprecision(precision
) << setw(12) << cdf(s
, z
) << endl
;
77 cout
.precision(6); // default
79 /*`And all this you can do with a nanoscopic amount of work compared to
80 the team of *human computers* toiling with Milton Abramovitz and Irene Stegen
81 at the US National Bureau of Standards (now [@http://www.nist.gov NIST]).
82 Starting in 1938, their "Handbook of Mathematical Functions with Formulas, Graphs and Mathematical Tables",
83 was eventually published in 1964, and has been reprinted numerous times since.
84 (A major replacement is planned at [@http://dlmf.nist.gov Digital Library of Mathematical Functions]).
86 Pretty-printing a traditional 2-dimensional table is left as an exercise for the student,
87 but why bother now that the Math Toolkit lets you write
90 cout
<< "Area for z = " << z
<< " is " << cdf(s
, z
) << endl
; // to get the area for z.
92 Correspondingly, we can obtain the traditional 'critical' values for significance levels.
93 For the 95% confidence level, the significance level usually called alpha,
94 is 0.05 = 1 - 0.95 (for a one-sided test), so we can write
96 cout
<< "95% of area has a z below " << quantile(s
, 0.95) << endl
;
97 // 95% of area has a z below 1.64485
98 /*`and a two-sided test (a comparison between two levels, rather than a one-sided test)
101 cout
<< "95% of area has a z between " << quantile(s
, 0.975)
102 << " and " << -quantile(s
, 0.975) << endl
;
103 // 95% of area has a z between 1.95996 and -1.95996
106 First, define a table of significance levels: these are the probabilities
107 that the true occurrence frequency lies outside the calculated interval.
109 It is convenient to have an alpha level for the probability that z lies outside just one standard deviation.
110 This will not be some nice neat number like 0.05, but we can easily calculate it,
112 double alpha1
= cdf(s
, -1) * 2; // 0.3173105078629142
113 cout
<< setprecision(17) << "Significance level for z == 1 is " << alpha1
<< endl
;
115 and place in our array of favorite alpha values.
117 double alpha
[] = {0.3173105078629142, // z for 1 standard deviation.
118 0.20, 0.1, 0.05, 0.01, 0.001, 0.0001, 0.00001 };
121 Confidence value as % is (1 - alpha) * 100 (so alpha 0.05 == 95% confidence)
122 that the true occurrence frequency lies *inside* the calculated interval.
125 cout
<< "level of significance (alpha)" << setprecision(4) << endl
;
126 cout
<< "2-sided 1 -sided z(alpha) " << endl
;
127 for (int i
= 0; i
< sizeof(alpha
)/sizeof(alpha
[0]); ++i
)
129 cout
<< setw(15) << alpha
[i
] << setw(15) << alpha
[i
] /2 << setw(10) << quantile(complement(s
, alpha
[i
]/2)) << endl
;
130 // Use quantile(complement(s, alpha[i]/2)) to avoid potential loss of accuracy from quantile(s, 1 - alpha[i]/2)
134 /*`Notice the distinction between one-sided (also called one-tailed)
135 where we are using a > *or* < test (and not both)
136 and considering the area of the tail (integral) from z up to +[infin],
137 and a two-sided test where we are using two > *and* < tests, and thus considering two tails,
138 from -[infin] up to z low and z high up to +[infin].
140 So the 2-sided values alpha[i] are calculated using alpha[i]/2.
142 If we consider a simple example of alpha = 0.05, then for a two-sided test,
143 the lower tail area from -[infin] up to -1.96 is 0.025 (alpha/2)
144 and the upper tail area from +z up to +1.96 is also 0.025 (alpha/2),
145 and the area between -1.96 up to 12.96 is alpha = 0.95.
146 and the sum of the two tails is 0.025 + 0.025 = 0.05,
149 //] [/[normal_basic1]
153 /*`Armed with the cumulative distribution function, we can easily calculate the
154 easy to remember proportion of values that lie within 1, 2 and 3 standard deviations from the mean.
158 cout
<< showpoint
<< "cdf(s, s.standard_deviation()) = "
159 << cdf(s
, s
.standard_deviation()) << endl
; // from -infinity to 1 sd
160 cout
<< "cdf(complement(s, s.standard_deviation())) = "
161 << cdf(complement(s
, s
.standard_deviation())) << endl
;
162 cout
<< "Fraction 1 standard deviation within either side of mean is "
163 << 1 - cdf(complement(s
, s
.standard_deviation())) * 2 << endl
;
164 cout
<< "Fraction 2 standard deviations within either side of mean is "
165 << 1 - cdf(complement(s
, 2 * s
.standard_deviation())) * 2 << endl
;
166 cout
<< "Fraction 3 standard deviations within either side of mean is "
167 << 1 - cdf(complement(s
, 3 * s
.standard_deviation())) * 2 << endl
;
170 To a useful precision, the 1, 2 & 3 percentages are 68, 95 and 99.7,
171 and these are worth memorising as useful 'rules of thumb', as, for example, in
172 [@http://en.wikipedia.org/wiki/Standard_deviation standard deviation]:
175 Fraction 1 standard deviation within either side of mean is 0.683
176 Fraction 2 standard deviations within either side of mean is 0.954
177 Fraction 3 standard deviations within either side of mean is 0.997
180 We could of course get some really accurate values for these
181 [@http://en.wikipedia.org/wiki/Confidence_interval confidence intervals]
182 by using cout.precision(15);
185 Fraction 1 standard deviation within either side of mean is 0.682689492137086
186 Fraction 2 standard deviations within either side of mean is 0.954499736103642
187 Fraction 3 standard deviations within either side of mean is 0.997300203936740
190 But before you get too excited about this impressive precision,
191 don't forget that the *confidence intervals of the standard deviation* are surprisingly wide,
192 especially if you have estimated the standard deviation from only a few measurements.
194 //] [/[normal_basic2]
197 //[normal_bulbs_example1
199 Examples from K. Krishnamoorthy, Handbook of Statistical Distributions with Applications,
200 ISBN 1 58488 635 8, page 125... implemented using the Math Toolkit library.
202 A few very simple examples are shown here:
204 // K. Krishnamoorthy, Handbook of Statistical Distributions with Applications,
205 // ISBN 1 58488 635 8, page 125, example 10.3.5
206 /*`Mean lifespan of 100 W bulbs is 1100 h with standard deviation of 100 h.
207 Assuming, perhaps with little evidence and much faith, that the distribution is normal,
208 we construct a normal distribution called /bulbs/ with these values:
210 double mean_life
= 1100.;
211 double life_standard_deviation
= 100.;
212 normal
bulbs(mean_life
, life_standard_deviation
);
213 double expected_life
= 1000.;
215 /*`The we can use the Cumulative distribution function to predict fractions
216 (or percentages, if * 100) that will last various lifetimes.
218 cout
<< "Fraction of bulbs that will last at best (<=) " // P(X <= 1000)
219 << expected_life
<< " is "<< cdf(bulbs
, expected_life
) << endl
;
220 cout
<< "Fraction of bulbs that will last at least (>) " // P(X > 1000)
221 << expected_life
<< " is "<< cdf(complement(bulbs
, expected_life
)) << endl
;
222 double min_life
= 900;
223 double max_life
= 1200;
224 cout
<< "Fraction of bulbs that will last between "
225 << min_life
<< " and " << max_life
<< " is "
226 << cdf(bulbs
, max_life
) // P(X <= 1200)
227 - cdf(bulbs
, min_life
) << endl
; // P(X <= 900)
229 [note Real-life failures are often very ab-normal,
230 with a significant number that 'dead-on-arrival' or suffer failure very early in their life:
231 the lifetime of the survivors of 'early mortality' may be well described by the normal distribution.]
233 //] [/normal_bulbs_example1 Quickbook end]
236 // K. Krishnamoorthy, Handbook of Statistical Distributions with Applications,
237 // ISBN 1 58488 635 8, page 125, Example 10.3.6
239 //[normal_bulbs_example3
240 /*`Weekly demand for 5 lb sacks of onions at a store is normally distributed with mean 140 sacks and standard deviation 10.
242 double mean
= 140.; // sacks per week.
243 double standard_deviation
= 10;
244 normal
sacks(mean
, standard_deviation
);
246 double stock
= 160.; // per week.
247 cout
<< "Percentage of weeks overstocked "
248 << cdf(sacks
, stock
) * 100. << endl
; // P(X <=160)
249 // Percentage of weeks overstocked 97.7
251 /*`So there will be lots of mouldy onions!
252 So we should be able to say what stock level will meet demand 95% of the weeks.
254 double stock_95
= quantile(sacks
, 0.95);
255 cout
<< "Store should stock " << int(stock_95
) << " sacks to meet 95% of demands." << endl
;
256 /*`And it is easy to estimate how to meet 80% of demand, and waste even less.
258 double stock_80
= quantile(sacks
, 0.80);
259 cout
<< "Store should stock " << int(stock_80
) << " sacks to meet 8 out of 10 demands." << endl
;
260 //] [/normal_bulbs_example3 Quickbook end]
262 { // K. Krishnamoorthy, Handbook of Statistical Distributions with Applications,
263 // ISBN 1 58488 635 8, page 125, Example 10.3.7
265 //[normal_bulbs_example4
267 /*`A machine is set to pack 3 kg of ground beef per pack.
268 Over a long period of time it is found that the average packed was 3 kg
269 with a standard deviation of 0.1 kg.
270 Assuming the packing is normally distributed,
271 we can find the fraction (or %) of packages that weigh more than 3.1 kg.
274 double mean
= 3.; // kg
275 double standard_deviation
= 0.1; // kg
276 normal
packs(mean
, standard_deviation
);
278 double max_weight
= 3.1; // kg
279 cout
<< "Percentage of packs > " << max_weight
<< " is "
280 << cdf(complement(packs
, max_weight
)) << endl
; // P(X > 3.1)
282 double under_weight
= 2.9;
283 cout
<<"fraction of packs <= " << under_weight
<< " with a mean of " << mean
284 << " is " << cdf(complement(packs
, under_weight
)) << endl
;
285 // fraction of packs <= 2.9 with a mean of 3 is 0.841345
286 // This is 0.84 - more than the target 0.95
287 // Want 95% to be over this weight, so what should we set the mean weight to be?
289 double over_mean
= 3.0664;
290 normal
xpacks(over_mean
, standard_deviation
);
291 cout
<< "fraction of packs >= " << under_weight
292 << " with a mean of " << xpacks
.mean()
293 << " is " << cdf(complement(xpacks
, under_weight
)) << endl
;
294 // fraction of packs >= 2.9 with a mean of 3.06449 is 0.950005
295 double under_fraction
= 0.05; // so 95% are above the minimum weight mean - sd = 2.9
296 double low_limit
= standard_deviation
;
297 double offset
= mean
- low_limit
- quantile(packs
, under_fraction
);
298 double nominal_mean
= mean
+ offset
;
300 normal
nominal_packs(nominal_mean
, standard_deviation
);
301 cout
<< "Setting the packer to " << nominal_mean
<< " will mean that "
302 << "fraction of packs >= " << under_weight
303 << " is " << cdf(complement(nominal_packs
, under_weight
)) << endl
;
306 Setting the packer to 3.06449 will mean that fraction of packs >= 2.9 is 0.95.
308 Setting the packer to 3.13263 will mean that fraction of packs >= 2.9 is 0.99,
309 but will more than double the mean loss from 0.0644 to 0.133.
311 Alternatively, we could invest in a better (more precise) packer with a lower standard deviation.
313 To estimate how much better (how much smaller standard deviation) it would have to be,
314 we need to get the 5% quantile to be located at the under_weight limit, 2.9
316 double p
= 0.05; // wanted p th quantile.
317 cout
<< "Quantile of " << p
<< " = " << quantile(packs
, p
)
318 << ", mean = " << packs
.mean() << ", sd = " << packs
.standard_deviation() << endl
; //
320 Quantile of 0.05 = 2.83551, mean = 3, sd = 0.1
322 With the current packer (mean = 3, sd = 0.1), the 5% quantile is at 2.8551 kg,
323 a little below our target of 2.9 kg.
324 So we know that the standard deviation is going to have to be smaller.
326 Let's start by guessing that it (now 0.1) needs to be halved, to a standard deviation of 0.05
328 normal
pack05(mean
, 0.05);
329 cout
<< "Quantile of " << p
<< " = " << quantile(pack05
, p
)
330 << ", mean = " << pack05
.mean() << ", sd = " << pack05
.standard_deviation() << endl
;
332 cout
<<"Fraction of packs >= " << under_weight
<< " with a mean of " << mean
333 << " and standard deviation of " << pack05
.standard_deviation()
334 << " is " << cdf(complement(pack05
, under_weight
)) << endl
;
337 Fraction of packs >= 2.9 with a mean of 3 and standard deviation of 0.05 is 0.9772
339 So 0.05 was quite a good guess, but we are a little over the 2.9 target,
340 so the standard deviation could be a tiny bit more. So we could do some
341 more guessing to get closer, say by increasing to 0.06
344 normal
pack06(mean
, 0.06);
345 cout
<< "Quantile of " << p
<< " = " << quantile(pack06
, p
)
346 << ", mean = " << pack06
.mean() << ", sd = " << pack06
.standard_deviation() << endl
;
348 cout
<<"Fraction of packs >= " << under_weight
<< " with a mean of " << mean
349 << " and standard deviation of " << pack06
.standard_deviation()
350 << " is " << cdf(complement(pack06
, under_weight
)) << endl
;
352 Fraction of packs >= 2.9 with a mean of 3 and standard deviation of 0.06 is 0.9522
354 Now we are getting really close, but to do the job properly,
355 we could use root finding method, for example the tools provided, and used elsewhere,
356 in the Math Toolkit, see __root_finding_without_derivatives.
358 But in this normal distribution case, we could be even smarter and make a direct calculation.
361 normal s
; // For standard normal distribution,
363 double x
= 2.9; // Our required limit.
364 // then probability p = N((x - mean) / sd)
365 // So if we want to find the standard deviation that would be required to meet this limit,
366 // so that the p th quantile is located at x,
367 // in this case the 0.95 (95%) quantile at 2.9 kg pack weight, when the mean is 3 kg.
369 double prob
= pdf(s
, (x
- mean
) / sd
);
370 double qp
= quantile(s
, 0.95);
371 cout
<< "prob = " << prob
<< ", quantile(p) " << qp
<< endl
; // p = 0.241971, quantile(p) 1.64485
372 // Rearranging, we can directly calculate the required standard deviation:
373 double sd95
= std::abs((x
- mean
)) / qp
;
375 cout
<< "If we want the "<< p
<< " th quantile to be located at "
376 << x
<< ", would need a standard deviation of " << sd95
<< endl
;
378 normal
pack95(mean
, sd95
); // Distribution of the 'ideal better' packer.
379 cout
<<"Fraction of packs >= " << under_weight
<< " with a mean of " << mean
380 << " and standard deviation of " << pack95
.standard_deviation()
381 << " is " << cdf(complement(pack95
, under_weight
)) << endl
;
383 // Fraction of packs >= 2.9 with a mean of 3 and standard deviation of 0.0608 is 0.95
385 /*`Notice that these two deceptively simple questions
386 (do we over-fill or measure better) are actually very common.
387 The weight of beef might be replaced by a measurement of more or less anything.
388 But the calculations rely on the accuracy of the standard deviation - something
389 that is almost always less good than we might wish,
390 especially if based on a few measurements.
393 //] [/normal_bulbs_example4 Quickbook end]
396 { // K. Krishnamoorthy, Handbook of Statistical Distributions with Applications,
397 // ISBN 1 58488 635 8, page 125, example 10.3.8
398 //[normal_bulbs_example5
399 /*`A bolt is usable if between 3.9 and 4.1 long.
400 From a large batch of bolts, a sample of 50 show a
401 mean length of 3.95 with standard deviation 0.1.
402 Assuming a normal distribution, what proportion is usable?
403 The true sample mean is unknown,
404 but we can use the sample mean and standard deviation to find approximate solutions.
407 normal
bolts(3.95, 0.1);
411 cout
<< "Fraction long enough [ P(X <= " << top
<< ") ] is " << cdf(bolts
, top
) << endl
;
412 cout
<< "Fraction too short [ P(X <= " << bottom
<< ") ] is " << cdf(bolts
, bottom
) << endl
;
413 cout
<< "Fraction OK -between " << bottom
<< " and " << top
414 << "[ P(X <= " << top
<< ") - P(X<= " << bottom
<< " ) ] is "
415 << cdf(bolts
, top
) - cdf(bolts
, bottom
) << endl
;
417 cout
<< "Fraction too long [ P(X > " << top
<< ") ] is "
418 << cdf(complement(bolts
, top
)) << endl
;
420 cout
<< "95% of bolts are shorter than " << quantile(bolts
, 0.95) << endl
;
422 //] [/normal_bulbs_example5 Quickbook end]
425 catch(const std::exception
& e
)
426 { // Always useful to include try & catch blocks because default policies
427 // are to throw exceptions on arguments that cause errors like underflow, overflow.
428 // Lacking try & catch blocks, the program will abort without a message below,
429 // which may give some helpful clues as to the cause of the exception.
431 "\n""Message from thrown exception was:\n " << e
.what() << std::endl
;
441 Autorun "i:\boost-06-05-03-1300\libs\math\test\Math_test\debug\normal_misc_examples.exe"
442 Example: Normal distribution, Miscellaneous Applications.Standard normal distribution, mean = 0, standard deviation = 1
443 Probability distribution function values
445 -4 0.00013383022576488537
446 -3 0.0044318484119380075
447 -2 0.053990966513188063
448 -1 0.24197072451914337
450 1 0.24197072451914337
451 2 0.053990966513188063
452 3 0.0044318484119380075
453 4 0.00013383022576488537
454 Standard normal mean = 0, standard deviation = 1
455 Integral (area under the curve) from - infinity up to z
457 -4 3.1671241833119979e-005
458 -3 0.0013498980316300959
459 -2 0.022750131948179219
460 -1 0.1586552539314571
462 1 0.84134474606854293
463 2 0.97724986805182079
465 4 0.99996832875816688
466 Area for z = 2 is 0.97725
467 95% of area has a z below 1.64485
468 95% of area has a z between 1.95996 and -1.95996
469 Significance level for z == 1 is 0.3173105078629142
470 level of significance (alpha)
471 2-sided 1 -sided z(alpha)
480 cdf(s, s.standard_deviation()) = 0.841
481 cdf(complement(s, s.standard_deviation())) = 0.159
482 Fraction 1 standard deviation within either side of mean is 0.683
483 Fraction 2 standard deviations within either side of mean is 0.954
484 Fraction 3 standard deviations within either side of mean is 0.997
485 Fraction of bulbs that will last at best (<=) 1.00e+003 is 0.159
486 Fraction of bulbs that will last at least (>) 1.00e+003 is 0.841
487 Fraction of bulbs that will last between 900. and 1.20e+003 is 0.819
488 Percentage of weeks overstocked 97.7
489 Store should stock 156 sacks to meet 95% of demands.
490 Store should stock 148 sacks to meet 8 out of 10 demands.
491 Percentage of packs > 3.10 is 0.159
492 fraction of packs <= 2.90 with a mean of 3.00 is 0.841
493 fraction of packs >= 2.90 with a mean of 3.07 is 0.952
494 Setting the packer to 3.06 will mean that fraction of packs >= 2.90 is 0.950
495 Quantile of 0.0500 = 2.84, mean = 3.00, sd = 0.100
496 Quantile of 0.0500 = 2.92, mean = 3.00, sd = 0.0500
497 Fraction of packs >= 2.90 with a mean of 3.00 and standard deviation of 0.0500 is 0.977
498 Quantile of 0.0500 = 2.90, mean = 3.00, sd = 0.0600
499 Fraction of packs >= 2.90 with a mean of 3.00 and standard deviation of 0.0600 is 0.952
500 prob = 0.242, quantile(p) 1.64
501 If we want the 0.0500 th quantile to be located at 2.90, would need a standard deviation of 0.0608
502 Fraction of packs >= 2.90 with a mean of 3.00 and standard deviation of 0.0608 is 0.950
503 Fraction long enough [ P(X <= 4.10) ] is 0.933
504 Fraction too short [ P(X <= 3.90) ] is 0.309
505 Fraction OK -between 3.90 and 4.10[ P(X <= 4.10) - P(X<= 3.90 ) ] is 0.625
506 Fraction too long [ P(X > 4.10) ] is 0.0668
507 95% of bolts are shorter than 4.11