-
Notifications
You must be signed in to change notification settings - Fork 2
/
01-01-Stat-Methods-Distributions.Rmd
182 lines (136 loc) · 5.52 KB
/
01-01-Stat-Methods-Distributions.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Statistical Methods {-}
## Fitting Distributions {-}
### Continuous Distributions {-}
* Assessing Distributions Visually
* Formal Tests for Distribution Fit
* Maximum Likelihood calculation
**Normal Distribution**
```{r a0, comment=NA, fig.width=8, fig.height=7, warning=FALSE, message=FALSE}
library(nortest)
library(MASS)
## Draw some random data
set.seed(10)
x1 = rnorm(20)
## Distribution plots
par(mfrow = c(2, 2))
qqnorm(x1)
qqline(x1)
boxplot(x1, main = "Boxplot")
hist(x1, freq = FALSE, main = "Histogram with Density Curve")
lines(density(x1))
plot(ecdf(x1), main = "Empiracle CDF")
```
QQ plot indicates the data might be normal by remaining close to the line. The Box plot, histogram, and density curve all support this assumption.
Formal tests all agree that the data are from the normal distribution. Shapiro Wilk is considered the best for test for testing normality.
```{r a1, comment=NA, warning=FALSE, message=FALSE}
## Are the data from a normal distribution?
## Shapiro-Wilk Test
shapiro.test(x1)
## Anderson Darling Test
ad.test(x1)
## Kolmogorov-Smirnoff Test
ks.test(x1, 'pnorm')
```
**Chi-squared Distribution**
```{r a2, comment=NA, warning=FALSE, message=FALSE, fig.width=8, fig.height=7}
## Draw some random data
set.seed(10)
x2 = rchisq(n = 20, 2)
## Estimate the DF parameter by maximum likelihood
fitdistr(x = x2, dchisq, start = list(df = 2))
## Input the estimate from MLE
ks.test(x = x2, y = pchisq, df = 1.8140625)
## Distribution plots
par(mfrow = c(2, 2))
qqplot(qchisq(ppoints(20), df = 1.8140625), x2, main = "QQ Plot")
qqline(x2, distribution = function(p) qchisq(p, df = 1.8140625))
boxplot(x2, main = "Boxplot")
hist(x2, freq = FALSE, main = "Histogram with Density Curve")
lines(density(x2))
plot(ecdf(x2), main = "Empiracle CDF")
```
**Calculating the MLE manually**
```{r a3, comment=NA, warning=FALSE, message=FALSE, fig.width=8, fig.height=5}
## Generate data from the exponential distribution with mean = 1/5
set.seed(1000)
X = rexp(n = 20, rate = 5)
## sample size and range of betas to test
n = 20; beta = seq(.01, .5, by = .01)
## Liklihood function
Likelihood = (1/beta)^n * exp(-1/beta * sum(X))
## Maximum Likelihood
(mle = max(Likelihood))
(mle.beta = beta[which(Likelihood == mle)])
## Statistical test for how well the specified distribution fits the data
ks.test(x = X, y = "pexp", rate = 1/mle.beta)
par(mfrow = c(1, 2))
## Plot the maximum likelihood
plot(x = beta, y = Likelihood, type = "l", main = "Maximum Likelihood", lwd = 2)
abline(h = mle, v = mle.beta, lty = 2)
## QQplot for assessing distribution fit visually
qqplot(qexp(ppoints(10), rate = 1/mle.beta), X, xlab = "QQ", main = "QQ Plot")
qqline(X, distribution = function(p) qexp(p, rate = 1/mle.beta))
```
---
### Discrete Distributions {-}
* Fitting a Binomial model
* Chi-squared goodness of fit test
* Producing a markdown table
A rare but fatal disease of genetic origin occurring chiefly in infants and children is under investigation. An experiment was conducted on a 100 couples who are both carriers of the disease and have 5 children. A researcher recorded the number of children having the disease for each couple.
```{r a4, comment=NA, fig.width=8, fig.height=6, warning=FALSE, message=FALSE}
library(knitr)
## Dataset
(dta = data.frame(
Diseased = c(0, 1, 2, 3, 4, 5),
Count = c(21, 42, 24, 8, 4, 1)
))
## Number of diseased children (d) and the total number of children (c)
(d = sum(apply(X = dta, MARGIN = 1, FUN = function(p) dta$Diseased * dta$Count)[,1]))
(c = 5 * sum(dta$Count))
## MLE
(mle = d / c)
## Calculate the expected probabilities and expected diseased children
dta$Exp.Prob = round(dbinom(x = 0:5, size = 5, prob = mle), 4)
dta$Exp.Diseased = with(dta, sum(Count) * Exp.Prob)
dta
## Chi-square test requirements:
## 1) all Exp must be > 1
## 2) at most 20% of Exp may be less than 5
##
## So we need to combine counts 4 and 5 and meet these requirements
dta[5, 2:4] = dta[5, 2:4] + dta[6, 2:4]
dta = dta[-6, ]
dta$Diseased = as.character(dta$Diseased)
dta$Diseased[5] = '4 or 5'
## Compute the Chi-Squared Statistic
dta$X.2 = round(with(dta, (Count - Exp.Diseased)^2 / Exp.Diseased), 4)
dta$Diseased = factor(dta$Diseased)
dta
## Compute the test statistic pvalue
(TS = sum(dta$X.2)); 1 - pchisq(TS, df = 4)
## Based on the table below we conclude the binomial model is a good fit for the data
##
## Assessment of Chi-squared GOF P-value
## • p-value > .25 ⇒ Excellent fit
## • .15 ≤ p−value < .25 ⇒ Good fit
## • .05 ≤ p−value < .15 ⇒ Moderately Good fit
## • .01 ≤ p−value < .05 ⇒ Poor fit
## • p-value < .01 ⇒ Unacceptable fit
##
## Plot of the data vs model and create a markdown table from the data frame
plot(x = dta$Diseased, y = NULL, xlab = "Diseased Children per Couple",
ylim = c(0, 50), ylab = "Frequency",
axes = FALSE, type = "n", main = "Binomial Model vs Data")
axis(1, labels = dta$Diseased, at = dta$Diseased)
axis(2, labels = seq(0, 50, 10), at = seq(0, 50, 10))
legend("topright", c("Data", "Model"), col = c("blue", "red"), pch = 19, bty = "n")
points(x = dta$Diseased, y = dta$Count, col = "blue", pch = 19, type = "b")
points(x = dta$Diseased, y = dta$Exp.Diseased, col = "red", pch = 19, type = "b")
kable(dta)
## Some Conclusions
## Since N is large we can use asymptotic confidence intervals
## A 95% confidence interval for whether a child will have the disease
mle + c(-1, 1) * 1.96 * sqrt(.27 * (1 - .27))/sqrt(100)
## What is the probability that a couple will have at least 1 child with the disease?
1 - pbinom(q = 0, size = 5, prob = .27)
```