knitr::opts_chunk$set(echo = TRUE)
require(Sleuth3)
require(ggplot2)
require(tigerstats)
require(xtable)
finches = case0201
The data consists of measurements of the beak depths of 178 finches, some measured before the draught, and some measured after the draught.
This is a bit of an overkill, but let’s see from what years the data is:
qplot(finches$Year, bins=3)
Now, let’s see how the beak depths differ across the years
ggplot(finches, aes(x=Depth)) +
geom_histogram(data=subset(finches, Year == 1976), fill = "red", alpha = 0.2) +
geom_histogram(data=subset(finches, Year == 1978), fill = "blue", alpha = 0.2)
Box plots and density plots can also be instructive
bwplot(Year ~ Depth, data = finches)
densityplot(~Depth, groups = Year, auto.key = TRUE, data = finches)
Here, we want to imagine that we already know the variance for some reason. But actually, we don’t know the variance. What would be a good guess?
var(finches[finches$Year==1976,]$Depth)
## [1] 1.07191
var(finches[finches$Year==1978,]$Depth)
## [1] 0.8217058
From this, it would seem that we can imagine that the known variance (for the entire population of finches in 1976, and for the entire population of finches in 1978) is 1.0. (Aside: is that plausible? See the next lecture.)
Let’s not compute plausible means for the 1976 and 1978 populations
mean(finches[finches$Year==1976,]$Depth)
## [1] 9.469663
mean(finches[finches$Year==1978,]$Depth)
## [1] 10.1382
So, we can imagine that the population in 1976 was distributed as \(N(9.5, 1.0)\), and the population in 1978 is distributed like \(N(10.1, 1.0)\).
Here is an example of two samples.
sample_x <- rnorm(n=89, mean=9.5, sd=1.0)
hist(sample_x)
sample_y <- rnorm(n=89, mean=10.1, sd=1.0)
hist(sample_y)
Let’s now generate a couple more samples
sample_x <- as.data.frame(rnorm(n=89, mean=9.5, sd=1.0))
sample_y <- as.data.frame(rnorm(n=89, mean=10.1, sd=1.0))
colnames(sample_x) <- c("x")
colnames(sample_y) <- c("x")
ggplot(sample_x, aes(x=x)) +
geom_histogram(data=as.data.frame(sample_x), fill = "red", alpha = 0.2) +
geom_histogram(data=as.data.frame(sample_y), fill = "blue", alpha = 0.2)
sample_x <- as.data.frame(rnorm(n=89, mean=9.5, sd=1.0))
sample_y <- as.data.frame(rnorm(n=89, mean=10.1, sd=1.0))
colnames(sample_x) <- c("x")
colnames(sample_y) <- c("x")
ggplot(sample_x, aes(x=x)) +
geom_histogram(data=as.data.frame(sample_x), fill = "red", alpha = 0.2) +
geom_histogram(data=as.data.frame(sample_y), fill = "blue", alpha = 0.2)
sample_x <- as.data.frame(rnorm(n=89, mean=9.5, sd=1.0))
sample_y <- as.data.frame(rnorm(n=89, mean=10.1, sd=1.0))
colnames(sample_x) <- c("x")
colnames(sample_y) <- c("x")
ggplot(sample_x, aes(x=x)) +
geom_histogram(data=as.data.frame(sample_x), fill = "red", alpha = 0.2) +
geom_histogram(data=as.data.frame(sample_y), fill = "blue", alpha = 0.2)
sample_x <- as.data.frame(rnorm(n=89, mean=9.5, sd=1.0))
sample_y <- as.data.frame(rnorm(n=89, mean=10.1, sd=1.0))
colnames(sample_x) <- c("x")
colnames(sample_y) <- c("x")
ggplot(sample_x, aes(x=x)) +
geom_histogram(data=as.data.frame(sample_x), fill = "red", alpha = 0.2) +
geom_histogram(data=as.data.frame(sample_y), fill = "blue", alpha = 0.2)
The histograms are different every time (since they are random). Now, let’s look at the sampling distribution of the sample mean.
means <- c()
for(i in 1:1000){
sample_x <- rnorm(n=89, mean=9.5, sd=1.0)
new_mean <- mean(sample_x)
means <- c(means, new_mean)
}
hist(means)
mean(means)
## [1] 9.499978
sd(means)
## [1] 0.1081852
Note that \(1.0/\sqrt(89) \approx 0.106\), so this is what the theory predicts.
Now, let’s confirm that if \(X_i\sim N(\mu_X, \sigma^2)\) and \(Y_i\sim N(\mu_Y, \sigma^2)\), then \((\bar{X}-\bar{Y})\sim N(\mu_X-\mu_Y, \frac{2}{N}\sigma^2)\).
diffs <- c()
for(i in 1:1000){
sample_x <- rnorm(n=89, mean=9.5, sd=1.0)
sample_y <- rnorm(n=89, mean=10.1, sd=1.0)
new_diff <- mean(sample_x)-mean(sample_y)
diffs <- c(diffs, new_diff)
}
hist(diffs)
mean(diffs)
## [1] -0.5995954
sd(diffs)
## [1] 0.1473324
The SD of the difference in sample means is indeed about \(\sqrt{2}\) times the SD of the sample mean.
To recap, a confidence interval is the interval within which the true value of the variable of interest will fall 95% (or 90%, or 99%…) of time, assuming the model is correct, when we repeatedly sample.
Suppose we have the samples sample_x
and sample_y
. We’ll build the CI for the difference between the means using those samples.
Since \((\bar{X}-\bar{Y})\sim N(\mu_X-\mu_Y, \frac{2}{N}\sigma^2)\), it follows that
\[\frac{(\bar{X}-\bar{Y})-(\mu_X-\mu_Y)}{\sqrt{2} \frac{\sigma}{\sqrt{N}}}\sim N(0, 1)\]
So that \(P((\mu_X-\mu_Y)\in [(\bar{X}-\bar{Y})-z_{.975}\sqrt{2} \frac{\sigma}{\sqrt{N}},(\bar{X}-\bar{Y})+z_{.975}\sqrt{2} \frac{\sigma}{\sqrt{N}}]) = .95\).
Let’s verify that this is true. Note that we can compute \(z_{.975}\) using qnorm(.975)
.
sigma <- 1.0
N <- 89
mu_x <- 9.5
mu_y <- 10.1
real_diff_in_int <- c()
diffs <- c()
for(i in 1:1000){
sample_x <- rnorm(n=N, mean=mu_x, sd=1.0)
sample_y <- rnorm(n=N, mean=mu_y, sd=1.0)
new_diff <- mean(sample_x)-mean(sample_y)
lower <- mean(sample_x) - mean(sample_y) - qnorm(0.975)*sqrt(2)*sigma/sqrt(N)
upper <- mean(sample_x) - mean(sample_y) + qnorm(0.975)*sqrt(2)*sigma/sqrt(N)
diff <- mu_x - mu_y
diff_in_int <- diff >= lower && diff <= upper
real_diff_in_int <- c(real_diff_in_int, diff_in_int)
}
mean(real_diff_in_int)
## [1] 0.958
One of the points of doing this simulation is to emphasize what it means to that we built a 95% CI. It’s saying that assuming the model is true, if we repeatdly sample and obtain a CI, the CI will contain the true parameter value 95% of the time.
Student’s t-Distribution is the distribution of the t-ratio \(t = \frac{\bar{X}-\mu_X}{s/\sqrt{N}}\), where we obtained \(s\) from the data.
One way to see the effect of estimating \(s\) from the data is to see what the quantiles are for the t-Distrbution when compared to the Normal distribution
qnorm(.975)
[1] 1.959964
qt_vals = data.frame(df=integer(), critical=integer())
for (df in c(5, 10, 20, 80, 160, 320, 1000)){
qt_vals <- rbind(qt_vals, c(df, qt(.975, df=df)))
}
colnames(qt_vals) <- c("df", "critical value")
print(xtable(qt_vals), type="html", include.rownames=FALSE)
df | critical value |
---|---|
5.00 | 2.57 |
10.00 | 2.23 |
20.00 | 2.09 |
80.00 | 1.99 |
160.00 | 1.97 |
320.00 | 1.97 |
1000.00 | 1.96 |
(Note that there are better–more idiomatic–ways to do this
#Happens to work with qt, would work with a lot of functions
qt(.975, df=c(10,20, 100))
## [1] 2.228139 2.085963 1.983972
#The general way to apply a function to multiple values
sapply(c(10, 20, 100), function(df) qt(.975, df))
## [1] 2.228139 2.085963 1.983972
)
As you can see, for large enough sample sizes, since the estimate for the SD is very good, we could get away with using a normal approximation (and when the math doesn’t work out as nicely as in this case, we will!)
Let’s now construct the \(100(1-\alpha)\%\) confidence intervals for the difference in means.
alpha = .05
t <- qt(1-alpha/2, df=df)
X <- finches[finches$Year==1976,]$Depth
Y <- finches[finches$Year==1978,]$Depth
Nx <- length(X)
Ny <- length(Y)
pooled_sd <- sqrt((sum((X-mean(X))^2) + sum((Y-mean(Y))^2))/(length(X)+length(Y)-2))
diff_sd <- pooled_sd*sqrt(1/Nx + 1/Ny)
lower <- mean(X)-mean(Y)-t*diff_sd
upper <- mean(X)-mean(Y)+t*diff_sd
c(lower, upper)
## [1] -0.9547758 -0.3823028
t <- qt(1-alpha, df=df)
upper <- mean(X)-mean(Y)+t*diff_sd
lower <- -Inf
c(lower, upper)
## [1] -Inf -0.4283904