library(ggplot2)
library(arm)
setwd("/media/guerzhoy/Windows/STA303/lectures/lec9/")
srrs2 <- read.table ("srrs2.dat", header=T, sep=",")
mn <- srrs2[srrs2$state=="MN",]
mn$radon <- mn$activity
mn$log.radon <- log (ifelse (mn$radon==0, .1, mn$radon))
mean(mn$log.radon)
## [1] 1.224623
sd(mn$log.radon)
## [1] 0.8533272
Let’s compute the average log-radon for all counties
fit <- lm(log.radon~county, data=mn)
s <- summary(fit)
coefs <- as.data.frame(s$coefficients)
Add a column with County nmaes
cleanup_name <- function(s){
return(gsub(" ", "", gsub("county", "", s)))
}
coefs$county <- as.character(lapply(rownames(s$coefficients), cleanup_name))
ggplot(coefs, aes(x = county, y = Estimate)) +
geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`))
subset_coefs <- coefs[c(2, 10, 22, 30, 36, 40, 50, 55, 60, 65),]
ggplot(subset_coefs, aes(x = county, y = Estimate)) +
geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`))
How many datapoints for Lac Qui Parle?
mn$county_clean <- as.character(lapply(mn$county, cleanup_name))
mn[mn$county_clean=="LACQUIPARLE", c("zip", "log.radon")]
## zip log.radon
## 5498 56256 2.424803
## 5499 56256 2.772589
Just two datapoints.
What about Anoka?
mn[mn$county_clean=="ANOKA", c("zip", "log.radon")]
## zip log.radon
## 5085 55011 1.13140211
## 5086 55014 0.91629073
## 5087 55014 0.40546511
## 5088 55014 0.00000000
## 5089 55014 -0.35667494
## 5090 55014 0.18232156
## 5091 55014 0.18232156
## 5092 55014 0.26236426
## 5093 55014 0.33647224
## 5094 55025 -0.91629073
## 5095 55025 0.09531018
## 5096 55025 1.50407740
## 5097 55025 0.26236426
## 5098 55025 0.74193734
## 5099 55038 1.77495235
## 5100 55070 1.19392247
## 5101 55303 0.58778666
## 5102 55303 1.68639895
## 5103 55303 1.84054963
## 5104 55303 0.64185389
## 5105 55303 1.88706965
## 5106 55303 1.13140211
## 5107 55303 1.91692261
## 5108 55303 1.94591015
## 5109 55303 2.04122033
## 5110 55303 1.64865863
## 5111 55303 1.50407740
## 5112 55303 1.48160454
## 5113 55303 1.02961942
## 5114 55303 2.09186406
## 5115 55303 0.47000363
## 5116 55304 1.43508453
## 5117 55304 1.68639895
## 5118 55304 1.38629436
## 5119 55304 0.83290912
## 5120 55304 1.06471074
## 5121 55304 0.33647224
## 5122 55304 1.19392247
## 5123 55304 1.06471074
## 5124 55304 0.58778666
## 5125 55304 -1.60943791
## 5126 55421 0.87546874
## 5127 55421 0.09531018
## 5128 55432 0.78845736
## 5129 55432 -0.51082562
## 5130 55432 0.53062825
## 5131 55433 1.06471074
## 5132 55433 0.78845736
## 5133 55434 0.53062825
## 5134 55434 0.33647224
## 5135 55434 0.64185389
## 5136 55434 0.58778666
Lots more. In general, larger error bars means smaller sample size. We should trust that estimate less.
Let’s generate county data
mu_a <- 1.1
sigma_a <- .3
sigma_y <- 0.8
n_counties <- 10
county_alphas <- rnorm(n_counties, mu_a, sigma_a)
county_alphas
## [1] 1.6670990 1.0413156 1.5770446 1.0328519 1.1529916 1.1735248 1.6928687
## [8] 0.9726845 0.8155599 1.4285897
Now, 10 measurements for county 2 would look like this:
rnorm(10, county_alphas[2], sigma_y)
## [1] 0.1946272 0.1856203 0.8397603 0.9574165 1.0972093 1.9969020 1.7472297
## [8] 0.9122856 2.1791477 1.6341556
Another 15 measurements for county 8 would look like this:
rnorm(15, county_alphas[8], sigma_y)
## [1] 1.083025168 0.458684319 1.966970946 1.696426632 0.320232825
## [6] 0.635152352 2.186371477 1.805396523 1.799289668 -0.004022616
## [11] 0.495692197 0.280336339 0.511478718 1.234910279 0.161252770
Let’s now fit the Partial Pooling model
library(lme4)
summary(lmer(log.radon~(1|county), data=mn))
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county)
## Data: mn
##
## REML criterion at convergence: 2259.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4661 -0.5734 0.0441 0.6432 3.3516
##
## Random effects:
## Groups Name Variance Std.Dev.
## county (Intercept) 0.09581 0.3095
## Residual 0.63662 0.7979
## Number of obs: 919, groups: county, 85
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.31258 0.04891 26.84
Let’s get the individual intercepts
coef(lmer(log.radon~(1|county), data=mn))
## $county
## (Intercept)
## AITKIN 1.0674994
## ANOKA 0.8875568
## BECKER 1.2303812
## BELTRAMI 1.2245444
## BENTON 1.2899760
## BIG STONE 1.3749235
## BLUE EARTH 1.7171954
## BROWN 1.4315991
## CARLTON 1.0833131
## CARVER 1.2608819
## CASS 1.3506019
## CHIPPEWA 1.4695309
## CHISAGO 1.1826263
## CLAY 1.6312662
## CLEARWATER 1.1867346
## COOK 1.1627174
## COTTONWOOD 1.0953027
## CROW WING 1.0736856
## DAKOTA 1.2944691
## DODGE 1.4642936
## DOUGLAS 1.5089532
## FARIBAULT 0.9349171
## FILLMORE 1.2494467
## FREEBORN 1.6743898
## GOODHUE 1.6755316
## HENNEPIN 1.2867314
## HOUSTON 1.4172565
## HUBBARD 1.0964581
## ISANTI 1.2327649
## ITASCA 1.0714227
## JACKSON 1.6165831
## KANABEC 1.2839088
## KANDIYOHI 1.5941519
## KITTSON 1.2495867
## KOOCHICHING 0.8482224
## LAC QUI PARLE 1.6101391
## LAKE 0.7427261
## LAKE OF THE WOODS 1.3858053
## LE SUEUR 1.4376932
## LINCOLN 1.6218834
## LYON 1.6209590
## MAHNOMEN 1.3189074
## MARSHALL 1.2489275
## MARTIN 1.1204333
## MCLEOD 1.1545020
## MEEKER 1.2705104
## MILLE LACS 1.1300481
## MORRISON 1.1719088
## MOWER 1.4969979
## MURRAY 1.4670206
## NICOLLET 1.6329198
## NOBLES 1.5039156
## NORMAN 1.2186497
## OLMSTED 1.2351414
## OTTER TAIL 1.3318141
## PENNINGTON 1.0973710
## PINE 0.9944189
## PIPESTONE 1.4509455
## POLK 1.3309653
## POPE 1.3048974
## RAMSEY 1.1292427
## REDWOOD 1.5385774
## RENVILLE 1.3492634
## RICE 1.6054357
## ROCK 1.3094621
## ROSEAU 1.2728558
## SCOTT 1.4904852
## SHERBURNE 1.1909980
## SIBLEY 1.2862247
## STEARNS 1.3631229
## STEELE 1.4731860
## STEVENS 1.4234422
## ST LOUIS 0.7977368
## SWIFT 1.1902441
## TODD 1.3657564
## TRAVERSE 1.5063796
## WABASHA 1.5209552
## WADENA 1.1772729
## WASECA 0.9827000
## WASHINGTON 1.2589465
## WATONWAN 1.5976957
## WILKIN 1.4325911
## WINONA 1.4079158
## WRIGHT 1.4961184
## YELLOW MEDICINE 1.2834113
##
## attr(,"class")
## [1] "coef.mer"
Here, we are saying that we want to fit an individual intercept for each county, and each individual intercept for each county is modelled: that is, we are assuming that \(\alpha_j~N(\mu_\alpha, \sigma_\alpha)\).
Let’s look at Lac Qui Parle County specifically
#Partial pooling
coef(lmer(log.radon~(1|county), data=mn))$county["LAC QUI PARLE ",]
## [1] 1.610139
#No-pooling
as.data.frame(lm(log.radon~county, data=mn)$coefficients)["countyLAC QUI PARLE ",]
## [1] 1.938289
#Complete pooling
lm(log.radon~1, data=mn)
##
## Call:
## lm(formula = log.radon ~ 1, data = mn)
##
## Coefficients:
## (Intercept)
## 1.225
As you can see, partial pooling helps move the estimate towards the mean, by using information about the other counties.
Let’s add the floor predictor
lmer(log.radon~(1|county) + floor, data=mn)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county) + floor
## Data: mn
## REML criterion at convergence: 2171.305
## Random effects:
## Groups Name Std.Dev.
## county (Intercept) 0.3282
## Residual 0.7556
## Number of obs: 919, groups: county, 85
## Fixed Effects:
## (Intercept) floor
## 1.462 -0.693
Let’s fit a different intercept for each county as well.
lmer(log.radon~floor+(floor|county) , data=mn)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ floor + (floor | county)
## Data: mn
## REML criterion at convergence: 2168.325
## Random effects:
## Groups Name Std.Dev. Corr
## county (Intercept) 0.3487
## floor 0.3436 -0.34
## Residual 0.7462
## Number of obs: 919, groups: county, 85
## Fixed Effects:
## (Intercept) floor
## 1.4628 -0.6811
Let’s get the modelled/random effects
fit<-lmer(log.radon~floor+(floor|county) , data=mn)
ranef(fit)
## $county
## (Intercept) floor
## AITKIN -0.318238328 0.140489342
## ANOKA -0.529391136 -0.089719658
## BECKER 0.008920476 0.012215452
## BELTRAMI 0.072667549 -0.071460523
## BENTON -0.035734040 0.060430855
## BIG STONE 0.019886161 -0.006611891
## BLUE EARTH 0.356585563 0.206349718
## BROWN 0.228005509 -0.016413021
## CARLTON -0.361830636 0.307392752
## CARVER 0.085931226 -0.098786087
## CASS -0.032172470 0.010696930
## CHIPPEWA 0.124710589 -0.041464658
## CHISAGO -0.240501602 0.079963673
## CLAY 0.427226081 -0.171248346
## CLEARWATER -0.072860452 0.023840218
## COOK -0.242558065 0.080647420
## COTTONWOOD -0.017277629 -0.149253484
## CROW WING -0.310678541 0.234951980
## DAKOTA -0.109905278 -0.087869162
## DODGE 0.133608692 -0.044423162
## DOUGLAS 0.167576578 0.009903829
## FARIBAULT -0.478975389 0.127679010
## FILLMORE -0.037616409 0.037775691
## FREEBORN 0.423886713 -0.088762178
## GOODHUE 0.312759451 0.201019525
## HENNEPIN -0.089921905 -0.098460880
## HOUSTON 0.149131164 0.031637058
## HUBBARD -0.146932774 0.056548187
## ISANTI -0.161009296 0.053533509
## ITASCA -0.379169815 0.126069061
## JACKSON 0.291153700 -0.096804841
## KANABEC -0.105595935 0.035109283
## KANDIYOHI 0.279331391 -0.092874076
## KITTSON 0.062452509 -0.054639400
## KOOCHICHING -0.460208659 0.162370978
## LAC QUI PARLE 0.375049045 0.095636664
## LAKE -0.722397288 0.234318866
## LAKE OF THE WOODS 0.144830537 0.056559621
## LE SUEUR 0.122935103 0.049512573
## LINCOLN 0.398069005 -0.099862859
## LYON 0.292104875 0.053937085
## MAHNOMEN -0.018245514 0.006066398
## MARSHALL 0.330028373 -0.534422518
## MARTIN -0.214148416 -0.139498482
## MCLEOD -0.090775712 -0.118601459
## MEEKER -0.129531500 0.043067549
## MILLE LACS -0.092543969 -0.192771753
## MORRISON -0.232368198 0.151936802
## MOWER 0.225136232 -0.280591258
## MURRAY 0.184695195 -0.061408764
## NICOLLET 0.327431819 -0.108866846
## NOBLES 0.184027261 -0.061186685
## NORMAN -0.077478925 -0.015362973
## OLMSTED -0.095487459 -0.213251381
## OTTER TAIL 0.051319979 0.093877536
## PENNINGTON -0.066286794 -0.165155747
## PINE -0.399870333 0.080361523
## PIPESTONE 0.157981491 0.037881709
## POLK 0.154571791 -0.122001790
## POPE -0.055746755 0.018535075
## RAMSEY -0.310409126 0.354223001
## REDWOOD 0.176090156 0.278560253
## RENVILLE 0.022531452 0.144560288
## RICE 0.277860999 -0.106888686
## ROCK -0.049749162 0.016540953
## ROSEAU 0.103833868 0.054850654
## SCOTT 0.144888377 0.283320683
## SHERBURNE -0.237058364 0.078818841
## SIBLEY -0.102722518 0.034153909
## STEARNS 0.044138010 -0.132161042
## STEELE 0.080341430 -0.026712487
## STEVENS 0.100010203 -0.033252100
## ST LOUIS -0.592368730 0.118669424
## SWIFT -0.221808418 0.073748431
## TODD 0.043268986 0.140838624
## TRAVERSE 0.253858364 -0.066435678
## WABASHA 0.253201870 -0.219505673
## WADENA -0.069847843 -0.067420341
## WASECA -0.382489254 0.027236269
## WASHINGTON -0.102833753 -0.157330637
## WATONWAN 0.381344961 0.145077162
## WILKIN 0.137520830 -0.045723898
## WINONA 0.231470723 -0.470004068
## WRIGHT 0.136349562 -0.051627294
## YELLOW MEDICINE -0.083977457 0.027921419
Now, let’s do prediction. Suppose we want to make a prediction for a new measurement in Lac Qui Parle county.
fit<-lmer(log.radon~floor+(floor|county) , data=mn)
sigma.y.hat <- sigma.hat(fit)$sigma$data
coef.hat <- as.matrix(coef(fit)$county["LAC QUI PARLE ",])
x_new <- 1 #first floor
y_new <- rnorm(1000, coef.hat %*% c(1, x_new), sigma.y.hat )
qplot(y_new)
What we got is what the log-radon measurements could possibly look like in Lac Qui Parle county. What about the radon measurements?
qplot(exp(y_new))
Let’s get a 95% predictive interval
quantile(y_new, c(0.025, 0.975))
## 2.5% 97.5%
## -0.2993842 2.8582184
quantile(exp(y_new), c(0.025, 0.975))
## 2.5% 97.5%
## 0.7412748 17.4304742
In our simulation, 95% of the data fell within the above interval.
Now, let’s predict a new observation in a new county that we haven’t seen.
sigma_a <- 0.3487
sigma_b <- 0.3436
rho <- -0.34
Sigma <- matrix(c(sigma_a^2 , rho*sigma_a*sigma_b,
rho*sigma_a*sigma_b , sigma_b^2),
nrow=2,
ncol=2,
byrow = TRUE)
mu_a <- 1.4628
mu_b <- -0.6811
mu <- matrix(c(mu_a,
mu_b),
nrow=2,
byrow=TRUE)
sigma_y <- 0.7462
random_params <- mvrnorm(n = 1000, mu, Sigma)
x_new = 1 # first floor
y_new <- rnorm(1000, mean=random_params %*% c(1, x_new), sd=sigma_y)
qplot(y_new)
Let’s now construct a predictive interval again
quantile(y_new, c(0.025, 0.975))
## 2.5% 97.5%
## -0.8385728 2.5221888
As you can see, the predictive interval is wider for an unknown county (of course).