Reading in the data

library(ggplot2)
library(arm)
setwd("/media/guerzhoy/Windows/STA303/lectures/lec9/")
srrs2 <- read.table ("srrs2.dat", header=T, sep=",")
mn <- srrs2[srrs2$state=="MN",]
mn$radon <- mn$activity
mn$log.radon <- log (ifelse (mn$radon==0, .1, mn$radon))

Complete pooling

mean(mn$log.radon)
## [1] 1.224623
sd(mn$log.radon)
## [1] 0.8533272

Let’s compute the average log-radon for all counties

fit <- lm(log.radon~county, data=mn)
s <- summary(fit)
coefs <- as.data.frame(s$coefficients)

Add a column with County nmaes

cleanup_name <- function(s){
  return(gsub(" ", "", gsub("county", "", s)))
}


coefs$county <- as.character(lapply(rownames(s$coefficients), cleanup_name))
ggplot(coefs, aes(x = county, y = Estimate)) +  
  geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`))

subset_coefs <- coefs[c(2, 10, 22, 30, 36, 40, 50, 55, 60, 65),]
ggplot(subset_coefs, aes(x = county, y = Estimate)) +  
  geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`))

How many datapoints for Lac Qui Parle?

mn$county_clean <- as.character(lapply(mn$county, cleanup_name))
mn[mn$county_clean=="LACQUIPARLE", c("zip", "log.radon")]
##        zip log.radon
## 5498 56256  2.424803
## 5499 56256  2.772589

Just two datapoints.

What about Anoka?

mn[mn$county_clean=="ANOKA", c("zip", "log.radon")]
##        zip   log.radon
## 5085 55011  1.13140211
## 5086 55014  0.91629073
## 5087 55014  0.40546511
## 5088 55014  0.00000000
## 5089 55014 -0.35667494
## 5090 55014  0.18232156
## 5091 55014  0.18232156
## 5092 55014  0.26236426
## 5093 55014  0.33647224
## 5094 55025 -0.91629073
## 5095 55025  0.09531018
## 5096 55025  1.50407740
## 5097 55025  0.26236426
## 5098 55025  0.74193734
## 5099 55038  1.77495235
## 5100 55070  1.19392247
## 5101 55303  0.58778666
## 5102 55303  1.68639895
## 5103 55303  1.84054963
## 5104 55303  0.64185389
## 5105 55303  1.88706965
## 5106 55303  1.13140211
## 5107 55303  1.91692261
## 5108 55303  1.94591015
## 5109 55303  2.04122033
## 5110 55303  1.64865863
## 5111 55303  1.50407740
## 5112 55303  1.48160454
## 5113 55303  1.02961942
## 5114 55303  2.09186406
## 5115 55303  0.47000363
## 5116 55304  1.43508453
## 5117 55304  1.68639895
## 5118 55304  1.38629436
## 5119 55304  0.83290912
## 5120 55304  1.06471074
## 5121 55304  0.33647224
## 5122 55304  1.19392247
## 5123 55304  1.06471074
## 5124 55304  0.58778666
## 5125 55304 -1.60943791
## 5126 55421  0.87546874
## 5127 55421  0.09531018
## 5128 55432  0.78845736
## 5129 55432 -0.51082562
## 5130 55432  0.53062825
## 5131 55433  1.06471074
## 5132 55433  0.78845736
## 5133 55434  0.53062825
## 5134 55434  0.33647224
## 5135 55434  0.64185389
## 5136 55434  0.58778666

Lots more. In general, larger error bars means smaller sample size. We should trust that estimate less.

Multilvel Model – fake data

Let’s generate county data

mu_a <- 1.1
sigma_a <- .3
sigma_y <- 0.8

n_counties <- 10
county_alphas <- rnorm(n_counties, mu_a, sigma_a)
county_alphas
##  [1] 1.6670990 1.0413156 1.5770446 1.0328519 1.1529916 1.1735248 1.6928687
##  [8] 0.9726845 0.8155599 1.4285897

Now, 10 measurements for county 2 would look like this:

rnorm(10, county_alphas[2], sigma_y)
##  [1] 0.1946272 0.1856203 0.8397603 0.9574165 1.0972093 1.9969020 1.7472297
##  [8] 0.9122856 2.1791477 1.6341556

Another 15 measurements for county 8 would look like this:

rnorm(15, county_alphas[8], sigma_y)
##  [1]  1.083025168  0.458684319  1.966970946  1.696426632  0.320232825
##  [6]  0.635152352  2.186371477  1.805396523  1.799289668 -0.004022616
## [11]  0.495692197  0.280336339  0.511478718  1.234910279  0.161252770

Partial Pooling

Let’s now fit the Partial Pooling model

library(lme4)
summary(lmer(log.radon~(1|county), data=mn))
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county)
##    Data: mn
## 
## REML criterion at convergence: 2259.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4661 -0.5734  0.0441  0.6432  3.3516 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  county   (Intercept) 0.09581  0.3095  
##  Residual             0.63662  0.7979  
## Number of obs: 919, groups:  county, 85
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  1.31258    0.04891   26.84

Let’s get the individual intercepts

coef(lmer(log.radon~(1|county), data=mn))
## $county
##                      (Intercept)
## AITKIN                 1.0674994
## ANOKA                  0.8875568
## BECKER                 1.2303812
## BELTRAMI               1.2245444
## BENTON                 1.2899760
## BIG STONE              1.3749235
## BLUE EARTH             1.7171954
## BROWN                  1.4315991
## CARLTON                1.0833131
## CARVER                 1.2608819
## CASS                   1.3506019
## CHIPPEWA               1.4695309
## CHISAGO                1.1826263
## CLAY                   1.6312662
## CLEARWATER             1.1867346
## COOK                   1.1627174
## COTTONWOOD             1.0953027
## CROW WING              1.0736856
## DAKOTA                 1.2944691
## DODGE                  1.4642936
## DOUGLAS                1.5089532
## FARIBAULT              0.9349171
## FILLMORE               1.2494467
## FREEBORN               1.6743898
## GOODHUE                1.6755316
## HENNEPIN               1.2867314
## HOUSTON                1.4172565
## HUBBARD                1.0964581
## ISANTI                 1.2327649
## ITASCA                 1.0714227
## JACKSON                1.6165831
## KANABEC                1.2839088
## KANDIYOHI              1.5941519
## KITTSON                1.2495867
## KOOCHICHING            0.8482224
## LAC QUI PARLE          1.6101391
## LAKE                   0.7427261
## LAKE OF THE WOODS      1.3858053
## LE SUEUR               1.4376932
## LINCOLN                1.6218834
## LYON                   1.6209590
## MAHNOMEN               1.3189074
## MARSHALL               1.2489275
## MARTIN                 1.1204333
## MCLEOD                 1.1545020
## MEEKER                 1.2705104
## MILLE LACS             1.1300481
## MORRISON               1.1719088
## MOWER                  1.4969979
## MURRAY                 1.4670206
## NICOLLET               1.6329198
## NOBLES                 1.5039156
## NORMAN                 1.2186497
## OLMSTED                1.2351414
## OTTER TAIL             1.3318141
## PENNINGTON             1.0973710
## PINE                   0.9944189
## PIPESTONE              1.4509455
## POLK                   1.3309653
## POPE                   1.3048974
## RAMSEY                 1.1292427
## REDWOOD                1.5385774
## RENVILLE               1.3492634
## RICE                   1.6054357
## ROCK                   1.3094621
## ROSEAU                 1.2728558
## SCOTT                  1.4904852
## SHERBURNE              1.1909980
## SIBLEY                 1.2862247
## STEARNS                1.3631229
## STEELE                 1.4731860
## STEVENS                1.4234422
## ST LOUIS               0.7977368
## SWIFT                  1.1902441
## TODD                   1.3657564
## TRAVERSE               1.5063796
## WABASHA                1.5209552
## WADENA                 1.1772729
## WASECA                 0.9827000
## WASHINGTON             1.2589465
## WATONWAN               1.5976957
## WILKIN                 1.4325911
## WINONA                 1.4079158
## WRIGHT                 1.4961184
## YELLOW MEDICINE        1.2834113
## 
## attr(,"class")
## [1] "coef.mer"

Here, we are saying that we want to fit an individual intercept for each county, and each individual intercept for each county is modelled: that is, we are assuming that \(\alpha_j~N(\mu_\alpha, \sigma_\alpha)\).

Let’s look at Lac Qui Parle County specifically

#Partial pooling
coef(lmer(log.radon~(1|county), data=mn))$county["LAC QUI PARLE       ",]
## [1] 1.610139
#No-pooling
as.data.frame(lm(log.radon~county, data=mn)$coefficients)["countyLAC QUI PARLE       ",]
## [1] 1.938289
#Complete pooling  
lm(log.radon~1, data=mn)
## 
## Call:
## lm(formula = log.radon ~ 1, data = mn)
## 
## Coefficients:
## (Intercept)  
##       1.225

As you can see, partial pooling helps move the estimate towards the mean, by using information about the other counties.

Adding predictors

Let’s add the floor predictor

lmer(log.radon~(1|county) + floor, data=mn) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county) + floor
##    Data: mn
## REML criterion at convergence: 2171.305
## Random effects:
##  Groups   Name        Std.Dev.
##  county   (Intercept) 0.3282  
##  Residual             0.7556  
## Number of obs: 919, groups:  county, 85
## Fixed Effects:
## (Intercept)        floor  
##       1.462       -0.693

Let’s fit a different intercept for each county as well.

lmer(log.radon~floor+(floor|county) , data=mn) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ floor + (floor | county)
##    Data: mn
## REML criterion at convergence: 2168.325
## Random effects:
##  Groups   Name        Std.Dev. Corr 
##  county   (Intercept) 0.3487        
##           floor       0.3436   -0.34
##  Residual             0.7462        
## Number of obs: 919, groups:  county, 85
## Fixed Effects:
## (Intercept)        floor  
##      1.4628      -0.6811

Let’s get the modelled/random effects

fit<-lmer(log.radon~floor+(floor|county) , data=mn) 
ranef(fit)
## $county
##                       (Intercept)        floor
## AITKIN               -0.318238328  0.140489342
## ANOKA                -0.529391136 -0.089719658
## BECKER                0.008920476  0.012215452
## BELTRAMI              0.072667549 -0.071460523
## BENTON               -0.035734040  0.060430855
## BIG STONE             0.019886161 -0.006611891
## BLUE EARTH            0.356585563  0.206349718
## BROWN                 0.228005509 -0.016413021
## CARLTON              -0.361830636  0.307392752
## CARVER                0.085931226 -0.098786087
## CASS                 -0.032172470  0.010696930
## CHIPPEWA              0.124710589 -0.041464658
## CHISAGO              -0.240501602  0.079963673
## CLAY                  0.427226081 -0.171248346
## CLEARWATER           -0.072860452  0.023840218
## COOK                 -0.242558065  0.080647420
## COTTONWOOD           -0.017277629 -0.149253484
## CROW WING            -0.310678541  0.234951980
## DAKOTA               -0.109905278 -0.087869162
## DODGE                 0.133608692 -0.044423162
## DOUGLAS               0.167576578  0.009903829
## FARIBAULT            -0.478975389  0.127679010
## FILLMORE             -0.037616409  0.037775691
## FREEBORN              0.423886713 -0.088762178
## GOODHUE               0.312759451  0.201019525
## HENNEPIN             -0.089921905 -0.098460880
## HOUSTON               0.149131164  0.031637058
## HUBBARD              -0.146932774  0.056548187
## ISANTI               -0.161009296  0.053533509
## ITASCA               -0.379169815  0.126069061
## JACKSON               0.291153700 -0.096804841
## KANABEC              -0.105595935  0.035109283
## KANDIYOHI             0.279331391 -0.092874076
## KITTSON               0.062452509 -0.054639400
## KOOCHICHING          -0.460208659  0.162370978
## LAC QUI PARLE         0.375049045  0.095636664
## LAKE                 -0.722397288  0.234318866
## LAKE OF THE WOODS     0.144830537  0.056559621
## LE SUEUR              0.122935103  0.049512573
## LINCOLN               0.398069005 -0.099862859
## LYON                  0.292104875  0.053937085
## MAHNOMEN             -0.018245514  0.006066398
## MARSHALL              0.330028373 -0.534422518
## MARTIN               -0.214148416 -0.139498482
## MCLEOD               -0.090775712 -0.118601459
## MEEKER               -0.129531500  0.043067549
## MILLE LACS           -0.092543969 -0.192771753
## MORRISON             -0.232368198  0.151936802
## MOWER                 0.225136232 -0.280591258
## MURRAY                0.184695195 -0.061408764
## NICOLLET              0.327431819 -0.108866846
## NOBLES                0.184027261 -0.061186685
## NORMAN               -0.077478925 -0.015362973
## OLMSTED              -0.095487459 -0.213251381
## OTTER TAIL            0.051319979  0.093877536
## PENNINGTON           -0.066286794 -0.165155747
## PINE                 -0.399870333  0.080361523
## PIPESTONE             0.157981491  0.037881709
## POLK                  0.154571791 -0.122001790
## POPE                 -0.055746755  0.018535075
## RAMSEY               -0.310409126  0.354223001
## REDWOOD               0.176090156  0.278560253
## RENVILLE              0.022531452  0.144560288
## RICE                  0.277860999 -0.106888686
## ROCK                 -0.049749162  0.016540953
## ROSEAU                0.103833868  0.054850654
## SCOTT                 0.144888377  0.283320683
## SHERBURNE            -0.237058364  0.078818841
## SIBLEY               -0.102722518  0.034153909
## STEARNS               0.044138010 -0.132161042
## STEELE                0.080341430 -0.026712487
## STEVENS               0.100010203 -0.033252100
## ST LOUIS             -0.592368730  0.118669424
## SWIFT                -0.221808418  0.073748431
## TODD                  0.043268986  0.140838624
## TRAVERSE              0.253858364 -0.066435678
## WABASHA               0.253201870 -0.219505673
## WADENA               -0.069847843 -0.067420341
## WASECA               -0.382489254  0.027236269
## WASHINGTON           -0.102833753 -0.157330637
## WATONWAN              0.381344961  0.145077162
## WILKIN                0.137520830 -0.045723898
## WINONA                0.231470723 -0.470004068
## WRIGHT                0.136349562 -0.051627294
## YELLOW MEDICINE      -0.083977457  0.027921419

Now, let’s do prediction. Suppose we want to make a prediction for a new measurement in Lac Qui Parle county.

fit<-lmer(log.radon~floor+(floor|county) , data=mn) 
sigma.y.hat <- sigma.hat(fit)$sigma$data
coef.hat <- as.matrix(coef(fit)$county["LAC QUI PARLE       ",])

x_new <- 1  #first floor
y_new <- rnorm(1000, coef.hat %*% c(1, x_new), sigma.y.hat  )
qplot(y_new)

What we got is what the log-radon measurements could possibly look like in Lac Qui Parle county. What about the radon measurements?

qplot(exp(y_new))

Let’s get a 95% predictive interval

quantile(y_new, c(0.025,  0.975))
##       2.5%      97.5% 
## -0.2993842  2.8582184
quantile(exp(y_new), c(0.025,  0.975))
##       2.5%      97.5% 
##  0.7412748 17.4304742

In our simulation, 95% of the data fell within the above interval.

Now, let’s predict a new observation in a new county that we haven’t seen.

sigma_a <- 0.3487
sigma_b <- 0.3436
rho <- -0.34
Sigma <-  matrix(c(sigma_a^2           , rho*sigma_a*sigma_b, 
                   rho*sigma_a*sigma_b , sigma_b^2),
   nrow=2,       
   ncol=2,       
   byrow = TRUE)   

mu_a <- 1.4628
mu_b <- -0.6811
mu <- matrix(c(mu_a,
               mu_b),
               nrow=2,
               byrow=TRUE)
             
sigma_y <- 0.7462

random_params <- mvrnorm(n = 1000, mu, Sigma)


x_new = 1  # first floor

y_new <- rnorm(1000, mean=random_params %*% c(1, x_new), sd=sigma_y)
qplot(y_new)

Let’s now construct a predictive interval again

quantile(y_new, c(0.025,  0.975))
##       2.5%      97.5% 
## -0.8385728  2.5221888

As you can see, the predictive interval is wider for an unknown county (of course).