#setwd("/media/guerzhoy/Windows/STA303/lectures/lec5_pre")
setwd("c:/STA303/lectures/lec5_pre")
titanic <- read.csv("titanic.csv")
titanic <- titanic[!is.na(titanic$age),]
titanic <- titanic[!is.na(titanic$survived),]
Let’s cook up a dataset with overdispersion. Suppose that we have 200 women in first class, whose probability of survival is 90%, and 200 women in second class, whose probability of survival is 10%
Suppose men’s probability of survival is 50%, and there are 50 men
Suppose we haven’t recorded the class information.
survived<-c(rbinom(200, size=1, prob=.9), rbinom(200, size=1, prob=.1), rbinom(50, size=1, prob=.5))
survived<-c(rbinom(200, size=1, prob=.4), rbinom(200, size=1, prob=.4), rbinom(50, size=1, prob=.5))
sex <- c(rep("Female", 200) , rep("Female", 200) , rep("Male", 25) , rep("Male", 25))
class<- c(rep(1, 200) , rep(2, 200) , rep(1, 25) , rep(2, 25))
df<- data.frame(survived=survived, sex=sex, class=class)
fit_quasi <- glm(survived~sex, family=quasibinomial, data=df)
summary(fit_quasi)
##
## Call:
## glm(formula = survived ~ sex, family = quasibinomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.177 -0.986 -0.986 1.382 1.382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4684 0.1030 -4.548 6.98e-06 ***
## sexMale 0.4684 0.3016 1.553 0.121
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.004464)
##
## Null deviance: 604.89 on 449 degrees of freedom
## Residual deviance: 602.48 on 448 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Here’s a more direct demonstration:
x<-c(rbinom(100, size=1, prob=.9), rbinom(100, size=1, prob=.1)) ; c(var(x),mean(x)*(1-mean(x)))
## [1] 0.2506281 0.2493750
As you can see, it’s usually not a huge effect, with just the one predictor variable.
Omitting the last name, using the first letter of the last name
titanic$first_letter <- sapply(titanic$name, function(name) substr(name, 1,1))
titanic <- titanic[!(titanic$first_letter %in% c('v', 'U', 'Y')),]
summary(glm(survived~sex+first_letter, family=quasibinomial, data=titanic))
##
## Call:
## glm(formula = survived ~ sex + first_letter, family = quasibinomial,
## data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1914 -0.7100 -0.5619 0.7174 2.8926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.76946 0.30963 2.485 0.01311 *
## sexmale -2.53532 0.16610 -15.264 < 2e-16 ***
## first_letterB 0.60746 0.39197 1.550 0.12151
## first_letterC 0.45659 0.38994 1.171 0.24190
## first_letterd 1.51270 0.92399 1.637 0.10191
## first_letterD 0.98760 0.42148 2.343 0.01931 *
## first_letterE -0.39307 0.76640 -0.513 0.60815
## first_letterF 0.28950 0.54155 0.535 0.59305
## first_letterG -0.08505 0.49111 -0.173 0.86255
## first_letterH 0.51635 0.41080 1.257 0.20906
## first_letterI -0.51629 0.92380 -0.559 0.57637
## first_letterJ 0.42560 0.53919 0.789 0.43010
## first_letterK 0.24163 0.53578 0.451 0.65210
## first_letterL 0.31374 0.46665 0.672 0.50153
## first_letterM 0.21541 0.45290 0.476 0.63445
## first_letterN 0.98600 0.50767 1.942 0.05239 .
## first_letterO 0.10027 0.64964 0.154 0.87736
## first_letterP -0.42290 0.48791 -0.867 0.38627
## first_letterQ 14.79661 866.43375 0.017 0.98638
## first_letterR 0.04067 0.48636 0.084 0.93338
## first_letterS 0.64811 0.39879 1.625 0.10444
## first_letterT 1.53671 0.55424 2.773 0.00566 **
## first_letterV -2.40247 1.14998 -2.089 0.03694 *
## first_letterW 0.72324 0.46334 1.561 0.11886
## first_letterZ -14.86798 647.61112 -0.023 0.98169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.063234)
##
## Null deviance: 1404.7 on 1037 degrees of freedom
## Residual deviance: 1047.7 on 1013 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 14
As you can see, the overdispersion parameter here is 1.06. This makes the results slightly more significant.