Titanic

knitr::opts_chunk$set(echo = TRUE)
require(ggplot2)
require(tigerstats)
require(xtable)
require(lattice)

titanic <- read.csv("titanic.csv")
titanic <- titanic[!is.na(titanic$age),]

Here is the variable descriptions:

VARIABLE DESCRIPTIONS:
survival        Survival
                (0 = No; 1 = Yes)
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
name            Name
sex             Sex
age             Age
sibsp           Number of Siblings/Spouses Aboard
parch           Number of Parents/Children Aboard
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)

SPECIAL NOTES:
Pclass is a proxy for socio-economic status (SES)
 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

Age is in Years; Fractional if Age less than One (1)
 If the Age is Estimated, it is in the form xx.5

With respect to the family relation variables (i.e. sibsp and parch)
some relations were ignored.  The following are the definitions used
for sibsp and parch.

Sibling:  Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic
Spouse:   Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)
Parent:   Mother or Father of Passenger Aboard Titanic
Child:    Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic

Other family relatives excluded from this study include cousins,
nephews/nieces, aunts/uncles, and in-laws.  Some children travelled
only with a nanny, therefore parch=0 for them.  As well, some
travelled with very close friends or neighbors in a village, however,
the definitions do not support such relations.

Let’s build some tables to explore the data a little bit

mean(titanic$survived)

## [1] 0.4082218

with(titanic, aggregate(survived, by=list(sex), mean))

##   Group.1         x
## 1  female 0.7525773
## 2    male 0.2051672

with(titanic, aggregate(survived, by=list(pclass), mean))

##   Group.1         x
## 1       1 0.6373239
## 2       2 0.4406130
## 3       3 0.2614770

with(titanic, aggregate(survived, by=list(pclass, sex), mean))

##   Group.1 Group.2         x
## 1       1  female 0.9624060
## 2       2  female 0.8932039
## 3       3  female 0.4736842
## 4       1    male 0.3509934
## 5       2    male 0.1455696
## 6       3    male 0.1690544

What about age? Actually, arguably the best thing to do to visualize the information is to run a logistic regression model! Let’s do that. (See also the excellent visualization from the Statistical Sleuth text, in the slides.)

fit <- glm(survived~age, family=binomial(link="logit"), data=titanic)
ages <- data.frame(age=titanic$age)
predictions <- predict(fit, newdata=ages, type="response", se=TRUE)

ggdata <- data.frame(age = ages$age,
                     pred = predictions$fit,
                     LL = predictions$fit-1.96*predictions$se.fit,
                     UL = predictions$fit+1.96*predictions$se.fit
                    )

ggplot(ggdata, aes(x = age, y = pred)) + 
   geom_ribbon(aes(ymin = LL, ymax = UL), alpha = .2) +
   geom_line(colour="blue", size=1)

We kind of already did, but let’s now run the regression

fit <- glm(survived~age, family=binomial(link="logit"), data=titanic)
summary(fit)

## 
## Call:
## glm(formula = survived ~ age, family = binomial(link = "logit"), 
##     data = titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1189  -1.0361  -0.9768   1.3187   1.5162  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.136531   0.144715  -0.943   0.3455  
## age         -0.007899   0.004407  -1.792   0.0731 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.6  on 1045  degrees of freedom
## Residual deviance: 1411.4  on 1044  degrees of freedom
## AIC: 1415.4
## 
## Number of Fisher Scoring iterations: 4

Let’s first estimate the proportion of survivors in a very roundabout way:

fit <- glm(survived~1, family=binomial(link="logit"), data=titanic)
summary(fit)

## 
## Call:
## glm(formula = survived ~ 1, family = binomial(link = "logit"), 
##     data = titanic)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.024  -1.024  -1.024   1.339   1.339  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.37132    0.06291  -5.903 3.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.6  on 1045  degrees of freedom
## Residual deviance: 1414.6  on 1045  degrees of freedom
## AIC: 1416.6
## 
## Number of Fisher Scoring iterations: 4

plogis(fit$coefficients["(Intercept)"])

## (Intercept) 
##   0.4082218

Let’s confirm that:

mean(titanic$survived)

## [1] 0.4082218

logit(mean(titanic$survived))

## [1] -0.3713213

Predicting with age:

fit <- glm(survived~age, family=binomial(link="logit"), data=titanic)
summary(fit)

## 
## Call:
## glm(formula = survived ~ age, family = binomial(link = "logit"), 
##     data = titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1189  -1.0361  -0.9768   1.3187   1.5162  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.136531   0.144715  -0.943   0.3455  
## age         -0.007899   0.004407  -1.792   0.0731 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.6  on 1045  degrees of freedom
## Residual deviance: 1411.4  on 1044  degrees of freedom
## AIC: 1415.4
## 
## Number of Fisher Scoring iterations: 4

Interetation of \(\beta_0\): the predicted probability of survival for people aged 0.

We already did this when graphing the probabilities, but here it is again, in more detail

fit <- glm(survived~age, family=binomial(link="logit"), data=titanic)
ages <- data.frame(age=titanic$age)
logodds <- predict(fit, newdata=ages, type="link", se=TRUE)
logodds$fit[1:10]

##          1          2          3          4          5          6 
## -0.3655906 -0.1437719 -0.1523284 -0.3734892 -0.3339962 -0.5156639 
##          7          8          9         10 
## -0.6341429 -0.4445765 -0.5551569 -0.6973317

logodds$se.fit[1:10]

##          1          2          3          4          5          6 
## 0.06305451 0.14108972 0.13683517 0.06303438 0.06613902 0.10280249 
##          7          8          9         10 
## 0.16023655 0.07548539 0.12096813 0.19314712

Here, we are getting the log-odds, together with their SEs. We can transform the implied interval to obtain the uncertainty for the probabilities

plogis(logodds$fit[1])

##         1 
## 0.4096069

c(plogis(logodds$fit[1]-1.96*logodds$se.fit[1]), plogis(logodds$fit[1]+1.96*logodds$se.fit[1]))

##         1         1 
## 0.3800874 0.4397926

Note that this interval is not symmetric. Why?

(This is not really a CI, since the probability is never observed.)

Let’s now visualize the difference between men and women, controlling for age

fit <- glm(survived~age+sex, family=binomial(link="logit"), data=titanic)

newdata <- data.frame(age=0:100, sex=rep(factor("male"), 101))
predictions_male <- predict(fit, newdata=newdata, type="response", se=TRUE)

newdata <- data.frame(age=0:100, sex=rep(factor("female"), 101))
predictions_female <- predict(fit, newdata=newdata, type="response", se=TRUE)


ggdata <- data.frame(age = newdata$age,
                     pred_male = predictions_male$fit,
                     mLL = predictions_male$fit-1.96*predictions_male$se.fit,
                     mUL = predictions_male$fit+1.96*predictions_male$se.fit,
                     pred_female = predictions_female$fit,
                     fLL = predictions_female$fit-1.96*predictions_female$se.fit,
                     fUL = predictions_female$fit+1.96*predictions_female$se.fit
                    )

ggplot(ggdata, aes(x = age)) + 
   geom_ribbon(aes(ymin = mLL, ymax = mUL, fill="male"), alpha = .2) + guides(fill=FALSE) +
   geom_line(aes(y = pred_male, color="male"), size=1) +
   geom_ribbon(aes(ymin = fLL, ymax = fUL, fill="female"), alpha = .2) + guides(fill=FALSE) +
   geom_line(aes(y = pred_female, color="female"), size=1) +
   labs(y = "Survival probs")

Also see a beautiful visualization of the Titanic data here.

Likelihood Ratio Tests

full <- glm(survived~age+sex, family=binomial(link="logit"), data=titanic)
reduced <- glm(survived~age, family=binomial(link="logit"), data=titanic)
anova(reduced, full, test="LRT")