knitr::opts_chunk$set(echo = TRUE)
require(ggplot2)
require(tigerstats)
require(xtable)
require(lattice)

#CHANGE THIS
#setwd("c:/STA303/lectures/lec5_pre")
setwd("/media/guerzhoy/Windows/STA303/lectures/lec5_pre")

titanic <- read.csv("titanic.csv")
titanic <- titanic[!is.na(titanic$age),]

Drop in Deviance Tests

full <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
reduced <- glm(survived~sex, family=binomial(link="logit"), data=titanic)
anova(reduced, full, test="LRT")
## Analysis of Deviance Table
## 
## Model 1: survived ~ sex
## Model 2: survived ~ sex + age
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1044     1102.0                     
## 2      1043     1101.3  1   0.6694   0.4133

Let’s compute this qunatity manually:

1-pchisq(0.6694, df=1)
## [1] 0.4132609

So we cannot reject the hypothesis that the coefficient for age is 0. (Of course, this is also confirmed by the Wald Test.)

Can we reject the hypothesis that the class doesn’t matter?

full <- glm(survived~age+pclass, family=binomial(link="logit"), data=titanic)
reduced <- glm(survived~age, family=binomial(link="logit"), data=titanic)
anova(reduced, full, test="LRT")
## Analysis of Deviance Table
## 
## Model 1: survived ~ age
## Model 2: survived ~ age + pclass
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1044     1411.4                          
## 2      1043     1256.2  1    155.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Apparently yes.

AIC

Let’s try AIC for

full <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
full
## 
## Call:  glm(formula = survived ~ sex + age, family = binomial(link = "logit"), 
##     data = titanic)
## 
## Coefficients:
## (Intercept)      sexmale          age  
##    1.235414    -2.460689    -0.004254  
## 
## Degrees of Freedom: 1045 Total (i.e. Null);  1043 Residual
## Null Deviance:       1415 
## Residual Deviance: 1101  AIC: 1107
reduced <- glm(survived~sex, family=binomial(link="logit"), data=titanic)
reduced
## 
## Call:  glm(formula = survived ~ sex, family = binomial(link = "logit"), 
##     data = titanic)
## 
## Coefficients:
## (Intercept)      sexmale  
##       1.112       -2.467  
## 
## Degrees of Freedom: 1045 Total (i.e. Null);  1044 Residual
## Null Deviance:       1415 
## Residual Deviance: 1102  AIC: 1106

AIC says use the reduced model. Intuition: the reduction in deviance is not large enough to compensate for the additional parameter. The Residual Deviance (just 2*Deviance in the slides) goes down by 1, but the number of parameters goes up by 1 (and gets multiplied by 2) when we go to the full model.

Classification

Let’s load the iris dataset.

library(datasets)
data("iris")
iris_small <- subset(iris, Species=="virginica" | Species=="versicolor")

Plotting the iris dataset:

ggplot(iris_small, aes(Petal.Width, Petal.Length)) + geom_point(aes(colour=factor(Species)))

Prediction:

fit <- glm(Species~Petal.Width+Petal.Length, data=iris_small, family="binomial")

new_data <- data.frame(Petal.Width=1.75, Petal.Length=5)
plogis(predict(fit,new_data))
##         1 
## 0.8559489

Predictions

p_cutoff <- .5
iris_small$pred <- ifelse(fit$fitted.values > p_cutoff, "virginica", "versicolor")

How well did we do? We are computing how well the predictions match to the correct classes.

mean(iris_small$pred==iris_small$Species)
## [1] 0.94

Not bad!

Let’s try this with the Titanic data

p_cutoff <- 0.5  #cutoff for the pi
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- ifelse(fit$fitted.values > p_cutoff, 1, 0)
mean(titanic$pred==titanic$survived)
## [1] 0.7791587

What about the cut-offs? Let’s say that you want to sell insurance to someone. Maybe it’s more constly (because of the cost of risk) to sell insurance to someone who will die than to refuse to sell insurance to someone who will live.

p_cutoff <- 0.5
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- ifelse(fit$fitted.values > p_cutoff, 1, 0)

cost_sell_die <- 10
cost_refuse_survive <- 1

total_cost_sell_die <- cost_sell_die*sum((titanic$pred==1) & (titanic$survived==0))
total_cost_refuse_survive <- cost_refuse_survive*sum((titanic$pred==0) & (titanic$survived==1))
total_cost_sell_die+total_cost_refuse_survive
## [1] 1095

Let’s try this with two difference cutoffs:

p_cutoff <- 0.75
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- factor(ifelse(fit$fitted.values > p_cutoff, 1, 0))

cost_sell_die <- 10
cost_refuse_survive <- 1

total_cost_sell_die <- cost_sell_die*sum((titanic$pred==1) & (titanic$survived==0))
total_cost_refuse_survive <- cost_refuse_survive*sum((titanic$pred==0) & (titanic$survived==1))
total_cost_sell_die+total_cost_refuse_survive
## [1] 981
p_cutoff <- 0.5
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- factor(ifelse(fit$fitted.values > p_cutoff, 1, 0))

cost_sell_die <- 10
cost_refuse_survive <- 1

total_cost_sell_die <- cost_sell_die*sum((titanic$pred==1) & (titanic$survived==0))
total_cost_refuse_survive <- cost_refuse_survive*sum((titanic$pred==0) & (titanic$survived==1))
total_cost_sell_die+total_cost_refuse_survive
## [1] 1095

Can also use the confusion matrix:

with(titanic, table(pred, survived))
##     survived
## pred   0   1
##    0 523 135
##    1  96 292

Overfitting: Introduction (to be continued Wednesday)

titanic_small <- titanic[1:20,]
fit<-glm(survived~pclass+sex+age+sibsp+parch+fare, data=titanic_small)
p_cutoff <- 0.5
titanic$pred <- ifelse(predict(fit, titanic) > p_cutoff, 1, 0)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
titanic_small$pred <- ifelse(predict(fit, titanic_small) > p_cutoff, 1, 0)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
mean(titanic_small$pred==titanic_small$survived, na.rm=TRUE)
## [1] 0.65
mean(titanic$pred==titanic$survived, na.rm=TRUE)
## [1] 0.4421053

Cross-validation Split your dataset into a “training set” and a “validation set” fit your model on the training set, and you check the performance on the “validation set”