knitr::opts_chunk$set(echo = TRUE)
require(ggplot2)
require(tigerstats)
require(xtable)
require(lattice)
#CHANGE THIS
#setwd("c:/STA303/lectures/lec5_pre")
setwd("/media/guerzhoy/Windows/STA303/lectures/lec5_pre")
titanic <- read.csv("titanic.csv")
titanic <- titanic[!is.na(titanic$age),]
full <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
reduced <- glm(survived~sex, family=binomial(link="logit"), data=titanic)
anova(reduced, full, test="LRT")
## Analysis of Deviance Table
##
## Model 1: survived ~ sex
## Model 2: survived ~ sex + age
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1044 1102.0
## 2 1043 1101.3 1 0.6694 0.4133
Let’s compute this qunatity manually:
1-pchisq(0.6694, df=1)
## [1] 0.4132609
So we cannot reject the hypothesis that the coefficient for age is 0. (Of course, this is also confirmed by the Wald Test.)
Can we reject the hypothesis that the class doesn’t matter?
full <- glm(survived~age+pclass, family=binomial(link="logit"), data=titanic)
reduced <- glm(survived~age, family=binomial(link="logit"), data=titanic)
anova(reduced, full, test="LRT")
## Analysis of Deviance Table
##
## Model 1: survived ~ age
## Model 2: survived ~ age + pclass
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1044 1411.4
## 2 1043 1256.2 1 155.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Apparently yes.
Let’s try AIC for
full <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
full
##
## Call: glm(formula = survived ~ sex + age, family = binomial(link = "logit"),
## data = titanic)
##
## Coefficients:
## (Intercept) sexmale age
## 1.235414 -2.460689 -0.004254
##
## Degrees of Freedom: 1045 Total (i.e. Null); 1043 Residual
## Null Deviance: 1415
## Residual Deviance: 1101 AIC: 1107
reduced <- glm(survived~sex, family=binomial(link="logit"), data=titanic)
reduced
##
## Call: glm(formula = survived ~ sex, family = binomial(link = "logit"),
## data = titanic)
##
## Coefficients:
## (Intercept) sexmale
## 1.112 -2.467
##
## Degrees of Freedom: 1045 Total (i.e. Null); 1044 Residual
## Null Deviance: 1415
## Residual Deviance: 1102 AIC: 1106
AIC says use the reduced model. Intuition: the reduction in deviance is not large enough to compensate for the additional parameter. The Residual Deviance (just 2*Deviance in the slides) goes down by 1, but the number of parameters goes up by 1 (and gets multiplied by 2) when we go to the full model.
Let’s load the iris dataset.
library(datasets)
data("iris")
iris_small <- subset(iris, Species=="virginica" | Species=="versicolor")
Plotting the iris dataset:
ggplot(iris_small, aes(Petal.Width, Petal.Length)) + geom_point(aes(colour=factor(Species)))
Prediction:
fit <- glm(Species~Petal.Width+Petal.Length, data=iris_small, family="binomial")
new_data <- data.frame(Petal.Width=1.75, Petal.Length=5)
plogis(predict(fit,new_data))
## 1
## 0.8559489
Predictions
p_cutoff <- .5
iris_small$pred <- ifelse(fit$fitted.values > p_cutoff, "virginica", "versicolor")
How well did we do? We are computing how well the predictions match to the correct classes.
mean(iris_small$pred==iris_small$Species)
## [1] 0.94
Not bad!
Let’s try this with the Titanic data
p_cutoff <- 0.5 #cutoff for the pi
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- ifelse(fit$fitted.values > p_cutoff, 1, 0)
mean(titanic$pred==titanic$survived)
## [1] 0.7791587
What about the cut-offs? Let’s say that you want to sell insurance to someone. Maybe it’s more constly (because of the cost of risk) to sell insurance to someone who will die than to refuse to sell insurance to someone who will live.
p_cutoff <- 0.5
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- ifelse(fit$fitted.values > p_cutoff, 1, 0)
cost_sell_die <- 10
cost_refuse_survive <- 1
total_cost_sell_die <- cost_sell_die*sum((titanic$pred==1) & (titanic$survived==0))
total_cost_refuse_survive <- cost_refuse_survive*sum((titanic$pred==0) & (titanic$survived==1))
total_cost_sell_die+total_cost_refuse_survive
## [1] 1095
Let’s try this with two difference cutoffs:
p_cutoff <- 0.75
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- factor(ifelse(fit$fitted.values > p_cutoff, 1, 0))
cost_sell_die <- 10
cost_refuse_survive <- 1
total_cost_sell_die <- cost_sell_die*sum((titanic$pred==1) & (titanic$survived==0))
total_cost_refuse_survive <- cost_refuse_survive*sum((titanic$pred==0) & (titanic$survived==1))
total_cost_sell_die+total_cost_refuse_survive
## [1] 981
p_cutoff <- 0.5
fit <- glm(survived~sex+age, family=binomial(link="logit"), data=titanic)
titanic$pred <- factor(ifelse(fit$fitted.values > p_cutoff, 1, 0))
cost_sell_die <- 10
cost_refuse_survive <- 1
total_cost_sell_die <- cost_sell_die*sum((titanic$pred==1) & (titanic$survived==0))
total_cost_refuse_survive <- cost_refuse_survive*sum((titanic$pred==0) & (titanic$survived==1))
total_cost_sell_die+total_cost_refuse_survive
## [1] 1095
Can also use the confusion matrix:
with(titanic, table(pred, survived))
## survived
## pred 0 1
## 0 523 135
## 1 96 292
titanic_small <- titanic[1:20,]
fit<-glm(survived~pclass+sex+age+sibsp+parch+fare, data=titanic_small)
p_cutoff <- 0.5
titanic$pred <- ifelse(predict(fit, titanic) > p_cutoff, 1, 0)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
titanic_small$pred <- ifelse(predict(fit, titanic_small) > p_cutoff, 1, 0)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
mean(titanic_small$pred==titanic_small$survived, na.rm=TRUE)
## [1] 0.65
mean(titanic$pred==titanic$survived, na.rm=TRUE)
## [1] 0.4421053
Cross-validation Split your dataset into a “training set” and a “validation set” fit your model on the training set, and you check the performance on the “validation set”