knitr::opts_chunk$set(echo = TRUE)
require(ggplot2)
require(tigerstats)
require(xtable)
require(lattice)

#CHANGE THIS
#setwd("c:/STA303/lectures/lec5_pre")
setwd("/media/guerzhoy/Windows/STA303/lectures/lec5_pre")

titanic <- read.csv("titanic.csv")
titanic <- titanic[!is.na(titanic$age),]

Overfitting: Introduction

Let’s try to build a large model based on just a few data points

set.seed(0)
titanic_small <- titanic[sample(dim(titanic)[1], 12),]
fit<-glm(survived~pclass+sex+log(age)+sibsp+parch+fare, data=titanic_small, family=binomial)
p_cutoff <- 0.5
titanic$pred <- ifelse(predict(fit, titanic, type="response") > p_cutoff, 1, 0)
titanic_small$pred <- ifelse(predict(fit, titanic_small, type="response") > p_cutoff, 1, 0)
mean(titanic_small$pred==titanic_small$survived, na.rm=TRUE)
## [1] 0.9166667
mean(titanic$pred==titanic$survived, na.rm=TRUE)
## [1] 0.5291866

It looks like we’re doing great on the small dataset, but not so great on the larger dataset.

This is a phenomenon known as overfitting – if our dataset is small, we are in danger of finding parameters that work great for the small dataset (due to random variation in the data), but don’t work for the large dataset.

Cross-Validation

Cross-Validation allows us to select the best model that the data support. The idea is to get the model that would predict new data.

One way to think about this is to take almost all the data, and try to predict the data that we are holding out. In other words, for a dataset with N datapoints, fit the model on N-1 points (the training set), and see how well you do predicting the variable of interest on the 1 point that you held out (the validation set). This is known as leave-one-out cross-validation.

We also use K-fold cross-validation – we split the data into K folds, and each time we use (K-1) folds as the training set and 1 fold as the validation set.

If the goal is to fit a model that would do as well as possible on new data in terms of classification, we might pick the model that predicts the held out data the best. If the goal is to fit as good a model as the data supports, we might pick the model with the smallest deviance/largest likelihood on held out data.

library(boot)
set.seed(0)
titanic_small <- titanic[sample(dim(titanic)[1], 20),]
fit_full <- glm(survived~pclass+sex+log(age)+sibsp+parch+fare, data=titanic_small, family=binomial)
fit_reduced <- glm(survived~sex, data=titanic_small, family=binomial)

cost_classification <- function(r, pi) mean(abs(r-pi) > 0.5)
cost_negloglikelihood <- function(r, pi) -sum(r*log(pi)+(1-r)*log(1-pi)   )

cv.glm(data=titanic_small, glmfit=fit_full, cost=cost_negloglikelihood, K=20)$delta[1]
## [1] 1.116525
cv.glm(data=titanic_small, glmfit=fit_reduced, cost=cost_negloglikelihood, K=20)$delta[1]
## [1] 0.6198363
cv.glm(data=titanic_small, glmfit=fit_full, cost=cost_classification, K=20)$delta[1]
## [1] 0.55
cv.glm(data=titanic_small, glmfit=fit_reduced, cost=cost_classification, K=20)$delta[1]
## [1] 0.2

This is in all ways better than AIC for model selection – AIC basically approximates Leave-One-Out cross-validation.

Note that in this particular example, AIC would also work

fit_full$aic   
## [1] 30.87021
fit_reduced$aic   
## [1] 23.12143

(Smaller AIC means better model)

Let’s apply this to something fun. I figured that the first letter of the person’s last name might work as a predictor of survival.

#Exclude people with very rare first-letter names
titanic$first_letter <- sapply(titanic$name, function(name) substr(name, 1,1))
titanic <- titanic[!(titanic$first_letter %in% c('v', 'U', 'Y')),]

fit_noletter <- glm(survived~sex+pclass, data=titanic, family=binomial)
titanic$pred <- ifelse(predict(fit_noletter, titanic) > .5, 1, 0)
mean(titanic$pred==titanic$survived)
## [1] 0.7870906
fit_noletter$aic
## [1] 1013.58
fit_letter <- glm(survived~sex+pclass+first_letter, data=titanic, family=binomial)
titanic$pred <- ifelse(predict(fit_letter, titanic) > .5, 1, 0)
mean(titanic$pred==titanic$survived)
## [1] 0.8015414
fit_letter$aic
## [1] 1014.199

An improvement of over 1%! But AIC says we’re not really doing better. Let’s try cross-validation.

cv.glm(data=titanic, glmfit=fit_letter, cost=cost_classification, K= 10)$delta[1]
## [1] 0.2071291
cv.glm(data=titanic, glmfit=fit_noletter, cost=cost_classification, K=10)$delta[1]
## [1] 0.22158

Using the first letter wins! Note that the classification error when doing cross-validation is lower. Why does that make sense?

Here’s a concern – our observations are not independent. Maybe all we’re picking up on is that people from the same family had a better chance of surviving.

Is that true?

titanic$lastname <- sapply(titanic$name, function(name) strsplit(toString(name),",")[[1]][1])
fit_lastname<-glm(survived~lastname, data=titanic, family=binomial)
anova(fit_lastname, test="LRT")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: survived
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       1037    1404.73              
## lastname 692   1055.5       345     349.27 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s confirm the classification rate

titanic$pred <- ifelse(predict(fit_lastname, titanic) > .5, 1, 0)
mean(titanic$pred==titanic$survived)
## [1] 0.8988439

Of course, the AIC indicates that the model is actually worse, and we’re overfitting by a lot, since a lot of the last names are unique. (Doing k-fold cross-validation here would be tricky though (why?), so we’ll skip that.)

get_count <- function(entry, column){
  return(sum(entry==column))
}

get_counts <- function(column){
  return(sapply(column, get_count, column=column))
}

titanic_nouniquenames <- titanic[get_counts(titanic$lastname)>1,]
fit_nouniquenames <- glm(survived~lastname+sex+pclass, family=binomial, data=titanic_nouniquenames)
fit_nonames <- glm(survived~sex+pclass, family=binomial, data=titanic_nouniquenames)
anova(fit_nouniquenames, fit_nonames, test="LRT")
## Analysis of Deviance Table
## 
## Model 1: survived ~ lastname + sex + pclass
## Model 2: survived ~ sex + pclass
##   Resid. Df Resid. Dev   Df Deviance  Pr(>Chi)    
## 1       343     168.38                            
## 2       537     534.87 -194  -366.48 9.776e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s do something that’s like quick-and-dirty cross-validation

library(plyr)

get_count <- function(entry, column){
  return(sum(entry==column))
}

get_counts <- function(column){
  return(sapply(column, get_count, column=column))
}

titanic_nouniquenames <- titanic[get_counts(titanic$lastname)>1,]
titanic_nouniquenames <- titanic_nouniquenames[sample(nrow(titanic_nouniquenames)),]
titanic_train <- ddply(titanic_nouniquenames, "lastname", function(z) tail(z,1) )
titanic_test  <- ddply(titanic_nouniquenames, "lastname", function(z) head(z,1) )
mean(titanic_test$survived==titanic_train$survived)
## [1] 0.6358974

This is better than what we’d expect! (We’d expectd \(.4\times.4+.6\times.6 = .52\))

Let’s select a name at random from each family:

set.seed(0)
randomRows = function(df,n){
   return(df[sample(nrow(df),n),])
}

titanic_uniquelast <- ddply(titanic, "lastname", function(z) randomRows(z,1)    )
titanic_uniquelast$first_letter <- sapply(titanic_uniquelast$name, function(name) substr(name, 1,1))
titanic_uniquelast <- titanic_uniquelast[!(titanic_uniquelast$first_letter %in% c('v', 'U', 'Y', 'Q')),]

fit_noletter <- glm(survived~sex+pclass, data=titanic_uniquelast, family=binomial)
titanic_uniquelast$pred <- ifelse(predict(fit_noletter, titanic_uniquelast) > .5, 1, 0)
mean(titanic_uniquelast$pred==titanic_uniquelast$survived)
## [1] 0.7875723
fit_noletter$aic
## [1] 671.3337
fit_letter <- glm(survived~sex+pclass+first_letter, data=titanic_uniquelast, family=binomial)
titanic_uniquelast$pred <- ifelse(predict(fit_letter, titanic_uniquelast) > .5, 1, 0)
mean(titanic_uniquelast$pred==titanic_uniquelast$survived)
## [1] 0.800578
fit_letter$aic
## [1] 690.6101

Even more of an improvement (but maybe we got lucky…)

cv.glm(data=titanic_uniquelast, glmfit=fit_letter, cost=cost_classification, K= 10)$delta[1]
## [1] 0.2153179
cv.glm(data=titanic_uniquelast, glmfit=fit_noletter, cost=cost_classification, K=10)$delta[1]
## [1] 0.2124277

The effect went away! It was about the last names after all.