--- title: "Classification" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r message=FALSE} lines <- "AGE,SEX,STATUS 23,MALE,DIED 40,FEMALE,SURVIVED 40,MALE,SURVIVED 30,MALE,DIED 28,MALE,DIED 40,MALE,DIED 45,FEMALE,DIED 62,MALE,DIED 65,MALE,DIED 45,FEMALE,DIED 25,FEMALE,DIED 28,MALE,SURVIVED 28,MALE,DIED 23,MALE,DIED 22,FEMALE,SURVIVED 23,FEMALE,SURVIVED 28,MALE,SURVIVED 15,FEMALE,SURVIVED 47,FEMALE,DIED 57,MALE,DIED 20,FEMALE,SURVIVED 18,MALE,SURVIVED 25,MALE,DIED 60,MALE,DIED 25,MALE,SURVIVED 20,MALE,SURVIVED 32,MALE,SURVIVED 32,FEMALE,SURVIVED 24,FEMALE,SURVIVED 30,MALE,SURVIVED 15,MALE,DIED 50,FEMALE,DIED 21,FEMALE,SURVIVED 25,MALE,DIED 46,MALE,SURVIVED 32,FEMALE,SURVIVED 30,MALE,DIED 25,MALE,DIED 25,MALE,DIED 25,MALE,DIED 30,MALE,DIED 35,MALE,DIED 23,MALE,SURVIVED 24,MALE,DIED 25,FEMALE,SURVIVED" con <- textConnection(lines) donner <- read.csv(con, sep=",") require(Sleuth3) krunnit <- case2101 ``` Make sure you understand what this function does and why. It's not as important to understand the `sapply` stuff below. ```{r} ctable_line <- function(cutoff, probs, y){ correct <- mean( (probs >= cutoff) == y) false_pos <- sum((y==0) * (probs > cutoff))/sum(y==0) false_neg <- sum((y==1) * (probs < cutoff))/sum(y==1) return(data.frame(cutoff=cutoff, correct=correct, false_pos=false_pos, false_neg=false_neg)) } #sample call: #donner$STATUS<- relevel(donner$STATUS, "DIED") #fit <- glm(STATUS~SEX+AGE, family=binomial, data=donner) #ctable_line(cutoff=.6, probs=fit$fitted.values, y=donner$STATUS=="SURVIVED") ``` Let's now use this function to obtain the correction classification rate, false negative rate, and false positive rate for the Donner data: ```{r} donner$STATUS<- relevel(donner$STATUS, "DIED") fit <- glm(STATUS~SEX+AGE, family=binomial, data=donner) cutoffs <- seq(0, 1, .02) t(sapply(cutoffs, ctable_line, probs= fit$fitted.values, y=donner$STATUS=="SURVIVED")) ``` ```{r} ctable_line_binomial <- function(cutoff, probs, successes, fails){ correct <- sum(successes*(probs > cutoff) + fails*(probs<=cutoff))/(sum(successes)+sum(fails)) false_pos <- sum(fails * (probs > cutoff))/sum(fails) false_neg <- sum(successes * (probs <= cutoff))/sum(successes) return(data.frame(cutoff=cutoff, correct=correct, false_pos=false_pos, false_neg=false_neg)) } #sample call: fit <- glm(cbind(Extinct, AtRisk-Extinct)~Area, family=binomial, data=krunnit) ctable_line_binomial(cutoff=.6, probs=fit$fitted.values, successes=krunnit$Extinct, fails=(krunnit$AtRisk-krunnit$Extinct)) fit <- glm(cbind(Extinct, AtRisk-Extinct)~Area, family=binomial, data=krunnit) cutoffs <- seq(0, 1, .02) t(sapply(cutoffs, ctable_line_binomial, probs= fit$fitted.values, successes=krunnit$Extinct, fails=(krunnit$AtRisk-krunnit$Extinct))) ```