spam.data = read.table("spam.data") dim(spam.data) #[1] 4601 58 test = scan("spam.traintest") sum(test) #1536 > names(spam.data) #[1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11" "V12" #[13] "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23" "V24" #[25] "V25" "V26" "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35" "V36" #[37] "V37" "V38" "V39" "V40" "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" #[49] "V49" "V50" "V51" "V52" "V53" "V54" "V55" "V56" "V57" "V58" # For reasons I really don't understand, if I assign names to the database, then I can't fit the tree model (see below). Suggestions welcome. spam.data$V58 = factor(spam.data$V58) # this will give a classification tree library(MASS) library(rpart) spam.tree = rpart(V58 ~ . data = spam.data[test==0,], cp = 0.001) printcp(spam.tree) #output omitted, it's long (59 splits) attributes(spam.tree) # this is helpful to avoid printing the whole tree #$names # [1] "frame" "where" "call" "terms" "cptable" "splits" # [7] "method" "parms" "control" "functions" "y" "ordered" #$class #[1] "rpart" attributes(spam.tree$cptable) #$dim #[1] 43 5 #$dimnames #$dimnames[[1]] # [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" #[15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" #[29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" #[43] "43" #$dimnames[[2]] #[1] "CP" "nsplit" "rel error" "xerror" "xstd" min(spam.tree$cptable[,4]) # this is xerror # [1] 0.2998988 > .299+.016 # adding on 1 standard error # [1] 0.315 # this would lead to a tree with 28 splits and cp = . 0.0027259 # the book claimed 17 terminal nodes was best, and printcp(spam.tree) # seems to indicate that cp = 0.0043 will do that spam.tree2 = rpart(V58 ~ ., data=spam.data[test==0,],cp=.0043) # but this has 10 splits and 11 terminal nodes spam.tree2$cptable[,2] # 1 2 3 4 5 6 7 8 9 # 0 1 2 4 5 6 7 8 10 # and slightly worse predictions that in Table 9.3 table(predict(spam.tree2,spam.data[test==1,],type="class"), spam.data[test==1,58])/sum(test) # 0 1 # 0 0.57942708 0.06184896 # 1 0.03320312 0.32552083 # but here's one that's a little better (yay!) spam.tree3 = rpart(V58 ~ ., data=spam.data[test==0,],cp=.0025) table(predict(spam.tree3,spam.data[test==1,],type="class"), spam.data[test==1,58])/sum(test) # 0 1 # 0 0.58072917 0.05468750 # 1 0.03190104 0.33268229 # and finally this gives a pretty nice looking tree plot(spam.tree3, uniform = T); text(spam.tree3, use.n=T, cex=0.8) ## sad that I can't get the names on: here is what I tried names(spam.data) = spam.names # spam.names read in from the repository names(spam.data) # [1] "make" "address" "all" "3d" "our" # [6] "over" "remove" "internet" "order" "mail" #[11] "receive" "will" "people" "report" "addresses" #[16] "free" "business" "email" "you" "credit" #[21] "your" "font" "000" "money" "hp" #[26] "hpl" "george" "650" "lab" "labs" #[31] "telnet" "857" "data" "415" "85" #[36] "technology" "1999" "parts" "pm" "direct" #[41] "cs" "meeting" "original" "project" "re" #[46] "edu" "table" "conference" ";" "(" #[51] "[" "!" "$" "#" "average" #[56] "longest" "total" NA names(spam.data)[58]= "email" rpart(email ~ ., data = spam.data, method="class", cp = 0.0025) # Error in `[.data.frame`(frame, predictors) : undefined columns selected