# RPART and TREE can be used for both classification and regression trees library(MASS) # Implementation in RPART (dataset: cpus) library(rpart) data(cpus) dim(cpus) # [1] 209 9 cpus[1:4,] # ***************************************************** # name syct mmin mmax cach chmin chmax perf estperf # 1 ADVISOR 32/60 125 256 6000 256 16 128 198 199 # 2 AMDAHL 470V/7 29 8000 32000 32 8 32 269 253 # 3 AMDAHL 470/7A 29 8000 32000 32 8 32 220 253 # 4 AMDAHL 470V/7B 29 8000 32000 32 8 32 172 253 # ***************************************************** names(cpus[,2:8]) # ***************************************************** # [1] "syct" "mmin" "mmax" "cach" "chmin" "chmax" "perf" # ***************************************************** # RPART differs from TREE function mainly in its handling of surogate variables # In most details it follows Breiman's et al quite closely. cpus.rp <- rpart(log10(perf) ~ ., cpus[ ,2:8], cp=1e-3) post(cpus.rp,title="Plot of rpart object cpus.rp", filename="C:\\AnaMaria\\CpusTree.eps", horizontal=F, pointsize=8) print(cpus.rp, cp=0.001) # ***************************************************** # n= 209 # # node), split, n, deviance, yval # * denotes terminal node # # 1) root 209 43.1155400 1.753333 # 2) cach< 27 143 11.7908500 1.524647 # 4) mmax< 6100 78 3.8937440 1.374824 # 8) mmax< 1750 12 0.7842516 1.088732 * # ------------------------------------ # ***************************************************** # The tree has NOT been pruned yet. To prune, use PRINTCP # to print out information stored in RPART object printcp(cpus.rp) # ***************************************************** # Regression tree: # rpart(formula = log10(perf) ~ ., data = cpus[, 2:8], cp=0.001) # # Variables actually used in tree construction: # [1] cach chmax chmin mmax syct # # Root node error: 43.116/209 = 0.20629 # # n= 209 # # CP nsplit rel error xerror xstd # 1 0.5492697 0 1.00000 1.02128 0.098176 # 2 0.0893390 1 0.45073 0.47818 0.048282 # 3 0.0328159 3 0.27376 0.32234 0.031711 # 4 0.0269220 4 0.24094 0.31461 0.030093 # 5 0.0185561 5 0.21402 0.28854 0.027477 # 6 0.0094604 9 0.14708 0.25858 0.026232 # 7 0.0054766 10 0.13762 0.22744 0.024714 # 8 0.0043985 12 0.12692 0.20458 0.022625 # 9 0.0022883 13 0.12252 0.20642 0.022525 # 10 0.0014131 15 0.11796 0.20760 0.022741 # 11 0.0010000 16 0.11655 0.20499 0.022640 # ***************************************************** # Graphically we can examine the output of PLOTCP # Note that RPART uses a 10-fold cross-validation to determine the # cost complexity parameter "cp" plotcp(cpus.rp) title("Pruning: choosing parameter cp") # From the plot estimate cp to be cp=0.06, as the best for pruning cpus.rp.pr <- prune(cpus.rp, cp=0.06) cpus.rp <- rpart(log10(perf) ~ ., cpus[ ,2:8]) # cp: is the cost-complexity parameter # best: number of nodes to which to prune a a grown tree. # PLOTCP is used to plot a complexity parameter table for an RPART fit. plotcp(cpus.rp) # ***************************************************** # Plot not handed out # ***************************************************** cpus.rp.pr <- prune(cpus.rp, cp=0.006) post(cpus.rp.pr,title="Plot of rpart object cpus.rp.pr", filename="C:\\AnaMaria\\CpusTree.eps", horizontal=F, pointsize=8) summary(cpus.rp.pr) # ***************************************************** # Call: # rpart(formula = log10(perf) ~ ., data = cpus[, 2:8], cp = 0.001) # n= 209 # # CP nsplit rel error xerror xstd # 1 0.549269710 0 1.0000000 1.0212769 0.09817555 # 2 0.089339015 1 0.4507303 0.4781812 0.04828199 # ------------------------------------------ # 9 0.009460414 9 0.1470831 0.2585843 0.02623156 # 10 0.006000000 10 0.1376227 0.2274426 0.02471429 # # Node number 1: 209 observations, complexity param=0.5492697 # mean=1.753333, MSE=0.2062945 # left son=2 (143 obs) right son=3 (66 obs) # Primary splits: # cach < 27 to the left, improve=0.5492697, (0 missing) # mmax < 14000 to the left, improve=0.4942141, (0 missing) # chmin < 7.5 to the left, improve=0.4822048, (0 missing) # mmin < 3550 to the left, improve=0.4415438, (0 missing) # syct < 49 to the right, improve=0.4267197, (0 missing) # Surrogate splits: # chmin < 7.5 to the left, agree=0.856, adj=0.545, (0 split) # mmin < 3550 to the left, agree=0.842, adj=0.500, (0 split) # mmax < 18485 to the left, agree=0.823, adj=0.439, (0 split) # syct < 49 to the right, agree=0.809, adj=0.394, (0 split) # chmax < 22 to the left, agree=0.780, adj=0.303, (0 split) # # Node number 2: 143 observations, complexity param=0.08933901 # mean=1.524647, MSE=0.08245348 # left son=4 (78 obs) right son=5 (65 obs) # Primary splits: # mmax < 6100 to the left, improve=0.3266856, (0 missing) # mmin < 1750 to the left, improve=0.2445926, (0 missing) # chmax < 4.5 to the left, improve=0.2353382, (0 missing) # syct < 325 to the right, improve=0.2248801, (0 missing) # cach < 0.5 to the left, improve=0.1877734, (0 missing) # Surrogate splits: # mmin < 1250 to the left, agree=0.720, adj=0.385, (0 split) # syct < 102.5 to the right, agree=0.713, adj=0.369, (0 split) # ------------------------------------------------- # ***************************************************** # NOTE. For Classification TREES, the default for the node impurity measure # in growing the tree is "Gini Index", when using the RPART function. # If "Cross-Entropy" is desired, then one needs to specify # parms=list(split="information") in RPART statement # Implementation in TREE (dataset: Forensic glass, coded as fgl). library(tree) data(fgl) dim(fgl) #[1] 214 10 fgl[1:4,] # ***************************************************** # RI Na Mg Al Si K Ca Ba Fe type # 1 3.01 13.64 4.49 1.10 71.78 0.06 8.75 0 0 WinF # 2 -0.39 13.89 3.60 1.36 72.73 0.48 7.83 0 0 WinF # 3 -1.82 13.53 3.55 1.54 72.99 0.39 7.78 0 0 WinF # 4 -0.34 13.21 3.69 1.29 72.61 0.57 8.22 0 0 WinF # ***************************************************** levels(fgl$type) # ***************************************************** # [1] "WinF" "WinNF" "Veh" "Con" "Tabl" "Head" # ***************************************************** fgl.tr <- tree(type~.,fgl) summary(fgl.tr) fgl.tr # ***************************************************** #node), split, n, deviance, yval, (yprob) # * denotes terminal node # # 1) root 214 645.700 WinNF ( 0.327103 ...) # 2) Mg < 2.695 61 159.200 Head ( 0.000000 ...) # 4) Na < 13.785 24 40.160 Con ( 0.000000 ...) # 8) Al < 1.38 8 6.028 WinNF ( 0.000000 ...) * # 9) Al > 1.38 16 17.990 Con ( 0.000000 ...) # 18) Fe < 0.085 10 0.000 Con ( 0.000000 ...) * # 19) Fe > 0.085 6 7.638 WinNF ( 0.000000 ...) * # 5) Na > 13.785 37 63.940 Head ( 0.000000 ...) # ---------------------------------------------------- # ***************************************************** plot(fgl.tr, type="uniform") text(fgl.tr, all=T) # Plot NOT handed out fgl.tr1 <- snip.tree(fgl.tr, nodes = 9) # The nodes could be sniped off interactively, by clicking with # the mouse on the terminal node. Note a slightly different command: # fgl.tr1 <- snip.tree(fgl.tr) fgl.tr1 # ***************************************************** # node), split, n, deviance, yval, (yprob) # * denotes terminal node # # 1) root 214 645.700 WinNF ( 0.327103 ...) 2) Mg < 2.695 61 159.200 Head ( 0.000000 ...) 4) Na < 13.785 24 40.160 Con ( 0.000000 ...) 8) Al < 1.38 8 6.028 WinNF ( 0.000000 ...) * 9) Al > 1.38 16 17.990 Con ( 0.000000 ...) * 5) Na > 13.785 37 63.940 Head ( 0.000000 ...) # ---------------------------------------------------- # ***************************************************** # summary(fgl.tr1) # For the classification tree, we plot the deviance against the size plot(prune.tree(fgl.tr)) # If decide that the "best" deviance is achieved by having k=10 # k is the cost-complexity pruning parameter summary(prune.tree(fgl.tr, k=10)) # Instead one could examine cross-validation plots # A K-fold cross validation experiment to find the deviance/misclassifications # as a function of the cost-complexity parameter k fgl.cv <- cv.tree(fgl.tr,, FUN=prune.tree, K=10) # The algorithm below randomply devides the training set. for(i in 2:5) {fgl.cv$dev <- fgl.cv$dev + cv.tree(fgl.tr,, prune.tree)$dev} fgl.cv$dev <- fgl.cv$dev/5 plot(fgl.cv) title("Cross-validation plot for pruning") # Examine miscalssification for trees (select overall misclassification # or misclassification at each node) misclass.tree(fgl.tr) # [1] 33 misclass.tree(prune.tree(fgl.tr,best=5)) #[1] 65 # One may choose the cost complexity parameter based on misclassification fgl.cv <- cv.tree(fgl.tr,, FUN=prune.misclass) for(i in 2:5) {fgl.cv$dev <- fgl.cv$dev + cv.tree(fgl.tr,, prune.misclass)$dev} fgl.cv$dev <- fgl.cv$dev/5 fgl.cv # ***************************************************** # $size # [1] 20 16 12 11 9 6 5 4 3 1 # #$dev # [1] 74.6 74.8 72.6 71.8 72.0 79.0 89.4 90.8 94.2 144.6 # ***************************************************** # As an alternative to prune.tree, one could use prune.misclass prune.misclass(fgl.tr) # ***************************************************** #$size # [1] 20 16 12 11 9 6 5 4 3 1 # #$dev # [1] 33 33 37 39 44 58 65 73 84 138 # ***************************************************** # ONE s.e. RULE: Take the smallest pruned tree, whose error rate is within # one standard deviation of the minimum. (close to the minimum) # -------------------------------------------------------------- # ----------------------------END----------------------------