# RPART and TREE can be used for both classification and regression trees library(MASS) # Implementation in RPART (dataset: cpus) library(rpart) data(cpus) dim(cpus) # [1] 209 9 cpus[1:4,] # ***************************************************** # name syct mmin mmax cach chmin chmax perf estperf # 1 ADVISOR 32/60 125 256 6000 256 16 128 198 199 # 2 AMDAHL 470V/7 29 8000 32000 32 8 32 269 253 # 3 AMDAHL 470/7A 29 8000 32000 32 8 32 220 253 # 4 AMDAHL 470V/7B 29 8000 32000 32 8 32 172 253 # ***************************************************** names(cpus[,2:8]) # ***************************************************** # [1] "syct" "mmin" "mmax" "cach" "chmin" "chmax" "perf" # ***************************************************** # RPART differs from TREE function mainly in its handling of surogate variables # In most details it follows Breiman's et al quite closely. cpus.rp <- rpart(log10(perf) ~ ., cpus[ ,2:8], cp=1e-3) post(cpus.rp,title="Plot of rpart object cpus.rp", filename="Cpus.tree.ps", horizontal=F, pointsize=8) print(cpus.rp, cp=0.001) # ***************************************************** # n= 209 # # node), split, n, deviance, yval # * denotes terminal node # # 1) root 209 43.1155400 1.753333 # 2) cach< 27 143 11.7908500 1.524647 # 4) mmax< 6100 78 3.8937440 1.374824 # 8) mmax< 1750 12 0.7842516 1.088732 * # ------------------------------------ # ***************************************************** # The tree has NOT been pruned yet. To prune, use PRINTCP # to print out information stored in RPART object printcp(cpus.rp) # ***************************************************** #Regression tree: #rpart(formula = log10(perf) ~ ., data = cpus[, 2:8], cp = 0.001) # #Variables actually used in tree construction: #[1] cach chmax chmin mmax syct # #Root node error: 43.116/209 = 0.20629 # #n= 209 # # CP nsplit rel error xerror xstd #1 0.5492697 0 1.00000 1.00368 0.096587 #2 0.0893390 1 0.45073 0.47000 0.048065 #3 0.0876332 2 0.36139 0.44063 0.046430 #4 0.0328159 3 0.27376 0.30827 0.031482 #5 0.0269220 4 0.24094 0.31203 0.031598 #6 0.0185561 5 0.21402 0.28991 0.029923 #7 0.0167992 6 0.19546 0.26525 0.027540 #8 0.0157908 7 0.17866 0.26148 0.027114 #9 0.0094604 9 0.14708 0.25504 0.027174 #10 0.0054766 10 0.13762 0.22835 0.026250 #11 0.0052307 11 0.13215 0.22584 0.026189 #12 0.0043985 12 0.12692 0.22276 0.025918 #13 0.0022883 13 0.12252 0.21797 0.025834 #14 0.0022704 14 0.12023 0.21557 0.025784 #15 0.0014131 15 0.11796 0.21654 0.026096 #16 0.0010000 16 0.11655 0.21844 0.026433 # ***************************************************** # Graphically we can examine the output of PLOTCP # Note that RPART uses a 10-fold cross-validation to determine the # cost complexity parameter "cp" plotcp(cpus.rp) title("Pruning: choosing parameter cp") # From the plot estimate cp to be cp=0.006, as the best for pruning cpus.rp.pr <- prune(cpus.rp, cp=0.006) # cp: is the cost-complexity parameter # PLOTCP is used to plot a complexity parameter table for an RPART fit. plotcp(cpus.rp) # ***************************************************** # Plot not handed out # ***************************************************** cpus.rp.pr <- prune(cpus.rp, cp=0.006) post(cpus.rp.pr,title="Plot of rpart object cpus.rp.pr", filename="Cpus.treeprune.ps", main = "pruned tree", horizontal=F, pointsize=8) summary(cpus.rp.pr) # ***************************************************** # Call: # rpart(formula = log10(perf) ~ ., data = cpus[, 2:8], cp = 0.001) # n= 209 # # CP nsplit rel error xerror xstd # 1 0.549269710 0 1.0000000 1.0212769 0.09817555 # 2 0.089339015 1 0.4507303 0.4781812 0.04828199 # ------------------------------------------ # 9 0.009460414 9 0.1470831 0.2585843 0.02623156 # 10 0.006000000 10 0.1376227 0.2274426 0.02471429 # # Node number 1: 209 observations, complexity param=0.5492697 # mean=1.753333, MSE=0.2062945 # left son=2 (143 obs) right son=3 (66 obs) # Primary splits: # cach < 27 to the left, improve=0.5492697, (0 missing) # mmax < 14000 to the left, improve=0.4942141, (0 missing) # chmin < 7.5 to the left, improve=0.4822048, (0 missing) # mmin < 3550 to the left, improve=0.4415438, (0 missing) # syct < 49 to the right, improve=0.4267197, (0 missing) # Surrogate splits: # chmin < 7.5 to the left, agree=0.856, adj=0.545, (0 split) # mmin < 3550 to the left, agree=0.842, adj=0.500, (0 split) # mmax < 18485 to the left, agree=0.823, adj=0.439, (0 split) # syct < 49 to the right, agree=0.809, adj=0.394, (0 split) # chmax < 22 to the left, agree=0.780, adj=0.303, (0 split) # # Node number 2: 143 observations, complexity param=0.08933901 # mean=1.524647, MSE=0.08245348 # left son=4 (78 obs) right son=5 (65 obs) # Primary splits: # mmax < 6100 to the left, improve=0.3266856, (0 missing) # mmin < 1750 to the left, improve=0.2445926, (0 missing) # chmax < 4.5 to the left, improve=0.2353382, (0 missing) # syct < 325 to the right, improve=0.2248801, (0 missing) # cach < 0.5 to the left, improve=0.1877734, (0 missing) # Surrogate splits: # mmin < 1250 to the left, agree=0.720, adj=0.385, (0 split) # syct < 102.5 to the right, agree=0.713, adj=0.369, (0 split) # ------------------------------------------------- # ***************************************************** # # # ***************************************************** # # Classification tree data(fgl) dim(fgl) #[1] 214 10 fgl[1:4,] # ***************************************************** # RI Na Mg Al Si K Ca Ba Fe type # 1 3.01 13.64 4.49 1.10 71.78 0.06 8.75 0 0 WinF # 2 -0.39 13.89 3.60 1.36 72.73 0.48 7.83 0 0 WinF # 3 -1.82 13.53 3.55 1.54 72.99 0.39 7.78 0 0 WinF # 4 -0.34 13.21 3.69 1.29 72.61 0.57 8.22 0 0 WinF # ***************************************************** levels(fgl$type) # ***************************************************** # [1] "WinF" "WinNF" "Veh" "Con" "Tabl" "Head" # ***************************************************** # set.seed(123) # since xerror is randomly chosen, results will differ with different seeds fgl.rp = rpart(type ~ .,data = fgl, cp = .001) plotcp(fgl.rp) printcp(fgl.rp) #Classification tree: #rpart(formula = type ~ ., data = fgl, cp = 0.001) # #Variables actually used in tree construction: #[1] Al Ba Ca Fe Mg Na RI # #Root node error: 138/214 = 0.64486 # #n= 214 # # CP nsplit rel error xerror xstd #1 0.206522 0 1.00000 1.00000 0.050729 #2 0.072464 2 0.58696 0.60145 0.051652 #3 0.057971 3 0.51449 0.59420 0.051536 #4 0.036232 4 0.45652 0.53623 0.050419 #5 0.032609 5 0.42029 0.53623 0.050419 #6 0.010870 7 0.35507 0.50725 0.049733 #7 0.001000 9 0.33333 0.50725 0.049733 ## try 8 splits, cp = 0.02 fgl.rp2 = prune(fgl.rp, cp = 0.02) plot(fgl.rp2, uniform = T); text(fgl.rp2, use.n = T, cex = .8) fgl.rp2 #n= 214 # #node), split, n, loss, yval, (yprob) # * denotes terminal node # # 1) root 214 138 WinNF (0.33 0.36 0.079 0.061 0.042 0.14) # 2) Ba< 0.335 185 110 WinNF (0.37 0.41 0.092 0.065 0.049 0.016) # 4) Al< 1.42 113 50 WinF (0.56 0.27 0.12 0.0088 0.027 0.018) # 8) Ca< 10.48 101 38 WinF (0.62 0.21 0.13 0 0.02 0.02) # 16) RI>=-0.93 85 25 WinF (0.71 0.2 0.071 0 0.012 0.012) # 32) Mg< 3.865 77 18 WinF (0.77 0.14 0.065 0 0.013 0.013) * # 33) Mg>=3.865 8 2 WinNF (0.12 0.75 0.12 0 0 0) * # 17) RI< -0.93 16 9 Veh (0.19 0.25 0.44 0 0.062 0.062) * # 9) Ca>=10.48 12 2 WinNF (0 0.83 0 0.083 0.083 0) * # 5) Al>=1.42 72 28 WinNF (0.083 0.61 0.056 0.15 0.083 0.014) # 10) Mg>=2.26 52 11 WinNF (0.12 0.79 0.077 0 0.019 0) * # 11) Mg< 2.26 20 9 Con (0 0.15 0 0.55 0.25 0.05) # 22) Na< 13.495 12 1 Con (0 0.083 0 0.92 0 0) * # 23) Na>=13.495 8 3 Tabl (0 0.25 0 0 0.62 0.12) * # 3) Ba>=0.335 29 3 Head (0.034 0.034 0 0.034 0 0.9) *