R version 2.8.1 (2008-12-22) Copyright (C) 2008 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. ## earlier in the session (details deleted) I loaded the MASS library ("library(MASS)") and ## installed the leaps package using a menu item ("Package installer"), and then "library(leaps)" > ls() [1] "X" "lm1" "lm2" "pr" "pr.lars" [6] "pr.leaps" "pr.lm" "pr.std" "prtrain" "std" [11] "z1" "z2" "z3" "z4" "z5" [16] "z6" "z7" "z8" ## X is the model matrix for pr.lm: X = model.matrix(pr.lm) > attributes(pr.lm) $names [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "contrasts" "xlevels" "call" "terms" [13] "model" $class [1] "lm" ## note that the list of attributes doesn't specify "model matrix", but ## I found a reference to it in the help file for lm > ?lm > pr.lm$coef (Intercept) lcavol lweight age lbph 2.46493292 0.67952814 0.26305307 -0.14146483 0.21014656 svi lcp gleason pgg45 0.30520060 -0.28849277 -0.02130504 0.26695576 ## here's another one that is often handy: > vcov(pr.lm) (Intercept) lcavol lweight age (Intercept) 0.0079771654 0.0002577108 2.497635e-04 -1.366576e-03 lcavol 0.0002577108 0.0160349106 -2.093091e-03 -1.389624e-03 lweight 0.0002497635 -0.0020930909 9.144755e-03 -1.972497e-03 age -0.0013665755 -0.0013896243 -1.972497e-03 1.027029e-02 lbph 0.0003216418 -0.0007789790 -3.972523e-03 -1.684915e-03 svi -0.0003078325 -0.0038717186 -1.338898e-03 8.899717e-05 lcp 0.0009861364 -0.0085324704 -6.315722e-05 1.151570e-03 gleason 0.0019156887 -0.0030493983 2.201221e-03 -3.530295e-03 pgg45 -0.0021032732 0.0021050692 -1.945055e-04 -2.114305e-04 lbph svi lcp gleason (Intercept) 3.216418e-04 -3.078325e-04 9.861364e-04 0.0019156887 lcavol -7.789790e-04 -3.871719e-03 -8.532470e-03 -0.0030493983 lweight -3.972523e-03 -1.338898e-03 -6.315722e-05 0.0022012206 age -1.684915e-03 8.899717e-05 1.151570e-03 -0.0035302952 lbph 1.044873e-02 2.318138e-03 1.142597e-03 -0.0006056925 svi 2.318138e-03 1.527703e-02 -6.360236e-03 0.0018745317 lcp 1.142597e-03 -6.360236e-03 2.387932e-02 0.0012561481 gleason -6.056925e-04 1.874532e-03 1.256148e-03 0.0210967576 pgg45 7.034888e-05 -2.409082e-03 -9.250059e-03 -0.0145273621 pgg45 (Intercept) -2.103273e-03 lcavol 2.105069e-03 lweight -1.945055e-04 age -2.114305e-04 lbph 7.034888e-05 svi -2.409082e-03 lcp -9.250059e-03 gleason -1.452736e-02 pgg45 2.359713e-02 > ?step ## according to the documentation, step is a simplified version of stepAIC > step(pr.lm, trace=2) Start: AIC=-37.13 lpsa ~ (lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45 + train) - train Df Sum of Sq RSS AIC - gleason 1 0.011 29.437 -39.103 29.426 -37.128 - age 1 0.989 30.415 -36.914 - pgg45 1 1.532 30.959 -35.727 - lcp 1 1.768 31.195 -35.218 - lbph 1 2.144 31.571 -34.415 - svi 1 3.093 32.520 -32.430 - lweight 1 3.839 33.265 -30.912 - lcavol 1 14.610 44.037 -12.118 Step: AIC=-39.1 lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45 Df Sum of Sq RSS AIC 29.437 -39.103 - age 1 1.102 30.540 -38.639 - lcp 1 1.758 31.196 -37.216 - lbph 1 2.135 31.573 -36.411 - pgg45 1 2.376 31.813 -35.903 - svi 1 3.166 32.604 -34.258 - lweight 1 4.005 33.442 -32.557 - lcavol 1 14.887 44.325 -13.681 Call: lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45, data = pr.std, subset = train) Coefficients: (Intercept) lcavol lweight age lbph 2.4669 0.6764 0.2653 -0.1450 0.2095 svi lcp pgg45 0.3071 -0.2872 0.2523 > ?stepAIC > library(MASS) > stepAIC(pr.lm, trace=2) Start: AIC=-37.13 lpsa ~ (lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45 + train) - train Df Sum of Sq RSS AIC - gleason 1 0.011 29.437 -39.103 29.426 -37.128 - age 1 0.989 30.415 -36.914 - pgg45 1 1.532 30.959 -35.727 - lcp 1 1.768 31.195 -35.218 - lbph 1 2.144 31.571 -34.415 - svi 1 3.093 32.520 -32.430 - lweight 1 3.839 33.265 -30.912 - lcavol 1 14.610 44.037 -12.118 Step: AIC=-39.1 lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45 Df Sum of Sq RSS AIC 29.437 -39.103 - age 1 1.102 30.540 -38.639 - lcp 1 1.758 31.196 -37.216 - lbph 1 2.135 31.573 -36.411 - pgg45 1 2.376 31.813 -35.903 - svi 1 3.166 32.604 -34.258 - lweight 1 4.005 33.442 -32.557 - lcavol 1 14.887 44.325 -13.681 Call: lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45, data = pr.std, subset = train) Coefficients: (Intercept) lcavol lweight age lbph 2.4669 0.6764 0.2653 -0.1450 0.2095 svi lcp pgg45 0.3071 -0.2872 0.2523 ## there is probably a way to force stepAIC to fit all models, but I haven't found it > ?leaps > library(leaps) # this is after I downloaded the package > ls() [1] "X" "lm1" "lm2" "pr" "pr.lars" [6] "pr.leaps" "pr.lm" "pr.std" "prtrain" "std" [11] "z1" "z2" "z3" "z4" "z5" [16] "z6" "z7" "z8" > dim(prtrain) # I made a training data frame prtrain = pr.std[pr.std$train,] [1] 67 9 > prtrain[1,] lcavol lweight age lbph svi lcp 1 -1.637356 -2.006212 -1.862426 -1.024706 -0.5229409 -0.8631712 gleason pgg45 lpsa 1 -1.042157 -0.8644665 -0.4307829 ## the documentation for leaps says regsubsets is better ## but the info presented seems harder to interpret > pr.regsub = regsubsets(lpsa ~ ., data=prtrain) > pr.regsub Subset selection object Call: regsubsets.formula(lpsa ~ ., data = prtrain) 8 Variables (and intercept) Forced in Forced out lcavol FALSE FALSE lweight FALSE FALSE age FALSE FALSE lbph FALSE FALSE svi FALSE FALSE lcp FALSE FALSE gleason FALSE FALSE pgg45 FALSE FALSE 1 subsets of each size up to 8 Selection Algorithm: exhaustive > summary(pr.regsub) Subset selection object Call: regsubsets.formula(lpsa ~ ., data = prtrain) 8 Variables (and intercept) Forced in Forced out lcavol FALSE FALSE lweight FALSE FALSE age FALSE FALSE lbph FALSE FALSE svi FALSE FALSE lcp FALSE FALSE gleason FALSE FALSE pgg45 FALSE FALSE 1 subsets of each size up to 8 Selection Algorithm: exhaustive lcavol lweight age lbph svi lcp gleason pgg45 1 ( 1 ) "*" " " " " " " " " " " " " " " 2 ( 1 ) "*" "*" " " " " " " " " " " " " 3 ( 1 ) "*" "*" " " " " "*" " " " " " " 4 ( 1 ) "*" "*" " " "*" "*" " " " " " " 5 ( 1 ) "*" "*" " " "*" "*" " " " " "*" 6 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" 7 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" ## It appears that the best subset of size 4, for example, has lcavol, lweight, lbph and svi ## does this agree with HTF? ## I like the output from leaps better, but leaps requires a matrix x and response y as arguments > leaps(X, y=prtrain$lpsa) $which 1 2 3 4 5 6 7 8 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 1 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE 1 FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE 1 FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE 1 FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE 1 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE 1 FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE 2 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE 2 TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE 2 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE 2 TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE 2 TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE 2 TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE 2 TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE 2 FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE 2 FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE 2 FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE 3 TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE 3 TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE 3 TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE 3 TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE 3 TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE 3 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE 3 TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE 3 TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE 3 TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE 3 TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE 4 TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE 4 TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE 4 TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE 4 TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE 4 TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE 4 TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE 4 TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE 4 TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE 4 TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE 4 TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE 5 TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE 5 TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE 5 TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE 5 TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE 5 TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE 5 TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE 5 TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE 5 TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE 5 TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE 5 TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE 6 TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE 6 TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE 6 TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE 6 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE 6 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE 6 TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE 6 TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE 6 TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE 6 TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE 6 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE 7 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE 7 TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE 7 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE 7 TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE 7 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE 7 TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE 7 TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE 7 FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 8 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE $label [1] "(Intercept)" "1" "2" "3" [5] "4" "5" "6" "7" [9] "8" $size [1] 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 [33] 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 [65] 8 8 9 $Cp [1] 24.766739 67.919902 81.356324 82.093779 88.676389 [6] 104.520541 113.652518 116.938459 12.108779 17.825642 [11] 22.398961 24.587817 26.560758 26.646478 26.701812 [16] 40.929828 47.492867 50.890946 9.803880 10.841197 [21] 11.896574 11.986624 13.254631 13.567505 14.104723 [26] 16.973886 19.451158 19.637461 7.679020 10.160844 [31] 10.520823 10.554147 11.210485 11.418311 11.486420 [36] 11.622995 12.633296 12.689889 8.209530 8.501411 [41] 8.633280 8.708966 9.327889 10.415197 11.313823 [46] 11.324420 11.690506 11.850092 7.194521 8.487149 [51] 9.230453 9.688924 9.703734 9.903660 10.078118 [56] 10.383958 11.262653 12.214720 7.021515 8.948562 [61] 10.020087 10.485363 11.226501 13.097221 14.566842 [66] 35.797073 9.000000 ## I used this output to plot $Cp$ vs. size: plot(pr.leaps$size, pr.leaps$Cp) ## Now ridge regression: > ?lm.ridge > pr.ridge = lm.ridge(lpsa ~ ., data =prtrain, lambda=seq(0,100,by=10)) > pr.ridge > lm.ridge(lpsa ~ ., data=prtrain, lambda=seq(0,100,by=10)) lcavol lweight age lbph svi 0 2.464933 0.6795281 0.2630531 -0.1414648335 0.2101466 0.3052006 10 2.466706 0.5144359 0.2495231 -0.0863071221 0.1902987 0.2635555 20 2.464658 0.4320618 0.2331187 -0.0538607739 0.1730534 0.2388469 30 2.462559 0.3797003 0.2179352 -0.0327213254 0.1586723 0.2210173 40 2.460798 0.3423099 0.2044014 -0.0180862659 0.1465965 0.2069084 50 2.459368 0.3137004 0.1924063 -0.0075435686 0.1363227 0.1951732 60 2.458208 0.2907866 0.1817493 0.0002652705 0.1274691 0.1851107 70 2.457260 0.2718319 0.1722357 0.0061673481 0.1197518 0.1763045 80 2.456479 0.2557718 0.1636970 0.0106954362 0.1129584 0.1684837 90 2.455828 0.2419110 0.1559923 0.0142077585 0.1069270 0.1614600 100 2.455282 0.2297723 0.1490049 0.0169534697 0.1015319 0.1550965 lcp gleason pgg45 0 -0.28849277 -0.02130504 0.26695576 10 -0.08918274 0.02760564 0.16610251 20 -0.01172818 0.04070146 0.13566399 30 0.02730599 0.04677755 0.12160655 40 0.04951199 0.05032788 0.11339837 50 0.06300241 0.05259221 0.10779161 60 0.07148712 0.05406167 0.10351866 70 0.07688841 0.05498651 0.10000670 80 0.08029311 0.05551720 0.09696915 90 0.08235659 0.05575390 0.09425118 100 0.08349405 0.05576752 0.09176375 > plot(pr.ridge) # this plots the coefficients against lambda ### with a little more work you can reproduce the plot in HTF > svd(model.matrix(pr.lm)) # this will give the eigenvalues d_j $d [1] 15.376287 10.859892 8.646383 8.130266 6.387959 5.821079 [7] 5.127015 4.317813 3.439101 $u [,1] [,2] [,3] [,4] [1,] 0.194557998 -0.1948803764 -0.049052010 0.112423999 [2,] 0.171312035 -0.0837424280 -0.067094962 0.125493141 [3,] 0.090622720 -0.1210955435 -0.294993682 -0.016288654 ...stuff omitted ## and finally you can do lasso using lars, and you can easily standardize ## the inputs with scale > ?scale > ?lars >