R version 2.9.2 (2009-08-24) Copyright (C) 2009 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. [R.app GUI 1.29 (5464) i386-apple-darwin8.11.1] > load("myfile") ## fixed item 32 from data set in ElemStatLearn > ls() [1] "prostate" > prostate[32,] lcavol lweight age lbph svi lcp gleason pgg45 lpsa 32 0.1823216 3.804438 65 1.704748 0 -1.386294 6 0 2.008214 train 32 FALSE > attach(prostate) # > pr.std = data.frame(apply(prostate[,1:8],2,scale),lpsa,train) > detach(prostate) # > ls() [1] "pr.std" "prostate" # > attach(pr.std) > train = subset(pr.std,train==TRUE)[,1:9] > train[1,] lcavol lweight age lbph svi lcp gleason 1 -1.637356 -2.006212 -1.862426 -1.024706 -0.5229409 -0.8631712 -1.042157 pgg45 lpsa 1 -0.8644665 -0.4307829 > test = subset(pr.std,train==FALSE)[,1:9] > detach(pr.std) > lm(lpsa ~ ., data = train ) # agrees with textbook coefficient estimates Call: lm(formula = lpsa ~ ., data = train) Coefficients: (Intercept) lcavol lweight age lbph 2.46493 0.67953 0.26305 -0.14146 0.21015 svi lcp gleason pgg45 0.30520 -0.28849 -0.02131 0.26696 > pr.lm = lm(lpsa ~ ., data = train) > ## pr.lm has lots of components > pr.lm$coef (Intercept) lcavol lweight age lbph svi 2.46493292 0.67952814 0.26305307 -0.14146483 0.21014656 0.30520060 lcp gleason pgg45 -0.28849277 -0.02130504 0.26695576 > coef(pr.lm) (Intercept) lcavol lweight age lbph svi 2.46493292 0.67952814 0.26305307 -0.14146483 0.21014656 0.30520060 lcp gleason pgg45 -0.28849277 -0.02130504 0.26695576 > plot(pr.lm$residuals, pr.lm$fitted.values) > plot(pr.lm$fitted.values, pr.lm$residuals) > attributes(pr.lm) $names [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "contrasts" "xlevels" "call" "terms" [13] "model" $class [1] "lm" > ## lm objects also have lots of methods (functions), such as "predict" > ## and "vcov" (covariance matrix of betahat) > vcov(pr.lm) (Intercept) lcavol lweight age (Intercept) 0.0079771654 0.0002577108 2.497635e-04 -1.366576e-03 lcavol 0.0002577108 0.0160349106 -2.093091e-03 -1.389624e-03 lweight 0.0002497635 -0.0020930909 9.144755e-03 -1.972497e-03 age -0.0013665755 -0.0013896243 -1.972497e-03 1.027029e-02 lbph 0.0003216418 -0.0007789790 -3.972523e-03 -1.684915e-03 svi -0.0003078325 -0.0038717186 -1.338898e-03 8.899717e-05 lcp 0.0009861364 -0.0085324704 -6.315722e-05 1.151570e-03 gleason 0.0019156887 -0.0030493983 2.201221e-03 -3.530295e-03 pgg45 -0.0021032732 0.0021050692 -1.945055e-04 -2.114305e-04 lbph svi lcp gleason (Intercept) 3.216418e-04 -3.078325e-04 9.861364e-04 0.0019156887 lcavol -7.789790e-04 -3.871719e-03 -8.532470e-03 -0.0030493983 lweight -3.972523e-03 -1.338898e-03 -6.315722e-05 0.0022012206 age -1.684915e-03 8.899717e-05 1.151570e-03 -0.0035302952 lbph 1.044873e-02 2.318138e-03 1.142597e-03 -0.0006056925 svi 2.318138e-03 1.527703e-02 -6.360236e-03 0.0018745317 lcp 1.142597e-03 -6.360236e-03 2.387932e-02 0.0012561481 gleason -6.056925e-04 1.874532e-03 1.256148e-03 0.0210967576 pgg45 7.034888e-05 -2.409082e-03 -9.250059e-03 -0.0145273621 pgg45 (Intercept) -2.103273e-03 lcavol 2.105069e-03 lweight -1.945055e-04 age -2.114305e-04 lbph 7.034888e-05 svi -2.409082e-03 lcp -9.250059e-03 gleason -1.452736e-02 pgg45 2.359713e-02 > diag(.Last.value) (Intercept) lcavol lweight age lbph svi 0.007977165 0.016034911 0.009144755 0.010270292 0.010448731 0.015277026 lcp gleason pgg45 0.023879316 0.021096758 0.023597129 > sqrt(.Last.value) (Intercept) lcavol lweight age lbph svi 0.08931498 0.12662903 0.09562821 0.10134245 0.10221904 0.12360027 lcp gleason pgg45 0.15452934 0.14524723 0.15361357 > ## there are the standard errors of the beta-hats; see summary(pr.lm) above > ## Now I will fit a succession of simple models > ## > lm1 = lm (lcavol ~ 1, data=pr.std[train,1:9]) > z1 = lm1$residuals > > ## when fitting a linear model, the intercept ("1") term is assumed to be > ## present, but here where I have no RHS except the intercept I needed to > ## put it in > lm2 = lm(lweight~z1, data=pr.std[train,]) > summary(lm2) Call: lm(formula = lweight ~ z1, data = pr.std[train, ]) Residuals: Min 1Q Median 3Q Max -2.69045 -0.63401 -0.07676 0.59203 2.72646 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.006617 0.130635 -0.051 0.9598 z1 0.316810 0.124845 2.538 0.0136 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.069 on 65 degrees of freedom Multiple R-squared: 0.09014, Adjusted R-squared: 0.07614 F-statistic: 6.44 on 1 and 65 DF, p-value: 0.01357 > z2 = lm2$residuals > z3 = lm(age ~ z1 + z2 , data=pr.std[train,])$residuals > z4 = lm(lbph ~ z1 + z2 + z3 , data=pr.std[train,])$residuals > z5 = lm(svi ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals > z6 = lm(lcp ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals > z7 = lm(gleason ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals > z6 = lm(lcp ~ z1 + z2 + z3 + z4 + z5 , data=pr.std[train,])$residuals > z7 = lm(gleason ~ z1 + z2 + z3 + z4 + z5 + z6 , data=pr.std[train,])$residuals > z8 = lm(pgg45 ~ z1 + z2 + z3 + z4 + z5 + z6 +z7 , data=pr.std[train,])$residuals > summary(lm(prtrain$lpsa ~ z8)) Call: lm(formula = prtrain$lpsa ~ z8) Residuals: Min 1Q Median 3Q Max -2.8735 -0.8759 0.1777 0.9103 2.7308 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.4523 0.1475 16.626 <2e-16 *** z8 0.2670 0.2604 1.025 0.309 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.207 on 65 degrees of freedom Multiple R-squared: 0.01591, Adjusted R-squared: 0.0007745 F-statistic: 1.051 on 1 and 65 DF, p-value: 0.3090 > summary(lm(prtrain$lpsa ~ z1 + z2 + z3 + z4 + z5 + z6 + z7 + z8)) Call: lm(formula = prtrain$lpsa ~ z1 + z2 + z3 + z4 + z5 + z6 + z7 + z8) Residuals: Min 1Q Median 3Q Max -1.64870 -0.34147 -0.05424 0.44941 1.48675 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.45235 0.08702 28.182 < 2e-16 *** z1 0.83993 0.08316 10.100 2.15e-14 *** z2 0.31633 0.08262 3.829 0.000318 *** z3 -0.06905 0.09385 -0.736 0.464870 z4 0.16143 0.09885 1.633 0.107881 z5 0.27726 0.11075 2.503 0.015134 * z6 -0.13160 0.13650 -0.964 0.338984 z7 0.14304 0.11024 1.298 0.199578 z8 0.26696 0.15361 1.738 0.087546 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.7123 on 58 degrees of freedom Multiple R-squared: 0.6944, Adjusted R-squared: 0.6522 F-statistic: 16.47 on 8 and 58 DF, p-value: 2.042e-12 > ## note that the coefficient for z8 is beta-hat8, in either regression > ## but only the 2nd version also gets the standard error correct > ####### Subset Selection ######### > ## see ElemStatLearn package documentation for regsubsets > ## > pr.step = step(pr.lm) Start: AIC=-37.13 lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45 Df Sum of Sq RSS AIC - gleason 1 0.011 29.437 -39.103 29.426 -37.128 - age 1 0.989 30.415 -36.914 - pgg45 1 1.532 30.959 -35.727 - lcp 1 1.768 31.195 -35.218 - lbph 1 2.144 31.571 -34.415 - svi 1 3.093 32.520 -32.430 - lweight 1 3.839 33.265 -30.912 - lcavol 1 14.610 44.037 -12.118 Step: AIC=-39.1 lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45 Df Sum of Sq RSS AIC 29.437 -39.103 - age 1 1.102 30.540 -38.639 - lcp 1 1.758 31.196 -37.216 - lbph 1 2.135 31.573 -36.411 - pgg45 1 2.376 31.813 -35.903 - svi 1 3.166 32.604 -34.258 - lweight 1 4.005 33.442 -32.557 - lcavol 1 14.887 44.325 -13.681 ## this method chooses 7 covariates ## AIC known to choose models that are too complicated ## to get Cp plot, using some code from ElemStatLearn > pr.leaps = regsubsets(lpsa ~ . , data=train, nbest=70, #all! + really.big=TRUE) > pr.leaps.sum = summary(pr.leaps) > attributes(pr.leaps.sum) $names [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj" $class [1] "summary.regsubsets" > dim(pr.leaps.sum$cp) NULL > length(pr.leaps.sum$cp) # it's a vector [1] 255 > pr.models = pr.leaps.sum$which > pr.models.size = as.numeric(attr(pr.models,"dimnames")[[1]]) > length(.Last.value) [1] 255 > plot(pr.models.size,pr.leaps.sum$cp) > abline(0,1) # plots Cp=p line for comparison