R version 2.9.2 (2009-08-24)
Copyright (C) 2009 The R Foundation for Statistical Computing
ISBN 3-900051-07-0

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

[R.app GUI 1.29 (5464) i386-apple-darwin8.11.1]

> load("myfile")  ## fixed item 32 from data set in ElemStatLearn
> ls()
[1] "prostate"
> prostate[32,]
      lcavol  lweight age     lbph svi       lcp gleason pgg45     lpsa
32 0.1823216 3.804438  65 1.704748   0 -1.386294       6     0 2.008214
   train
32 FALSE

> attach(prostate)
#
> pr.std = data.frame(apply(prostate[,1:8],2,scale),lpsa,train)
> detach(prostate)
#
> ls()
[1] "pr.std"   "prostate"
#
> attach(pr.std)
> train = subset(pr.std,train==TRUE)[,1:9]
> train[1,]
     lcavol   lweight       age      lbph        svi        lcp   gleason
1 -1.637356 -2.006212 -1.862426 -1.024706 -0.5229409 -0.8631712 -1.042157
       pgg45       lpsa
1 -0.8644665 -0.4307829
> test = subset(pr.std,train==FALSE)[,1:9]
> detach(pr.std)

> lm(lpsa ~ ., data = train )  # agrees with textbook coefficient estimates

Call:
lm(formula = lpsa ~ ., data = train)

Coefficients:
(Intercept)       lcavol      lweight          age         lbph  
    2.46493      0.67953      0.26305     -0.14146      0.21015  
        svi          lcp      gleason        pgg45  
    0.30520     -0.28849     -0.02131      0.26696  

> pr.lm = lm(lpsa ~ ., data = train)
> ## pr.lm has lots of components
> pr.lm$coef
(Intercept)      lcavol     lweight         age        lbph         svi 
 2.46493292  0.67952814  0.26305307 -0.14146483  0.21014656  0.30520060 
        lcp     gleason       pgg45 
-0.28849277 -0.02130504  0.26695576 
> coef(pr.lm)
(Intercept)      lcavol     lweight         age        lbph         svi 
 2.46493292  0.67952814  0.26305307 -0.14146483  0.21014656  0.30520060 
        lcp     gleason       pgg45 
-0.28849277 -0.02130504  0.26695576 

> plot(pr.lm$residuals, pr.lm$fitted.values)
> plot(pr.lm$fitted.values, pr.lm$residuals)
> attributes(pr.lm)
$names
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "contrasts"     "xlevels"       "call"          "terms"        
[13] "model"        

$class
[1] "lm"

> ## lm objects also have lots of methods (functions), such as "predict"
> ## and "vcov" (covariance matrix of betahat)
> vcov(pr.lm)
              (Intercept)        lcavol       lweight           age
(Intercept)  0.0079771654  0.0002577108  2.497635e-04 -1.366576e-03
lcavol       0.0002577108  0.0160349106 -2.093091e-03 -1.389624e-03
lweight      0.0002497635 -0.0020930909  9.144755e-03 -1.972497e-03
age         -0.0013665755 -0.0013896243 -1.972497e-03  1.027029e-02
lbph         0.0003216418 -0.0007789790 -3.972523e-03 -1.684915e-03
svi         -0.0003078325 -0.0038717186 -1.338898e-03  8.899717e-05
lcp          0.0009861364 -0.0085324704 -6.315722e-05  1.151570e-03
gleason      0.0019156887 -0.0030493983  2.201221e-03 -3.530295e-03
pgg45       -0.0021032732  0.0021050692 -1.945055e-04 -2.114305e-04
                     lbph           svi           lcp       gleason
(Intercept)  3.216418e-04 -3.078325e-04  9.861364e-04  0.0019156887
lcavol      -7.789790e-04 -3.871719e-03 -8.532470e-03 -0.0030493983
lweight     -3.972523e-03 -1.338898e-03 -6.315722e-05  0.0022012206
age         -1.684915e-03  8.899717e-05  1.151570e-03 -0.0035302952
lbph         1.044873e-02  2.318138e-03  1.142597e-03 -0.0006056925
svi          2.318138e-03  1.527703e-02 -6.360236e-03  0.0018745317
lcp          1.142597e-03 -6.360236e-03  2.387932e-02  0.0012561481
gleason     -6.056925e-04  1.874532e-03  1.256148e-03  0.0210967576
pgg45        7.034888e-05 -2.409082e-03 -9.250059e-03 -0.0145273621
                    pgg45
(Intercept) -2.103273e-03
lcavol       2.105069e-03
lweight     -1.945055e-04
age         -2.114305e-04
lbph         7.034888e-05
svi         -2.409082e-03
lcp         -9.250059e-03
gleason     -1.452736e-02
pgg45        2.359713e-02
> diag(.Last.value)
(Intercept)      lcavol     lweight         age        lbph         svi 
0.007977165 0.016034911 0.009144755 0.010270292 0.010448731 0.015277026 
        lcp     gleason       pgg45 
0.023879316 0.021096758 0.023597129 
> sqrt(.Last.value)
(Intercept)      lcavol     lweight         age        lbph         svi 
 0.08931498  0.12662903  0.09562821  0.10134245  0.10221904  0.12360027 
        lcp     gleason       pgg45 
 0.15452934  0.14524723  0.15361357 
> ## there are the standard errors of the beta-hats; see summary(pr.lm) above

> ## Now I will fit a succession of simple models
> ## 
> lm1 = lm (lcavol ~ 1, data=pr.std[train,1:9])
> z1 = lm1$residuals
> 
> ## when fitting a linear model, the intercept ("1") term is assumed to be 
> ## present, but here where I have no RHS except the intercept I needed to
> ## put it in

> lm2 = lm(lweight~z1, data=pr.std[train,])
> summary(lm2)

Call:
lm(formula = lweight ~ z1, data = pr.std[train, ])

Residuals:
     Min       1Q   Median       3Q      Max 
-2.69045 -0.63401 -0.07676  0.59203  2.72646 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept) -0.006617   0.130635  -0.051   0.9598  
z1           0.316810   0.124845   2.538   0.0136 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 1.069 on 65 degrees of freedom
Multiple R-squared: 0.09014,	Adjusted R-squared: 0.07614 
F-statistic:  6.44 on 1 and 65 DF,  p-value: 0.01357 

> z2 = lm2$residuals
> z3 = lm(age ~ z1 + z2 , data=pr.std[train,])$residuals
> z4 = lm(lbph ~ z1 + z2 + z3 , data=pr.std[train,])$residuals
> z5 = lm(svi ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals
> z6 = lm(lcp ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals
> z7 = lm(gleason ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals
> z6 = lm(lcp ~ z1 + z2 + z3 + z4 + z5 , data=pr.std[train,])$residuals
> z7 = lm(gleason ~ z1 + z2 + z3 + z4 + z5 + z6 , data=pr.std[train,])$residuals
> z8 = lm(pgg45 ~ z1 + z2 + z3 + z4 + z5 + z6 +z7 , data=pr.std[train,])$residuals
> summary(lm(prtrain$lpsa ~ z8))

Call:
lm(formula = prtrain$lpsa ~ z8)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.8735 -0.8759  0.1777  0.9103  2.7308 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   2.4523     0.1475  16.626   <2e-16 ***
z8            0.2670     0.2604   1.025    0.309    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 1.207 on 65 degrees of freedom
Multiple R-squared: 0.01591,	Adjusted R-squared: 0.0007745 
F-statistic: 1.051 on 1 and 65 DF,  p-value: 0.3090 

> summary(lm(prtrain$lpsa ~ z1 + z2 + z3 + z4 + z5 + z6 + z7 + z8))

Call:
lm(formula = prtrain$lpsa ~ z1 + z2 + z3 + z4 + z5 + z6 + z7 + 
    z8)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.64870 -0.34147 -0.05424  0.44941  1.48675 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.45235    0.08702  28.182  < 2e-16 ***
z1           0.83993    0.08316  10.100 2.15e-14 ***
z2           0.31633    0.08262   3.829 0.000318 ***
z3          -0.06905    0.09385  -0.736 0.464870    
z4           0.16143    0.09885   1.633 0.107881    
z5           0.27726    0.11075   2.503 0.015134 *  
z6          -0.13160    0.13650  -0.964 0.338984    
z7           0.14304    0.11024   1.298 0.199578    
z8           0.26696    0.15361   1.738 0.087546 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

Residual standard error: 0.7123 on 58 degrees of freedom
Multiple R-squared: 0.6944,	Adjusted R-squared: 0.6522 
F-statistic: 16.47 on 8 and 58 DF,  p-value: 2.042e-12 

> ## note that the coefficient for z8 is beta-hat8, in either regression
> ## but only the 2nd version also gets the standard error correct

> #######   Subset Selection #########
> ## see ElemStatLearn package documentation for regsubsets
> ## 

> pr.step = step(pr.lm)
Start:  AIC=-37.13
lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
    pgg45

          Df Sum of Sq     RSS     AIC
- gleason  1     0.011  29.437 -39.103
<none>                  29.426 -37.128
- age      1     0.989  30.415 -36.914
- pgg45    1     1.532  30.959 -35.727
- lcp      1     1.768  31.195 -35.218
- lbph     1     2.144  31.571 -34.415
- svi      1     3.093  32.520 -32.430
- lweight  1     3.839  33.265 -30.912
- lcavol   1    14.610  44.037 -12.118

Step:  AIC=-39.1
lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45

          Df Sum of Sq     RSS     AIC
<none>                  29.437 -39.103
- age      1     1.102  30.540 -38.639
- lcp      1     1.758  31.196 -37.216
- lbph     1     2.135  31.573 -36.411
- pgg45    1     2.376  31.813 -35.903
- svi      1     3.166  32.604 -34.258
- lweight  1     4.005  33.442 -32.557
- lcavol   1    14.887  44.325 -13.681

## this method chooses 7 covariates
## AIC known to choose models that are too complicated

## to get Cp plot, using some code from ElemStatLearn

> pr.leaps = regsubsets(lpsa ~ . , data=train, nbest=70, #all!
+ really.big=TRUE)
> pr.leaps.sum = summary(pr.leaps)
> attributes(pr.leaps.sum)
$names
[1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"   

$class
[1] "summary.regsubsets"

> dim(pr.leaps.sum$cp)
NULL
> length(pr.leaps.sum$cp)  # it's a vector
[1] 255
> pr.models = pr.leaps.sum$which
> pr.models.size = as.numeric(attr(pr.models,"dimnames")[[1]])
> length(.Last.value)
[1] 255

> plot(pr.models.size,pr.leaps.sum$cp)
> abline(0,1)  # plots  Cp=p line for comparison