R version 2.8.1 (2008-12-22) Copyright (C) 2008 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > pr = read.table("http://www.utstat.utoronto.ca/reid/sta414/prostate.data") > is.data.frame(pr) [1] TRUE > names(pr) [1] "lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason" [8] "pgg45" "lpsa" "train" > ## This is taken from lecture notes of Jan 13; in class I read the data from > ## a file on my computer, because there is no internet connection > > > ls() [1] "pr" > ## There is a way to refer to the individual variables, > ## by attaching the data frame > lpsa Error: object "lpsa" not found > attach(pr) > lpsa [1] -0.4307829 -0.1625189 -0.1625189 -0.1625189 0.3715636 0.7654678 [7] 0.7654678 0.8544153 1.0473190 1.0473190 1.2669476 1.2669476 [13] 1.2669476 1.3480731 1.3987169 1.4469190 1.4701758 1.4929041 ... (rest of lpsa omitted) > detach(pr) ## this gets things back to the usual, in case you want to use a different data frame > ## Now I want to standardize the x-variables (because the book does) > ## Here's what I did in class on Jan 15 (Thursday): > std = function(x){((x-mean(x))/sqrt(var(x)))} > > > pr.std = cbind(apply(pr[,1:8],2,std),pr$lpsa,pr$train) > pr.std[1:3,] lcavol lweight age lbph svi lcp gleason 1 -1.637356 -2.0062118 -1.8624260 -1.024706 -0.5229409 -0.8631712 -1.0421573 2 -1.988980 -0.7220088 -0.7878962 -1.024706 -0.5229409 -0.8631712 -1.0421573 3 -1.578819 -2.1887840 1.3611634 -1.024706 -0.5229409 -0.8631712 0.3426271 pgg45 1 -0.8644665 -0.4307829 1 2 -0.8644665 -0.1625189 1 3 -0.1553481 -0.1625189 1 > is.data.frame(pr.std) [1] FALSE > names(pr.std) NULL > ## I needed to use pr$lpsa and pr$train to refer to the individual variables (because of 'detach') > ## the result is not a data frame > ## let's make it one > > pr.std = data.frame(pr.std) > is.data.frame(pr.std) [1] TRUE > names(pr.std) [1] "lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason" [8] "pgg45" "V9" "V10" > ## Note that the names are not quite what I want, and the train variable is > ## numeric, whereas in pr it was logical > > ## Above is what I did on the Jan 13 handout > > attach(pr) > pr.std1 = data.frame(cbind(apply(pr[,1:8],2,std)),lpsa,train) > is.data.frame(pr.std1) [1] TRUE > names(pr.std1) [1] "lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason" [8] "pgg45" "lpsa" "train" > ## there are two differences in this version: one is hiding in the brackets. > ## In the command to create pr.std, I used "cbind" on the 8 columns of pr > ## + two new columns (pr$lpsa and pr$train). Just above I only used cbind > ## on the 8 columns. The variables lpsa and train got "data.frame" 'd > ?data.frame > > ## cbind is short for "bind in columns" > ## I think data.frame binds things in columns anyway > ## Let's try a 3rd version > data.frame(apply(pr[,1:8],2,std), lpsa, train)[1:3,] lcavol lweight age lbph svi lcp gleason 1 -1.637356 -2.0062118 -1.8624260 -1.024706 -0.5229409 -0.8631712 -1.0421573 2 -1.988980 -0.7220088 -0.7878962 -1.024706 -0.5229409 -0.8631712 -1.0421573 3 -1.578819 -2.1887840 1.3611634 -1.024706 -0.5229409 -0.8631712 0.3426271 pgg45 lpsa train 1 -0.8644665 -0.4307829 TRUE 2 -0.8644665 -0.1625189 TRUE 3 -0.1553481 -0.1625189 TRUE > ## This is the best one, we've got everything in a convenient form > ## ASIDE: Useful things with R: c(1,2,3) makes a vector from some numbers > ## cbind(v1,v2,v3) makes a matrix from some column vectors > ## data.frame makes a data.frame from some vectors (and keeps their names) end ASIDE > rm(pr.std1) > rm(pr.std) > ## Let's do it right > pr.std = data.frame(apply(pr[,1:8],2,std),lpsa,train) > ## the command "apply" is very useful for matrices or data frames > ## the notation pr[,1:8] refers to all rows, columns 1 - 8 > ## similarly pr[1:3,] is the first 3 rows (all columns) > ## and pr[-1,] is everything but the first row. > ## finally now some linear modelling > pr.lm = lm(lpsa ~ . - train, subset=train, data=pr.std) > summary(pr.lm) Call: lm(formula = lpsa ~ . - train, data = pr.std, subset = train) Residuals: Min 1Q Median 3Q Max -1.64870 -0.34147 -0.05424 0.44941 1.48675 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.46493 0.08931 27.598 < 2e-16 *** lcavol 0.67953 0.12663 5.366 1.47e-06 *** lweight 0.26305 0.09563 2.751 0.00792 ** age -0.14146 0.10134 -1.396 0.16806 lbph 0.21015 0.10222 2.056 0.04431 * svi 0.30520 0.12360 2.469 0.01651 * lcp -0.28849 0.15453 -1.867 0.06697 . gleason -0.02131 0.14525 -0.147 0.88389 pgg45 0.26696 0.15361 1.738 0.08755 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.7123 on 58 degrees of freedom Multiple R-squared: 0.6944, Adjusted R-squared: 0.6522 F-statistic: 16.47 on 8 and 58 DF, p-value: 2.042e-12 > ## Agrees with values in the text book > ## pr.lm has lots of components > pr.lm$coef (Intercept) lcavol lweight age lbph svi 2.46493292 0.67952814 0.26305307 -0.14146483 0.21014656 0.30520060 lcp gleason pgg45 -0.28849277 -0.02130504 0.26695576 > coef(pr.lm) (Intercept) lcavol lweight age lbph svi 2.46493292 0.67952814 0.26305307 -0.14146483 0.21014656 0.30520060 lcp gleason pgg45 -0.28849277 -0.02130504 0.26695576 > plot(pr.lm$residuals, pr.lm$fitted.values) > plot(pr.lm$fitted.values, pr.lm$residuals) > attributes(pr.lm) $names [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "contrasts" "xlevels" "call" "terms" [13] "model" $class [1] "lm" > ## lm objects also have lots of methods (functions), such as "predict" > ## and "vcov" (covariance matrix of betahat) > vcov(pr.lm) (Intercept) lcavol lweight age (Intercept) 0.0079771654 0.0002577108 2.497635e-04 -1.366576e-03 lcavol 0.0002577108 0.0160349106 -2.093091e-03 -1.389624e-03 lweight 0.0002497635 -0.0020930909 9.144755e-03 -1.972497e-03 age -0.0013665755 -0.0013896243 -1.972497e-03 1.027029e-02 lbph 0.0003216418 -0.0007789790 -3.972523e-03 -1.684915e-03 svi -0.0003078325 -0.0038717186 -1.338898e-03 8.899717e-05 lcp 0.0009861364 -0.0085324704 -6.315722e-05 1.151570e-03 gleason 0.0019156887 -0.0030493983 2.201221e-03 -3.530295e-03 pgg45 -0.0021032732 0.0021050692 -1.945055e-04 -2.114305e-04 lbph svi lcp gleason (Intercept) 3.216418e-04 -3.078325e-04 9.861364e-04 0.0019156887 lcavol -7.789790e-04 -3.871719e-03 -8.532470e-03 -0.0030493983 lweight -3.972523e-03 -1.338898e-03 -6.315722e-05 0.0022012206 age -1.684915e-03 8.899717e-05 1.151570e-03 -0.0035302952 lbph 1.044873e-02 2.318138e-03 1.142597e-03 -0.0006056925 svi 2.318138e-03 1.527703e-02 -6.360236e-03 0.0018745317 lcp 1.142597e-03 -6.360236e-03 2.387932e-02 0.0012561481 gleason -6.056925e-04 1.874532e-03 1.256148e-03 0.0210967576 pgg45 7.034888e-05 -2.409082e-03 -9.250059e-03 -0.0145273621 pgg45 (Intercept) -2.103273e-03 lcavol 2.105069e-03 lweight -1.945055e-04 age -2.114305e-04 lbph 7.034888e-05 svi -2.409082e-03 lcp -9.250059e-03 gleason -1.452736e-02 pgg45 2.359713e-02 > diag(.Last.value) (Intercept) lcavol lweight age lbph svi 0.007977165 0.016034911 0.009144755 0.010270292 0.010448731 0.015277026 lcp gleason pgg45 0.023879316 0.021096758 0.023597129 > sqrt(.Last.value) (Intercept) lcavol lweight age lbph svi 0.08931498 0.12662903 0.09562821 0.10134245 0.10221904 0.12360027 lcp gleason pgg45 0.15452934 0.14524723 0.15361357 > ## there are the standard errors of the beta-hats; see summary(pr.lm) above > ## Now I will fit a succession of simple models > ## > lm1 = lm (lcavol ~ 1, data=pr.std[train,1:9]) > z1 = lm1$residuals > > ## when fitting a linear model, the intercept ("1") term is assumed to be > ## present, but here where I have no RHS except the intercept I needed to > ## put it in > lm2 = lm(lweight~z1, data=pr.std[train,]) > summary(lm2) Call: lm(formula = lweight ~ z1, data = pr.std[train, ]) Residuals: Min 1Q Median 3Q Max -2.69045 -0.63401 -0.07676 0.59203 2.72646 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.006617 0.130635 -0.051 0.9598 z1 0.316810 0.124845 2.538 0.0136 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.069 on 65 degrees of freedom Multiple R-squared: 0.09014, Adjusted R-squared: 0.07614 F-statistic: 6.44 on 1 and 65 DF, p-value: 0.01357 > z2 = lm2$residuals > z3 = lm(age ~ z1 + z2 , data=pr.std[train,])$residuals > z4 = lm(lbph ~ z1 + z2 + z3 , data=pr.std[train,])$residuals > z5 = lm(svi ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals > z6 = lm(lcp ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals > z7 = lm(gleason ~ z1 + z2 + z3 + z4 , data=pr.std[train,])$residuals > z6 = lm(lcp ~ z1 + z2 + z3 + z4 + z5 , data=pr.std[train,])$residuals > z7 = lm(gleason ~ z1 + z2 + z3 + z4 + z5 + z6 , data=pr.std[train,])$residuals > z8 = lm(pgg45 ~ z1 + z2 + z3 + z4 + z5 + z6 +z7 , data=pr.std[train,])$residuals > summary(lm(prtrain$lpsa ~ z8)) Call: lm(formula = prtrain$lpsa ~ z8) Residuals: Min 1Q Median 3Q Max -2.8735 -0.8759 0.1777 0.9103 2.7308 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.4523 0.1475 16.626 <2e-16 *** z8 0.2670 0.2604 1.025 0.309 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.207 on 65 degrees of freedom Multiple R-squared: 0.01591, Adjusted R-squared: 0.0007745 F-statistic: 1.051 on 1 and 65 DF, p-value: 0.3090 > summary(lm(prtrain$lpsa ~ z1 + z2 + z3 + z4 + z5 + z6 + z7 + z8)) Call: lm(formula = prtrain$lpsa ~ z1 + z2 + z3 + z4 + z5 + z6 + z7 + z8) Residuals: Min 1Q Median 3Q Max -1.64870 -0.34147 -0.05424 0.44941 1.48675 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.45235 0.08702 28.182 < 2e-16 *** z1 0.83993 0.08316 10.100 2.15e-14 *** z2 0.31633 0.08262 3.829 0.000318 *** z3 -0.06905 0.09385 -0.736 0.464870 z4 0.16143 0.09885 1.633 0.107881 z5 0.27726 0.11075 2.503 0.015134 * z6 -0.13160 0.13650 -0.964 0.338984 z7 0.14304 0.11024 1.298 0.199578 z8 0.26696 0.15361 1.738 0.087546 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.7123 on 58 degrees of freedom Multiple R-squared: 0.6944, Adjusted R-squared: 0.6522 F-statistic: 16.47 on 8 and 58 DF, p-value: 2.042e-12 > ## note that the coefficient for z8 is beta-hat8, in either regression > ## but only the 2nd version also gets the standard error correct