R : Copyright 2004, The R Foundation for Statistical Computing Version 1.9.0 (2004-04-12), ISBN 3-900051-00-3 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for a HTML browser interface to help. Type 'q()' to quit R. [Previously saved workspace restored] > rm(list=ls()) > exg <- read.table("exg.data") #on CQUEST use "/u/reid/410/exg.data" > dim(exg) [1] 32 11 > exg[32,] V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 32 270.71 67.83 7 80 886 1 0 0 1 11 1 > is.data.frame(exg) [1] TRUE > names(exg) <- c("cost","date","t1", "t2", "S", "PR", "NE", "CT", "BW", "N", "PT") > exg[1:4,] cost date t1 t2 S PR NE CT BW N PT 1 460.05 68.58 14 46 687 0 1 0 0 14 0 2 452.99 67.33 10 73 1065 0 0 1 0 1 0 3 443.22 67.33 10 85 1065 1 0 1 0 1 0 4 652.32 68.00 11 67 1065 0 1 1 0 12 0 # fit the full linear model as in Cox and Snell > exg.lm <- lm(log(cost)~PT + CT + log(N) + log(S) + date + + NE + log(t1) + log(t2) + PR + BW, data=exg) > summary(exg.lm) Call: lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE + log(t1) + log(t2) + PR + BW, data = exg) Residuals: Min 1Q Median 3Q Max -0.28574 -0.10408 0.02784 0.09512 0.25031 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -14.24198 4.22880 -3.368 0.00291 ** PT -0.22429 0.12246 -1.832 0.08125 . CT 0.12040 0.06632 1.815 0.08376 . log(N) -0.08020 0.04596 -1.745 0.09562 . log(S) 0.69373 0.13605 5.099 4.75e-05 *** date 0.20922 0.06526 3.206 0.00425 ** NE 0.25807 0.07693 3.355 0.00300 ** log(t1) 0.09187 0.24396 0.377 0.71025 log(t2) 0.28553 0.27289 1.046 0.30731 PR -0.09237 0.07730 -1.195 0.24542 BW 0.03303 0.10112 0.327 0.74715 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 0.1645 on 21 degrees of freedom Multiple R-Squared: 0.8717, Adjusted R-squared: 0.8106 F-statistic: 14.27 on 10 and 21 DF, p-value: 3.081e-07 > attributes(exg.lm) $names [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "xlevels" "call" "terms" "model" $class [1] "lm" # the names are various pieces that can be pulled out separately # there are other pieces that can be extracted, see p.141 of MASS > .1645^2 [1] 0.02706025 # residual mean square as on p.86 of Cox and Snell > exg.lm$df.residual [1] 21 > ?update # Now we'll fit a series of reduced models to duplicate Table G.3 > update(exg.lm, .~.-BW) Call: lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE + log(t1) + log(t2) + PR, data = exg) Coefficients: (Intercept) PT CT log(N) log(S) -14.70537 -0.21530 0.11655 -0.07684 0.68804 date NE log(t1) log(t2) PR 0.21545 0.25833 0.06313 0.32211 -0.09820 > exg.lm9 <- update(.Last.value) > deviance(exg.lm9) # gives the residual sum of squares [1] 0.5709137 > deviance(update(exg.lm,.~.-BW)) [1] 0.5709137 > deviance(update(exg.lm,.~.-BW-log(t1))) [1] 0.5729959 > deviance(update(exg.lm,.~.-BW-log(t1)-log(t2))) [1] 0.616538 > deviance(update(exg.lm,.~.-BW-log(t1)-log(t2)-pr)) [1] 0.616538 > deviance(update(exg.lm,.~.-BW-log(t1)-log(t2)-PR)) [1] 0.6337415 # We have now duplicated Table G.3 'by hand' # By changing the order of the variables to correspond to the order # in Table G.3 we could get this automatically using anova > anova(exg.lm) Analysis of Variance Table Response: log(cost) Df Sum Sq Mean Sq F value Pr(>F) PT 1 2.01272 2.01272 74.4105 2.410e-08 *** CT 1 0.04550 0.04550 1.6823 0.2086954 log(N) 1 0.26551 0.26551 9.8159 0.0050252 ** log(S) 1 0.75496 0.75496 27.9110 3.081e-05 *** date 1 0.42961 0.42961 15.8828 0.0006729 *** NE 1 0.28601 0.28601 10.5738 0.0038159 ** log(t1) 1 0.00101 0.00101 0.0372 0.8489384 log(t2) 1 0.01572 0.01572 0.5812 0.4543297 PR 1 0.04610 0.04610 1.7044 0.2058380 BW 1 0.00289 0.00289 0.1067 0.7471545 Residuals 21 0.56803 0.02705 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 ## the Sum Sq due to BW is 0.0029 = 0.5709-0.5680; see Table G.3 ## the Sum Sq due to PR is a bit different because of the order ## the reduced model in Table G.4 is fit (and saved) using update > exg.lmred <- update(exg.lm,.~.-BW-PR-log(t1)-log(t2)) > summary(exg.lmred) Call: lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE, data = exg) Residuals: Min 1Q Median 3Q Max -0.32721 -0.07620 0.02920 0.08115 0.28946 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -13.26031 3.13950 -4.224 0.000278 *** PT -0.22610 0.11355 -1.991 0.057490 . CT 0.14039 0.06042 2.323 0.028582 * log(N) -0.08758 0.04147 -2.112 0.044891 * log(S) 0.72341 0.11882 6.088 2.31e-06 *** date 0.21241 0.04326 4.910 4.70e-05 *** NE 0.24902 0.07414 3.359 0.002510 ** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 0.1592 on 25 degrees of freedom Multiple R-Squared: 0.8569, Adjusted R-squared: 0.8225 F-statistic: 24.95 on 6 and 25 DF, p-value: 2.058e-09 > # example of an interaction > exg.lmint <- lm(log(cost)~PT+CT+log(N)+log(S)+date+NE+PT*CT,data=exg) > summary(exg.lmint) Call: lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE + PT*CT, data = exg) Residuals: Min 1Q Median 3Q Max -0.33157 -0.06591 0.02536 0.07838 0.28676 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -13.23435 3.19296 -4.145 0.000365 *** PT -0.24289 0.12210 -1.989 0.058181 . CT 0.13123 0.06515 2.014 0.055310 . log(N) -0.08680 0.04221 -2.056 0.050782 . log(S) 0.72291 0.12083 5.983 3.55e-06 *** date 0.21213 0.04399 4.822 6.53e-05 *** NE 0.24899 0.07539 3.303 0.002991 ** PT:CT 0.07976 0.18867 0.423 0.676234 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 0.1619 on 24 degrees of freedom Multiple R-Squared: 0.8579, Adjusted R-squared: 0.8165 F-statistic: 20.71 on 7 and 24 DF, p-value: 1.000e-08 ## duplicating plots in Figures G.1-G.4 (nothing unusual) > plot(exg$date,exg.lmred$residuals) > plot(exg.lmred$fitted,exg.lmred$residuals) > qqnorm(exg.lmred$residuals) ## Illustration of command 'model.matrix' > model.matrix(exg.lmred) (Intercept) PT CT log(N) log(S) date NE 1 1 0 0 2.6390573 6.532334 68.58 1 2 1 0 1 0.0000000 6.970730 67.33 0 3 1 0 1 0.0000000 6.970730 67.33 0 4 1 0 1 2.4849066 6.970730 68.00 1 5 1 0 1 2.4849066 6.970730 68.00 1 6 1 0 1 1.0986123 6.242223 67.92 1 7 1 0 0 1.6094379 6.711740 68.17 0 8 1 0 0 0.0000000 6.124683 68.42 0 9 1 0 0 1.6094379 6.711740 68.42 0 10 1 0 1 0.6931472 6.674561 68.33 1 11 1 0 0 1.0986123 6.327937 68.58 0 12 1 0 0 1.7917595 6.672033 68.75 1 13 1 0 1 0.6931472 6.272877 68.42 0 14 1 0 0 1.9459101 6.956545 68.92 0 15 1 0 0 2.7725887 6.745236 68.92 0 16 1 0 0 1.0986123 6.656727 68.42 0 17 1 0 0 2.8332133 6.739337 69.50 1 18 1 0 1 0.6931472 6.272877 68.42 0 19 1 0 0 0.0000000 6.993933 69.17 0 20 1 0 0 2.0794415 6.956545 68.92 0 21 1 0 1 2.7080502 6.816736 68.75 0 22 1 0 0 2.9957323 6.719013 70.92 1 23 1 0 1 2.8903718 6.666957 69.67 0 24 1 0 0 1.0986123 6.710523 70.08 0 25 1 0 1 2.9444390 6.287859 70.42 0 26 1 0 1 3.0445224 7.029973 71.08 0 27 1 1 0 2.0794415 6.613384 67.25 0 28 1 1 1 1.9459101 6.710523 67.17 0 29 1 1 0 2.3978953 6.786717 67.83 0 30 1 1 0 2.3978953 6.786717 67.83 0 31 1 1 0 2.0794415 6.613384 67.25 0 32 1 1 0 2.3978953 6.786717 67.83 0 attr(,"assign") [1] 0 1 2 3 4 5 6 ## there is a built in plot methods for lm that also gives some ## plots to check for influential values > plot(exg.lmred) Hit to see next plot: Hit to see next plot: Hit to see next plot: Hit to see next plot: ## These indicate that observations 26 and 19 may be influential ## so I'll try refitting without them ## identifying influential observations will be discussed more in 410 > exg[19,] cost date t1 t2 S PR NE CT BW N PT 19 881.24 69.17 15 67 1090 0 0 0 0 1 0 > exg[26,] cost date t1 t2 S PR NE CT BW N PT 26 697.14 71.08 20 57 1130 0 0 1 0 21 0 > > exg.lminf <- lm(log(cost)~PT+CT+log(N)+log(S)+date+NE,data=exg[-c(19,26),]) > summary(exg.lminf)$coef Estimate Std. Error t value Pr(>|t|) (Intercept) -13.5100856 3.53716909 -3.819463 8.798663e-04 PT -0.2320113 0.10446933 -2.220856 3.648949e-02 CT 0.1969191 0.05617147 3.505678 1.901559e-03 log(N) -0.0442815 0.04217402 -1.049971 3.046393e-01 log(S) 0.6890273 0.11852241 5.813478 6.354631e-06 date 0.2180599 0.04660851 4.678542 1.037163e-04 NE 0.2203132 0.06457740 3.411614 2.390272e-03 > df.residual(exg.lminf) [1] 23 > df.residual(exg.lmred) [1] 25 ## the coefficients are not much changed ## I'll try just leaving out 26 > > summary(lm(log(cost)~PT+CT+log(N)+log(S)+date+NE,data=exg[-26,])) Call: lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE, data = exg[-26, ]) Residuals: Min 1Q Median 3Q Max -0.297578 -0.057638 0.009755 0.108084 0.239421 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -17.08769 3.39852 -5.028 3.87e-05 *** PT -0.17030 0.10867 -1.567 0.13017 CT 0.18961 0.06050 3.134 0.00451 ** log(N) -0.09361 0.03870 -2.419 0.02351 * log(S) 0.80020 0.11594 6.902 3.88e-07 *** date 0.26071 0.04583 5.688 7.37e-06 *** NE 0.23082 0.06949 3.322 0.00286 ** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 0.1482 on 24 degrees of freedom Multiple R-Squared: 0.8742, Adjusted R-squared: 0.8428 F-statistic: 27.81 on 6 and 24 DF, p-value: 1.112e-09 ## Here is a use of 'subset' to leave out some observations ## this was used in 450 to fit the model only to the training data > summary(lm(log(cost)~PT+CT+log(N)+log(S)+date+NE,subset=row.names(exg)!="26",data=exg)) Call: lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE, data = exg, subset = row.names(exg) != "26") Residuals: Min 1Q Median 3Q Max -0.297578 -0.057638 0.009755 0.108084 0.239421 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -17.08769 3.39852 -5.028 3.87e-05 *** PT -0.17030 0.10867 -1.567 0.13017 CT 0.18961 0.06050 3.134 0.00451 ** log(N) -0.09361 0.03870 -2.419 0.02351 * log(S) 0.80020 0.11594 6.902 3.88e-07 *** date 0.26071 0.04583 5.688 7.37e-06 *** NE 0.23082 0.06949 3.322 0.00286 ** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 0.1482 on 24 degrees of freedom Multiple R-Squared: 0.8742, Adjusted R-squared: 0.8428 F-statistic: 27.81 on 6 and 24 DF, p-value: 1.112e-09 ## Prediction is often needed in 450 applications; here is an example ## where I predict the values for the omitted observations; we can ## see that 19 is underestimated and 26 is overestimated but I don't ## think it's very important for this example > ?predict.lm > predict(exg.lminf,exg[c(19,26),],se.fit=TRUE) $fit 19 26 6.392128 6.895557 $se.fit 19 26 0.1086234 0.1202985 $df [1] 23 $residual.scale [1] 0.1373332 > exp(6.392) [1] 597.0495 > exp(6.899) [1] 991.283