R : Copyright 2004, The R Foundation for Statistical Computing
Version 1.9.0  (2004-04-12), ISBN 3-900051-00-3

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for a HTML browser interface to help.
Type 'q()' to quit R.

[Previously saved workspace restored]

> rm(list=ls())
> exg <- read.table("exg.data") #on CQUEST use "/u/reid/410/exg.data"
> dim(exg)
[1] 32 11
> exg[32,]
       V1    V2 V3 V4  V5 V6 V7 V8 V9 V10 V11
32 270.71 67.83  7 80 886  1  0  0  1  11   1
> is.data.frame(exg)
[1] TRUE
> names(exg) <- c("cost","date","t1", "t2", "S", "PR", "NE", "CT", "BW", "N", "PT")
> exg[1:4,]
    cost  date t1 t2    S PR NE CT BW  N PT
1 460.05 68.58 14 46  687  0  1  0  0 14  0
2 452.99 67.33 10 73 1065  0  0  1  0  1  0
3 443.22 67.33 10 85 1065  1  0  1  0  1  0
4 652.32 68.00 11 67 1065  0  1  1  0 12  0

# fit the full linear model as in Cox and Snell

> exg.lm <- lm(log(cost)~PT + CT + log(N) + log(S) + date +
+ NE + log(t1) + log(t2) + PR + BW, data=exg)
> summary(exg.lm)

Call:
lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE + 
    log(t1) + log(t2) + PR + BW, data = exg)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.28574 -0.10408  0.02784  0.09512  0.25031 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -14.24198    4.22880  -3.368  0.00291 ** 
PT           -0.22429    0.12246  -1.832  0.08125 .  
CT            0.12040    0.06632   1.815  0.08376 .  
log(N)       -0.08020    0.04596  -1.745  0.09562 .  
log(S)        0.69373    0.13605   5.099 4.75e-05 ***
date          0.20922    0.06526   3.206  0.00425 ** 
NE            0.25807    0.07693   3.355  0.00300 ** 
log(t1)       0.09187    0.24396   0.377  0.71025    
log(t2)       0.28553    0.27289   1.046  0.30731    
PR           -0.09237    0.07730  -1.195  0.24542    
BW            0.03303    0.10112   0.327  0.74715    
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

Residual standard error: 0.1645 on 21 degrees of freedom
Multiple R-Squared: 0.8717,	Adjusted R-squared: 0.8106 
F-statistic: 14.27 on 10 and 21 DF,  p-value: 3.081e-07 

> attributes(exg.lm)
$names
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        

$class
[1] "lm"

# the names are various pieces that can be pulled out separately
# there are other pieces that can be extracted, see p.141 of MASS

> .1645^2
[1] 0.02706025  # residual mean square as on p.86 of Cox and Snell

> exg.lm$df.residual
[1] 21
> ?update  # Now we'll fit a series of reduced models to duplicate Table G.3
> update(exg.lm, .~.-BW)

Call:
lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE +     log(t1) + log(t2) + PR, data = exg)

Coefficients:
(Intercept)           PT           CT       log(N)       log(S)  
  -14.70537     -0.21530      0.11655     -0.07684      0.68804  
       date           NE      log(t1)      log(t2)           PR  
    0.21545      0.25833      0.06313      0.32211     -0.09820  

> exg.lm9 <- update(.Last.value)
> deviance(exg.lm9) # gives the residual sum of squares
[1] 0.5709137
> deviance(update(exg.lm,.~.-BW))
[1] 0.5709137
> deviance(update(exg.lm,.~.-BW-log(t1)))
[1] 0.5729959
> deviance(update(exg.lm,.~.-BW-log(t1)-log(t2)))
[1] 0.616538
> deviance(update(exg.lm,.~.-BW-log(t1)-log(t2)-pr))
[1] 0.616538
> deviance(update(exg.lm,.~.-BW-log(t1)-log(t2)-PR))
[1] 0.6337415

# We have now duplicated Table G.3 'by hand'
# By changing the order of the variables to correspond to the order
# in Table G.3 we could get this automatically using anova

> anova(exg.lm)
Analysis of Variance Table

Response: log(cost)
          Df  Sum Sq Mean Sq F value    Pr(>F)    
PT         1 2.01272 2.01272 74.4105 2.410e-08 ***
CT         1 0.04550 0.04550  1.6823 0.2086954    
log(N)     1 0.26551 0.26551  9.8159 0.0050252 ** 
log(S)     1 0.75496 0.75496 27.9110 3.081e-05 ***
date       1 0.42961 0.42961 15.8828 0.0006729 ***
NE         1 0.28601 0.28601 10.5738 0.0038159 ** 
log(t1)    1 0.00101 0.00101  0.0372 0.8489384    
log(t2)    1 0.01572 0.01572  0.5812 0.4543297    
PR         1 0.04610 0.04610  1.7044 0.2058380    
BW         1 0.00289 0.00289  0.1067 0.7471545    
Residuals 21 0.56803 0.02705                      
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

## the Sum Sq due to BW is 0.0029 = 0.5709-0.5680; see Table G.3
## the Sum Sq due to PR is a bit different because of the order


## the reduced model in Table G.4 is fit (and saved) using update

> exg.lmred <- update(exg.lm,.~.-BW-PR-log(t1)-log(t2))
> summary(exg.lmred)

Call:
lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE, 
    data = exg)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.32721 -0.07620  0.02920  0.08115  0.28946 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -13.26031    3.13950  -4.224 0.000278 ***
PT           -0.22610    0.11355  -1.991 0.057490 .  
CT            0.14039    0.06042   2.323 0.028582 *  
log(N)       -0.08758    0.04147  -2.112 0.044891 *  
log(S)        0.72341    0.11882   6.088 2.31e-06 ***
date          0.21241    0.04326   4.910 4.70e-05 ***
NE            0.24902    0.07414   3.359 0.002510 ** 
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

Residual standard error: 0.1592 on 25 degrees of freedom
Multiple R-Squared: 0.8569,	Adjusted R-squared: 0.8225 
F-statistic: 24.95 on 6 and 25 DF,  p-value: 2.058e-09 

> # example of an interaction
> exg.lmint <- lm(log(cost)~PT+CT+log(N)+log(S)+date+NE+PT*CT,data=exg)
> summary(exg.lmint)

Call:
lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE + 
    PT*CT, data = exg)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.33157 -0.06591  0.02536  0.07838  0.28676 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -13.23435    3.19296  -4.145 0.000365 ***
PT           -0.24289    0.12210  -1.989 0.058181 .  
CT            0.13123    0.06515   2.014 0.055310 .  
log(N)       -0.08680    0.04221  -2.056 0.050782 .  
log(S)        0.72291    0.12083   5.983 3.55e-06 ***
date          0.21213    0.04399   4.822 6.53e-05 ***
NE            0.24899    0.07539   3.303 0.002991 ** 
PT:CT         0.07976    0.18867   0.423 0.676234    
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

Residual standard error: 0.1619 on 24 degrees of freedom
Multiple R-Squared: 0.8579,	Adjusted R-squared: 0.8165 
F-statistic: 20.71 on 7 and 24 DF,  p-value: 1.000e-08 

## duplicating plots in Figures G.1-G.4 (nothing unusual)

> plot(exg$date,exg.lmred$residuals)
> plot(exg.lmred$fitted,exg.lmred$residuals)
> qqnorm(exg.lmred$residuals)


## Illustration of command 'model.matrix'

> model.matrix(exg.lmred)
   (Intercept) PT CT    log(N)   log(S)  date NE
1            1  0  0 2.6390573 6.532334 68.58  1
2            1  0  1 0.0000000 6.970730 67.33  0
3            1  0  1 0.0000000 6.970730 67.33  0
4            1  0  1 2.4849066 6.970730 68.00  1
5            1  0  1 2.4849066 6.970730 68.00  1
6            1  0  1 1.0986123 6.242223 67.92  1
7            1  0  0 1.6094379 6.711740 68.17  0
8            1  0  0 0.0000000 6.124683 68.42  0
9            1  0  0 1.6094379 6.711740 68.42  0
10           1  0  1 0.6931472 6.674561 68.33  1
11           1  0  0 1.0986123 6.327937 68.58  0
12           1  0  0 1.7917595 6.672033 68.75  1
13           1  0  1 0.6931472 6.272877 68.42  0
14           1  0  0 1.9459101 6.956545 68.92  0
15           1  0  0 2.7725887 6.745236 68.92  0
16           1  0  0 1.0986123 6.656727 68.42  0
17           1  0  0 2.8332133 6.739337 69.50  1
18           1  0  1 0.6931472 6.272877 68.42  0
19           1  0  0 0.0000000 6.993933 69.17  0
20           1  0  0 2.0794415 6.956545 68.92  0
21           1  0  1 2.7080502 6.816736 68.75  0
22           1  0  0 2.9957323 6.719013 70.92  1
23           1  0  1 2.8903718 6.666957 69.67  0
24           1  0  0 1.0986123 6.710523 70.08  0
25           1  0  1 2.9444390 6.287859 70.42  0
26           1  0  1 3.0445224 7.029973 71.08  0
27           1  1  0 2.0794415 6.613384 67.25  0
28           1  1  1 1.9459101 6.710523 67.17  0
29           1  1  0 2.3978953 6.786717 67.83  0
30           1  1  0 2.3978953 6.786717 67.83  0
31           1  1  0 2.0794415 6.613384 67.25  0
32           1  1  0 2.3978953 6.786717 67.83  0
attr(,"assign")
[1] 0 1 2 3 4 5 6

## there is a built in plot methods for lm that also gives some
## plots to check for influential values

> plot(exg.lmred)
Hit <Return> to see next plot: 
Hit <Return> to see next plot: 
Hit <Return> to see next plot: 
Hit <Return> to see next plot: 

## These indicate that observations 26 and 19 may be influential
## so I'll try refitting without them
## identifying influential observations will be discussed more in 410

> exg[19,]
     cost  date t1 t2    S PR NE CT BW N PT
19 881.24 69.17 15 67 1090  0  0  0  0 1  0
> exg[26,]
     cost  date t1 t2    S PR NE CT BW  N PT
26 697.14 71.08 20 57 1130  0  0  1  0 21  0
> 
> exg.lminf <- lm(log(cost)~PT+CT+log(N)+log(S)+date+NE,data=exg[-c(19,26),])
> summary(exg.lminf)$coef
               Estimate Std. Error   t value     Pr(>|t|)
(Intercept) -13.5100856 3.53716909 -3.819463 8.798663e-04
PT           -0.2320113 0.10446933 -2.220856 3.648949e-02
CT            0.1969191 0.05617147  3.505678 1.901559e-03
log(N)       -0.0442815 0.04217402 -1.049971 3.046393e-01
log(S)        0.6890273 0.11852241  5.813478 6.354631e-06
date          0.2180599 0.04660851  4.678542 1.037163e-04
NE            0.2203132 0.06457740  3.411614 2.390272e-03
> df.residual(exg.lminf)
[1] 23
> df.residual(exg.lmred)
[1] 25

## the coefficients are not much changed

## I'll try just leaving out 26
> 
> summary(lm(log(cost)~PT+CT+log(N)+log(S)+date+NE,data=exg[-26,]))

Call:
lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE, 
    data = exg[-26, ])

Residuals:
      Min        1Q    Median        3Q       Max 
-0.297578 -0.057638  0.009755  0.108084  0.239421 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -17.08769    3.39852  -5.028 3.87e-05 ***
PT           -0.17030    0.10867  -1.567  0.13017    
CT            0.18961    0.06050   3.134  0.00451 ** 
log(N)       -0.09361    0.03870  -2.419  0.02351 *  
log(S)        0.80020    0.11594   6.902 3.88e-07 ***
date          0.26071    0.04583   5.688 7.37e-06 ***
NE            0.23082    0.06949   3.322  0.00286 ** 
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

Residual standard error: 0.1482 on 24 degrees of freedom
Multiple R-Squared: 0.8742,	Adjusted R-squared: 0.8428 
F-statistic: 27.81 on 6 and 24 DF,  p-value: 1.112e-09 

## Here is a use of 'subset' to leave out some observations
## this was used in 450 to fit the model only to the training data

> summary(lm(log(cost)~PT+CT+log(N)+log(S)+date+NE,subset=row.names(exg)!="26",data=exg))

Call:
lm(formula = log(cost) ~ PT + CT + log(N) + log(S) + date + NE, 
    data = exg, subset = row.names(exg) != "26")

Residuals:
      Min        1Q    Median        3Q       Max 
-0.297578 -0.057638  0.009755  0.108084  0.239421 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -17.08769    3.39852  -5.028 3.87e-05 ***
PT           -0.17030    0.10867  -1.567  0.13017    
CT            0.18961    0.06050   3.134  0.00451 ** 
log(N)       -0.09361    0.03870  -2.419  0.02351 *  
log(S)        0.80020    0.11594   6.902 3.88e-07 ***
date          0.26071    0.04583   5.688 7.37e-06 ***
NE            0.23082    0.06949   3.322  0.00286 ** 
---
Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

Residual standard error: 0.1482 on 24 degrees of freedom
Multiple R-Squared: 0.8742,	Adjusted R-squared: 0.8428 
F-statistic: 27.81 on 6 and 24 DF,  p-value: 1.112e-09 


## Prediction is often needed in 450 applications; here is an example
## where I predict the values for the omitted observations; we can
## see that 19 is underestimated and 26 is overestimated but I don't
## think it's very important for this example

> ?predict.lm
> predict(exg.lminf,exg[c(19,26),],se.fit=TRUE)
$fit
      19       26 
6.392128 6.895557 

$se.fit
       19        26 
0.1086234 0.1202985 

$df
[1] 23

$residual.scale
[1] 0.1373332

> exp(6.392)
[1] 597.0495
> exp(6.899)
[1] 991.283