## Fitting a cubic polynomial to the toxoplasmosis data. ## I have set up a data frame called tox, which has the centered ## values for 'rain' (i.e. mean subtracted out) ## and has 2 additional columns: rain-mean(rain) squared and cubed ## (Refer to the book excerpt handed out in class > glm(cbind(r,m-r)~rain+rain2+rain3,data=tox,family=binomial) Call: glm(formula = cbind(r, m - r) ~ rain + rain2 + rain3, family = binomial, data = tox) Coefficients: (Intercept) rain rain2 rain3 9.939e-02 -2.552e-03 -6.064e-06 3.932e-08 Degrees of Freedom: 33 Total (i.e. Null); 30 Residual Null Deviance: 74.21 Residual Deviance: 62.63 AIC: 161.3 > tox.glm <- .Last.value > summary(tox.glm) Call: glm(formula = cbind(r, m - r) ~ rain + rain2 + rain3, family = binomial, data = tox) Deviance Residuals: Min 1Q Median 3Q Max -2.7620 -1.2166 -0.5079 0.3538 2.6204 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 9.939e-02 1.020e-01 0.975 0.329678 rain -2.552e-03 8.828e-04 -2.891 0.003843 ** rain2 -6.064e-06 2.963e-06 -2.046 0.040743 * rain3 3.932e-08 1.174e-08 3.351 0.000806 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 74.212 on 33 degrees of freedom Residual deviance: 62.635 on 30 degrees of freedom AIC: 161.33 Number of Fisher Scoring iterations: 3 ## Here is a 3-degree polynomial fit using the 'poly' function ## ## Note that the coefficient estimates are quite different ## but the residual deviance is the same > glm(cbind(r,m-r)~poly(rain,3),data=tox,family=binomial) Call: glm(formula = cbind(r, m - r) ~ poly(rain, 3), family = binomial, data = tox) Coefficients: (Intercept) poly(rain, 3)1 poly(rain, 3)2 poly(rain, 3)3 0.02427 -0.08606 -0.19269 1.37875 Degrees of Freedom: 33 Total (i.e. Null); 30 Residual Null Deviance: 74.21 Residual Deviance: 62.63 AIC: 161.3 > summary(tox.glm) Call: glm(formula = cbind(r, m - r) ~ poly(rain, 3), family = binomial, data = tox, x = TRUE) Deviance Residuals: Min 1Q Median 3Q Max -2.7620 -1.2166 -0.5079 0.3538 2.6204 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.02427 0.07693 0.315 0.752401 poly(rain, 3)1 -0.08606 0.45870 -0.188 0.851172 poly(rain, 3)2 -0.19269 0.46739 -0.412 0.680141 poly(rain, 3)3 1.37875 0.41150 3.351 0.000806 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 74.212 on 33 degrees of freedom Residual deviance: 62.635 on 30 degrees of freedom AIC: 161.33 Number of Fisher Scoring iterations: 3 ## This call will return the model matrix ## > glm(cbind(r,m-r)~poly(rain,3),data=tox,family=binomial,x=TRUE) Call: glm(formula = cbind(r, m - r) ~ poly(rain, 3), family = binomial, data = tox, x = TRUE) Coefficients: (Intercept) poly(rain, 3)1 poly(rain, 3)2 poly(rain, 3)3 0.02427 -0.08606 -0.19269 1.37875 Degrees of Freedom: 33 Total (i.e. Null); 30 Residual Null Deviance: 74.21 Residual Deviance: 62.63 AIC: 161.3 > .Last.value$x (Intercept) poly(rain, 3)1 poly(rain, 3)2 poly(rain, 3)3 1 1 -0.171220074 0.078435079 0.09599121 2 1 0.027881166 -0.160142637 -0.02109800 3 1 0.091276585 -0.150508795 -0.14642941 4 1 0.064531643 -0.159615129 -0.09514990 5 1 -0.156361772 0.046549352 0.12163071 6 1 -0.106834101 -0.043333319 0.15444420 7 1 -0.156361772 0.046549352 0.12163071 8 1 0.167549199 -0.084121670 -0.24768486 9 1 0.012032311 -0.156090804 0.01034325 ## stuff removed ## Now I'll get the model matrix for the original fit > tox.glm$x # didn't work because I didn't have x=TRUE in the call list() ## Try again > glm(cbind(r,m-r)~rain+rain2+rain3,data=tox,family=binomial,x=TRUE) Call: glm(formula = cbind(r, m - r) ~ rain + rain2 + rain3, family = binomial, data = tox, x = TRUE) Coefficients: (Intercept) rain rain2 rain3 9.939e-02 -2.552e-03 -6.064e-06 3.932e-08 Degrees of Freedom: 33 Total (i.e. Null); 30 Residual Null Deviance: 74.21 Residual Deviance: 62.63 AIC: 161.3 > .Last.value$x (Intercept) rain rain2 rain3 1 1 -172.852941 29878.13927 -5.164524e+06 2 1 28.147059 792.25692 2.229970e+04 3 1 92.147059 8491.08045 7.824281e+05 4 1 65.147059 4244.13927 2.764932e+05 5 1 -157.852941 24917.55104 -3.933309e+06 6 1 -107.852941 11632.25692 -1.254573e+06 7 1 -157.852941 24917.55104 -3.933309e+06 8 1 169.147059 28610.72751 4.839420e+06 ## Stuff removed ## Note it is very different than the one using poly(rain,3) ## This will plotted observed and fitted values > plot(tox$rain,tox$r/tox$m) > points(tox$rain,tox.glm$fitted,col="red",pch="+") NULL ## This built in plot function shows the residuals > plot(tox.glm) Hit to see next plot: Hit to see next plot: Hit to see next plot: Hit to see next plot: ## This is the QR decomposition for the weighted least squares ## algorithm (final version) > tox.glm$qr $qr (Intercept) rain rain2 rain3 1 -13.08785064 -1.178077e+02 -3.796170e+05 -4.777286e+07 2 0.12080121 -2.225869e+03 -2.608070e+05 -1.748240e+08 3 0.08516470 4.808175e-02 4.273570e+05 6.621716e+07 4 0.12070898 4.898573e-02 1.159836e-01 8.520751e+07 5 0.05376810 -4.868149e-02 -1.723678e-02 -2.312736e-02 6 0.08473704 -5.180849e-02 1.962907e-02 -6.179547e-02 7 0.10753619 -9.736298e-02 -3.447356e-02 -4.625472e-02 8 0.16447563 1.673251e-01 8.502933e-02 2.096971e-01 ## many more rows omitted $rank [1] 4 $qraux [1] 1.076174 1.022742 1.077439 1.049667 $pivot [1] 1 2 3 4 $tol [1] 1e-11 attr(,"class") [1] "qr" ## This gives an 'analysis of deviance' table which is like ## an analysis of variance table in regression ## This is for the original (centered only) version > anova(tox.glm) Analysis of Deviance Table Model: binomial, link: logit Response: cbind(r, m - r) Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 33 74.212 rain 1 0.124 32 74.087 rain2 1 1.460e-05 31 74.087 rain3 1 11.453 30 62.635 ## The update function allows you to remove variables one at a time > update(tox.glm,.~.-rain3) Call: glm(formula = cbind(r, m - r) ~ rain + rain2, family = binomial, data = tox) Coefficients: (Intercept) rain rain2 4.472e-02 -1.551e-04 -8.778e-09 Degrees of Freedom: 33 Total (i.e. Null); 31 Residual Null Deviance: 74.21 Residual Deviance: 74.09 AIC: 170.8 > update(.Last.value,~.~-rain2) Error in model.frame.default(formula = ~cbind(r, m - r) ~ 1, data = tox, : object is not a matrix > update(tox.glm,.~.-rain3-rain2) Call: glm(formula = cbind(r, m - r) ~ rain, family = binomial, data = tox) Coefficients: (Intercept) rain 0.0444753 -0.0001562 Degrees of Freedom: 33 Total (i.e. Null); 32 Residual Null Deviance: 74.21 Residual Deviance: 74.09 AIC: 168.8 > update(tox.glm,.~.-rain3-rain2-rain) Call: glm(formula = cbind(r, m - r) ~ 1, family = binomial, data = tox) Coefficients: (Intercept) 0.04305 Degrees of Freedom: 33 Total (i.e. Null); 33 Residual Null Deviance: 74.21 Residual Deviance: 74.21 AIC: 166.9 ## There is a prediction method for glms as well: > predict.glm(tox.glm) 1 2 3 4 5 6 0.15623823 0.02363833 -0.15647468 -0.08171660 0.19645405 0.25475284 7 8 9 10 11 12 0.19645405 -0.31543720 0.06757119 0.25475284 -0.27292793 0.24275625 13 14 15 16 17 18 -0.31991333 -0.18316918 -0.15647468 0.23293314 0.06757119 0.23293314 19 20 21 22 23 24 0.02373403 -0.60636808 0.20938545 -0.31991333 0.09142942 0.25393642 25 26 27 28 29 30 0.14279518 0.18323370 -0.29562994 -0.33585600 0.07291591 0.23894396 31 32 33 34 0.24435696 0.11903989 -0.09022474 0.45340621 ## This version uses the **uncentered** rain values ## (I deleted some code that changed the data frame tox) ## For the version with poly it makes no difference > tox.glm <- glm(cbind(r,m-r)~poly(rain,3),data=tox,family=binomial) > summary(tox.glm) Call: glm(formula = cbind(r, m - r) ~ poly(rain, 3), family = binomial, data = tox) Deviance Residuals: Min 1Q Median 3Q Max -2.7620 -1.2166 -0.5079 0.3538 2.6204 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.02427 0.07693 0.315 0.752401 poly(rain, 3)1 -0.08606 0.45870 -0.188 0.851172 poly(rain, 3)2 -0.19269 0.46739 -0.412 0.680141 poly(rain, 3)3 1.37875 0.41150 3.351 0.000806 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 74.212 on 33 degrees of freedom Residual deviance: 62.635 on 30 degrees of freedom AIC: 161.33 Number of Fisher Scoring iterations: 3 ## Here's another way to do it by hand ## using I(). Rain here is NOT centered > tox.glm <- glm(cbind(r,m-r)~ rain+I(rain^2)+I(rain^3),data=tox,family=binomial,x=TRUE) > summary(tox.glm) Call: glm(formula = cbind(r, m - r) ~ rain + I(rain^2) + I(rain^3), family = binomial, data = tox, x = TRUE) Deviance Residuals: Min 1Q Median 3Q Max -2.7620 -1.2166 -0.5079 0.3538 2.6204 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -2.902e+02 8.722e+01 -3.327 0.000878 *** rain 4.500e-01 1.347e-01 3.341 0.000836 *** I(rain^2) -2.311e-04 6.903e-05 -3.348 0.000813 *** I(rain^3) 3.932e-08 1.174e-08 3.351 0.000806 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 74.212 on 33 degrees of freedom Residual deviance: 62.635 on 30 degrees of freedom AIC: 161.33 Number of Fisher Scoring iterations: 3 > library(MASS) ## This is similar as well to the same function for lm objects ## Note that the behaviour for poly(rain,3) is much different than ## for the polynomial specified by hand > stepAIC(tox.glm) Start: AIC= 161.33 cbind(r, m - r) ~ poly(rain, 3) Df Deviance AIC 62.635 161.327 - poly(rain, 3) 3 74.212 166.904 Call: glm(formula = cbind(r, m - r) ~ poly(rain, 3), family = binomial, data = tox, x = TRUE) Coefficients: (Intercept) poly(rain, 3)1 poly(rain, 3)2 poly(rain, 3)3 0.02427 -0.08606 -0.19269 1.37875 Degrees of Freedom: 33 Total (i.e. Null); 30 Residual Null Deviance: 74.21 Residual Deviance: 62.63 AIC: 161.3 ## In the above only removing the whole polynomial was contemplated ## IN the below the possibility of removing a single term is contemplated > stepAIC(tox.glm2) Start: AIC= 161.33 cbind(r, m - r) ~ I(rain - mean(rain)) + I((rain - mean(rain))^2) + I((rain - mean(rain))^3) Df Deviance AIC 62.635 161.327 - I((rain - mean(rain))^2) 1 66.910 163.602 - I(rain - mean(rain)) 1 71.075 167.767 - I((rain - mean(rain))^3) 1 74.087 170.780 Call: glm(formula = cbind(r, m - r) ~ I(rain - mean(rain)) + I((rain - mean(rain))^2) + I((rain - mean(rain))^3), family = binomial, data = tox, x = TRUE) Coefficients: (Intercept) I(rain - mean(rain)) 9.939e-02 -2.552e-03 I((rain - mean(rain))^2) I((rain - mean(rain))^3) -6.064e-06 3.932e-08 Degrees of Freedom: 33 Total (i.e. Null); 30 Residual Null Deviance: 74.21 Residual Deviance: 62.63 AIC: 161.3