unemployment <- read.table ("unemployment.dat", header=TRUE)
ggplot(unemployment, aes(year, unemployed.pct)) + geom_line() + ylab("unemployed (%)")
Let’s try to fit a linear model:
unemployment$year1975 <- unemployment$year-1975
fit <- lm(unemployed.pct~year1975, data=unemployment)
summary(fit)
##
## Call:
## lm(formula = unemployed.pct ~ year1975, data = unemployment)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3325 -0.8997 -0.0546 0.7667 3.9057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.58505 0.18727 29.823 < 2e-16 ***
## year1975 0.02990 0.01118 2.674 0.00981 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.426 on 56 degrees of freedom
## Multiple R-squared: 0.1132, Adjusted R-squared: 0.09739
## F-statistic: 7.15 on 1 and 56 DF, p-value: 0.009806
There is a pretty small upward trend. Let’s plot the residuals.
qplot(unemployment$year, fit$residuals)
It’s pretty clear that the residuals are not independent – large residuals tend to occur together.
Let’s fit the AR(1) model:
y <- unemployment$unemployed.pct
n <- length(y)
y.lag <- c(NA, y[1:(n-1)])
lm.lag <- lm(y~y.lag)
Do the residuals look normal?
qplot(unemployment$year[2:length(y)], lm.lag$residuals)
Kind of, and it seems like we got rid of most of the “parabola” pattern in the previous plot.
One way to see if the model is appropriate is “fake-data simulation”: simulate several datasets from the model using the fitted parameters.
b.hat <- coef (lm.lag) # vector of 2 regression coefs
s.hat <- sigma.hat (lm.lag) # residual sd
simulate_series <- function(){
n_years <- 58
y1 <- 3.9
y.rep <- rep(0, n_years)
y.rep[1] <- y1
for (t in 2:n_years){
prediction <- c (1, y.rep[t-1]) %*% b.hat
y.rep[t] <- rnorm (1, prediction, s.hat)
}
ggdata <- data.frame(year=seq(1947, 2004), unemployment.pct=y.rep)
return(y.rep)
}
n.sims <- 5
y.rep <- replicate(n.sims, simulate_series())
Let’s create a bunch of plots of simulations:
ggdata <- data.frame(year=seq(1947, 2004), unemployed.pct=y.rep[,1])
ggplot(ggdata, aes(year, unemployed.pct)) + geom_line() + ylab("unemployed (%)")
ggdata <- data.frame(year=seq(1947, 2004), unemployed.pct=y.rep[,2])
ggplot(ggdata, aes(year, unemployed.pct)) + geom_line() + ylab("unemployed (%)")
ggdata <- data.frame(year=seq(1947, 2004), unemployed.pct=y.rep[,3])
ggplot(ggdata, aes(year, unemployed.pct)) + geom_line() + ylab("unemployed (%)")
ggdata <- data.frame(year=seq(1947, 2004), unemployed.pct=y.rep[,4])
ggplot(ggdata, aes(year, unemployed.pct)) + geom_line() + ylab("unemployed (%)")