## simulate spurious regression on nonstationary variables ## make two independent stationary variates x <- rnorm(100) y <- rnorm(100) ## model y on x: expect no significance summary(lm(y~x)) ## do it 1000 times nrep <- 1000 pvals <- rep(NA, nrep) for(i in 1:nrep) { x <- rnorm(100) y <- rnorm(100) pvals[i] <- coef(summary(lm(y~x)))[2, 4] } ## histogram of p-values' frequency hist(pvals) ## empirical frequency of "significant" p-values ## (expected: 5%) sum(pvals < 0.05)/length(pvals) ## now on nonstationary variates: ## make y and x from independent Random Walk processes x <- rep(NA, 100) x[1] <- rnorm(1) for(i in 2:100) x[i] <- x[i-1] + rnorm(1) y <- rep(NA, 100) y[1] <- rnorm(1) for(i in 2:100) y[i] <- y[i-1] + rnorm(1) ## model y on x: one would expect insignificance, given ## that they were generated independently of each other! mod <- lm(y~x) summary(mod) ## ...instead you get: significant x, high R2 ## look at the residuals: plot(resid(mod), type="l", col="blue") ## (BG test on the residuals:) library(lmtest) bgtest(mod) ## are the residuals stationary? library(tseries) adf.test(residuals(mod)) ## do it 1000 times nrep <- 1000 pvals <- rep(NA, nrep) r2 <- rep(NA, nrep) tstats <- rep(NA, nrep) for(j in 1:nrep) { x <- rep(NA, 100) x[1] <- rnorm(1) for(i in 2:100) x[i] <- x[i-1] + rnorm(1) y <- rep(NA, 100) y[1] <- rnorm(1) for(i in 2:100) y[i] <- y[i-1] + rnorm(1) modsumm <- summary(lm(y~x)) pvals[j] <- coef(modsumm)[2, 4] r2[j] <- modsumm$r.squared tstats[j] <- coef(modsumm)[2, 3] } ## most R2s should be near zero, given that y and x have ## been generated independently ## replicate R2 slide from Brooks hist(r2, breaks=50) ## idem tstats slide hist(tstats, breaks=50) ## (mark the critical values:) abline(v=-1.96, col="red") abline(v=1.96, col="red") ## perhaps more interestingly, look at p-values for the t-test hist(pvals, breaks=50) ## empirical rejection rate: sum(pvals < 0.05)/length(pvals) ## now a cointegrating regression (variables are nonstationary ## but actually related) u <- rnorm(100) x <- rep(NA, 100) x[1] <- rnorm(1) for(i in 2:100) x[i] <- x[i-1] + rnorm(1) y <- 4*x + u mod <- lm(y~x) summary(mod) plot(resid(mod), type="l", col="blue") ## what about the residual serial correlation? bgtest(mod) ## are the residuals stationary? library(tseries) adf.test(residuals(mod))