## simulate spurious regression on nonstationary variables

## make two independent stationary variates
x <- rnorm(100)
y <- rnorm(100)

## model y on x: expect no significance
summary(lm(y~x))

## do it 1000 times
nrep <- 1000
pvals <- rep(NA, nrep)
for(i in 1:nrep) {
	x <- rnorm(100)
	y <- rnorm(100)
	pvals[i] <- coef(summary(lm(y~x)))[2, 4]
}

## histogram of p-values' frequency
hist(pvals)

## empirical frequency of "significant" p-values
## (expected: 5%)
sum(pvals < 0.05)/length(pvals)

## now on nonstationary variates:
## make y and x from independent Random Walk processes
x <- rep(NA, 100)
x[1] <- rnorm(1)
for(i in 2:100) x[i] <- x[i-1] + rnorm(1)

y <- rep(NA, 100)
y[1] <- rnorm(1)
for(i in 2:100) y[i] <- y[i-1] + rnorm(1)

## model y on x: one would expect insignificance, given
## that they were generated independently of each other!
mod <- lm(y~x)
summary(mod)
## ...instead you get: significant x, high R2

## look at the residuals:
plot(resid(mod), type="l", col="blue")

## (BG test on the residuals:)
library(lmtest)
bgtest(mod)

## are the residuals stationary?
library(tseries)
adf.test(residuals(mod))

## do it 1000 times
nrep <- 1000
pvals <- rep(NA, nrep)
r2 <- rep(NA, nrep)
tstats <- rep(NA, nrep)
for(j in 1:nrep) {
	x <- rep(NA, 100)
	x[1] <- rnorm(1)
	for(i in 2:100) x[i] <- x[i-1] + rnorm(1)
	y <- rep(NA, 100)
	y[1] <- rnorm(1)
	for(i in 2:100) y[i] <- y[i-1] + rnorm(1)
	modsumm <- summary(lm(y~x))
      pvals[j] <- coef(modsumm)[2, 4]
      r2[j] <- modsumm$r.squared
      tstats[j] <- coef(modsumm)[2, 3]
}

## most R2s should be near zero, given that y and x have
## been generated independently

## replicate R2 slide from Brooks
hist(r2, breaks=50)

## idem tstats slide
hist(tstats, breaks=50)
## (mark the critical values:)
abline(v=-1.96, col="red")
abline(v=1.96, col="red")

## perhaps more interestingly, look at p-values for the t-test
hist(pvals, breaks=50)
## empirical rejection rate:
sum(pvals < 0.05)/length(pvals)



## now a cointegrating regression (variables are nonstationary
## but actually related)
u <- rnorm(100)

x <- rep(NA, 100)
x[1] <- rnorm(1)
for(i in 2:100) x[i] <- x[i-1] + rnorm(1)

y <- 4*x + u

mod <- lm(y~x)
summary(mod)
plot(resid(mod), type="l", col="blue")

## what about the residual serial correlation?
bgtest(mod)

## are the residuals stationary?
library(tseries)
adf.test(residuals(mod))