## mtcars dataset

## load data
data(mtcars)

## Testing for functional form: ("Hypothesis 0")

## estimate model of car range given weight and power
fm1 <- mpg ~ hp + wt

mod1 <- lm(formula=fm1, data=mtcars)

## inspect results? Wait! They are meaningful only if residuals are "well behaved"
#summary(mod1)

## (always!) visualize residuals

## in original order:
plot(resid(mod1), type="l", col="blue")
abline(h=0, col="orange")

## against y:
plot(mtcars$mpg, resid(mod1), col="red")
abline(h=0, col="orange")

## signs of nonlinear behaviour!

## RamsEy's RESET test:
library(lmtest)
reset(mod1)

## by hand, it would look like:
auxmod <- lm(resid(mod1) ~ I(fitted(mod1)^2) + I(fitted(mod1)^3))
summary(auxmod)
## Chi2 verson, as in Brooks:
rst <- length(resid(mod1))*summary(auxmod)$r.squared
1-pchisq(rst, df=2)

## what do you conclude?

## Transform the specification into consumption (instead of range) and see
## if linearity becomes acceptable:

## estimate model of car consumption (=1/range) given weight and power
fm2 <- I(1/mpg*100) ~ hp + wt

mod2 <- lm(fm2, mtcars)

plot(mtcars$mpg, resid(mod2), col="red")
abline(h=0, col="orange")

reset(mod2)

## Fine. Take heed it is not always this easy...

## Testing model 2 for heteroskedasticity (Hypothesis 2):

## in original order:
plot(resid(mod2))
gqtest(mod2)

## but try to order observations by mileage:

plot(resid(mod2)[order(mtcars$mpg)])

## ...or by weight:

plot(resid(mod2)[order(mtcars$wt)])

## heavier cars clearly have more variance!

## estimate reordered model:
mod2.1 <- lm(fm2, mtcars[order(mtcars$wt), ])
gqtest(mod2.1)

## White test is "more robust" to ordering:

auxmod.w <- lm(I(resid(mod2)^2) ~ I(hp^2) + I(wt^2) + I(hp*wt), data=mtcars)
summary(auxmod.w)
wtest <- length(resid(mod2))*summary(auxmod.w)$r.squared
1-pchisq(wtest, df=3)

## Heteroskedasticity is there. What now?

## e.g., try respecifying as power efficiency vs. power to weight ratio
fm3 <- I(hp/mpg) ~ I(wt/hp)
mod3 <- lm(fm3, mtcars)

## simpler solution: White's robust standard errors

library(sandwich)
coeftest(mod2, vcov=vcovHC)

## compare SEs:
mytable <- cbind(coeftest(mod2)[, c(1,3:4)], coeftest(mod2, vcov=vcovHC)[, 3:4])
dimnames(mytable)[[2]][2:5] <- c("t (OLS S.E.)", "p-value (OLS)",
                                 "t (HC S.E.)", "p-value (HC)")
round(mytable,4)

## conclusions are qualitatively unchanged

#######################

## Multicollinearity:

fm4 <- I(1/mpg*100) ~ hp + wt + disp + qsec
mod4 <- lm(fm4, mtcars)
summary(mod4)

## three vars look not significant. Are they (jointly!) so?

## F test for the joint restriction
rmod <- lm(I(1/mpg*100) ~ wt, mtcars)

rrss <- sum(resid(rmod)^2)
urss <- sum(resid(mod4)^2)
df <- length(resid(mod4)) - length(coef(mod4))
m <- length(coef(mod4))-length(coef(rmod))
Fstat <- (rrss-urss)/urss * df/m
1-pf(Fstat, df1=m, df2=df)

## automagic:
waldtest(mod4, c(1,3:4))

## the reason why they do not "look" significant if taken individually:

cor(mtcars[, c("hp", "disp", "qsec")])

## from here, solutions must be found ad hoc. E.g., hp is most correlated with
# the other two
summary(update(mod4, . ~ . -hp))

## (discussion: what is the model needed for)


#####################