## mtcars dataset ## load data data(mtcars) ## Testing for functional form: ("Hypothesis 0") ## estimate model of car range given weight and power fm1 <- mpg ~ hp + wt mod1 <- lm(formula=fm1, data=mtcars) ## inspect results? Wait! They are meaningful only if residuals are "well behaved" #summary(mod1) ## (always!) visualize residuals ## in original order: plot(resid(mod1), type="l", col="blue") abline(h=0, col="orange") ## against y: plot(mtcars$mpg, resid(mod1), col="red") abline(h=0, col="orange") ## signs of nonlinear behaviour! ## RamsEy's RESET test: library(lmtest) reset(mod1) ## by hand, it would look like: auxmod <- lm(resid(mod1) ~ I(fitted(mod1)^2) + I(fitted(mod1)^3)) summary(auxmod) ## Chi2 verson, as in Brooks: rst <- length(resid(mod1))*summary(auxmod)$r.squared 1-pchisq(rst, df=2) ## what do you conclude? ## Transform the specification into consumption (instead of range) and see ## if linearity becomes acceptable: ## estimate model of car consumption (=1/range) given weight and power fm2 <- I(1/mpg*100) ~ hp + wt mod2 <- lm(fm2, mtcars) plot(mtcars$mpg, resid(mod2), col="red") abline(h=0, col="orange") reset(mod2) ## Fine. Take heed it is not always this easy... ## Testing model 2 for heteroskedasticity (Hypothesis 2): ## in original order: plot(resid(mod2)) gqtest(mod2) ## but try to order observations by mileage: plot(resid(mod2)[order(mtcars$mpg)]) ## ...or by weight: plot(resid(mod2)[order(mtcars$wt)]) ## heavier cars clearly have more variance! ## estimate reordered model: mod2.1 <- lm(fm2, mtcars[order(mtcars$wt), ]) gqtest(mod2.1) ## White test is "more robust" to ordering: auxmod.w <- lm(I(resid(mod2)^2) ~ I(hp^2) + I(wt^2) + I(hp*wt), data=mtcars) summary(auxmod.w) wtest <- length(resid(mod2))*summary(auxmod.w)$r.squared 1-pchisq(wtest, df=3) ## Heteroskedasticity is there. What now? ## e.g., try respecifying as power efficiency vs. power to weight ratio fm3 <- I(hp/mpg) ~ I(wt/hp) mod3 <- lm(fm3, mtcars) ## simpler solution: White's robust standard errors library(sandwich) coeftest(mod2, vcov=vcovHC) ## compare SEs: mytable <- cbind(coeftest(mod2)[, c(1,3:4)], coeftest(mod2, vcov=vcovHC)[, 3:4]) dimnames(mytable)[[2]][2:5] <- c("t (OLS S.E.)", "p-value (OLS)", "t (HC S.E.)", "p-value (HC)") round(mytable,4) ## conclusions are qualitatively unchanged ####################### ## Multicollinearity: fm4 <- I(1/mpg*100) ~ hp + wt + disp + qsec mod4 <- lm(fm4, mtcars) summary(mod4) ## three vars look not significant. Are they (jointly!) so? ## F test for the joint restriction rmod <- lm(I(1/mpg*100) ~ wt, mtcars) rrss <- sum(resid(rmod)^2) urss <- sum(resid(mod4)^2) df <- length(resid(mod4)) - length(coef(mod4)) m <- length(coef(mod4))-length(coef(rmod)) Fstat <- (rrss-urss)/urss * df/m 1-pf(Fstat, df1=m, df2=df) ## automagic: waldtest(mod4, c(1,3:4)) ## the reason why they do not "look" significant if taken individually: cor(mtcars[, c("hp", "disp", "qsec")]) ## from here, solutions must be found ad hoc. E.g., hp is most correlated with # the other two summary(update(mod4, . ~ . -hp)) ## (discussion: what is the model needed for) #####################