################################################# # Laboratory 4 - 24/11/2021: Linear Regression ~# ################################################# # load the library DAAG with the dataset nihills library(DAAG) nihills n <- nrow(nihills) str(nihills) ############################ # Simple Linear Regression # ############################ # scatterplot: dist (x) against time (y) plot(nihills$dist, nihills$time, pch=19, xlab="distance", ylab="time") # Alternatively with(nihills, plot(dist, time, pch=19)) # Fit the linear model 1: time = beta_0+beta_1*dist + epsilon lm1 <- lm(time ~ dist, data=nihills) summary(lm1) #Call: #lm(formula = time ~ dist, data = nihills) # #Residuals: # Min 1Q Median 3Q Max #-0.42812 -0.12193 -0.00250 0.09232 0.43028 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) -0.32530 0.07629 -4.264 0.000345 *** #dist 0.20094 0.01120 17.934 3.28e-14 *** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 0.1935 on 21 degrees of freedom #Multiple R-squared: 0.9387, Adjusted R-squared: 0.9358 #F-statistic: 321.6 on 1 and 21 DF, p-value: 3.278e-14 # plot of the residuals par(mfrow=c(2,2)) plot(lm1) # Plot of the fitted values par(mfrow=c(1,1)) with(nihills, plot(dist, time, pch=19)) abline(coef(lm1), col="red", lty="solid") # or #curve(predict(lm1, data.frame(dist=x)), col="red", lty="solid", lwd=2, add=TRUE) text(13,3,expression(time==hat(beta)[0]+hat(beta)[1]*dist), col="red") points(nihills$dist, predict(lm1), col="red", pch=19, cex=0.8) # or #points(nihills$dist, fitted(lm1), col="red", pch=19, cex=0.8) nihills[17,] # dist climb time timef #Flagstaff to Carling 11 3000 1.456944 2.034444 segments(nihills[17,]$dist,nihills[17,]$time, nihills[17,]$dist,fitted(lm1)[17], lty="dashed") # or #segments(nihills[17,"dist"],nihills[17,"time"], # nihills[17,"dist"],fitted(lm1)[17], lty="dashed") # or #segments(nihills[17,1],nihills[17,3], # nihills[17,1],fitted(lm1)[17], lty="dashed") # R_sq Tot_SS <- with(nihills, sum((time-mean(time))^2)) Res_SS <- with(nihills, sum((predict(lm1)-time)^2)) Mod_SS <- with(nihills, sum((predict(lm1)-mean(time))^2)) R_sq <- 1-Res_SS/Tot_SS #[1] 0.9387112 Mod_SS/Tot_SS #[1] 0.9387112 # In the simple linear regression (one predictor) the R-square is # equal to the square of the correlation coefficient with(nihills, cor(time, dist))^2 #[1] 0.9387112 ############################################################# ############################## # Multiple Linear Regression # ############################## # Scatterplot matrix library(PerformanceAnalytics) chart.Correlation(nihills[,c("dist", "climb", "time")]) # Model 2: time= beta0+beta1*dist+beta2*climb lm2 <- lm(time ~ dist + climb, data = nihills) summary(lm2) #Call: #lm(formula = time ~ dist + climb, data = nihills) # #Residuals: # Min 1Q Median 3Q Max #-0.19857 -0.04824 0.01701 0.05539 0.21083 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) -2.286e-01 4.025e-02 -5.679 1.47e-05 *** #dist 1.008e-01 1.382e-02 7.293 4.72e-07 *** #climb 2.298e-04 2.893e-05 7.941 1.31e-07 *** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 0.0973 on 20 degrees of freedom #Multiple R-squared: 0.9852, Adjusted R-squared: 0.9838 #F-statistic: 667.6 on 2 and 20 DF, p-value: < 2.2e-16 par(mfrow=c(2,2)) plot(lm2) # Extract the coefficients summary(lm2)$coef # Estimate Std. Error t value Pr(>|t|) #(Intercept) -0.2285705228 4.024640e-02 -5.679279 1.473367e-05 #dist 0.1007603621 1.381667e-02 7.292669 4.724521e-07 #climb 0.0002297608 2.893385e-05 7.940899 1.306900e-07 #or coef(lm2) # R-squared summary(lm2)$adj.r.squared #[1] 0.9837661 # Plot the residuals against the dist variable par(mfrow=c(1,1)) plot(nihills$dist, lm2$residuals) abline(h=0, lty="dashed") # Model 3: time= beta0+beta1*dist+beta2*dist^2 lm3 <- lm(time ~ dist + I(dist^2), data=nihills) summary(lm3) # Plot of the residuals and the fitted values par(mfrow=c(2,2)) plot(lm3) par(mfrow=c(1,1)) with(nihills, plot(dist, time, pch=19)) curve(predict(lm3, data.frame(dist=x)), col="red", lty="solid", lwd=2, add=TRUE) text(10,3,expression(time==hat(beta)[0]+hat(beta)[1]*dist+hat(beta)[1]*dist^2), col="red") points(nihills$dist, predict(lm3), col="red", pch=19, cex=0.8) # Model 4: log-transformation # Look the summary of data summary(nihills) # dist climb time timef # Min. : 2.500 Min. : 750 Min. :0.3247 Min. :0.4092 # 1st Qu.: 4.000 1st Qu.:1205 1st Qu.:0.4692 1st Qu.:0.6158 # Median : 4.500 Median :1500 Median :0.5506 Median :0.7017 # Mean : 5.778 Mean :2098 Mean :0.8358 Mean :1.1107 # 3rd Qu.: 5.800 3rd Qu.:2245 3rd Qu.:0.7857 3rd Qu.:1.0014 # Max. :18.900 Max. :8775 Max. :3.9028 Max. :5.9856 # Look the histograms of time, dist and climb par(mfrow=c(1,3)) hist(nihills$time, probability=TRUE, breaks=15) hist(nihills$climb, probability=TRUE, breaks=15) hist(nihills$dist, probability=TRUE, breaks=15) # Transform the variable in the log-scale #(add the variables to the nihills data.frame) nihills$log_time<-log(nihills$time) nihills$log_climb<-log(nihills$climb) nihills$log_dist<-log(nihills$dist) # See the scatterplot matrix chart.Correlation(nihills[,c("log_time","log_climb", "log_dist")]) lm4 <- lm(log_time ~ log_dist + log_climb, data=nihills) summary(lm4) #Call: #lm(formula = log_time ~ log_dist + log_climb, data = nihills) # #Residuals: # Min 1Q Median 3Q Max #-0.17223 -0.04229 -0.02538 0.05222 0.13150 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) -4.96113 0.27387 -18.11 7.09e-14 *** #log_dist 0.68136 0.05518 12.35 8.19e-11 *** #log_climb 0.46576 0.04530 10.28 1.98e-09 *** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 0.0766 on 20 degrees of freedom #Multiple R-squared: 0.9831, Adjusted R-squared: 0.9814 #F-statistic: 582.7 on 2 and 20 DF, p-value: < 2.2e-16 par(mfrow=c(2,2)) plot(lm4) summary(lm4)$adj.r.squared #[1] 0.981441 ####### # Compute AIC and BIC AIC <- rbind(extractAIC(lm1)[2],extractAIC(lm2)[2], extractAIC(lm3)[2],extractAIC(lm4)[2]) BIC <- rbind(extractAIC(lm1, k=log(n))[2],extractAIC(lm2,k=log(n))[2], extractAIC(lm3,k=log(n))[2],extractAIC(lm4,k=log(n))[2]) cbind(AIC, BIC) # [,1] [,2] #[1,] -73.64315 -71.37216 #[2,] -104.39068 -100.98419 #[3,] -98.42631 -95.01983 #[4,] -115.39756 -111.99108 # Evaluate the prediction accuracy for the log-model: # coverage intervals for predicted values coverage_log <- exp(predict(lm4, interval="confidence")) coverage_log[1:5,] freq_coverage_log <- mean(nihills$time >= coverage_log[,2] & nihills$time < coverage_log[,3]) ##### # Model Selection # Compare the summary of model 1 with anova() summary(lm1) anova(lm1) # Computing the F-value by hand m0<- lm(time ~ 1, data = nihills) m1<- lm(time ~ dist, data = nihills) F<-(sum(residuals(m0)^2)-sum(residuals(m1)^2))/(sum(residuals(m1)^2)/(n-2)) #[1] 321.6401 pf(F, 1,21,lower.tail=FALSE) #[1] 3.27848e-14 # Model selection for model 4: anova(lm4) #Analysis of Variance Table #Response: log_time # Df Sum Sq Mean Sq F value Pr(>F) #log_dist 1 6.2174 6.2174 1059.7 < 2.2e-16 *** #log_climb 1 0.6202 0.6202 105.7 1.981e-09 *** #Residuals 20 0.1173 0.0059 #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 ########################### # Multicollinearity ########################### # Compute the variance inflation factor: vif() vif(lm4) # log_dist log_climb # 2.5543 2.5543 #by hand lm_log_dist_climb<- lm(log_dist ~ log_climb, data=nihills) 1/(1-summary(lm_log_dist_climb)$r.squared) # Naive excercise: include in the model the # predictor logLC = 4 + 3*log(dist) - 2*climb nihills$logLC = 4 + 3*nihills$log_dist - 2*nihills$climb lm5 <- lm(log_time ~ log_dist+log_climb+logLC ,data=nihills) vif(lm5) # log_dist log_climb logLC # 2.9491 8.0909 9.2378 # by hand lm_log1<- lm(log_dist ~ log_climb + logLC, data=nihills) 1/(1-summary(lm_log1)$r.squared) lm_log2<- lm(log_climb ~ log_dist+logLC, data=nihills) 1/(1-summary(lm_log2)$r.squared) lm_log3<- lm(logLC ~ log_dist+log_climb, data=nihills) 1/(1-summary(lm_log3)$r.squared) # Another example: generate # x1 = log(climb) + log(dist) # x2 = log(climb) - log(dist) #Estimate the model (log_time=beta0 + beta1*x1 +beta2*x2 ) nihills$sum <- nihills$log_climb + nihills$log_dist nihills$diff <- nihills$log_climb - nihills$log_dist lm6 <- lm(log_time ~ sum + diff, data=nihills) # Compute the vif vif(lm6) # sum diff #1.1006 1.1006 ######################################################### # Methods of regularization: Ridge and Lasso Regression # ######################################################### #Ridge regression library(MASS) # Tuning parameter = 0 implies least square estimates lm_ridge<- lm.ridge(log_time ~ log_dist + log_climb, lambda=0, data=nihills) coef(lm_ridge) # log_dist log_climb #-4.9611313 0.6813596 0.4657575 coef(lm4) #Call: #lm(formula = log_time ~ log_dist + log_climb, data = nihills) # #Coefficients: #(Intercept) log_dist log_climb # -4.9611 0.6814 0.4658 # select lambda by GCV in the model with logLC grid.ridge<-lm.ridge(log_time ~ log_dist + log_climb + logLC, lambda=seq(0.1,10,0.001), data=nihills) lambda_selected<-grid.ridge$lambda[which(grid.ridge$GCV==min(grid.ridge$GCV))] # or see the last line of the following code #select(lm.ridge(log_time ~ log_dist + log_climb + logLC, # lambda=seq(0.1,10,0.001), data=nihills)) lm_ridge_GCV <- lm.ridge(log_time ~ log_dist + log_climb + logLC, lambda=lambda_selected, data=nihills) coef(lm_ridge_GCV) # log_dist log_climb logLC #-4.097312e+00 6.295220e-01 3.467162e-01 -2.569142e-05 coef(lm5) # (Intercept) log_dist log_climb logLC #-4.278904e+00 6.500840e-01 3.695947e-01 -2.035603e-05 # LASSO library(lasso2) lm.lasso <- l1ce(log_time ~ log_dist + log_climb + logLC, data=nihills) summary(lm.lasso)$coef # Value Std. Error Z score Pr(>|Z|) #(Intercept) -2.304966e+00 2.141078e+00 -1.0765445 0.2816838 #log_dist 3.583981e-01 2.287124e-01 1.5670250 0.1171088 #log_climb 1.757895e-01 3.187876e-01 0.5514315 0.5813379 #logLC -7.166288e-06 5.617907e-05 -0.1275615 0.8984960 coef(lm.lasso) # (Intercept) log_dist log_climb logLC #-2.304966e+00 3.583981e-01 1.757895e-01 -7.166288e-06