#################################################
# Laboratory 4 - 24/11/2021: Linear Regression ~#
#################################################
# load the library DAAG with the dataset nihills
library(DAAG)
nihills
n <- nrow(nihills)

str(nihills)

############################
# Simple Linear Regression #
############################
# scatterplot: dist (x) against time (y)
plot(nihills$dist, nihills$time, pch=19, xlab="distance", ylab="time")
# Alternatively
with(nihills, plot(dist, time, pch=19))

# Fit the linear model 1: time = beta_0+beta_1*dist + epsilon
lm1 <- lm(time ~ dist, data=nihills)

summary(lm1)
#Call:
#lm(formula = time ~ dist, data = nihills)
#
#Residuals:
#     Min       1Q   Median       3Q      Max 
#-0.42812 -0.12193 -0.00250  0.09232  0.43028 
#
#Coefficients:
#            Estimate Std. Error t value Pr(>|t|)    
#(Intercept) -0.32530    0.07629  -4.264 0.000345 ***
#dist         0.20094    0.01120  17.934 3.28e-14 ***
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 0.1935 on 21 degrees of freedom
#Multiple R-squared:  0.9387,    Adjusted R-squared:  0.9358 
#F-statistic: 321.6 on 1 and 21 DF,  p-value: 3.278e-14


# plot of the residuals
par(mfrow=c(2,2))
plot(lm1)

# Plot of the fitted values
par(mfrow=c(1,1))
with(nihills, plot(dist, time, pch=19))
abline(coef(lm1), col="red", lty="solid")
# or
#curve(predict(lm1, data.frame(dist=x)), col="red", lty="solid", lwd=2, add=TRUE)

text(13,3,expression(time==hat(beta)[0]+hat(beta)[1]*dist), col="red")

points(nihills$dist, predict(lm1), col="red", pch=19, cex=0.8)
# or
#points(nihills$dist, fitted(lm1), col="red", pch=19, cex=0.8)

nihills[17,]
#                     dist climb     time    timef
#Flagstaff to Carling   11  3000 1.456944 2.034444

segments(nihills[17,]$dist,nihills[17,]$time, 
         nihills[17,]$dist,fitted(lm1)[17], lty="dashed")
# or
#segments(nihills[17,"dist"],nihills[17,"time"], 
#         nihills[17,"dist"],fitted(lm1)[17], lty="dashed")
# or 
#segments(nihills[17,1],nihills[17,3], 
#         nihills[17,1],fitted(lm1)[17], lty="dashed")

# R_sq
Tot_SS <- with(nihills, sum((time-mean(time))^2)) 
Res_SS <- with(nihills, sum((predict(lm1)-time)^2))
Mod_SS <- with(nihills, sum((predict(lm1)-mean(time))^2))

R_sq <- 1-Res_SS/Tot_SS
#[1] 0.9387112

Mod_SS/Tot_SS
#[1] 0.9387112

# In the simple linear regression (one predictor) the R-square is 
# equal to the square of the correlation coefficient

with(nihills, cor(time, dist))^2
#[1] 0.9387112

#############################################################

##############################
# Multiple Linear Regression #
##############################
# Scatterplot matrix

library(PerformanceAnalytics)
chart.Correlation(nihills[,c("dist", "climb", "time")])

# Model 2: time= beta0+beta1*dist+beta2*climb
lm2 <- lm(time ~ dist + climb, data = nihills)
summary(lm2)
#Call:
#lm(formula = time ~ dist + climb, data = nihills)
#
#Residuals:
#     Min       1Q   Median       3Q      Max 
#-0.19857 -0.04824  0.01701  0.05539  0.21083 
#
#Coefficients:
#              Estimate Std. Error t value Pr(>|t|)    
#(Intercept) -2.286e-01  4.025e-02  -5.679 1.47e-05 ***
#dist         1.008e-01  1.382e-02   7.293 4.72e-07 ***
#climb        2.298e-04  2.893e-05   7.941 1.31e-07 ***
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 0.0973 on 20 degrees of freedom
#Multiple R-squared:  0.9852,    Adjusted R-squared:  0.9838 
#F-statistic: 667.6 on 2 and 20 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(lm2)

# Extract the coefficients

summary(lm2)$coef
#                Estimate   Std. Error   t value     Pr(>|t|)
#(Intercept) -0.2285705228 4.024640e-02 -5.679279 1.473367e-05
#dist         0.1007603621 1.381667e-02  7.292669 4.724521e-07
#climb        0.0002297608 2.893385e-05  7.940899 1.306900e-07

#or 
coef(lm2)

# R-squared
summary(lm2)$adj.r.squared
#[1] 0.9837661

# Plot the residuals against the dist variable
par(mfrow=c(1,1))
plot(nihills$dist, lm2$residuals)
abline(h=0, lty="dashed")

# Model 3: time= beta0+beta1*dist+beta2*dist^2
lm3 <- lm(time ~ dist + I(dist^2), data=nihills)

summary(lm3) 


# Plot of the residuals and the fitted values
par(mfrow=c(2,2))
plot(lm3)

par(mfrow=c(1,1))
with(nihills, plot(dist, time, pch=19))
curve(predict(lm3, data.frame(dist=x)), col="red", lty="solid", lwd=2, add=TRUE)
text(10,3,expression(time==hat(beta)[0]+hat(beta)[1]*dist+hat(beta)[1]*dist^2), col="red")
points(nihills$dist, predict(lm3), col="red", pch=19, cex=0.8)

# Model 4: log-transformation

# Look the summary of data
summary(nihills)
#      dist            climb           time            timef       
# Min.   : 2.500   Min.   : 750   Min.   :0.3247   Min.   :0.4092  
# 1st Qu.: 4.000   1st Qu.:1205   1st Qu.:0.4692   1st Qu.:0.6158  
# Median : 4.500   Median :1500   Median :0.5506   Median :0.7017  
# Mean   : 5.778   Mean   :2098   Mean   :0.8358   Mean   :1.1107  
# 3rd Qu.: 5.800   3rd Qu.:2245   3rd Qu.:0.7857   3rd Qu.:1.0014  
# Max.   :18.900   Max.   :8775   Max.   :3.9028   Max.   :5.9856  

# Look the histograms of time, dist and climb
par(mfrow=c(1,3))
hist(nihills$time, probability=TRUE, breaks=15)
hist(nihills$climb, probability=TRUE, breaks=15)
hist(nihills$dist, probability=TRUE, breaks=15)

# Transform the variable in the log-scale 
#(add the variables to the nihills data.frame)
nihills$log_time<-log(nihills$time)
nihills$log_climb<-log(nihills$climb)
nihills$log_dist<-log(nihills$dist)

# See the scatterplot matrix
chart.Correlation(nihills[,c("log_time","log_climb", "log_dist")])

lm4 <- lm(log_time ~ log_dist + log_climb, data=nihills)
summary(lm4)
#Call:
#lm(formula = log_time ~ log_dist + log_climb, data = nihills)
#
#Residuals:
#     Min       1Q   Median       3Q      Max 
#-0.17223 -0.04229 -0.02538  0.05222  0.13150 
#
#Coefficients:
#            Estimate Std. Error t value Pr(>|t|)    
#(Intercept) -4.96113    0.27387  -18.11 7.09e-14 ***
#log_dist     0.68136    0.05518   12.35 8.19e-11 ***
#log_climb    0.46576    0.04530   10.28 1.98e-09 ***
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 0.0766 on 20 degrees of freedom
#Multiple R-squared:  0.9831,    Adjusted R-squared:  0.9814 
#F-statistic: 582.7 on 2 and 20 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(lm4)

summary(lm4)$adj.r.squared
#[1] 0.981441


#######
# Compute AIC and BIC
AIC <- rbind(extractAIC(lm1)[2],extractAIC(lm2)[2],
             extractAIC(lm3)[2],extractAIC(lm4)[2])

BIC <- rbind(extractAIC(lm1, k=log(n))[2],extractAIC(lm2,k=log(n))[2],
             extractAIC(lm3,k=log(n))[2],extractAIC(lm4,k=log(n))[2])
cbind(AIC, BIC)
#           [,1]       [,2]
#[1,]  -73.64315  -71.37216
#[2,] -104.39068 -100.98419
#[3,]  -98.42631  -95.01983
#[4,] -115.39756 -111.99108


# Evaluate the prediction accuracy for the log-model:
# coverage intervals for predicted values
coverage_log <- exp(predict(lm4, interval="confidence")) 
coverage_log[1:5,]

freq_coverage_log <- mean(nihills$time >= coverage_log[,2] & 
                          nihills$time < coverage_log[,3])

#####
# Model Selection

# Compare the summary of model 1 with anova()

summary(lm1)

anova(lm1)

# Computing the F-value by hand
m0<- lm(time ~ 1, data = nihills)
m1<- lm(time ~ dist, data = nihills)

F<-(sum(residuals(m0)^2)-sum(residuals(m1)^2))/(sum(residuals(m1)^2)/(n-2))
#[1] 321.6401

pf(F, 1,21,lower.tail=FALSE)
#[1] 3.27848e-14

# Model selection for model 4:

anova(lm4)
#Analysis of Variance Table

#Response: log_time
#          Df Sum Sq Mean Sq F value    Pr(>F)    
#log_dist   1 6.2174  6.2174  1059.7 < 2.2e-16 ***
#log_climb  1 0.6202  0.6202   105.7 1.981e-09 ***
#Residuals 20 0.1173  0.0059                      
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

###########################
# Multicollinearity
###########################
# Compute the variance inflation factor: vif()
vif(lm4)
# log_dist log_climb 
#   2.5543    2.5543 

#by hand
lm_log_dist_climb<- lm(log_dist ~ log_climb, data=nihills)
1/(1-summary(lm_log_dist_climb)$r.squared)


# Naive excercise: include in the model the 
#       predictor logLC = 4 + 3*log(dist) - 2*climb
nihills$logLC = 4 + 3*nihills$log_dist - 2*nihills$climb

lm5 <- lm(log_time ~ log_dist+log_climb+logLC ,data=nihills)
vif(lm5)
# log_dist log_climb     logLC 
#   2.9491    8.0909    9.2378 

# by hand

lm_log1<- lm(log_dist ~ log_climb + logLC, data=nihills)
1/(1-summary(lm_log1)$r.squared)

lm_log2<- lm(log_climb ~ log_dist+logLC, data=nihills)
1/(1-summary(lm_log2)$r.squared)

lm_log3<- lm(logLC ~ log_dist+log_climb, data=nihills)
1/(1-summary(lm_log3)$r.squared)

# Another example: generate 
# x1 = log(climb) + log(dist)
# x2 = log(climb) - log(dist)

#Estimate the model (log_time=beta0 + beta1*x1 +beta2*x2 )
nihills$sum <- nihills$log_climb + nihills$log_dist
nihills$diff <- nihills$log_climb - nihills$log_dist

lm6 <- lm(log_time ~ sum + diff, data=nihills)

# Compute the vif
vif(lm6)
#   sum   diff 
#1.1006 1.1006 


#########################################################
# Methods of regularization: Ridge and Lasso Regression #
#########################################################
#Ridge regression
library(MASS)
# Tuning parameter = 0 implies least square estimates
lm_ridge<- lm.ridge(log_time ~ log_dist + log_climb, lambda=0, data=nihills)
coef(lm_ridge)
#             log_dist  log_climb 
#-4.9611313  0.6813596  0.4657575 

coef(lm4)
#Call:
#lm(formula = log_time ~ log_dist + log_climb, data = nihills)
#
#Coefficients:
#(Intercept)     log_dist    log_climb  
#    -4.9611       0.6814       0.4658  

# select lambda by GCV in the model with logLC
grid.ridge<-lm.ridge(log_time ~ log_dist + log_climb + logLC, 
       lambda=seq(0.1,10,0.001), data=nihills)

lambda_selected<-grid.ridge$lambda[which(grid.ridge$GCV==min(grid.ridge$GCV))]

# or see the last line of the following code
#select(lm.ridge(log_time ~ log_dist + log_climb + logLC, 
#       lambda=seq(0.1,10,0.001), data=nihills))

lm_ridge_GCV <- lm.ridge(log_time ~ log_dist + log_climb + logLC, 
                         lambda=lambda_selected, data=nihills)

coef(lm_ridge_GCV)
#                   log_dist     log_climb         logLC 
#-4.097312e+00  6.295220e-01  3.467162e-01 -2.569142e-05 

coef(lm5)
#  (Intercept)      log_dist     log_climb         logLC 
#-4.278904e+00  6.500840e-01  3.695947e-01 -2.035603e-05 
 

# LASSO
library(lasso2)
lm.lasso <- l1ce(log_time ~ log_dist + log_climb + logLC, data=nihills)

summary(lm.lasso)$coef
#                    Value   Std. Error    Z score  Pr(>|Z|)
#(Intercept) -2.304966e+00 2.141078e+00 -1.0765445 0.2816838
#log_dist     3.583981e-01 2.287124e-01  1.5670250 0.1171088
#log_climb    1.757895e-01 3.187876e-01  0.5514315 0.5813379
#logLC       -7.166288e-06 5.617907e-05 -0.1275615 0.8984960

coef(lm.lasso)
#  (Intercept)      log_dist     log_climb         logLC 
#-2.304966e+00  3.583981e-01  1.757895e-01 -7.166288e-06