###############################################
# Lab 6: Splines - GLM - CART - Random Forest #
###############################################
setwd("C:/Users/Gioia/Desktop/Lab6_2122")

###########
# Splines #
###########
library(splines)
library(ggplot2)
library(MASS)
########################
# To do during the lab #
########################
# Load the dataset Boston in the MASS package
data(Boston)
str(Boston)

# Scatterplot of medv (y) and lstat (x) by using ggplot()
ggplot(Boston, aes(lstat, medv))+
 geom_point()

# Fit a simple linear model 
fit <- lm(medv ~ lstat, data = Boston)

# Analysis of the residual
par(mfrow=c(2,2))
plot(fit)

# Fit a linear model considering a quadratic term for lstat
# Two way to do this:
fit.poly2 <- lm(medv ~ lstat + I(lstat^2), data = Boston)
fit.poly2 <- lm(medv ~ poly(lstat, 2, raw = TRUE), data = Boston)

# Analysis of the residual 

plot(fit.poly2)


# Fit a linear model considering a polynomial of degree 5 for lstat
fit.poly5 <- lm(medv ~ poly(lstat, 5, raw = TRUE), data = Boston)

# Plot the fitted curve by using ggplot
ggplot(Boston, aes(lstat, medv)) + 
 geom_point()+
 stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))+
 stat_smooth(method = lm, formula = y ~ x, col = "red") +
 stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE), col = "green") 

# Now we consider the splines, namely we consider B-splines
# We fix the (internal) knots at first, second and third quartile
# By default we consider a cubic regression spline 
knots <- quantile(Boston$lstat)[2:4]
fit.spline <- lm(medv ~ bs(lstat, knots=knots), data=Boston)

summary(fit.spline)

# Similar results could be obtain by specifing the degrees of freedom 
# equal to 6 as an alternative to the specification of the knots
fit.spline <- lm(medv ~ bs(lstat, df = 6), data=Boston)

summary(fit.spline)

# Plot the fitted lines comparing:
# model 1: polynomial of degree = 3 (BLUE)
# model 2: cubic B-splines with df = 6 (RED)
# model 3: cubic B-splines with df = 100 (GREEN)

ggplot(Boston, aes(lstat, medv)) + 
 geom_point()+
 stat_smooth(method = lm, formula = y ~ poly(x, 3, raw = TRUE) , se = FALSE)+
 stat_smooth(method = lm, formula = y ~ bs(x, df = 6), col = "red", se = FALSE) +
 stat_smooth(method = lm, formula = y ~ bs(x, df = 100), col = "green", se = FALSE ) 


########################
# To do during the lab #
########################
# Plot the fitted lines comparing three different spcification of 
# spline degrees (consider one knot on the median):
# model 1: cubic B-splines (BLUE)
# model 2: quadratic B-splines (RED)
# model 3: linear B-splines (GREEN)
knot <- knots[2]
# knot <- median(Boston$lstat)
ggplot(Boston, aes(lstat, medv)) + 
 geom_point()+
 stat_smooth(method = lm, formula = y ~ bs(x, knots = knot), se = FALSE)+
 stat_smooth(method = lm, formula = y ~ bs(x, knots = knot, degree = 2), col = "red", se = FALSE) +
 stat_smooth(method = lm, formula = y ~ bs(x, knots = knot, degree = 1), col = "green", se = FALSE ) 

########
# GAMs #
########
library(mgcv)
library(PerformanceAnalytics)
########################
# To do during the lab #
########################
# Consider the tree dataset
str(trees)
# Look the data
chart.Correlation(trees)

# Otherwise by using pairs
pairs(trees)
pairs(trees, panel = panel.smooth)

# Customize the diagonal panel (you can also customize the upper and diagonal panel)
panel.hist <- function(x, ...){
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks
    nB <- length(breaks)
    y <- h$counts
    y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "grey", ...)
}

pairs(trees, panel = panel.smooth, main = "trees data",diag.panel = panel.hist)

########################
# To do during the lab #
########################
# Compare a GLM with a GAM considering a single predictor(Height)
# and the outcome (Volume)  distributed according a Gamma 
glm.1 <- glm(Volume ~ Height, data = trees,
             family = Gamma(link = "log")) 
gam.1 <- gam(Volume ~ s(Height), data = trees,
             family = Gamma(link = "log")) 

# Look the summary
summary(glm.1)
summary(gam.1)

# Look the coefficient of gam.1
coef(gam.1)

# Plot the predicted values for both models
par(mfrow=c(1,1))
with(trees, plot(Height, Volume, xlab = "Height", ylab = "Observed and predicted values"))
points(trees$Height, glm.1$fitted.values, pch=19, col="blue")
points(trees$Height, gam.1$fitted.values, pch=19, col="red", cex=0.5)
legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red"))

########################
# To do during the lab #
########################
# Diagnostic plot: represents the smooth function and the partial residuals
plot(gam.1,residuals=TRUE,pch=19)

# The points represented are
pr_data<-cbind(trees$Height,gam.1$residuals +
               gam(Volume ~ s(Height), data = trees,
                   family = Gamma(link = "log"),
                   fit=FALSE)$X[,2:10] %*%coef(gam.1)[2:10]) 


########################
# To do during the lab #
########################
# Adding another predictor to the model (compare GLM with GAM)
glm.2 <- glm(Volume ~ Height + Girth, data = trees,
             family = Gamma(link = "log")) 
gam.2 <- gam(Volume ~ s(Height) + s(Girth), data = trees,
             family = Gamma(link = "log")) 
summary(gam.2)

# Plot the predicted values (w.r.t. the predictors) for both models
# Height
with(trees, plot(Height, Volume, xlab = "Height", ylab = "Observed and predicted values"))
points(trees$Height, glm.2$fitted.values, pch = 19, col = "blue")
points(trees$Height, gam.2$fitted.values, pch = 19, col = "red", cex=0.5)
legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red"))

# Girth
with(trees, plot(Girth, Volume, xlab = "Girth", ylab = "Observed and predicted values"))
points(trees$Girth, glm.2$fitted.values, pch = 19, col = "blue")
points(trees$Girth, gam.2$fitted.values, pch = 19, col = "red", cex=0.5)
legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red"))

# Diagnostic plot:
plot(gam.2, residuals = TRUE, pch = 19, pages = 1)


########################
# To do during the lab #
########################
# Fitted surface by using vis.gam() function
par(mfrow=c(1,2))
vis.gam(gam.2, theta = -45, color = "terrain", type = "response")
vis.gam(gam.2, theta = -45, color = "terrain", type = "link")

# Contour plot by using vis.gam() function
par(mfrow=c(1,2))
vis.gam(gam.2, plot.type = "contour", color = "terrain", type = "response")
vis.gam(gam.2, plot.type = "contour", color = "terrain", type = "link")


########################
# To do during the lab #
########################
# Compare the fitted models by using AIC
AIC(glm.1, gam.1, glm.2, gam.2) 

########################
# To do during the lab #
########################
# Fit the model setting the smoothing paramters to be 0s 8unpenalized regression spline)
gam.2.unp <- gam(Volume ~ s(Height, fx = TRUE) + s(Girth, fx = TRUE), data = trees,
             family = Gamma(link = "log")) 
gam.2.unp$sp
# Alternatively 
gam.2.unp <- gam(Volume ~ s(Height, sp = 0) + s(Girth, sp = 0), data = trees,
             family = Gamma(link = "log")) 
# Diagnostic plot
plot(gam.2.unp, residuals = TRUE, pages = 1, pch = 19)
# Fitted surface 
par(mfrow=c(1,1))
vis.gam(gam.2.unp, theta = -45, type = "link", color  = "terrain")

########################
# To do during the lab #
########################
# Look the smoothing parameter of gam.2
sp <- gam.2$sp
sp

# Let's look at the AIC/GCV and how the smoothing parameter is selected
# Consider a grid of value for (lambda1, lambda2) by analysing sp
lamb1 <- c(166396,166397,166398)
lamb2 <- seq(0.01,1, by = 0.0001)

sp.grid <- expand.grid(lamb1, lamb2 )
n.tuning <- nrow(sp.grid)


minus2loglik <- rep(NA, n.tuning) 
edf <- rep(NA, n.tuning) 
aic <- rep(NA, n.tuning) 
gcv <- rep(NA, n.tuning) 

for(i in 1:n.tuning){
 gamobj <- gam(Volume ~ s(Height) + s(Girth), data = trees,
             family = Gamma(link = "log"), sp = sp.grid[i,]) 
 minus2loglik[i] <- -2*logLik(gamobj)
 edf[i] <- sum(gamobj$edf) + 1
 aic[i] <- AIC(gamobj)
 gcv[i] <- gamobj$gcv.ubre
 print(i)
}

# Some Plots 
par(mfrow=c(2,2))
plot(minus2loglik, type = "b", main = "-2 log Likelihood")
plot( edf, type = "b", main = "effective number of parameter")
plot( aic, type = "b", main = "AIC")
plot(gcv, type = "b", main = "GCV")

# Find the minimum
opt.sp <- sp.grid[which.min(gcv),]
opt.sp 

sp

gam.2.opt <- gam(Volume ~ s(Height) + s(Girth),
                 data = trees, family = Gamma(link = "log"), sp = opt.sp) 
# Compare the AIC (gam.2 vs gam.2.opt)
AIC(gam.2, gam.2.opt)


########################################
# To do during the lab: It's your turn #
########################################
# 1) Simulate some Poisson data (consider gamSim function, with defaults eg=1, scale=0.1)
# 2) Fit a GAM and print the results. Interpret the fit
# 3) Produce some plots for each x_j plotted agains s(x_j), and comment. 
#    Do you may drop some covariates or specyfing it in a different way
# 4) Compute the AIC between your models and a GLM


########################################
library(reshape)
library(rpart)
library(rpart.plot)

###############################
# Tree - based classification #
###############################
# Load the dataset spambase
spam <-read.table("spambase.data", sep = ",")
# See the structure
str(spam)
# Take the columns
# 58: "yesno" 
# 57: "crl.tot"
# 53: "dollar"
# 52: "bang"
# 24: "money"
# 23: "n000"
#  1: "make" 
spam <- spam[, c(58,57,53,52,24,23,1)]

# Give the names to the columns
colnames(spam) <- c("yesno", "crl.tot", "dollar", "bang", "money", "n000", "make")

# Covert the yesno variable in a factor and assign "n" to 0 and "y" to 1
spam$yesno <- factor(spam$yesno, levels = c(0,1))
levels(spam$yesno) <- c("n","y")

# Produce the boxpolot
par(mfrow=c(2,3))
for(i in 2:7){
 boxplot(spam[,i] ~ yesno, data = spam, ylab=colnames(spam)[i])
}
########################
# To do during the lab #
########################
# Fit a glm 
glm.spam <- glm(yesno ~ crl.tot + dollar + bang + money + n000 + make ,
                data = spam, family = binomial("logit") )
summary(glm.spam)

########################
# To do during the lab #
########################
# Consider a classification tree
spam.rpart <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make ,
                    data = spam, method = "class")
# Plot the tree
par(mfrow=c(1,1))
plot(spam.rpart)
text(spam.rpart)

########################
# To do during the lab #
########################
# See the summary of the decision  tree
printcp(spam.rpart)

########################
# To do during the lab #
########################
# Fix cp = 0
spam.rpart.cp0 <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make ,
                    data = spam, method = "class", cp=0)
# See the summary of the decision  tree
printcp(spam.rpart.cp0)

########################
# To do during the lab #
########################
# See the plot of the decision  tree
plotcp(spam.rpart.cp0)

########################
# To do during the lab #
########################
# Select the best split
best.cp <- spam.rpart.cp0$cptable[which.min(spam.rpart.cp0$cptable[,"xerror"]),]
best.cp

sd.rule<- best.cp["xerror"] + best.cp["xstd"]
cptable.sd.rule <- spam.rpart.cp0$cptable[spam.rpart.cp0$cptable[,"xerror"]<=sd.rule,]
best.cp.sd <- cptable.sd.rule[which.min(cptable.sd.rule[,"nsplit"]),]

# Plot the pruned tree 
# According to best.cp
tree.pruned <-prune(spam.rpart.cp0, cp=best.cp[1])
rpart.plot(tree.pruned, extra=106,box.palette="GnBu", branch.lty = 3,
           shadow.col = "gray", nn=TRUE)
printcp(tree.pruned)

# According to the standard deviation rule
tree.pruned.sd <-prune(spam.rpart.cp0, cp=best.cp.sd[1])
printcp(tree.pruned.sd)

rpart.plot(tree.pruned.sd, extra=106,box.palette="GnBu", branch.lty = 3,
           shadow.col = "gray", nn=TRUE)


#################
# Random Forest #
#################
library(randomForest)

########################
# To do during the lab #
########################
spam.rf <- randomForest(yesno ~ ., data = spam, importance =TRUE)
print(spam.rf)

importance(spam.rf)
varImpPlot(spam.rf, sort=TRUE)

help(rpart.plot)