############################################### # Lab 6: Splines - GLM - CART - Random Forest # ############################################### setwd("C:/Users/Gioia/Desktop/Lab6_2122") ########### # Splines # ########### library(splines) library(ggplot2) library(MASS) ######################## # To do during the lab # ######################## # Load the dataset Boston in the MASS package data(Boston) str(Boston) # Scatterplot of medv (y) and lstat (x) by using ggplot() ggplot(Boston, aes(lstat, medv))+ geom_point() # Fit a simple linear model fit <- lm(medv ~ lstat, data = Boston) # Analysis of the residual par(mfrow=c(2,2)) plot(fit) # Fit a linear model considering a quadratic term for lstat # Two way to do this: fit.poly2 <- lm(medv ~ lstat + I(lstat^2), data = Boston) fit.poly2 <- lm(medv ~ poly(lstat, 2, raw = TRUE), data = Boston) # Analysis of the residual plot(fit.poly2) # Fit a linear model considering a polynomial of degree 5 for lstat fit.poly5 <- lm(medv ~ poly(lstat, 5, raw = TRUE), data = Boston) # Plot the fitted curve by using ggplot ggplot(Boston, aes(lstat, medv)) + geom_point()+ stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))+ stat_smooth(method = lm, formula = y ~ x, col = "red") + stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE), col = "green") # Now we consider the splines, namely we consider B-splines # We fix the (internal) knots at first, second and third quartile # By default we consider a cubic regression spline knots <- quantile(Boston$lstat)[2:4] fit.spline <- lm(medv ~ bs(lstat, knots=knots), data=Boston) summary(fit.spline) # Similar results could be obtain by specifing the degrees of freedom # equal to 6 as an alternative to the specification of the knots fit.spline <- lm(medv ~ bs(lstat, df = 6), data=Boston) summary(fit.spline) # Plot the fitted lines comparing: # model 1: polynomial of degree = 3 (BLUE) # model 2: cubic B-splines with df = 6 (RED) # model 3: cubic B-splines with df = 100 (GREEN) ggplot(Boston, aes(lstat, medv)) + geom_point()+ stat_smooth(method = lm, formula = y ~ poly(x, 3, raw = TRUE) , se = FALSE)+ stat_smooth(method = lm, formula = y ~ bs(x, df = 6), col = "red", se = FALSE) + stat_smooth(method = lm, formula = y ~ bs(x, df = 100), col = "green", se = FALSE ) ######################## # To do during the lab # ######################## # Plot the fitted lines comparing three different spcification of # spline degrees (consider one knot on the median): # model 1: cubic B-splines (BLUE) # model 2: quadratic B-splines (RED) # model 3: linear B-splines (GREEN) knot <- knots[2] # knot <- median(Boston$lstat) ggplot(Boston, aes(lstat, medv)) + geom_point()+ stat_smooth(method = lm, formula = y ~ bs(x, knots = knot), se = FALSE)+ stat_smooth(method = lm, formula = y ~ bs(x, knots = knot, degree = 2), col = "red", se = FALSE) + stat_smooth(method = lm, formula = y ~ bs(x, knots = knot, degree = 1), col = "green", se = FALSE ) ######## # GAMs # ######## library(mgcv) library(PerformanceAnalytics) ######################## # To do during the lab # ######################## # Consider the tree dataset str(trees) # Look the data chart.Correlation(trees) # Otherwise by using pairs pairs(trees) pairs(trees, panel = panel.smooth) # Customize the diagonal panel (you can also customize the upper and diagonal panel) panel.hist <- function(x, ...){ usr <- par("usr") on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks nB <- length(breaks) y <- h$counts y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col = "grey", ...) } pairs(trees, panel = panel.smooth, main = "trees data",diag.panel = panel.hist) ######################## # To do during the lab # ######################## # Compare a GLM with a GAM considering a single predictor(Height) # and the outcome (Volume) distributed according a Gamma glm.1 <- glm(Volume ~ Height, data = trees, family = Gamma(link = "log")) gam.1 <- gam(Volume ~ s(Height), data = trees, family = Gamma(link = "log")) # Look the summary summary(glm.1) summary(gam.1) # Look the coefficient of gam.1 coef(gam.1) # Plot the predicted values for both models par(mfrow=c(1,1)) with(trees, plot(Height, Volume, xlab = "Height", ylab = "Observed and predicted values")) points(trees$Height, glm.1$fitted.values, pch=19, col="blue") points(trees$Height, gam.1$fitted.values, pch=19, col="red", cex=0.5) legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red")) ######################## # To do during the lab # ######################## # Diagnostic plot: represents the smooth function and the partial residuals plot(gam.1,residuals=TRUE,pch=19) # The points represented are pr_data<-cbind(trees$Height,gam.1$residuals + gam(Volume ~ s(Height), data = trees, family = Gamma(link = "log"), fit=FALSE)$X[,2:10] %*%coef(gam.1)[2:10]) ######################## # To do during the lab # ######################## # Adding another predictor to the model (compare GLM with GAM) glm.2 <- glm(Volume ~ Height + Girth, data = trees, family = Gamma(link = "log")) gam.2 <- gam(Volume ~ s(Height) + s(Girth), data = trees, family = Gamma(link = "log")) summary(gam.2) # Plot the predicted values (w.r.t. the predictors) for both models # Height with(trees, plot(Height, Volume, xlab = "Height", ylab = "Observed and predicted values")) points(trees$Height, glm.2$fitted.values, pch = 19, col = "blue") points(trees$Height, gam.2$fitted.values, pch = 19, col = "red", cex=0.5) legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red")) # Girth with(trees, plot(Girth, Volume, xlab = "Girth", ylab = "Observed and predicted values")) points(trees$Girth, glm.2$fitted.values, pch = 19, col = "blue") points(trees$Girth, gam.2$fitted.values, pch = 19, col = "red", cex=0.5) legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red")) # Diagnostic plot: plot(gam.2, residuals = TRUE, pch = 19, pages = 1) ######################## # To do during the lab # ######################## # Fitted surface by using vis.gam() function par(mfrow=c(1,2)) vis.gam(gam.2, theta = -45, color = "terrain", type = "response") vis.gam(gam.2, theta = -45, color = "terrain", type = "link") # Contour plot by using vis.gam() function par(mfrow=c(1,2)) vis.gam(gam.2, plot.type = "contour", color = "terrain", type = "response") vis.gam(gam.2, plot.type = "contour", color = "terrain", type = "link") ######################## # To do during the lab # ######################## # Compare the fitted models by using AIC AIC(glm.1, gam.1, glm.2, gam.2) ######################## # To do during the lab # ######################## # Fit the model setting the smoothing paramters to be 0s 8unpenalized regression spline) gam.2.unp <- gam(Volume ~ s(Height, fx = TRUE) + s(Girth, fx = TRUE), data = trees, family = Gamma(link = "log")) gam.2.unp$sp # Alternatively gam.2.unp <- gam(Volume ~ s(Height, sp = 0) + s(Girth, sp = 0), data = trees, family = Gamma(link = "log")) # Diagnostic plot plot(gam.2.unp, residuals = TRUE, pages = 1, pch = 19) # Fitted surface par(mfrow=c(1,1)) vis.gam(gam.2.unp, theta = -45, type = "link", color = "terrain") ######################## # To do during the lab # ######################## # Look the smoothing parameter of gam.2 sp <- gam.2$sp sp # Let's look at the AIC/GCV and how the smoothing parameter is selected # Consider a grid of value for (lambda1, lambda2) by analysing sp lamb1 <- c(166396,166397,166398) lamb2 <- seq(0.01,1, by = 0.0001) sp.grid <- expand.grid(lamb1, lamb2 ) n.tuning <- nrow(sp.grid) minus2loglik <- rep(NA, n.tuning) edf <- rep(NA, n.tuning) aic <- rep(NA, n.tuning) gcv <- rep(NA, n.tuning) for(i in 1:n.tuning){ gamobj <- gam(Volume ~ s(Height) + s(Girth), data = trees, family = Gamma(link = "log"), sp = sp.grid[i,]) minus2loglik[i] <- -2*logLik(gamobj) edf[i] <- sum(gamobj$edf) + 1 aic[i] <- AIC(gamobj) gcv[i] <- gamobj$gcv.ubre print(i) } # Some Plots par(mfrow=c(2,2)) plot(minus2loglik, type = "b", main = "-2 log Likelihood") plot( edf, type = "b", main = "effective number of parameter") plot( aic, type = "b", main = "AIC") plot(gcv, type = "b", main = "GCV") # Find the minimum opt.sp <- sp.grid[which.min(gcv),] opt.sp sp gam.2.opt <- gam(Volume ~ s(Height) + s(Girth), data = trees, family = Gamma(link = "log"), sp = opt.sp) # Compare the AIC (gam.2 vs gam.2.opt) AIC(gam.2, gam.2.opt) ######################################## # To do during the lab: It's your turn # ######################################## # 1) Simulate some Poisson data (consider gamSim function, with defaults eg=1, scale=0.1) # 2) Fit a GAM and print the results. Interpret the fit # 3) Produce some plots for each x_j plotted agains s(x_j), and comment. # Do you may drop some covariates or specyfing it in a different way # 4) Compute the AIC between your models and a GLM ######################################## library(reshape) library(rpart) library(rpart.plot) ############################### # Tree - based classification # ############################### # Load the dataset spambase spam <-read.table("spambase.data", sep = ",") # See the structure str(spam) # Take the columns # 58: "yesno" # 57: "crl.tot" # 53: "dollar" # 52: "bang" # 24: "money" # 23: "n000" # 1: "make" spam <- spam[, c(58,57,53,52,24,23,1)] # Give the names to the columns colnames(spam) <- c("yesno", "crl.tot", "dollar", "bang", "money", "n000", "make") # Covert the yesno variable in a factor and assign "n" to 0 and "y" to 1 spam$yesno <- factor(spam$yesno, levels = c(0,1)) levels(spam$yesno) <- c("n","y") # Produce the boxpolot par(mfrow=c(2,3)) for(i in 2:7){ boxplot(spam[,i] ~ yesno, data = spam, ylab=colnames(spam)[i]) } ######################## # To do during the lab # ######################## # Fit a glm glm.spam <- glm(yesno ~ crl.tot + dollar + bang + money + n000 + make , data = spam, family = binomial("logit") ) summary(glm.spam) ######################## # To do during the lab # ######################## # Consider a classification tree spam.rpart <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make , data = spam, method = "class") # Plot the tree par(mfrow=c(1,1)) plot(spam.rpart) text(spam.rpart) ######################## # To do during the lab # ######################## # See the summary of the decision tree printcp(spam.rpart) ######################## # To do during the lab # ######################## # Fix cp = 0 spam.rpart.cp0 <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make , data = spam, method = "class", cp=0) # See the summary of the decision tree printcp(spam.rpart.cp0) ######################## # To do during the lab # ######################## # See the plot of the decision tree plotcp(spam.rpart.cp0) ######################## # To do during the lab # ######################## # Select the best split best.cp <- spam.rpart.cp0$cptable[which.min(spam.rpart.cp0$cptable[,"xerror"]),] best.cp sd.rule<- best.cp["xerror"] + best.cp["xstd"] cptable.sd.rule <- spam.rpart.cp0$cptable[spam.rpart.cp0$cptable[,"xerror"]<=sd.rule,] best.cp.sd <- cptable.sd.rule[which.min(cptable.sd.rule[,"nsplit"]),] # Plot the pruned tree # According to best.cp tree.pruned <-prune(spam.rpart.cp0, cp=best.cp[1]) rpart.plot(tree.pruned, extra=106,box.palette="GnBu", branch.lty = 3, shadow.col = "gray", nn=TRUE) printcp(tree.pruned) # According to the standard deviation rule tree.pruned.sd <-prune(spam.rpart.cp0, cp=best.cp.sd[1]) printcp(tree.pruned.sd) rpart.plot(tree.pruned.sd, extra=106,box.palette="GnBu", branch.lty = 3, shadow.col = "gray", nn=TRUE) ################# # Random Forest # ################# library(randomForest) ######################## # To do during the lab # ######################## spam.rf <- randomForest(yesno ~ ., data = spam, importance =TRUE) print(spam.rf) importance(spam.rf) varImpPlot(spam.rf, sort=TRUE) help(rpart.plot)