Spline functions

Splines are functions defined as piecewise polynomials of fixed degree. The points of connections of the polynomials are called knots. Different basis expansions can be used to define a spline function.

We use a naive application on Boston dataset available in the MASS package to compare the use of polynomials and splines. The dataset collects information on housing values in suburbs of Boston and in particular we are going to model the continuous variable medv (median value of owner-occupied homes in $1000s) using a simple linear model including the continuous variable lstat (lower status of the population) as a predictor.

library(splines)
library(ggplot2)
library(MASS)
# Load the dataset Boston in the MASS package
data(Boston)

# Scatterplot of medv (y) and lstat (x) by using ggplot()
ggplot(Boston, aes(lstat, medv))+
 geom_point()

# Fit a simple linear model 
fit <- lm(medv ~ lstat, data = Boston)

# Analysis of the residual
par(mfrow=c(2,2))
plot(fit)

# Fit a linear model considering a quadratic term for lstat
# Two way to do this:
fit.poly2 <- lm(medv ~ lstat + I(lstat^2), data = Boston)
fit.poly2 <- lm(medv ~ poly(lstat, 2, raw = TRUE), data = Boston)

# Analysis of the residual 
plot(fit.poly2)

# Fit a linear model considering a polynomial of degree 5 for lstat
fit.poly5 <- lm(medv ~ poly(lstat, 5, raw = TRUE), data = Boston)

# Plot the fitted curve by using ggplot
ggplot(Boston, aes(lstat, medv)) + 
 geom_point()+
 stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))+
 stat_smooth(method = lm, formula = y ~ x, col = "red") +
 stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE), col = "green") 

# Now we consider the splines, namely we consider B-splines
# We fix the (internal) knots at first, second and third quartile
# By default we consider a cubic regression spline 
knots <- quantile(Boston$lstat)[2:4]
fit.spline <- lm(medv ~ bs(lstat, knots=knots), data=Boston)

summary(fit.spline)
## 
## Call:
## lm(formula = medv ~ bs(lstat, knots = knots), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.8071  -3.1502  -0.7389   2.1076  26.9529 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 50.628      2.582  19.608  < 2e-16 ***
## bs(lstat, knots = knots)1  -13.682      3.886  -3.521  0.00047 ***
## bs(lstat, knots = knots)2  -26.684      2.449 -10.894  < 2e-16 ***
## bs(lstat, knots = knots)3  -28.416      2.917  -9.743  < 2e-16 ***
## bs(lstat, knots = knots)4  -40.092      3.050 -13.144  < 2e-16 ***
## bs(lstat, knots = knots)5  -39.718      4.212  -9.431  < 2e-16 ***
## bs(lstat, knots = knots)6  -38.484      4.134  -9.308  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.206 on 499 degrees of freedom
## Multiple R-squared:  0.6833, Adjusted R-squared:  0.6795 
## F-statistic: 179.5 on 6 and 499 DF,  p-value: < 2.2e-16
# Similar results could be obtain by specifing the degrees of freedom 
# equal to 6 as an alternative to the specification of the knots
fit.spline <- lm(medv ~ bs(lstat, df = 6), data=Boston)

summary(fit.spline)
## 
## Call:
## lm(formula = medv ~ bs(lstat, df = 6), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.8071  -3.1502  -0.7389   2.1076  26.9529 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          50.628      2.582  19.608  < 2e-16 ***
## bs(lstat, df = 6)1  -13.682      3.886  -3.521  0.00047 ***
## bs(lstat, df = 6)2  -26.684      2.449 -10.894  < 2e-16 ***
## bs(lstat, df = 6)3  -28.416      2.917  -9.743  < 2e-16 ***
## bs(lstat, df = 6)4  -40.092      3.050 -13.144  < 2e-16 ***
## bs(lstat, df = 6)5  -39.718      4.212  -9.431  < 2e-16 ***
## bs(lstat, df = 6)6  -38.484      4.134  -9.308  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.206 on 499 degrees of freedom
## Multiple R-squared:  0.6833, Adjusted R-squared:  0.6795 
## F-statistic: 179.5 on 6 and 499 DF,  p-value: < 2.2e-16
# Plot the fitted lines comparing:
# model 1: polynomial of degree = 3 (BLUE)
# model 2: cubic B-splines with df = 6 (RED)
# model 3: cubic B-splines with df = 100 (GREEN)

ggplot(Boston, aes(lstat, medv)) + 
 geom_point()+
 stat_smooth(method = lm, formula = y ~ poly(x, 3, raw = TRUE) , se = FALSE)+
 stat_smooth(method = lm, formula = y ~ bs(x, df = 6), col = "red", se = FALSE) +
 stat_smooth(method = lm, formula = y ~ bs(x, df = 100), col = "green", se = FALSE ) 

The next plot shows a comparison of 3 spline functions with 1 knot on the median of the predictor and different spline degrees.

# Plot the fitted lines comparing three different spcification of 
# spline degrees (consider one knot on the median):
# model 1: cubic B-splines (BLUE)
# model 2: quadratic B-splines (RED)
# model 3: linear B-splines (GREEN)
knot <- knots[2]
# knot <- median(Boston$lstat)
ggplot(Boston, aes(lstat, medv)) + 
 geom_point()+
 stat_smooth(method = lm, formula = y ~ bs(x, knots = knot), se = FALSE)+
 stat_smooth(method = lm, formula = y ~ bs(x, knots = knot, degree = 2), col = "red", se = FALSE) +
 stat_smooth(method = lm, formula = y ~ bs(x, knots = knot, degree = 1), col = "green", se = FALSE ) 

Generalized additive models (GAM)

A generalized additive model (GAM) is a generalized linear model (GLM) in which the linear predictor is given by a user specified sum of smooth functions of the covariates plus a conventional parametric component of the linear predictor. Given \(\mu_i=E[Y_i]\), a simple example is:

\[\log(\mu_i)= \beta_0+\sum_{j=1}^{p}s_j(x_{ij})+\epsilon_i, \ i=1,\ldots,n,\] where the dependent variable \(Y_{i} \sim \mathsf{Gamma}(\mu_i, \phi)\) and \(s_j\) are smooth functions of covariates \(x_j\). GAM models imply the following likelihood penalization

\[ l(\boldsymbol{\beta}, \boldsymbol{s})-\lambda R(\boldsymbol{s}),\]

where \(R(\boldsymbol{s})\) is a measure of roughness.

Trees dataset

As an example, consider the simple dataset trees, where we have measurements of the girth, height and volume of timber in 31 felled black cherry trees. Note that girth is the diameter of the tree (in inches) measured at 4 ft 6 in above the ground. First of all, we use the pairs plot (or chart.Correlation) for checking possible correlations between the three variables.

library(mgcv)
library(PerformanceAnalytics)
# Consider the tree dataset
data(trees)
# Look the data
chart.Correlation(trees)

# Otherwise by using pairs
# pairs(trees)
# pairs(trees, panel = panel.smooth)

# Customize the diagonal panel (you can also customize the upper and diagonal panel)
panel.hist <- function(x, ...){
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks
    nB <- length(breaks)
    y <- h$counts
    y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "grey", ...)
}

pairs(trees, panel = panel.smooth, main = "trees data",diag.panel = panel.hist)

The model

In order to give a general idea about the GAM flexibility, we may start with one predictor. We could fit a simple Generalized Linear Model (GLM), and a GAM model \(Y_{i} \sim \mathsf{Gamma}(\mu_i, \phi)\), such that:

\[\begin{align*} \log(\mu_i)= & \beta_0+\beta_1 \mathsf{Height}_i\\ \log(\mu_i)= & \beta_0+ s_1(\mathsf{Height}_i), \end{align*}\]

for \(i=1,\ldots,n\). The question we pose is whether the GLM structure is good enough for fitting these data. We could start by printing the output of the GAM model; then, we may plot the predicted values for both the models.

# Compare a GLM with a GAM considering a single predictor(Height)
# and the outcome (Volume)  distributed according a Gamma 
glm.1 <- glm(Volume ~ Height, data = trees,
             family = Gamma(link = "log")) 
gam.1 <- gam(Volume ~ s(Height), data = trees,
             family = Gamma(link = "log")) 

# Look the summary
summary(glm.1)
## 
## Call:
## glm(formula = Volume ~ Height, family = Gamma(link = "log"), 
##     data = trees)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6719  -0.3199  -0.1358   0.3714   0.5801  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.85467    0.88028  -0.971     0.34    
## Height       0.05532    0.01154   4.792 4.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1622999)
## 
##     Null deviance: 8.3172  on 30  degrees of freedom
## Residual deviance: 4.7723  on 29  degrees of freedom
## AIC: 239.67
## 
## Number of Fisher Scoring iterations: 4
summary(gam.1)
## 
## Family: Gamma 
## Link function: log 
## 
## Formula:
## Volume ~ s(Height)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.34970    0.07236   46.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F  p-value    
## s(Height)   1      1 22.97 4.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.348   Deviance explained = 42.6%
## GCV = 0.17591  Scale est. = 0.1623    n = 31
# Look the coefficient of gam.1
coef(gam.1)
##   (Intercept)   s(Height).1   s(Height).2   s(Height).3   s(Height).4 
##  3.349704e+00 -1.356217e-07  2.472441e-09 -1.396796e-09 -6.612916e-08 
##   s(Height).5   s(Height).6   s(Height).7   s(Height).8   s(Height).9 
## -1.861406e-08 -1.725038e-08  8.662929e-09 -8.462102e-08  3.467615e-01
# Plot the predicted values for both models
par(mfrow=c(1,1))
with(trees, plot(Height, Volume, xlab = "Height", ylab = "Observed and predicted values"))
points(trees$Height, glm.1$fitted.values, pch=19, col="blue")
points(trees$Height, gam.1$fitted.values, pch=19, col="red", cex=0.5)
legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red"))

Diagnostic plot involve the representation of the smooth function and the partial residuals defined as: \[\hat\epsilon^{part}_{ij}=\hat s_j(x_{ij})+\hat\epsilon_i^{P}\] where \(\hat\epsilon^{P}\) are the Pearson residuals of the model. Looking at this plot we are interested in noticing non linearity or wiggle behavior of the smooth function and if the partial residuals are evenly distributed around the function.

# Diagnostic plot: represents the smooth function and the partial residuals
plot(gam.1,residuals=TRUE,pch=19)

# The points represented are
pr_data<-cbind(trees$Height,gam.1$residuals +
               gam(Volume ~ s(Height), data = trees,
                   family = Gamma(link = "log"),
                   fit=FALSE)$X[,2:10] %*%coef(gam.1)[2:10]) 

By the joint analysis of the summary output—look at the value edf=1—the prediction plots and the final plot displaying \(s(\mathsf{Height})\) versus \(\mathsf{Height}\), we may conclude that this simple single predictor framework may be well explained by a GLM approach.

Adding another predictor

Consider now to add the predictor \(\mathsf{Girth}\), and take the same model as before, \(Y_{i}=\mathsf{Volume}_i\sim \mathsf{Gamma}(\mu_i, \phi)\), then:

\[\log(\mu_i)=s_1(\mathsf{Height}_i)+s_2(\mathsf{Girth}_i).\] Analogously as before, we fit the GLM and the GAM model. We plot together the fitted values and we separately analyze the smooth terms \(s_1(\mathsf{Height})\) and \(s_2(\mathsf{Girth})\).

# Adding another predictor to the model (compare GLM with GAM)
glm.2 <- glm(Volume ~ Height + Girth, data = trees,
             family = Gamma(link = "log")) 
gam.2 <- gam(Volume ~ s(Height) + s(Girth), data = trees,
             family = Gamma(link = "log")) 
summary(gam.2)
## 
## Family: Gamma 
## Link function: log 
## 
## Formula:
## Volume ~ s(Height) + s(Girth)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.27570    0.01492   219.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df      F  p-value    
## s(Height) 1.000  1.000  31.32 7.07e-06 ***
## s(Girth)  2.422  3.044 219.28  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.973   Deviance explained = 97.8%
## GCV = 0.0080824  Scale est. = 0.006899  n = 31
# Plot the predicted values (w.r.t. the predictors) for both models
# Height
with(trees, plot(Height, Volume, xlab = "Height", ylab = "Observed and predicted values"))
points(trees$Height, glm.2$fitted.values, pch = 19, col = "blue")
points(trees$Height, gam.2$fitted.values, pch = 19, col = "red", cex=0.5)
legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red"))

# Girth
with(trees, plot(Girth, Volume, xlab = "Girth", ylab = "Observed and predicted values"))
points(trees$Girth, glm.2$fitted.values, pch = 19, col = "blue")
points(trees$Girth, gam.2$fitted.values, pch = 19, col = "red", cex=0.5)
legend("topleft", c("GLM", "GAM"), pch = 19, col = c("blue", "red"))

# Diagnostic plot:
plot(gam.2, residuals = TRUE, pch = 19, pages = 1)

Options to visualize the fitted surface are collected in the function vis.gam which offers perspective and contour plot for a gam model.

# Fitted surface by using vis.gam() function
par(mfrow=c(1,2))
vis.gam(gam.2, theta = -45, color = "terrain", type = "response")
vis.gam(gam.2, theta = -45, color = "terrain", type = "link")

# Contour plot by using vis.gam() function
par(mfrow=c(1,2))
vis.gam(gam.2, plot.type = "contour", color = "terrain", type = "response")
vis.gam(gam.2, plot.type = "contour", color = "terrain", type = "link")

The output reports an edf equals 2.422 for the \(\mathsf{Girth}\) predictor, meaning that a straight line could be partially inappropriate for our purposes. As a further check, we may compute the AIC for the four proposed models:

# Compare the fitted models by using AIC
AIC(glm.1, gam.1, glm.2, gam.2) 
##             df      AIC
## glm.1 3.000000 239.6724
## gam.1 3.000009 239.7580
## glm.2 4.000000 151.0081
## gam.2 5.422254 143.1907

The AIC for the single predictor models coincide. The AIC dramatically decreases when considering the second predictor, with the GAM model favorite over the GLM model.

An issue we should consider is the so called optimal degree of smoothness. With the argument sp in the gam function we can manually control the degree of smoothness for each smooth function included in the model.

# Fit the model setting the smoothing paramters to be 0s 8unpenalized regression spline)
gam.2.unp <- gam(Volume ~ s(Height, fx = TRUE) + s(Girth, fx = TRUE), data = trees,
             family = Gamma(link = "log")) 
gam.2.unp$sp
## named numeric(0)
# Alternatively 
gam.2.unp <- gam(Volume ~ s(Height, sp = 0) + s(Girth, sp = 0), data = trees,
             family = Gamma(link = "log")) 
# Diagnostic plot
plot(gam.2.unp, residuals = TRUE, pages = 1, pch = 19)

# Fitted surface 
par(mfrow=c(1,1))
vis.gam(gam.2.unp, theta = -45, type = "link", color  = "terrain")

However, usually \(\lambda\) may be estimated by several methods, such as CV, AIC, BIC…By default, gam function estimates consider the method = “GCV.Cp” and scale = 0, meaning that the \(\lambda\)s are estimated via GCV, or via AIC if the family is Binomial or Poisson. Let’s take a look explicitly at how \(\lambda\) is selected.

# Look the smoothing parameter of gam.2
sp <- gam.2$sp
sp
##    s(Height)     s(Girth) 
## 1.663969e+05 3.427109e-01
# Let's look at the AIC/GCV and how the smoothing parameter is selected
# Consider a grid of value for (lambda1, lambda2) by analysing sp
lamb1 <- c(166396,166397,166398)
lamb2 <- seq(0.01,1, by = 0.0001)

sp.grid <- expand.grid(lamb1, lamb2 )
n.tuning <- nrow(sp.grid)


minus2loglik <- rep(NA, n.tuning) 
edf <- rep(NA, n.tuning) 
aic <- rep(NA, n.tuning) 
gcv <- rep(NA, n.tuning) 

for(i in 1:n.tuning){
 gamobj <- gam(Volume ~ s(Height) + s(Girth), data = trees,
             family = Gamma(link = "log"), sp = sp.grid[i,]) 
 minus2loglik[i] <- -2*logLik(gamobj)
 edf[i] <- sum(gamobj$edf) + 1
 aic[i] <- AIC(gamobj)
 gcv[i] <- gamobj$gcv.ubre
}

# Some Plots 
par(mfrow=c(2,2))
plot(minus2loglik, type = "b", main = "-2 log Likelihood")
plot( edf, type = "b", main = "effective number of parameter")
plot( aic, type = "b", main = "AIC")
plot(gcv, type = "b", main = "GCV")

# Find the minimum
opt.sp <- sp.grid[which.min(gcv),]
opt.sp 
##        Var1   Var2
## 9984 166398 0.3427
sp
##    s(Height)     s(Girth) 
## 1.663969e+05 3.427109e-01
gam.2.opt <- gam(Volume ~ s(Height) + s(Girth),
                 data = trees, family = Gamma(link = "log"), sp = opt.sp) 
# Compare the AIC (gam.2 vs gam.2.opt)
AIC(gam.2, gam.2.opt)
##                 df      AIC
## gam.2     5.422254 143.1907
## gam.2.opt 5.422274 143.1907

It’s your turn!

1- Simulate some Poisson data with the gamSim function contained in the mgcv package (Use the default arguments eg=1 for the Gu and Wahba 4 univariate term example and scale=0.1).

2- Fit a Gam and print the results. Interpret the fit.

3- Produce some plots for each \(x_j\) plotted against \(s(x_j)\), and comment. Do you maybe drop some covariates?

4- Compute the AIC between your final gam and a glm.

Tree-based classification

We consider the example of detection of email spam. Data are available from the UCI repository of machine learning databases (http://www.ics.uci.edu/~mlearn/MLRepository.html) and they collect informations about 4601 email items, 1813 classified as spam. 57 explanatory variables describe several characteristics of the data. Following the example in the Data Analyisis and Graphics using R, we will consider only 6 of them, mostly related to the frequency of specific words and characters in the email. In detail,

Using these 6 explanatory variables we want to build an decision tree model that is able to classify each email correctly as spam (y in the binary outcome variable yesno) and non-spam (n in the binary outcome variable yesno).

library(reshape)
library(rpart)
library(rpart.plot)

# Load the dataset spambase
spam <-read.table("spambase.data", sep = ",")
# Take the columns
# 58: "yesno" 
# 57: "crl.tot"
# 53: "dollar"
# 52: "bang"
# 24: "money"
# 23: "n000"
#  1: "make" 
spam <- spam[, c(58,57,53,52,24,23,1)]

# Give the names to the columns
colnames(spam) <- c("yesno", "crl.tot", "dollar", "bang", "money", "n000", "make")

# Covert the yesno variable in a factor and assign "n" to 0 and "y" to 1
spam$yesno <- factor(spam$yesno, levels = c(0,1))
levels(spam$yesno) <- c("n","y")

# Produce the boxpolot
par(mfrow=c(2,3))
for(i in 2:7){
 boxplot(spam[,i] ~ yesno, data = spam, ylab=colnames(spam)[i])
}

# Fit a glm 
glm.spam <- glm(yesno ~ crl.tot + dollar + bang + money + n000 + make ,
                data = spam, family = binomial("logit") )
summary(glm.spam)
## 
## Call:
## glm(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, family = binomial("logit"), data = spam)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4904  -0.6153  -0.5816   0.4439   1.9323  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.700e+00  5.361e-02 -31.717  < 2e-16 ***
## crl.tot      6.917e-04  9.745e-05   7.098 1.27e-12 ***
## dollar       8.013e+00  6.175e-01  12.976  < 2e-16 ***
## bang         1.572e+00  1.115e-01  14.096  < 2e-16 ***
## money        2.142e+00  2.418e-01   8.859  < 2e-16 ***
## n000         4.149e+00  4.371e-01   9.492  < 2e-16 ***
## make         1.698e-02  1.434e-01   0.118    0.906    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6170.2  on 4600  degrees of freedom
## Residual deviance: 4058.8  on 4594  degrees of freedom
## AIC: 4072.8
## 
## Number of Fisher Scoring iterations: 16

The model can be fitted using the function rpart in the library rpart. The option class in the method is selected by default when the outcome variable is of type factor.

# Consider a classification tree
spam.rpart <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make ,
                    data = spam, method = "class")
# Plot the tree
par(mfrow=c(1,1))
plot(spam.rpart)
text(spam.rpart)

The summary of the fitted decision tree is visualized using the function printcp.

# See the summary of the decision  tree
printcp(spam.rpart)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class")
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar 
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.476558      0   1.00000 1.00000 0.018282
## 2 0.075565      1   0.52344 0.54606 0.015375
## 3 0.011583      3   0.37231 0.38169 0.013374
## 4 0.010480      4   0.36073 0.38334 0.013398
## 5 0.010000      5   0.35025 0.37838 0.013326

The complexity parameter CP is a proxy for the number of splits. In order to avoid too complex trees, the reduction of lack-of-fit for each additional split is evaluated with an increasing cost. When the cost outweights the reduction, the algorithm stops.

Small complexity parameter CP leads to large tree and large CPleads to small tree. Let’s try to decrease the default CP value for our model.

# Fix cp = 0
spam.rpart.cp0 <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make ,
                    data = spam, method = "class", cp=0)
# See the summary of the decision  tree
printcp(spam.rpart.cp0)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class", cp = 0)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  make    money   n000   
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##            CP nsplit rel error  xerror     xstd
## 1  4.7656e-01      0   1.00000 1.00000 0.018282
## 2  7.5565e-02      1   0.52344 0.54275 0.015341
## 3  1.1583e-02      3   0.37231 0.38445 0.013414
## 4  1.0480e-02      4   0.36073 0.38389 0.013406
## 5  6.3431e-03      5   0.35025 0.37066 0.013213
## 6  5.5157e-03     10   0.31660 0.35301 0.012947
## 7  4.4126e-03     11   0.31109 0.34473 0.012819
## 8  3.8610e-03     12   0.30667 0.33205 0.012617
## 9  2.7579e-03     16   0.29123 0.32377 0.012482
## 10 2.2063e-03     17   0.28847 0.32598 0.012518
## 11 1.9305e-03     18   0.28627 0.32598 0.012518
## 12 1.6547e-03     20   0.28240 0.32763 0.012545
## 13 9.1929e-04     25   0.27413 0.32488 0.012500
## 14 8.2736e-04     29   0.26917 0.32653 0.012527
## 15 5.5157e-04     46   0.25427 0.32708 0.012536
## 16 3.3094e-04     48   0.25317 0.32763 0.012545
## 17 2.7579e-04     53   0.25152 0.33094 0.012599
## 18 1.8386e-04     61   0.24931 0.33094 0.012599
## 19 6.8946e-05     64   0.24876 0.33536 0.012670
## 20 0.0000e+00     72   0.24821 0.33646 0.012688

The relative error showed in the summary decreases at any additional split, so it is not useful to evaluate the predictive accuracy of the model, while the xerror (which stands for cross-validated relative error) reaches a minimum. Since the xerror is computed using 10-fold cross-validation procedure by default, the values slightly changes everytime we fit a new model. The relative error remains exactly the same.

# See the plot of the decision  tree
plotcp(spam.rpart.cp0)

Now that we have a very large (overfitted) tree, what is the best number of splits where to prune our tree? Changes in the xerrors are so small that running the model again would probably lead to a different choice of number of splits if it is based on selecting the tree with the absolute minimum xerror. To reduce instabilty in the choice we can use the one-standard-deviation rule. The horizontal dashed line in the plot shows the minimum xerror + standard deviation. So choosing the smallest tree whose xerror is less or equal than this value will lead us to a more conservative choice if the interest is in choosing the optimal predictive tree, i.e., the predictive power will on average be slightly less than optimal.

# Select the best split
best.cp <- spam.rpart.cp0$cptable[which.min(spam.rpart.cp0$cptable[,"xerror"]),]
best.cp
##          CP      nsplit   rel error      xerror        xstd 
##  0.00275786 16.00000000  0.29123001  0.32377275  0.01248199
# Select the best split according to standard deviation rule
sd.rule<- best.cp["xerror"] + best.cp["xstd"]
cptable.sd.rule <- spam.rpart.cp0$cptable[spam.rpart.cp0$cptable[,"xerror"]<=sd.rule,]
best.cp.sd <- cptable.sd.rule[which.min(cptable.sd.rule[,"nsplit"]),]


# Plot the pruned tree 
# According to best.cp
tree.pruned <-prune(spam.rpart.cp0, cp=best.cp[1])
rpart.plot(tree.pruned, extra=106,box.palette="GnBu", branch.lty = 3,
           shadow.col = "gray", nn=TRUE)

printcp(tree.pruned)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class", cp = 0)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000   
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.4765582      0   1.00000 1.00000 0.018282
## 2 0.0755654      1   0.52344 0.54275 0.015341
## 3 0.0115830      3   0.37231 0.38445 0.013414
## 4 0.0104799      4   0.36073 0.38389 0.013406
## 5 0.0063431      5   0.35025 0.37066 0.013213
## 6 0.0055157     10   0.31660 0.35301 0.012947
## 7 0.0044126     11   0.31109 0.34473 0.012819
## 8 0.0038610     12   0.30667 0.33205 0.012617
## 9 0.0027579     16   0.29123 0.32377 0.012482
# According to the standard deviation rule
tree.pruned.sd <-prune(spam.rpart.cp0, cp=best.cp.sd[1])
printcp(tree.pruned.sd)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class", cp = 0)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000   
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.4765582      0   1.00000 1.00000 0.018282
## 2 0.0755654      1   0.52344 0.54275 0.015341
## 3 0.0115830      3   0.37231 0.38445 0.013414
## 4 0.0104799      4   0.36073 0.38389 0.013406
## 5 0.0063431      5   0.35025 0.37066 0.013213
## 6 0.0055157     10   0.31660 0.35301 0.012947
## 7 0.0044126     11   0.31109 0.34473 0.012819
## 8 0.0038610     12   0.30667 0.33205 0.012617
rpart.plot(tree.pruned.sd, extra=106,box.palette="GnBu", branch.lty = 3,
           shadow.col = "gray", nn=TRUE)

Absolute cross validation error for this last model is: \(0.33756*0.39404=0.1330121\), thus the model achieved an error rate of 13.3%. (The value of 0.33756 could be different since it depends on the CV)

Trying with RandomForest..

library(randomForest)

spam.rf <- randomForest(yesno ~ ., data = spam, importance =TRUE)
print(spam.rf)
## 
## Call:
##  randomForest(formula = yesno ~ ., data = spam, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 11.8%
## Confusion matrix:
##      n    y class.error
## n 2637  151  0.05416069
## y  392 1421  0.21621622
importance(spam.rf)
##                n         y MeanDecreaseAccuracy MeanDecreaseGini
## crl.tot 48.69025  54.53833             72.71228        248.26758
## dollar  56.35331  54.17269             75.03002        421.59759
## bang    93.43421 108.61746            122.00472        585.32208
## money   32.88276  50.29765             51.72064        202.39625
## n000    61.31834  13.18710             65.33372        132.83226
## make    12.59409  21.80521             25.76215         40.42125
varImpPlot(spam.rf, sort=TRUE)