### Lab 4 (28 ottobre 2024)

## 1. simulo dati da una cubic spline
library(splines)

set.seed(101)
n <- 150
beta0 <- 1
beta1 <- 0.7
beta2 <- -0.1
beta3 <- 5
beta4 <- -1.5
beta5 <- 0.1
beta6 <- -0.3
#beta7 <- 0.1
beta <- c(beta0, beta1, beta2, beta3, beta4, beta5, beta6)
x <- runif(n, -1,1)
xlims <- range(x)
x.grid <- seq(xlims[1], xlims[2], length.out = n)

knots <- c(-0.7,0,0.7)      # 3 nodi

# metodo con basi di potenza troncate (truncated power basis)
basis <- matrix(NA, n, 7)   # K + 4 gdl = 7 gdl
basis[,1] <- rep(1,n)
basis[,2] <- x
basis[,3] <- x^2
basis[,4] <- x^3
basis[,5] <- 0*(x<=knots[1])+((x-knots[1])^3)*(x>knots[1]) # truncated basis creation
basis[,6] <- 0*(x<=knots[2])+((x-knots[2])^3)*(x>knots[2])
basis[,7] <- 0*(x<=knots[3])+((x-knots[3])^3)*(x>knots[3])

# metodo con b-splines
basis.2 <- bs(x, knots = knots, intercept = FALSE) # metodo alternativo per le basi (b-spline)
epsilon <- rnorm(n, 0,0.5)

# dati con metodo 1
y <- basis%*%beta + epsilon
# dati con metodo 2
y.2 <-  beta0 + basis.2%*%beta[2:7] + epsilon
par(mfrow=c(1,2))
plot(x,y)
plot(x, y.2)

# uso dati 1
par(mfrow=c(1,1))
plot(x,y)

## 2. Stimo spline cubica, naturale cubica e smoothing splines

#  cubic spline con funzione 'bs()' e nodi specificati
fit <- lm(y ~ bs(x, knots = c( -0.5, 0, 0.5)))  
pred <- predict(fit , newdata = list(x = x.grid), se = T)
lines(x.grid, pred$fit , lwd = 2, col = 4)
lines(x.grid , pred$fit + 2 * pred$se, lty = "dashed", col = 4)
lines(x.grid , pred$fit - 2 * pred$se, lty = "dashed", col = 4)


# cubic spline con funzione 'bs()' e nodi non specificati dall'utente
fit.b <- lm(y ~ bs(x, df = 7, intercept = TRUE))  
pred.b <- predict(fit.b , newdata = list(x = x.grid), se = T)
lines(x.grid, pred.b$fit , lwd = 2, col = "cyan")
lines(x.grid , pred.b$fit + 2 * pred$se, lty = "dashed", col = 4)
lines(x.grid , pred.b$fit - 2 * pred$se, lty = "dashed", col = 4)

#  natural cubic spline con funzione 'ns()' (K gdl)
fit.2 <- lm(y ~ ns(x, df = 3))  
pred.2 <- predict(fit.2 , newdata = list(x = x.grid), se = T)
lines(x.grid, pred.2$fit , lwd = 2, col = 2)
lines(x.grid , pred.2$fit + 2 * pred.2$se, lty = "dashed", col = 2)
lines(x.grid , pred.2$fit - 2 * pred.2$se, lty = "dashed", col = 2)

#  smoothing spline con criterio per lambda
fit.3 <- smooth.spline(x , y , cv = TRUE) 
lines(fit.3 , col = "green", lwd = 2) # 2 modi alternativi

#  smoothing spline con lambda = 0 (molto flessibile)
fit.3bis <- smooth.spline(x , y , lambda = 0 ) 
lines(fit.3bis , col = "darkgreen", lwd = 2)

# smoothing spline con 200 df
fit.4 <- smooth.spline(x , y , df = 200) 
lines(fit.4 , col = "darkolivegreen", lwd = 2)

# smoothing spline con 2 df (molto liscia)
fit.5 <- smooth.spline(x , y , df = 2) 
lines(fit.5 , col = "orange", lwd = 2)


legend(-0.2,-2, lty = 1, lwd =2, col=c(4, "cyan", 2,3, "darkgreen", "darkolivegreen", "orange"), 
       c("Cubic", "Cubic 2", "Natural", expression(paste("Smooth, ", lambda[opt])), 
         expression(paste("Smooth, ", lambda == 0)),
         expression(paste("Smooth, ", df==200)),
         expression(paste("Smooth, ", df==2))),
         cex =0.4)

## Consegna:

# 1. Ripetere punti precedenti usando 
#    i dati generati con b-splines, y.2

# 2. Ripetere punti precedenti usando
#    i dati contenuti in splines_data.csv

# 3. Esercizio con GAM: produrre codice
#    commentare. Trovate prima un esempio
#    con dati 'mcycle' che non serve riportare
#    nella consegna. La consegna è sui dati
#    tree.

## 3. GAM
library(MASS)
library(gam)

# the data measure the acceleration of the
# rider's head, against time, in a simulated motorcycle crash.

# Plot the acceleration against time, and use gam to t a univariate smooth to the data,
# selecting the smoothing parameter by GCV. Plot the resulting smooth, with partial resid-
# uals, but without standard errors.

data("mcycle")
plot(mcycle$times, mcycle$accel)

y <- mcycle$accel
x <- mcycle$times
xlims <- range(x)
x.grid <- seq(xlims[1], xlims[2], length.out = length(y))

fit.gam <- gam::gam(y ~ gam::s(x, 10) )
summary(fit.gam)
pred <- predict(fit.gam, newdata = list(x = x.grid ))
lines(x.grid, pred, col=2, lwd=2)
plot(fit.gam)

# It's possible to overstate the importance of penalization in explaining the improvement of
# the penalized regression spline, relative to the polynomial. Use gam to ret an unpenalized
# thin plate regression spline to the data, with basis dimension the same as that used for
# the polynomial, and again produce a plot for comparison with the previous two results.

# 6. GAM (trees data)
library(mgcv)

glm.2<-glm(Volume ~ Girth + Height, family = Gamma(link=log),
           data=trees)
gam.2 <- mgcv::gam(Volume ~ s(Height) + s(Girth), family=Gamma(link=log),
             data=trees)
summary(gam.2)
plot(gam.2,residuals=TRUE,pch=19)
plot.gam(gam.2,residuals=TRUE,pch=19)

# AIC tra modello GAM e GLM
AIC(glm.2, gam.2)

sp <- gam.2$sp
sp

# optimal level of smoothing
tuning.scale<-c(1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,
                1e3,1e4,1e5) 
scale.exponent <- log10(tuning.scale) 
n.tuning <- length(tuning.scale) 
edf <- rep(NA,n.tuning)  
min2ll <- rep(NA,n.tuning) 
aic <- rep(NA,n.tuning)  


for (i in 1:n.tuning) {   
  gamobj <- gam(Volume ~ s(Height) + s(Girth), family=Gamma(link=log),
                data=trees, sp=tuning.scale[i]*sp) 
  min2ll[i]<--2*logLik(gamobj) 
  edf[i]<-sum(gamobj$edf)+1  
  aic[i]<-AIC(gamobj)
}

par(mfrow=c(2,2)) 
plot(scale.exponent,min2ll,type="b",main="-2 log likelihood") 
plot(scale.exponent,edf,ylim=c(0,70),type="b",main="effective number of parameters")  
plot(scale.exponent,aic,type="b",main="AIC") 

# find the minimum
min.aic<-1e100 
opt.tuning.scale<-NULL 
for (i in 1:n.tuning) { 
  if (aic[i]<min.aic) { 
    min.aic<-aic[i] 
    opt.tuning.scale<-tuning.scale[i] 
  } 
} 
opt.sp<-opt.tuning.scale*sp
opt.sp