---
title: 'Semiparametric regression: spline'
output:
  html_document: default
  pdf_document: default
date: "2023-10-28"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Data


```{r dati}
d=read.table("lidar.dat",header=TRUE)
names(d)=c("x","y")
d=d[sort.list(d$x),]
d$x=(d$x-min(d$x))/(max(d$x)-min(d$x))
```

```{r simulated data}
n=200
x=sort(runif(n,0,1))
x=seq(0,1,length=n)
truef=function(x) sin(2*pi*x)
my=truef(x)
signaltonoise=2

simula=function(signalnoise=signaltonoise){
  y=rnorm(n,my,sd=sd(my)/signalnoise)
  return(data.frame(x=x,y=y,truef=my))
}
d=simula()
plot(d$x,d$y)
```



# Spline (tpb), unrestricted estimation


```{r}
grado=3
nnodi=10


tpb=function(xx,nodi=NULL,nnodi=NULL,degree=3){
  if (is.null(nodi)) nodi=seq(min(xx),max(xx),length=nnodi+2)[-c(1,nnodi+2)]
  fbase=function(x,k,p) ifelse(x-k>0,(x-k)^p,0)
  X=outer(xx,0:degree,FUN="^")
  X=cbind(X,outer(xx,nodi,FUN=fbase,p=degree))
  return(list(nodi=nodi,degree=degree,X=X))
}

mybase=tpb(d$x,nnodi=5,degree=3)
matplot(d$x,mybase$X,type="l",lwd=2)
rug(mybase$nodi,col="red")
```
Estimate the spline

```{r}
fit=lm(d$y~mybase$X)
plot(d$x,d$y)
lines(d$x,predict(fit))

y.hat=mybase$X %*% solve(t(mybase$X)%*%mybase$X) %*% t(mybase$X) %*% d$y
points(d$x,y.hat,pch="|",col="red")
```
Evaluate the spline on a new set of points

```{r}
newx=seq(0,1,length=10)
mybaseforpred=tpb(newx,nodi=mybase$nodi,degree=mybase$degree)

plot(d$x,d$y)
newy.hat=mybaseforpred$X %*% solve(t(mybase$X)%*%mybase$X) %*% t(mybase$X) %*% d$y
points(newx,newy.hat,pch=20,col="red")
lines(d$x,y.hat,col="green")
```
# Spline (tpb), penalized estimation

Define a matrix for a ridge type penalization
```{r}
D=diag(c(rep(0,mybase$degree+1),rep(1,length(mybase$nodi))))
```

then the new $\hat{y}$ are

```{r}
lambda=0.000001
y.hat.pen=mybase$X %*% solve(t(mybase$X)%*%mybase$X+lambda*D) %*% t(mybase$X) %*% d$y
plot(d$x,d$y)
lines(d$x,y.hat,col="green")
lines(d$x,y.hat.pen,col="red")

```
```{r}
mybase=tpb(d$x,nnodi=25,degree=3)
y.hat=mybase$X %*% solve(t(mybase$X)%*%mybase$X) %*% t(mybase$X) %*% d$y
D=diag(c(rep(0,mybase$degree+1),rep(1,length(mybase$nodi))))
lambda=0.000001
y.hat.pen=mybase$X %*% solve(t(mybase$X)%*%mybase$X+lambda*D) %*% t(mybase$X) %*% d$y
plot(d$x,d$y)
lines(d$x,y.hat,col="green",lwd=2)
lines(d$x,y.hat.pen,col="red",lwd=2)

lambda=0.000000001
y.hat.pen=mybase$X %*% solve(t(mybase$X)%*%mybase$X+lambda*D) %*% t(mybase$X) %*% d$y
lines(d$x,y.hat.pen,col="blue",lwd=2)



```



```{r}
basis.calc=function(xx,nodi=NULL,nnodi=NULL,degree=3){
  if (is.null(nodi)) nodi=seq(min(xx),max(xx),length=nnodi+2)[-c(1,nnodi+2)]
  D=diag(c(rep(0,degree+1),rep(1,length(nodi))))
  fbase=function(x,k,p) ifelse(x-k>0,(x-k)^p,0)
  X=outer(xx,0:degree,FUN="^")
  X=cbind(X,outer(xx,nodi,FUN=fbase,p=degree))
  return(list(nodi=nodi,degree=degree,D=D,X=X))
}


splinetpb=function(x,xx,yy,h,nodi=NULL,nnodi=NULL,degree=3){
  mybase=basis.calc(xx,nodi=nodi,nnodi=nnodi,degree=degree)
  mybase.new=basis.calc(x,nodi=nodi,nnodi=nnodi,degree=degree)
  temp=solve(t(mybase$X)%*%mybase$X + h*mybase$D) %*% t(mybase$X)
  dof=sum(diag(mybase$X %*% temp))
  #Xnew=cbind(Xnew,outer(x,nodi,FUN=fbase,p=grado))
  coefs= temp %*% yy
  y=mybase.new$X %*% coefs
  return(list(dof=dof,nodi=nodi,D=D,y=y,base=mybase.new$X,coefs=coefs))
}
```



Estimate and plot the smooth estimate for a couple of penalizations


```{r}
plot(d$x,d$y)
lines(seq(0,1,length=100),splinetpb(seq(0,1,length=100),d$x,d$y,0.0001,nnodi=20)$y,col="red",lwd=2)
lines(seq(0,1,length=100),splinetpb(seq(0,1,length=100),d$x,d$y,10^(-10),nnodi=20)$y,col="green",lwd=2)
```

```{r}
fit1=splinetpb(d$x,d$x,d$y,4,nnodi=20)
fit2=splinetpb(d$x,d$x,d$y,10^(-10),nnodi=20)
par(mar=c(2,2,0,0))
layout(matrix(1:4,byrow=FALSE,ncol=2))
plot(d$x,d$y)
matlines(d$x,cbind(fit1$y,fit1$base*matrix(fit1$coefs,nrow=nrow(fit1$base),ncol=ncol(fit1$base),byrow=TRUE)),type="l",lty=1,lwd=2,xlab="",ylab="")
lines(d$x,fit1$y,col="black",lwd=3)
matplot(fit1$base,type="l",lty=1,lwd=2,xlab="",ylab="")

plot(d$x,d$y)
matlines(d$x,cbind(fit2$y,fit2$base*matrix(fit2$coefs,nrow=nrow(fit2$base),ncol=ncol(fit2$base),byrow=TRUE)),type="l",lty=1,lwd=2,xlab="",ylab="")
lines(d$x,fit2$y,col="black",lwd=3)
matplot(fit2$base,type="l",lty=1,lwd=2,xlab="",ylab="")
```

```{r}
basis.calc=function(xx,nodi=NULL,nnodi=NULL,degree=3){
  if (is.null(nodi)) nodi=seq(min(xx),max(xx),length=nnodi+2)[-c(1,nnodi+2)]
  X=splines::bs(xx,knots=nodi,degree=degree)
  D=diag(rep(1,ncol(X)))
  return(list(nodi=nodi,degree=degree,D=D,X=X))
}
```


# Bias and variance via MC simulation


Perform a MC simulation to evaluate bias and variance of the estimator for a given bandwith as a function of $x$

```{r}
smoother=function(x,xx,yy,h,nodi=NULL,nnodi=NULL,degree=1){
  splinetpb(x,xx,yy,h,nodi=seq(0,1,length=27)[-c(1,27)],degree=degree)$y
}
h.vec=c(100,10,1,0.1,0.01,0.001,0.0001,0.00001)
```


```{r}

plot(d$x,d$y,pch=20,xlab="x",ylab="y")
curve(smoother(x,d$x,d$y,h=10,nnodi=25),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=h.vec[floor(length(h.vec)*.25)],nnodi=25),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=h.vec[floor(length(h.vec)*.75)],nnodi=25),add=TRUE,lwd=2,col="darkgreen")
#legend(0,-0.6,legend=c("h=0.1","h=0.3"),lwd=2,col=c("red","darkgreen"))
```


```{r}
M=1000
hfix=h.vec[floor(length(h.vec)*.75)]
ff.hat=sapply(rep(1,M),FUN=function(x) {d.i=simula()
  smoother(d.i$x,d.i$x,d.i$y,h=hfix)})
biasf.hat=(apply(ff.hat,1,mean)-my)
varf.hat=apply(ff.hat,1,var)
#matplot(x,cbind(biasf.hat^2,varf.hat),col=c("blue","red"),pch=20)
plot(x,biasf.hat^2,pch=20)
plot(x,varf.hat,pch=20)
plot(x,biasf.hat^2+varf.hat,pch=20)
```

Perform a MC simulation to evaluate the overall bias and variance of the estimator (not as a function of $x$) for a range of bandwiths

```{r}
M=1000


onesim=function(z){
  d.i=simula()
  sapply(h.vec,FUN=function(x) smoother(d.i$x,d.i$x,d.i$y,x))
}
#onesim(0.1)
allsim=sapply(1:M,onesim,simplify="array")

var2=function(x) mean(x^2)-mean(x)^2

f.mean=apply(allsim,1:2,mean)
f.bias=apply(f.mean,2,FUN=function(x) x-my)
f.variance=apply(allsim,1:2,var2)

bias.overall=apply(abs(f.bias)^2,2,mean)
variance.overall=apply(f.variance,2,mean)
mse.overall=bias.overall+variance.overall
# # alternative calculation
# all.error=apply(allsim,2:3,FUN=function(x) x-my)
# f.mse.all=apply(all.error,1:2,FUN=function(x) mean(x^2))
# f.bias2=apply(all.error,1:2,mean)
# f.var2=apply(all.error,1:2,FUN=var2)

matplot(log(h.vec),cbind(bias.overall,variance.overall,mse.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2))
abline(v=h.vec[which.min(mse.overall)])



```


# Cross validation

Use $v-$fold cross validation to estimate the mean square error for the range of bandwiths and compare the estimate with the mse obtained via MC.

```{r}
V=n #number of folds
#d$fold=sample(1:V,n,replace=TRUE) #sample(c(rep(1:V,11),V))
if (n%%V>0) d$fold=sample(c(rep(1:V,n %/% V),1:(n%%V))) else d$fold=sample(rep(1:V,n %/% V))

msefold=function(v,h) {
  test=subset(d,fold==v)
  train=subset(d,fold!=v)
 # browser()
  y.hat=smoother(test$x,train$x,train$y,h)
  mean((test$y-y.hat)^2,na.rm=TRUE)
}
msefold(2,0.2)
rmse=mean(sapply(1:V,msefold,h=.5))
```
Ripetiamo l'operazione per tutte le bandwith


```{r}
msefold.v=Vectorize(msefold)
allmse=outer(1:V,h.vec,FUN=msefold.v)
rmse=apply(allmse,2,mean)

```

```{r}
matplot(log(h.vec),cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="")
abline(v=h.vec[which.min(mse.overall)])
rmse1=rmse-min(rmse)+min(bias.overall+variance.overall)
points(log(h.vec),rmse1)
points(log(h.vec)[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red")

```

The optimal estimate according to CV is

```{r}

plot(d$x,d$y,pch=20,xlab="x",ylab="y")
curve(smoother(x,d$x,d$y,h=h.vec[which.min(rmse)]),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=h.vec[which.min(mse.overall)]),add=TRUE,lwd=2,col="darkgreen")
```