--- title: 'Semiparametric regression: spline' output: html_document: default pdf_document: default date: "2023-10-28" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Data ```{r dati} d=read.table("lidar.dat",header=TRUE) names(d)=c("x","y") d=d[sort.list(d$x),] d$x=(d$x-min(d$x))/(max(d$x)-min(d$x)) ``` ```{r simulated data} n=200 x=sort(runif(n,0,1)) x=seq(0,1,length=n) truef=function(x) sin(2*pi*x) my=truef(x) signaltonoise=2 simula=function(signalnoise=signaltonoise){ y=rnorm(n,my,sd=sd(my)/signalnoise) return(data.frame(x=x,y=y,truef=my)) } d=simula() plot(d$x,d$y) ``` # Spline (tpb), unrestricted estimation ```{r} grado=3 nnodi=10 tpb=function(xx,nodi=NULL,nnodi=NULL,degree=3){ if (is.null(nodi)) nodi=seq(min(xx),max(xx),length=nnodi+2)[-c(1,nnodi+2)] fbase=function(x,k,p) ifelse(x-k>0,(x-k)^p,0) X=outer(xx,0:degree,FUN="^") X=cbind(X,outer(xx,nodi,FUN=fbase,p=degree)) return(list(nodi=nodi,degree=degree,X=X)) } mybase=tpb(d$x,nnodi=5,degree=3) matplot(d$x,mybase$X,type="l",lwd=2) rug(mybase$nodi,col="red") ``` Estimate the spline ```{r} fit=lm(d$y~mybase$X) plot(d$x,d$y) lines(d$x,predict(fit)) y.hat=mybase$X %*% solve(t(mybase$X)%*%mybase$X) %*% t(mybase$X) %*% d$y points(d$x,y.hat,pch="|",col="red") ``` Evaluate the spline on a new set of points ```{r} newx=seq(0,1,length=10) mybaseforpred=tpb(newx,nodi=mybase$nodi,degree=mybase$degree) plot(d$x,d$y) newy.hat=mybaseforpred$X %*% solve(t(mybase$X)%*%mybase$X) %*% t(mybase$X) %*% d$y points(newx,newy.hat,pch=20,col="red") lines(d$x,y.hat,col="green") ``` # Spline (tpb), penalized estimation Define a matrix for a ridge type penalization ```{r} D=diag(c(rep(0,mybase$degree+1),rep(1,length(mybase$nodi)))) ``` then the new $\hat{y}$ are ```{r} lambda=0.000001 y.hat.pen=mybase$X %*% solve(t(mybase$X)%*%mybase$X+lambda*D) %*% t(mybase$X) %*% d$y plot(d$x,d$y) lines(d$x,y.hat,col="green") lines(d$x,y.hat.pen,col="red") ``` ```{r} mybase=tpb(d$x,nnodi=25,degree=3) y.hat=mybase$X %*% solve(t(mybase$X)%*%mybase$X) %*% t(mybase$X) %*% d$y D=diag(c(rep(0,mybase$degree+1),rep(1,length(mybase$nodi)))) lambda=0.000001 y.hat.pen=mybase$X %*% solve(t(mybase$X)%*%mybase$X+lambda*D) %*% t(mybase$X) %*% d$y plot(d$x,d$y) lines(d$x,y.hat,col="green",lwd=2) lines(d$x,y.hat.pen,col="red",lwd=2) lambda=0.000000001 y.hat.pen=mybase$X %*% solve(t(mybase$X)%*%mybase$X+lambda*D) %*% t(mybase$X) %*% d$y lines(d$x,y.hat.pen,col="blue",lwd=2) ``` ```{r} basis.calc=function(xx,nodi=NULL,nnodi=NULL,degree=3){ if (is.null(nodi)) nodi=seq(min(xx),max(xx),length=nnodi+2)[-c(1,nnodi+2)] D=diag(c(rep(0,degree+1),rep(1,length(nodi)))) fbase=function(x,k,p) ifelse(x-k>0,(x-k)^p,0) X=outer(xx,0:degree,FUN="^") X=cbind(X,outer(xx,nodi,FUN=fbase,p=degree)) return(list(nodi=nodi,degree=degree,D=D,X=X)) } splinetpb=function(x,xx,yy,h,nodi=NULL,nnodi=NULL,degree=3){ mybase=basis.calc(xx,nodi=nodi,nnodi=nnodi,degree=degree) mybase.new=basis.calc(x,nodi=nodi,nnodi=nnodi,degree=degree) temp=solve(t(mybase$X)%*%mybase$X + h*mybase$D) %*% t(mybase$X) dof=sum(diag(mybase$X %*% temp)) #Xnew=cbind(Xnew,outer(x,nodi,FUN=fbase,p=grado)) coefs= temp %*% yy y=mybase.new$X %*% coefs return(list(dof=dof,nodi=nodi,D=D,y=y,base=mybase.new$X,coefs=coefs)) } ``` Estimate and plot the smooth estimate for a couple of penalizations ```{r} plot(d$x,d$y) lines(seq(0,1,length=100),splinetpb(seq(0,1,length=100),d$x,d$y,0.0001,nnodi=20)$y,col="red",lwd=2) lines(seq(0,1,length=100),splinetpb(seq(0,1,length=100),d$x,d$y,10^(-10),nnodi=20)$y,col="green",lwd=2) ``` ```{r} fit1=splinetpb(d$x,d$x,d$y,4,nnodi=20) fit2=splinetpb(d$x,d$x,d$y,10^(-10),nnodi=20) par(mar=c(2,2,0,0)) layout(matrix(1:4,byrow=FALSE,ncol=2)) plot(d$x,d$y) matlines(d$x,cbind(fit1$y,fit1$base*matrix(fit1$coefs,nrow=nrow(fit1$base),ncol=ncol(fit1$base),byrow=TRUE)),type="l",lty=1,lwd=2,xlab="",ylab="") lines(d$x,fit1$y,col="black",lwd=3) matplot(fit1$base,type="l",lty=1,lwd=2,xlab="",ylab="") plot(d$x,d$y) matlines(d$x,cbind(fit2$y,fit2$base*matrix(fit2$coefs,nrow=nrow(fit2$base),ncol=ncol(fit2$base),byrow=TRUE)),type="l",lty=1,lwd=2,xlab="",ylab="") lines(d$x,fit2$y,col="black",lwd=3) matplot(fit2$base,type="l",lty=1,lwd=2,xlab="",ylab="") ``` ```{r} basis.calc=function(xx,nodi=NULL,nnodi=NULL,degree=3){ if (is.null(nodi)) nodi=seq(min(xx),max(xx),length=nnodi+2)[-c(1,nnodi+2)] X=splines::bs(xx,knots=nodi,degree=degree) D=diag(rep(1,ncol(X))) return(list(nodi=nodi,degree=degree,D=D,X=X)) } ``` # Bias and variance via MC simulation Perform a MC simulation to evaluate bias and variance of the estimator for a given bandwith as a function of $x$ ```{r} smoother=function(x,xx,yy,h,nodi=NULL,nnodi=NULL,degree=1){ splinetpb(x,xx,yy,h,nodi=seq(0,1,length=27)[-c(1,27)],degree=degree)$y } h.vec=c(100,10,1,0.1,0.01,0.001,0.0001,0.00001) ``` ```{r} plot(d$x,d$y,pch=20,xlab="x",ylab="y") curve(smoother(x,d$x,d$y,h=10,nnodi=25),add=TRUE,lwd=2,col="red") curve(smoother(x,d$x,d$y,h=h.vec[floor(length(h.vec)*.25)],nnodi=25),add=TRUE,lwd=2,col="red") curve(smoother(x,d$x,d$y,h=h.vec[floor(length(h.vec)*.75)],nnodi=25),add=TRUE,lwd=2,col="darkgreen") #legend(0,-0.6,legend=c("h=0.1","h=0.3"),lwd=2,col=c("red","darkgreen")) ``` ```{r} M=1000 hfix=h.vec[floor(length(h.vec)*.75)] ff.hat=sapply(rep(1,M),FUN=function(x) {d.i=simula() smoother(d.i$x,d.i$x,d.i$y,h=hfix)}) biasf.hat=(apply(ff.hat,1,mean)-my) varf.hat=apply(ff.hat,1,var) #matplot(x,cbind(biasf.hat^2,varf.hat),col=c("blue","red"),pch=20) plot(x,biasf.hat^2,pch=20) plot(x,varf.hat,pch=20) plot(x,biasf.hat^2+varf.hat,pch=20) ``` Perform a MC simulation to evaluate the overall bias and variance of the estimator (not as a function of $x$) for a range of bandwiths ```{r} M=1000 onesim=function(z){ d.i=simula() sapply(h.vec,FUN=function(x) smoother(d.i$x,d.i$x,d.i$y,x)) } #onesim(0.1) allsim=sapply(1:M,onesim,simplify="array") var2=function(x) mean(x^2)-mean(x)^2 f.mean=apply(allsim,1:2,mean) f.bias=apply(f.mean,2,FUN=function(x) x-my) f.variance=apply(allsim,1:2,var2) bias.overall=apply(abs(f.bias)^2,2,mean) variance.overall=apply(f.variance,2,mean) mse.overall=bias.overall+variance.overall # # alternative calculation # all.error=apply(allsim,2:3,FUN=function(x) x-my) # f.mse.all=apply(all.error,1:2,FUN=function(x) mean(x^2)) # f.bias2=apply(all.error,1:2,mean) # f.var2=apply(all.error,1:2,FUN=var2) matplot(log(h.vec),cbind(bias.overall,variance.overall,mse.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2)) abline(v=h.vec[which.min(mse.overall)]) ``` # Cross validation Use $v-$fold cross validation to estimate the mean square error for the range of bandwiths and compare the estimate with the mse obtained via MC. ```{r} V=n #number of folds #d$fold=sample(1:V,n,replace=TRUE) #sample(c(rep(1:V,11),V)) if (n%%V>0) d$fold=sample(c(rep(1:V,n %/% V),1:(n%%V))) else d$fold=sample(rep(1:V,n %/% V)) msefold=function(v,h) { test=subset(d,fold==v) train=subset(d,fold!=v) # browser() y.hat=smoother(test$x,train$x,train$y,h) mean((test$y-y.hat)^2,na.rm=TRUE) } msefold(2,0.2) rmse=mean(sapply(1:V,msefold,h=.5)) ``` Ripetiamo l'operazione per tutte le bandwith ```{r} msefold.v=Vectorize(msefold) allmse=outer(1:V,h.vec,FUN=msefold.v) rmse=apply(allmse,2,mean) ``` ```{r} matplot(log(h.vec),cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="") abline(v=h.vec[which.min(mse.overall)]) rmse1=rmse-min(rmse)+min(bias.overall+variance.overall) points(log(h.vec),rmse1) points(log(h.vec)[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red") ``` The optimal estimate according to CV is ```{r} plot(d$x,d$y,pch=20,xlab="x",ylab="y") curve(smoother(x,d$x,d$y,h=h.vec[which.min(rmse)]),add=TRUE,lwd=2,col="red") curve(smoother(x,d$x,d$y,h=h.vec[which.min(mse.overall)]),add=TRUE,lwd=2,col="darkgreen") ```