--- title: 'Semiparametric regression: local methods' output: html_document: default pdf_document: default date: "2023-10-28" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` In this note we program functions to compute some simple local smoothers, for some of them we illustrate functions from existing `R` packages. We will then perform some simulation studies to explore the behaviour of the smoothers as estimators of the unknown (or known) function $f$. # Data The smoothers will be applied to either a real or simulated dataset, in any case the data must be in a data.frame with columns `x` and `y` for the independent and dependent variables observations respectively. Real dataset (lidar) loading ```{r dati} d=read.table("lidar.dat",header=TRUE) names(d)=c("x","y") d=d[sort.list(d$x),] d$x=(d$x-min(d$x))/(max(d$x)-min(d$x)) #d$x=(d$x-mean(d$x))/sd(d$x) ``` Simulation of a dataset \[ y_i = f(x_i)+\varepsilon_i \] for $i=1,\ldots,n$ and where $\varepsilon_i\thicksim N(0,\sigma^2)$ independent. ```{r simulated data} n=200 #x=sort(runif(n,0,1)) #x=rbeta(n,1,6) truef=function(x) sin(2*pi*x) signaltonoise=2 simula=function(x=seq(0,1,length=n),signalnoise=signaltonoise){ my=truef(x) y=rnorm(n,my,sd=sd(my)/signalnoise) return(data.frame(x=x,y=y,truef=my)) } d=simula() plot(d$x,d$y) lines(d$x,d$truef,col="darkred",lwd=2) ``` # The smoothers Define (or find) functions to compute the smoothers, the function we create has the following structure ```{r echo=TRUE,eval=FALSE} smoother(x,xx,yy,h){ ... return(y) } ``` where - `x` are the points in which the smoother is calculated, possibly a vector - `xx`, `yy` are the vectors of observations for the independent and dependent variable respectively - `h` is the bandwith (or equivalent) - `y` (the returned values) are the value of the smoother in the points in `x` (a vector of the same length as `x`) ## Running mean Define a function which computes the running mean in a point \[ \hat{f}(x) = \frac{\sum_{i=1}^n y_i I_{h}(|x-x_i|)}{\sum_{i=1}^n I_{h}(|x-x_i|)} \] ```{r} runmean0=function(x,xx,yy,h) mean(yy[abs(xx-x)0) d$fold=sample(c(rep(1:V,n %/% V),1:(n%%V))) else d$fold=sample(rep(1:V,n %/% V)) msefold=function(v,h) { test=subset(d,fold==v) train=subset(d,fold!=v) y.hat=smoother(test$x,train$x,train$y,h) mean((test$y-y.hat)^2,na.rm=TRUE) } msefold(2,0.2) rmse=mean(sapply(1:V,msefold,h=.5)) ``` The same calculation is then repeated for all bandwiths ```{r} msefold.v=Vectorize(msefold) allmse=outer(1:V,h.vec,FUN=msefold.v) rmse=apply(allmse,2,mean) ``` ```{r} matplot(h.vec,cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="") abline(v=h.vec[which.min(mse.overall)]) rmse1=rmse-min(rmse)+min(bias.overall+variance.overall) points(h.vec,rmse1) points(h.vec[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red") ``` The optimal estimate according to CV is ```{r} plot(d$x,d$y,pch=20,xlab="x",ylab="y") curve(smoother(x,d$x,d$y,h=h.vec[which.min(rmse)]),add=TRUE,lwd=2,col="red") curve(smoother(x,d$x,d$y,h=h.vec[which.min(mse.overall)]),add=TRUE,lwd=2,col="darkgreen") ``` # Smoothing matrix We obtain the smoothing matrix and consequently the equivalent degrees of freedom of the smoother for each bandwidth, we can then plot the mse as a function of the DOF instead of the bandwith. ```{r} y.hat=smoother(d$x,d$x,d$y,h=hfix) zeri=rep(0,length=length(d$y)) columnj=function(j,h) { zeri[j]=1 smoother(d$x,d$x,zeri,h=h) } columnj(1,hfix) L=sapply(1:n,columnj,h=hfix) ##just a check that it works y.hat1=L%*%d$y plot(y.hat1,y.hat) identical(y.hat1,y.hat) max(abs(y.hat1-y.hat)) sum(diag(L)) ``` We can then compute the equivalent degrees of freedom for each bandwith ```{r} computeDOF=function(h){ L=sapply(1:n,columnj,h=h) sum(diag(L)) } computeDOF(hfix) dof.vec=sapply(h.vec,computeDOF) ``` ```{r} matplot(dof.vec,cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="") abline(v=h.vec[which.min(mse.overall)]) rmse1=rmse-min(rmse)+min(bias.overall+variance.overall) points(dof.vec,rmse1) points(dof.vec[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red") ```