In this note we program functions to compute some simple local smoothers, for some of them we illustrate functions from existing R packages. We will then perform some simulation studies to explore the behaviour of the smoothers as estimators of the unknown (or known) function \(f\).

Data

The smoothers will be applied to either a real or simulated dataset, in any case the data must be in a data.frame with columns x and y for the independent and dependent variables observations respectively.

Real dataset (lidar) loading

d=read.table("lidar.dat",header=TRUE)
names(d)=c("x","y")
d=d[sort.list(d$x),]
d$x=(d$x-min(d$x))/(max(d$x)-min(d$x))
#d$x=(d$x-mean(d$x))/sd(d$x)

Simulation of a dataset \[ y_i = f(x_i)+\varepsilon_i \] for \(i=1,\ldots,n\) and where \(\varepsilon_i\thicksim N(0,\sigma^2)\) independent.

n=200
#x=sort(runif(n,0,1))

#x=rbeta(n,1,6)
truef=function(x) sin(2*pi*x)
signaltonoise=2

simula=function(x=seq(0,1,length=n),signalnoise=signaltonoise){
  my=truef(x)
  y=rnorm(n,my,sd=sd(my)/signalnoise)
  return(data.frame(x=x,y=y,truef=my))
}
d=simula()
plot(d$x,d$y)
lines(d$x,d$truef,col="darkred",lwd=2)

The smoothers

Define (or find) functions to compute the smoothers, the function we create has the following structure

smoother(x,xx,yy,h){
  ...
  return(y)
}

where

Running mean

Define a function which computes the running mean in a point \[ \hat{f}(x) = \frac{\sum_{i=1}^n y_i I_{h}(|x-x_i|)}{\sum_{i=1}^n I_{h}(|x-x_i|)} \]

runmean0=function(x,xx,yy,h)
  mean(yy[abs(xx-x)<h])
runmean0(0.5,d$x,d$y,h=0.1)
## [1] 0.1114741

Use the first function to define a new one which computes the smoother for a vector of values

runmean=function(x,xx,yy,h)  
  mapply(runmean0,x,MoreArgs=list(xx=xx,yy=yy,h=h))

# alternative
#runmean=Vectorize(runmean0,"x")

runmean(c(0.5,0.6),d$x,d$y,h=0.1)
## [1]  0.1114741 -0.3564263

using this, we obtain estimates of \(f(x)\) in a vector of values

runmean(c(0.5,0.6),d$x,d$y,h=0.1)
## [1]  0.1114741 -0.3564263

Mean of nearest neighbours

Let \[ N_k(x)=\{x_i: |x-x_i|\leq d_{(k)}\} \] where \(d_i=|x-x_i|\) and \(d_{(1)}\leq \ldots\leq d_{(n)}\) are the ordered distances, then \[ \hat{f}(x) = \frac{1}{k} \sum_{i=1}^n y_i I_{N_k(x)}(x_i) \]

hneighmean0=function(x,xx,yy,h){
  mean(yy[abs(xx-x)<sort(abs(xx-x))[h]])
}
hneighmean=function(x,xx,yy,h)  
  mapply(hneighmean0,x,MoreArgs=list(xx=xx,yy=yy,h=h))

# alternative
# hneighmean=Vectorize(hneighmean0,"x")

Kernel regression

Is defined as \[ \hat{f}(x) = \sum_{i=1}^n \ell_i(x) Y_i \] in which \[ \ell_i(x) = \frac{K\left(\frac{x-x_i}{h}\right)}{\sum_{j=1}^n K\left(\frac{x-x_j}{h}\right)} \] \(K()\) being a kernel.

kernelreg0=function(x,xx,yy,h){
  weights=dnorm((x-xx)/h)
  return(sum(weights*yy)/sum(weights))
}

kernelreg=function(x,xx,yy,h)  
  mapply(kernelreg0,x,MoreArgs=list(xx=xx,yy=yy,h=h))

# alternative
#kernelreg=Vectorize(kernelreg0,"x")
kernelreg0(c(0.5),d$x,d$y,h=0.1)
## [1] 0.07901042
kernelreg0(c(0.6),d$x,d$y,h=0.1)
## [1] -0.3510991
kernelreg(c(0.5,0.6),d$x,d$y,h=0.1)
## [1]  0.07901042 -0.35109905
kernelreg=function(x,xx,yy,h,kernel="normal") ksmooth(xx,yy,kernel=kernel,bandwidth=h,x.points=x)$y

kernelreg(c(0.5,0.6),d$x,d$y,h=0.1)
## [1]  0.09107379 -0.36749274
kernelreg(c(0.5,0.6),d$x,d$y,h=0.1*qnorm(0.75)/0.25)
## [1]  0.07900592 -0.35113222

Loess

loessreg0=function(x,xx,yy,h,degree=2){
  W=diag(dnorm((x-xx)/h))
  X=outer(xx-x,0:degree,FUN=function(x,y) (x^y/factorial(y)))
  solve(t(X)%*%W%*%X,t(X)%*%W%*%yy)[1]
}

loessreg=function(x,xx,yy,h,...)  
  mapply(loessreg0,x,MoreArgs=list(xx=xx,yy=yy,h=h,...))

# alternative
#loessreg=Vectorize(loessreg0,"x")
loessreg0(c(0.5),d$x,d$y,h=0.1)
## [1] 0.1001812
loessreg0(c(0.6),d$x,d$y,h=0.1)
## [1] -0.3927242
loessreg(c(0.5,0.6),d$x,d$y,h=0.1)
## [1]  0.1001812 -0.3927242
loessreg2 = function(x,xx,yy,h,degree=2,family="gaussian") {
  fit=loess(yy~xx,span=h,surface="direct",degree=degree)
  predict(fit,newdata=data.frame(xx=x))
}
#stima=loess(y~x,data=d,span=0.1)
loessreg2(c(0.5,0.6),d$x,d$y,0.1)
## [1] -0.007266302 -0.391545715

(See also the library KernSmooth and the function locpoly, both for kernel regression and loess.)

Bin smoother

binsmoother=function(x,xx,yy,h){
  seq(min(xx),max(xx),length=(max(xx)-min(xx))/(2*h))
  X=model.matrix(yy~cut(xx,seq(min(xx),max(xx),length=(max(xx)-min(xx))/h),include.lowest = TRUE))
  Xnew=model.matrix(x~cut(x,seq(min(xx),max(xx),length=(max(xx)-min(xx))/h),include.lowest = TRUE))
  Xnew %*% solve(t(X)%*%X,t(X)%*%yy)
}
binsmoother(c(0.5,0.6),d$x,d$y,h=0.1)
##         [,1]
## 1  0.1027698
## 2 -0.4451618
#smoother=hneighmean
#h.vec=1:(floor(n/2))

#smoother=runmean
#h.vec=seq(0,0.7,length=51)[-1]

smoother=kernelreg #runmean
h.vec=seq(0.05,0.3,length=51)[-1]

#smoother=loessreg
#h.vec=seq(0.05,1,length=5)

#smoother=binsmoother
#h.vec=seq(0.05,1,length=5)

Estimate and plot the smooth estimate for a couple of bandwiths

plot(d$x,d$y,pch=20,xlab="x",ylab="y")
curve(smoother(x,d$x,d$y,h=0.01),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=0.4),add=TRUE,lwd=2,col="darkgreen")

Bias and variance via MC simulation

We perform a MC simulation to evaluate bias and variance of the estimator for a given bandwith as a function of \(x\) \[ MSE_x = R(f(x),\hat{f}(x)) = E(L(f(x),\hat{f}(x))) = (f(x)-E(\hat{f}(x)))^2 + V(\hat{f}(x)) \]

M=1000
hfix=0.1
#ris=matrix...
#for (i in 1:M) {
#  d.i=simula()
#  ris[] = smoother(d.i$x,d.i$x,d.i$y,h=hfix)
#}
n=200 #sample size
xobs=seq(0,1,length=n) # observed x values
xseq=seq(0,1,length=100) # values of x in which the bias and variance are calculated
truefseq=truef(xseq) # true values of f
ff.hat=sapply(rep(1,M),
              FUN=function(x) {
                d.i=simula(x=xobs)
                smoother(xseq,d.i$x,d.i$y,h=hfix)
                }
              )
biasf.hat=(apply(ff.hat,1,mean)-truefseq)
varf.hat=apply(ff.hat,1,var)
#matplot(x,cbind(biasf.hat^2,varf.hat),col=c("blue","red"),pch=20)
plot(xseq,biasf.hat^2,pch=20)

plot(xseq,varf.hat,pch=20)

plot(xseq,biasf.hat^2+varf.hat,pch=20)

Perform a MC simulation to evaluate the overall bias and variance of the estimator (not as a function of \(x\)) for a range of bandwiths \[R(f,\hat{f}) = \frac{1}{n}\sum_{i=1}^n R(f(x_i),\hat{f}(x_i)) \]

M=1000


onesim=function(z){
  d.i=simula(x=xobs)
  sapply(h.vec,FUN=function(x) smoother(xobs,d.i$x,d.i$y,x))
}
#onesim(0.1)
allsim=sapply(1:M,onesim,simplify="array")

var2=function(x) mean(x^2)-mean(x)^2
f.mean=apply(allsim,1:2,mean)
f.bias=apply(f.mean,2,FUN=function(x) x-truef(xobs))
f.variance=apply(allsim,1:2,var2)

bias.overall=apply(abs(f.bias)^2,2,mean)
variance.overall=apply(f.variance,2,mean)
mse.overall=bias.overall+variance.overall


# # alternative calculation
# all.error=apply(allsim,2:3,FUN=function(x) x-my)
# f.mse.all=apply(all.error,1:2,FUN=function(x) mean(x^2))
# f.bias2=apply(all.error,1:2,mean)
# f.var2=apply(all.error,1:2,FUN=var2)

matplot(h.vec,cbind(bias.overall,variance.overall,mse.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2))
abline(v=h.vec[which.min(mse.overall)])

Cross validation

Use \(v-\)fold cross validation to estimate the mean square error for the range of bandwiths and compare the estimate with the mse obtained via MC.

V=10 #number of folds
#d$fold=sample(1:V,n,replace=TRUE) #sample(c(rep(1:V,11),V))
if (n%%V>0) d$fold=sample(c(rep(1:V,n %/% V),1:(n%%V))) else d$fold=sample(rep(1:V,n %/% V))

msefold=function(v,h) {
  test=subset(d,fold==v)
  train=subset(d,fold!=v)
  y.hat=smoother(test$x,train$x,train$y,h)
  mean((test$y-y.hat)^2,na.rm=TRUE)
}
msefold(2,0.2)
## [1] 0.1180701
rmse=mean(sapply(1:V,msefold,h=.5))

The same calculation is then repeated for all bandwiths

msefold.v=Vectorize(msefold)
allmse=outer(1:V,h.vec,FUN=msefold.v)
rmse=apply(allmse,2,mean)
matplot(h.vec,cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="")
abline(v=h.vec[which.min(mse.overall)])
rmse1=rmse-min(rmse)+min(bias.overall+variance.overall)
points(h.vec,rmse1)
points(h.vec[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red")

The optimal estimate according to CV is

plot(d$x,d$y,pch=20,xlab="x",ylab="y")
curve(smoother(x,d$x,d$y,h=h.vec[which.min(rmse)]),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=h.vec[which.min(mse.overall)]),add=TRUE,lwd=2,col="darkgreen")

Smoothing matrix

We obtain the smoothing matrix and consequently the equivalent degrees of freedom of the smoother for each bandwidth, we can then plot the mse as a function of the DOF instead of the bandwith.

y.hat=smoother(d$x,d$x,d$y,h=hfix)
zeri=rep(0,length=length(d$y))
columnj=function(j,h) {
  zeri[j]=1
  smoother(d$x,d$x,zeri,h=h)
}
columnj(1,hfix)
##   [1] 1.026294e-01 9.230404e-02 8.239528e-02 7.295366e-02 6.402809e-02
##   [6] 5.566407e-02 4.790182e-02 4.077420e-02 3.430484e-02 2.850641e-02
##  [11] 2.337938e-02 1.891137e-02 1.507733e-02 1.184044e-02 9.153937e-03
##  [16] 6.963447e-03 5.209838e-03 3.832150e-03 2.770389e-03 1.967912e-03
##  [21] 1.373241e-03 9.412234e-04 6.335631e-04 4.187913e-04 2.718232e-04
##  [26] 1.732350e-04 1.084002e-04 6.659786e-05 4.017160e-05 2.379043e-05
##  [31] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [36] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [41] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [46] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [51] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [56] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [61] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [66] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [71] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [76] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [81] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [86] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [91] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
##  [96] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [101] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [106] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [111] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [116] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [121] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [126] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [131] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [136] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [141] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [146] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [151] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [156] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [161] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [166] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [171] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [176] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [181] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [186] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [191] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [196] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
L=sapply(1:n,columnj,h=hfix)
##just a check that it works
y.hat1=L%*%d$y
plot(y.hat1,y.hat)

identical(y.hat1,y.hat)
## [1] FALSE
max(abs(y.hat1-y.hat))
## [1] 7.771561e-16
sum(diag(L))
## [1] 11.28414

We can then compute the equivalent degrees of freedom for each bandwith

computeDOF=function(h){
  L=sapply(1:n,columnj,h=h)
  sum(diag(L))
}
computeDOF(hfix)
## [1] 11.28414
dof.vec=sapply(h.vec,computeDOF)
matplot(dof.vec,cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="")
abline(v=h.vec[which.min(mse.overall)])
rmse1=rmse-min(rmse)+min(bias.overall+variance.overall)
points(dof.vec,rmse1)
points(dof.vec[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red")