In this note we program functions to compute some simple local
smoothers, for some of them we illustrate functions from existing
R
packages. We will then perform some simulation studies to
explore the behaviour of the smoothers as estimators of the unknown (or
known) function \(f\).
The smoothers will be applied to either a real or simulated dataset,
in any case the data must be in a data.frame with columns x
and y
for the independent and dependent variables
observations respectively.
Real dataset (lidar) loading
d=read.table("lidar.dat",header=TRUE)
names(d)=c("x","y")
d=d[sort.list(d$x),]
d$x=(d$x-min(d$x))/(max(d$x)-min(d$x))
#d$x=(d$x-mean(d$x))/sd(d$x)
Simulation of a dataset \[ y_i = f(x_i)+\varepsilon_i \] for \(i=1,\ldots,n\) and where \(\varepsilon_i\thicksim N(0,\sigma^2)\) independent.
n=200
#x=sort(runif(n,0,1))
#x=rbeta(n,1,6)
truef=function(x) sin(2*pi*x)
signaltonoise=2
simula=function(x=seq(0,1,length=n),signalnoise=signaltonoise){
my=truef(x)
y=rnorm(n,my,sd=sd(my)/signalnoise)
return(data.frame(x=x,y=y,truef=my))
}
d=simula()
plot(d$x,d$y)
lines(d$x,d$truef,col="darkred",lwd=2)
Define (or find) functions to compute the smoothers, the function we create has the following structure
smoother(x,xx,yy,h){
...
return(y)
}
where
x
are the points in which the smoother is calculated,
possibly a vectorxx
, yy
are the vectors of observations for
the independent and dependent variable respectivelyh
is the bandwith (or equivalent)y
(the returned values) are the value of the smoother
in the points in x
(a vector of the same length as
x
)Define a function which computes the running mean in a point \[ \hat{f}(x) = \frac{\sum_{i=1}^n y_i I_{h}(|x-x_i|)}{\sum_{i=1}^n I_{h}(|x-x_i|)} \]
runmean0=function(x,xx,yy,h)
mean(yy[abs(xx-x)<h])
runmean0(0.5,d$x,d$y,h=0.1)
## [1] 0.1114741
Use the first function to define a new one which computes the smoother for a vector of values
runmean=function(x,xx,yy,h)
mapply(runmean0,x,MoreArgs=list(xx=xx,yy=yy,h=h))
# alternative
#runmean=Vectorize(runmean0,"x")
runmean(c(0.5,0.6),d$x,d$y,h=0.1)
## [1] 0.1114741 -0.3564263
using this, we obtain estimates of \(f(x)\) in a vector of values
runmean(c(0.5,0.6),d$x,d$y,h=0.1)
## [1] 0.1114741 -0.3564263
Let \[ N_k(x)=\{x_i: |x-x_i|\leq d_{(k)}\} \] where \(d_i=|x-x_i|\) and \(d_{(1)}\leq \ldots\leq d_{(n)}\) are the ordered distances, then \[ \hat{f}(x) = \frac{1}{k} \sum_{i=1}^n y_i I_{N_k(x)}(x_i) \]
hneighmean0=function(x,xx,yy,h){
mean(yy[abs(xx-x)<sort(abs(xx-x))[h]])
}
hneighmean=function(x,xx,yy,h)
mapply(hneighmean0,x,MoreArgs=list(xx=xx,yy=yy,h=h))
# alternative
# hneighmean=Vectorize(hneighmean0,"x")
Is defined as \[ \hat{f}(x) = \sum_{i=1}^n \ell_i(x) Y_i \] in which \[ \ell_i(x) = \frac{K\left(\frac{x-x_i}{h}\right)}{\sum_{j=1}^n K\left(\frac{x-x_j}{h}\right)} \] \(K()\) being a kernel.
kernelreg0=function(x,xx,yy,h){
weights=dnorm((x-xx)/h)
return(sum(weights*yy)/sum(weights))
}
kernelreg=function(x,xx,yy,h)
mapply(kernelreg0,x,MoreArgs=list(xx=xx,yy=yy,h=h))
# alternative
#kernelreg=Vectorize(kernelreg0,"x")
kernelreg0(c(0.5),d$x,d$y,h=0.1)
## [1] 0.07901042
kernelreg0(c(0.6),d$x,d$y,h=0.1)
## [1] -0.3510991
kernelreg(c(0.5,0.6),d$x,d$y,h=0.1)
## [1] 0.07901042 -0.35109905
kernelreg=function(x,xx,yy,h,kernel="normal") ksmooth(xx,yy,kernel=kernel,bandwidth=h,x.points=x)$y
kernelreg(c(0.5,0.6),d$x,d$y,h=0.1)
## [1] 0.09107379 -0.36749274
kernelreg(c(0.5,0.6),d$x,d$y,h=0.1*qnorm(0.75)/0.25)
## [1] 0.07900592 -0.35113222
loessreg0=function(x,xx,yy,h,degree=2){
W=diag(dnorm((x-xx)/h))
X=outer(xx-x,0:degree,FUN=function(x,y) (x^y/factorial(y)))
solve(t(X)%*%W%*%X,t(X)%*%W%*%yy)[1]
}
loessreg=function(x,xx,yy,h,...)
mapply(loessreg0,x,MoreArgs=list(xx=xx,yy=yy,h=h,...))
# alternative
#loessreg=Vectorize(loessreg0,"x")
loessreg0(c(0.5),d$x,d$y,h=0.1)
## [1] 0.1001812
loessreg0(c(0.6),d$x,d$y,h=0.1)
## [1] -0.3927242
loessreg(c(0.5,0.6),d$x,d$y,h=0.1)
## [1] 0.1001812 -0.3927242
loessreg2 = function(x,xx,yy,h,degree=2,family="gaussian") {
fit=loess(yy~xx,span=h,surface="direct",degree=degree)
predict(fit,newdata=data.frame(xx=x))
}
#stima=loess(y~x,data=d,span=0.1)
loessreg2(c(0.5,0.6),d$x,d$y,0.1)
## [1] -0.007266302 -0.391545715
(See also the library KernSmooth
and the function
locpoly
, both for kernel regression and loess.)
binsmoother=function(x,xx,yy,h){
seq(min(xx),max(xx),length=(max(xx)-min(xx))/(2*h))
X=model.matrix(yy~cut(xx,seq(min(xx),max(xx),length=(max(xx)-min(xx))/h),include.lowest = TRUE))
Xnew=model.matrix(x~cut(x,seq(min(xx),max(xx),length=(max(xx)-min(xx))/h),include.lowest = TRUE))
Xnew %*% solve(t(X)%*%X,t(X)%*%yy)
}
binsmoother(c(0.5,0.6),d$x,d$y,h=0.1)
## [,1]
## 1 0.1027698
## 2 -0.4451618
#smoother=hneighmean
#h.vec=1:(floor(n/2))
#smoother=runmean
#h.vec=seq(0,0.7,length=51)[-1]
smoother=kernelreg #runmean
h.vec=seq(0.05,0.3,length=51)[-1]
#smoother=loessreg
#h.vec=seq(0.05,1,length=5)
#smoother=binsmoother
#h.vec=seq(0.05,1,length=5)
Estimate and plot the smooth estimate for a couple of bandwiths
plot(d$x,d$y,pch=20,xlab="x",ylab="y")
curve(smoother(x,d$x,d$y,h=0.01),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=0.4),add=TRUE,lwd=2,col="darkgreen")
We perform a MC simulation to evaluate bias and variance of the estimator for a given bandwith as a function of \(x\) \[ MSE_x = R(f(x),\hat{f}(x)) = E(L(f(x),\hat{f}(x))) = (f(x)-E(\hat{f}(x)))^2 + V(\hat{f}(x)) \]
M=1000
hfix=0.1
#ris=matrix...
#for (i in 1:M) {
# d.i=simula()
# ris[] = smoother(d.i$x,d.i$x,d.i$y,h=hfix)
#}
n=200 #sample size
xobs=seq(0,1,length=n) # observed x values
xseq=seq(0,1,length=100) # values of x in which the bias and variance are calculated
truefseq=truef(xseq) # true values of f
ff.hat=sapply(rep(1,M),
FUN=function(x) {
d.i=simula(x=xobs)
smoother(xseq,d.i$x,d.i$y,h=hfix)
}
)
biasf.hat=(apply(ff.hat,1,mean)-truefseq)
varf.hat=apply(ff.hat,1,var)
#matplot(x,cbind(biasf.hat^2,varf.hat),col=c("blue","red"),pch=20)
plot(xseq,biasf.hat^2,pch=20)
plot(xseq,varf.hat,pch=20)
plot(xseq,biasf.hat^2+varf.hat,pch=20)
Perform a MC simulation to evaluate the overall bias and variance of the estimator (not as a function of \(x\)) for a range of bandwiths \[R(f,\hat{f}) = \frac{1}{n}\sum_{i=1}^n R(f(x_i),\hat{f}(x_i)) \]
M=1000
onesim=function(z){
d.i=simula(x=xobs)
sapply(h.vec,FUN=function(x) smoother(xobs,d.i$x,d.i$y,x))
}
#onesim(0.1)
allsim=sapply(1:M,onesim,simplify="array")
var2=function(x) mean(x^2)-mean(x)^2
f.mean=apply(allsim,1:2,mean)
f.bias=apply(f.mean,2,FUN=function(x) x-truef(xobs))
f.variance=apply(allsim,1:2,var2)
bias.overall=apply(abs(f.bias)^2,2,mean)
variance.overall=apply(f.variance,2,mean)
mse.overall=bias.overall+variance.overall
# # alternative calculation
# all.error=apply(allsim,2:3,FUN=function(x) x-my)
# f.mse.all=apply(all.error,1:2,FUN=function(x) mean(x^2))
# f.bias2=apply(all.error,1:2,mean)
# f.var2=apply(all.error,1:2,FUN=var2)
matplot(h.vec,cbind(bias.overall,variance.overall,mse.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2))
abline(v=h.vec[which.min(mse.overall)])
Use \(v-\)fold cross validation to estimate the mean square error for the range of bandwiths and compare the estimate with the mse obtained via MC.
V=10 #number of folds
#d$fold=sample(1:V,n,replace=TRUE) #sample(c(rep(1:V,11),V))
if (n%%V>0) d$fold=sample(c(rep(1:V,n %/% V),1:(n%%V))) else d$fold=sample(rep(1:V,n %/% V))
msefold=function(v,h) {
test=subset(d,fold==v)
train=subset(d,fold!=v)
y.hat=smoother(test$x,train$x,train$y,h)
mean((test$y-y.hat)^2,na.rm=TRUE)
}
msefold(2,0.2)
## [1] 0.1180701
rmse=mean(sapply(1:V,msefold,h=.5))
The same calculation is then repeated for all bandwiths
msefold.v=Vectorize(msefold)
allmse=outer(1:V,h.vec,FUN=msefold.v)
rmse=apply(allmse,2,mean)
matplot(h.vec,cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="")
abline(v=h.vec[which.min(mse.overall)])
rmse1=rmse-min(rmse)+min(bias.overall+variance.overall)
points(h.vec,rmse1)
points(h.vec[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red")
The optimal estimate according to CV is
plot(d$x,d$y,pch=20,xlab="x",ylab="y")
curve(smoother(x,d$x,d$y,h=h.vec[which.min(rmse)]),add=TRUE,lwd=2,col="red")
curve(smoother(x,d$x,d$y,h=h.vec[which.min(mse.overall)]),add=TRUE,lwd=2,col="darkgreen")
We obtain the smoothing matrix and consequently the equivalent degrees of freedom of the smoother for each bandwidth, we can then plot the mse as a function of the DOF instead of the bandwith.
y.hat=smoother(d$x,d$x,d$y,h=hfix)
zeri=rep(0,length=length(d$y))
columnj=function(j,h) {
zeri[j]=1
smoother(d$x,d$x,zeri,h=h)
}
columnj(1,hfix)
## [1] 1.026294e-01 9.230404e-02 8.239528e-02 7.295366e-02 6.402809e-02
## [6] 5.566407e-02 4.790182e-02 4.077420e-02 3.430484e-02 2.850641e-02
## [11] 2.337938e-02 1.891137e-02 1.507733e-02 1.184044e-02 9.153937e-03
## [16] 6.963447e-03 5.209838e-03 3.832150e-03 2.770389e-03 1.967912e-03
## [21] 1.373241e-03 9.412234e-04 6.335631e-04 4.187913e-04 2.718232e-04
## [26] 1.732350e-04 1.084002e-04 6.659786e-05 4.017160e-05 2.379043e-05
## [31] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [36] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [41] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [46] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [51] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [56] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [61] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [66] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [71] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [76] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [81] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [86] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [91] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [96] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [101] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [106] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [111] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [116] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [121] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [126] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [131] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [136] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [141] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [146] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [151] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [156] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [161] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [166] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [171] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [176] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [181] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [186] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [191] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [196] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
L=sapply(1:n,columnj,h=hfix)
##just a check that it works
y.hat1=L%*%d$y
plot(y.hat1,y.hat)
identical(y.hat1,y.hat)
## [1] FALSE
max(abs(y.hat1-y.hat))
## [1] 7.771561e-16
sum(diag(L))
## [1] 11.28414
We can then compute the equivalent degrees of freedom for each bandwith
computeDOF=function(h){
L=sapply(1:n,columnj,h=h)
sum(diag(L))
}
computeDOF(hfix)
## [1] 11.28414
dof.vec=sapply(h.vec,computeDOF)
matplot(dof.vec,cbind(bias.overall,variance.overall,bias.overall+variance.overall),col=c("blue","red","black"),lwd=c(1,1,2),type="l",lty=c(1,1,2),ylab="")
abline(v=h.vec[which.min(mse.overall)])
rmse1=rmse-min(rmse)+min(bias.overall+variance.overall)
points(dof.vec,rmse1)
points(dof.vec[which.min(rmse1)],rmse1[which.min(rmse1)],pch=20,col="red")