--- title: "Health-pollution model" output: html_document: default pdf_document: default date: "2023-11-15" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(fig.width=6, fig.height=5) ``` # Epidemiology: relationship between pollutants and mortality It is widely believed that high pollutant concentrations may lead to higher mortality. In analyzing such a relationship, it must be kept in mind that mortality is also affected by other factors such as, for example, the temperature. # Data on air pollution and death in Chicago The following variables were observed daily from 1/1/1987 to 31/12/2000 - `death` total deaths (per day). - `pm10medianmedian` particles in $2.5-10$ per cubic m - `pm25medianmedian` particles $<2.5$ mg per cubic m (more dangerous). - `o3medianOzone` in parts per billion - `so2median` Median Sulpher dioxide measurement - `time` time in days - `tmpd` temperature in fahrenheit It is convenient to define a date variable and add it to a data frame, also, we convert temperatures from fahrenheit to Celsius ```{r} library(gamair) data(chicago) head(chicago) chicago$data=seq(as.Date("1987/1/1"), as.Date("2000/12/31"), "days") chicago$tmpd=(chicago$tmpd-32)*5/9 ``` We can then have a first look at the data ```{r} plot(chicago$data,chicago$death,pch=".") ``` ```{r} par(mfrow=c(2,2)) plot(chicago$pm10median,chicago$death,pch=".") plot(chicago$pm25median,chicago$death,pch=".") plot(chicago$so2median,chicago$death,pch=".") plot(chicago$o3median,chicago$death,pch=".") plot(chicago$tmpd,chicago$death,pch=".") ``` # Modeling A very first model ```{r} library(mgcv) fit1=gam(death~s(time,bs="cr",k=200)+ pm10median+so2median+o3median+ tmpd, data=chicago,family=poisson) plot(fit1,pages=1) summary(fit1) ``` ```{r} gam.check(fit1) ``` We notice there are some very high residuals (https://en.wikipedia.org/wiki/1995_Chicago_heat_wave) ```{r} fit1$model[sort.list(resid(fit1),decreasing=TRUE)[1:4],] chicago[3111:3120,] par(mfrow=c(1,2)) plot(fit1$model$tmpd,resid(fit1)) plot(fit1$model$o3median,resid(fit1)) ``` ```{r} par(mfrow=c(1,2)) rsd <- residuals(fit1,type="deviance") plot(fit1$model$tmpd,rsd) fit.res=gam(rsd~s(tmpd,k=20)-1,data=fit1$model,select=TRUE) plot(fit.res,residuals=FALSE) ``` ```{r} fit2=gam(death~s(time,bs="cr",k=200)+ s(pm10median)+s(so2median)+s(o3median)+ s(tmpd), data=chicago,family=poisson) plot(fit2,pages=1) summary(fit2) ``` There is some evidence of a non linear effect of `o3median` and a relatively clear evidence of a non linear effect of temperature `tmpd`, thus a reasonable model may be ```{r} fit3=gam(death~s(time,bs="cr",k=200)+ pm10median+so2median+s(o3median,k=20)+ s(tmpd,k=40), data=chicago,family=poisson) plot(fit3,pages=1) summary(fit3) ``` ```{r} gam.check(fit3) ``` ## Lagged data The effect of pollution and temperature on health may be lagged, so it may be relevant to use lagged concentrations/temperature as covariates, we then compute 3-days means \[ x_{i} = \frac{1}{3}(x_{i-2}+x_{i-1}+x_{i}),\;\;i=3,\ldots,n \] ```{r} meanlag3=function(x){ n=length(x) c(NA,NA,NA,(x[1:(n-3)]+x[2:(n-2)]+x[3:(n-1)]+x[4:n])/4) } chicago$pm10medianLag=meanlag3(chicago$pm10median) chicago$pm25medianLag=meanlag3(chicago$pm25median) chicago$o3medianLag=meanlag3(chicago$o3median) chicago$so2medianLag=meanlag3(chicago$so2median) chicago$tmpdLag=meanlag3(chicago$tmpd) ``` ```{r} par(mfrow=c(2,2)) plot(chicago$pm10medianLag,chicago$death,pch=".") plot(chicago$so2medianLag,chicago$death,pch=".") plot(chicago$o3medianLag,chicago$death,pch=".") plot(chicago$tmpdLag,chicago$death,pch=".") ``` We then consider the alternative model ```{r} fitLag=gam(death~s(time,bs="cr",k=200)+ pm10medianLag+so2medianLag+s(o3medianLag,k=20)+ s(tmpdLag,k=40), data=chicago,family=poisson) ``` ```{r} plot(fitLag,pages=1) ``` ```{r} summary(fitLag) gam.check(fitLag) ``` ## Alternative time model ```{r} chicago$yearday=as.POSIXlt(chicago$data)$yday chicago$weekday=as.POSIXlt(chicago$data)$wday chicago$year=as.POSIXlt(chicago$data)$year ``` ```{r} par(mfrow=c(1,3)) plot(chicago$yearday,chicago$death,pch=".") plot(as.factor(chicago$weekday),chicago$death,pch=".") plot(as.factor(chicago$year),chicago$death,pch=".") ``` ```{r} fitYday=gam(death~s(yearday,bs="cr",k=40)+ so2median+pm10median+ s(o3median,k=20)+s(tmpd,k=40), data=chicago,family=poisson) plot(fitYday,pages=1) summary(fitYday) ``` ```{r} fitYday2=gam(death~s(yearday,bs="cr",k=40)+ so2median+pm10median+ s(o3median,k=20)+s(tmpd,k=40)+as.factor(year), data=chicago,family=poisson) plot(fitYday2,pages=1) summary(fitYday2) ``` ```{r} fitLagYday=gam(death~s(yearday,bs="cr",k=40)+ pm10medianLag+so2medianLag+s(o3medianLag,k=20)+ s(tmpdLag,k=40), data=chicago,family=poisson) plot(fitLagYday,pages=1) summary(fitLagYday) ``` ```{r} fitLagYday2=gam(death~s(yearday,bs="cr",k=40)+ pm10medianLag+so2medianLag+s(o3medianLag,k=20)+ s(tmpdLag,k=40)+as.factor(year), data=chicago,family=poisson) plot(fitLagYday2,pages=1) summary(fitLagYday2) ``` Which one is most appropriate? Let's have at look at temperatures. ```{r} par(mfrow=c(1,2)) plot(predict(fitLag),predict(fitLagYday2)) abline(0,1) plot(residuals(fitLag),residuals(fitLagYday2)) abline(0,1) ``` ```{r} par(mfrow=c(1,2)) plot(fitLag,select=3) plot(fitLagYday2,select=3) ``` ```{r} nd=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago$tmpdLag,na.rm=TRUE),max(chicago$tmpdLag,na.rm=TRUE),length=100)) pr1=predict(fitLag,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE) pr2=predict(fitLagYday2,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE) par(mfrow=c(1,1)) matr.pred=cbind(pr1$fit,pr2$fit, pr1$fit-1.96*pr1$se.fit,pr2$fit-1.96*pr2$se.fit, pr1$fit+1.96*pr1$se.fit,pr2$fit+1.96*pr2$se.fit) matplot(nd$tmpdLag,matr.pred,type="l",col=c("black","red"),lwd=c(2,2,1,1,1,1),lty=c(1,1,2,2,2,2)) ``` ```{r} sum(chicago$death) sum(predict(fitLag,newdata=chicago,type="response"),na.rm=TRUE) chicago2=chicago chicago2$tmpdLag=chicago$tmpdLag+1 sum(predict(fitLag,newdata=chicago2,type="response"),na.rm=TRUE) sum(predict(fitLagYday2,newdata=chicago2,type="response"),na.rm=TRUE) ``` ## Alternative exploration: summer and winter model. Summer versus winter behaviour (especially related to temperature, for pollutants there are likely not enough data) ```{r} chicago.summer=subset(chicago,(yearday>150) & (yearday<244)) fitLag.summer=gam(death~s(time,bs="cr",k=200)+ pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + s(tmpdLag,k=40)+as.factor(year), data=chicago.summer,family=poisson) plot(fitLag.summer,pages=1) summary(fitLag.summer) ``` ```{r} chicago.summer=subset(chicago,(yearday>150) & (yearday<244)) chicago.summer$time2=1:nrow(chicago.summer) ##provided it is ordered! fitLag.summer=gam(death~s(time2,bs="cr",k=200)+ pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + s(tmpdLag,k=40)+as.factor(year), data=chicago.summer,family=poisson) plot(fitLag.summer,pages=1) summary(fitLag.summer) ``` ```{r} fitLagYday2.summer=gam(death~s(yearday,bs="cr",k=40)+ pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + s(tmpdLag,k=40)+as.factor(year), data=chicago.summer,family=poisson) plot(fitLagYday2.summer,pages=1) summary(fitLagYday2.summer) ``` ```{r} chicago.winter=subset(chicago,(yearday>334) | (yearday<60)) chicago.winter$yearday2=chicago.winter$yearday chicago.winter$yearday2[chicago.winter$yearday2>334]=chicago.winter$yearday2[chicago.winter$yearday2>334]-366 fitLagYday2.winter=gam(death~s(yearday2,bs="cr",k=40)+ pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + s(tmpdLag,k=40)+as.factor(year), data=chicago.winter,family=poisson) plot(fitLagYday2.winter,pages=1) summary(fitLagYday2.winter) ``` ```{r} par(mfrow=c(1,3)) plot(fitLagYday2,select=3) plot(fitLagYday2.winter,select=3) plot(fitLagYday2.summer,select=3) #par(mfrow=c(1,3)) #plot(fitLagYday2,type="terms",select="s(tmpdLag)") #plot(fitLagYday2.winter,type="terms",,select="s(tmpdLag)") #plot(fitLagYday2.summer,type="terms",,select="s(tmpdLag)") ``` ```{r} nd=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago$tmpdLag,na.rm=TRUE),max(chicago$tmpdLag,na.rm=TRUE),length=100)) pr2=predict(fitLagYday2,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE) nd.summer=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago.summer$tmpdLag,na.rm=TRUE),max(chicago.summer$tmpdLag,na.rm=TRUE),length=100)) pr2.summer=predict(fitLagYday2.summer,newdata=nd.summer,type="terms",term="s(tmpdLag)",se.fit=TRUE) nd.winter=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday2=1,year=89,tmpdLag=seq(min(chicago.winter$tmpdLag,na.rm=TRUE),max(chicago.winter$tmpdLag,na.rm=TRUE),length=100)) pr2.winter=predict(fitLagYday2.winter,newdata=nd.winter,type="terms",term="s(tmpdLag)",se.fit=TRUE) par(mfrow=c(1,1)) matr.pred=cbind(pr2$fit,pr2$fit-1.96*pr2$se.fit,pr2$fit+1.96*pr2$se.fit) matr.pred.winter=cbind(pr2.winter$fit,pr2.winter$fit-1.96*pr2.winter$se.fit,pr2.winter$fit+1.96*pr2.winter$se.fit) matr.pred.summer=cbind(pr2.summer$fit,pr2.summer$fit-1.96*pr2.summer$se.fit,pr2.summer$fit+1.96*pr2.summer$se.fit) matplot(nd$tmpdLag,matr.pred,type="l",col=c("black"),lwd=c(2,1,1),lty=c(1,2,2), ylim=range(matr.pred,matr.pred.winter,matr.pred.summer)) matlines(nd.winter$tmpdLag,matr.pred.winter,type="l",col=c("blue"),lwd=c(2,1,1),lty=c(1,2,2)) matlines(nd.summer$tmpdLag,matr.pred.summer,type="l",col=c("red"),lwd=c(2,1,1),lty=c(1,2,2)) ``` ```{r} fitLagYday2Biv=gam(death~s(yearday,tmpdLag)+ pm10medianLag+so2medianLag+s(o3medianLag,k=20)+as.factor(year), data=chicago,family=poisson) plot(fitLagYday2Biv) vis.gam(fitLagYday2Bivc,c("yearday","tmpdLag")) ``` ## Bivariate spline ```{r} fitLagBiv=gam(death~s(time,bs="cr",k=200)+ so2medianLag+pm10medianLag+ s(o3medianLag,tmpdLag,k=40), data=chicago,family=poisson) fitLagYdayBiv=gam(death~s(yearday,bs="cr",k=40)+ so2medianLag+pm10medianLag+ s(o3medianLag,tmpdLag,k=40), data=chicago,family=poisson) ``` ```{r} vis.gam(fitLagBiv,c("o3medianLag","tmpdLag"),theta=45,too.far=0.07) vis.gam(fitLagBiv,c("o3medianLag","tmpdLag"),plot.type="contour",too.far=0.07) points(fitLagBiv$model$o3medianLag,fitLagBiv$model$tmpdLag) ``` ```{r} vis.gam(fitLagYdayBiv,c("o3medianLag","tmpdLag"),theta=45,too.far=0.07) vis.gam(fitLagYdayBiv,c("o3medianLag","tmpdLag"),plot.type="contour",too.far=0.07) points(fitLagYdayBiv$model$o3medianLag,fitLagBiv$model$tmpdLag) ``` ## Questions - which model is best? Perhaps via Cross Validation. - Additional covariates: calendar year (for the model with `yday`)