Epidemiology: relationship between pollutants and mortality

It is widely believed that high pollutant concentrations may lead to higher mortality.

In analyzing such a relationship, it must be kept in mind that mortality is also affected by other factors such as, for example, the temperature.

Data on air pollution and death in Chicago

The following variables were observed daily from 1/1/1987 to 31/12/2000

It is convenient to define a date variable and add it to a data frame, also, we convert temperatures from fahrenheit to Celsius

library(gamair)
data(chicago)
head(chicago)
##   death pm10median pm25median  o3median  so2median    time tmpd
## 1   130 -7.4335443         NA -19.59234  1.9280426 -2556.5 31.5
## 2   150         NA         NA -19.03861 -0.9855631 -2555.5 33.0
## 3   101 -0.8265306         NA -20.21734 -1.8914161 -2554.5 33.0
## 4   135  5.5664557         NA -19.67567  6.1393413 -2553.5 29.0
## 5   126         NA         NA -19.21734  2.2784649 -2552.5 32.0
## 6   130  6.5664557         NA -17.63400  9.8585839 -2551.5 40.0
chicago$data=seq(as.Date("1987/1/1"), as.Date("2000/12/31"), "days")
chicago$tmpd=(chicago$tmpd-32)*5/9

We can then have a first look at the data

plot(chicago$data,chicago$death,pch=".")

par(mfrow=c(2,2))
plot(chicago$pm10median,chicago$death,pch=".")
plot(chicago$so2median,chicago$death,pch=".")
plot(chicago$o3median,chicago$death,pch=".")
plot(chicago$tmpd,chicago$death,pch=".")

Modeling

A very first model

library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
fit1=gam(death~s(time,bs="cr",k=200)+
           pm10median+so2median+o3median+
           tmpd,
         data=chicago,family=poisson)
plot(fit1,pages=1)

summary(fit1)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10median + so2median + 
##     o3median + tmpd
## 
## Parametric coefficients:
##              Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) 4.7304559  0.0036477 1296.825  < 2e-16 ***
## pm10median  0.0004321  0.0000889    4.861 1.17e-06 ***
## so2median   0.0008184  0.0005523    1.482    0.138    
## o3median    0.0009306  0.0002032    4.581 4.63e-06 ***
## tmpd        0.0016772  0.0003211    5.223 1.76e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df Chi.sq p-value    
## s(time) 168.6    188   2090  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.351   Deviance explained = 38.7%
## UBRE = 0.25467  Scale est. = 1         n = 4841
gam.check(fit1)

## 
## Method: UBRE   Optimizer: outer newton
## full convergence after 3 iterations.
## Gradient range [3.514596e-08,3.514596e-08]
## (score 0.2546689 & scale 1).
## Hessian positive definite, eigenvalue range [0.004247567,0.004247567].
## Model rank =  204 / 204 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k' edf k-index p-value    
## s(time) 199 169    0.92  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We notice there are some very high residuals (https://en.wikipedia.org/wiki/1995_Chicago_heat_wave)

fit1$model[sort.list(resid(fit1),decreasing=TRUE)[1:4],]
##      death pm10median  so2median  o3median     tmpd  time
## 3118   411  14.798103  0.6976599 28.115091 30.00000 560.5
## 3119   287  -8.333333 -0.9330126 21.115009 28.33333 561.5
## 3120   228  -3.232732 -2.3158882  5.649732 25.83333 562.5
## 3117   226  20.941667  2.2685856 29.703545 33.05556 559.5
chicago[3111:3120,]
##      death pm10median pm25median  o3median  so2median  time     tmpd       data
## 3111   112   3.807231         NA  4.206649  2.5440348 553.5 21.38889 1995-07-08
## 3112    97   0.732566         NA 18.518625 -1.0277339 554.5 22.77778 1995-07-09
## 3113   122  16.211505         NA  2.523175  2.7234298 555.5 23.05556 1995-07-10
## 3114   119  19.716615         NA 18.141378 -0.6733466 556.5 26.11111 1995-07-11
## 3115   116  41.248516         NA 24.160425 -1.0226071 557.5 29.16667 1995-07-12
## 3116   121  56.363682         NA 36.745545  1.2718564 558.5 33.33333 1995-07-13
## 3117   226  20.941667         NA 29.703545  2.2685856 559.5 33.05556 1995-07-14
## 3118   411  14.798103         NA 28.115091  0.6976599 560.5 30.00000 1995-07-15
## 3119   287  -8.333333         NA 21.115009 -0.9330126 561.5 28.33333 1995-07-16
## 3120   228  -3.232732         NA  5.649732 -2.3158882 562.5 25.83333 1995-07-17
par(mfrow=c(1,2))
plot(fit1$model$tmpd,resid(fit1))
plot(fit1$model$o3median,resid(fit1))

par(mfrow=c(1,2))
rsd <- residuals(fit1,type="deviance")
plot(fit1$model$tmpd,rsd)
fit.res=gam(rsd~s(tmpd,k=20)-1,data=fit1$model,select=TRUE)
plot(fit.res,residuals=FALSE)

fit2=gam(death~s(time,bs="cr",k=200)+
           s(pm10median)+s(so2median)+s(o3median)+
           s(tmpd),
         data=chicago,family=poisson)
plot(fit2,pages=1)

summary(fit2)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(time, bs = "cr", k = 200) + s(pm10median) + s(so2median) + 
##     s(o3median) + s(tmpd)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 4.744649   0.001342    3534   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf  Ref.df   Chi.sq p-value    
## s(time)       166.555 186.622 1828.269  <2e-16 ***
## s(pm10median)   1.001   1.002    5.658  0.0174 *  
## s(so2median)    1.000   1.001    1.789  0.1811    
## s(o3median)     8.023   8.752   17.177  0.0236 *  
## s(tmpd)         7.992   8.685  100.199  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.361   Deviance explained = 39.7%
## UBRE = 0.23928  Scale est. = 1         n = 4841

There is some evidence of a non linear effect of o3median and a relatively clear evidence of a non linear effect of temperature tmpd, thus a reasonable model may be

fit3=gam(death~s(time,bs="cr",k=200)+
           pm10median+so2median+s(o3median,k=20)+
           s(tmpd,k=40),
         data=chicago,family=poisson)
plot(fit3,pages=1)

summary(fit3)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10median + so2median + 
##     s(o3median, k = 20) + s(tmpd, k = 40)
## 
## Parametric coefficients:
##              Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) 4.745e+00  1.386e-03 3423.588   <2e-16 ***
## pm10median  2.284e-04  9.526e-05    2.398   0.0165 *  
## so2median   7.575e-04  5.605e-04    1.352   0.1765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df  Chi.sq p-value    
## s(time)     166.48 186.55 1818.02  <2e-16 ***
## s(o3median)  10.18  12.51   22.28  0.0434 *  
## s(tmpd)      12.91  16.13  108.16  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.363   Deviance explained = 39.9%
## UBRE = 0.23846  Scale est. = 1         n = 4841
gam.check(fit3)

## 
## Method: UBRE   Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [1.838621e-10,1.942568e-09]
## (score 0.238464 & scale 1).
## Hessian positive definite, eigenvalue range [0.0003795557,0.004281439].
## Model rank =  260 / 260 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                k'   edf k-index p-value    
## s(time)     199.0 166.5    0.94  <2e-16 ***
## s(o3median)  19.0  10.2    1.00    0.41    
## s(tmpd)      39.0  12.9    1.03    0.99    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Lagged data

The effect of pollution and temperature on health may be lagged, so it may be relevant to use lagged concentrations/temperature as covariates, we then compute 3-days means \[ x_{i} = \frac{1}{3}(x_{i-2}+x_{i-1}+x_{i}),\;\;i=3,\ldots,n \]

meanlag3=function(x){
  n=length(x)
  c(NA,NA,NA,(x[1:(n-3)]+x[2:(n-2)]+x[3:(n-1)]+x[4:n])/4)
}
chicago$pm10medianLag=meanlag3(chicago$pm10median)
chicago$pm25medianLag=meanlag3(chicago$pm25median)
chicago$o3medianLag=meanlag3(chicago$o3median)
chicago$so2medianLag=meanlag3(chicago$so2median)
chicago$tmpdLag=meanlag3(chicago$tmpd)
par(mfrow=c(2,2))
plot(chicago$pm10medianLag,chicago$death,pch=".")
plot(chicago$so2medianLag,chicago$death,pch=".")
plot(chicago$o3medianLag,chicago$death,pch=".")
plot(chicago$tmpdLag,chicago$death,pch=".")

We then consider the alternative model

fitLag=gam(death~s(time,bs="cr",k=200)+
           pm10medianLag+so2medianLag+s(o3medianLag,k=20)+
           s(tmpdLag,k=40),
         data=chicago,family=poisson)
plot(fitLag,pages=1)

summary(fitLag)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40)
## 
## Parametric coefficients:
##                Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)   4.7435162  0.0015644 3032.088  < 2e-16 ***
## pm10medianLag 0.0006183  0.0001761    3.511 0.000446 ***
## so2medianLag  0.0021204  0.0010432    2.033 0.042086 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf  Ref.df  Chi.sq  p-value    
## s(time)        147.178 170.479 1146.90  < 2e-16 ***
## s(o3medianLag)   1.003   1.005   11.38 0.000762 ***
## s(tmpdLag)      28.746  33.451  685.21  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.451   Deviance explained = 46.1%
## UBRE = 0.15416  Scale est. = 1         n = 4325
gam.check(fitLag)

## 
## Method: UBRE   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-4.146838e-07,-1.051935e-07]
## (score 0.154161 & scale 1).
## Hessian positive definite, eigenvalue range [4.144902e-07,0.004488623].
## Model rank =  260 / 260 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                   k'   edf k-index p-value
## s(time)        199.0 147.2    1.02    0.86
## s(o3medianLag)  19.0   1.0    1.01    0.78
## s(tmpdLag)      39.0  28.8    1.02    0.85

Alternative time model

chicago$yearday=as.POSIXlt(chicago$data)$yday
chicago$weekday=as.POSIXlt(chicago$data)$wday
chicago$year=as.POSIXlt(chicago$data)$year
par(mfrow=c(1,3))
plot(chicago$yearday,chicago$death,pch=".")
plot(as.factor(chicago$weekday),chicago$death,pch=".")
plot(as.factor(chicago$year),chicago$death,pch=".")

fitYday=gam(death~s(yearday,bs="cr",k=40)+
           so2median+pm10median+
           s(o3median,k=20)+s(tmpd,k=40),
         data=chicago,family=poisson)
plot(fitYday,pages=1)

summary(fitYday)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + so2median + pm10median + 
##     s(o3median, k = 20) + s(tmpd, k = 40)
## 
## Parametric coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)  4.746e+00  1.382e-03 3434.743  < 2e-16 ***
## so2median   -3.389e-04  5.341e-04   -0.634 0.525835    
## pm10median   3.278e-04  9.133e-05    3.590 0.000331 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value    
## s(yearday)  25.15  30.03  553.3 < 2e-16 ***
## s(o3median) 14.84  17.10   36.3 0.00368 ** 
## s(tmpd)     17.46  21.53  174.4 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.242   Deviance explained = 26.2%
## UBRE = 0.44885  Scale est. = 1         n = 4841
fitYday2=gam(death~s(yearday,bs="cr",k=40)+
           so2median+pm10median+
           s(o3median,k=20)+s(tmpd,k=40)+as.factor(year),
         data=chicago,family=poisson)
plot(fitYday2,pages=1)

summary(fitYday2)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + so2median + pm10median + 
##     s(o3median, k = 20) + s(tmpd, k = 40) + as.factor(year)
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         4.754e+00  5.368e-03 885.574  < 2e-16 ***
## so2median          -3.917e-04  5.352e-04  -0.732 0.464225    
## pm10median          3.502e-04  9.176e-05   3.817 0.000135 ***
## as.factor(year)88   1.782e-02  7.520e-03   2.370 0.017784 *  
## as.factor(year)89   1.698e-02  7.692e-03   2.208 0.027235 *  
## as.factor(year)90  -5.274e-03  7.356e-03  -0.717 0.473441    
## as.factor(year)91   4.609e-03  7.352e-03   0.627 0.530741    
## as.factor(year)92  -1.665e-02  7.413e-03  -2.246 0.024733 *  
## as.factor(year)93   2.647e-02  7.315e-03   3.619 0.000296 ***
## as.factor(year)94   1.247e-02  7.328e-03   1.702 0.088682 .  
## as.factor(year)95   2.753e-02  7.201e-03   3.823 0.000132 ***
## as.factor(year)96  -1.617e-02  7.348e-03  -2.201 0.027744 *  
## as.factor(year)97  -4.844e-02  7.374e-03  -6.569 5.08e-11 ***
## as.factor(year)98  -5.636e-02  7.333e-03  -7.685 1.53e-14 ***
## as.factor(year)99  -1.706e-02  7.287e-03  -2.341 0.019208 *  
## as.factor(year)100 -5.130e-02  7.391e-03  -6.942 3.88e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value    
## s(yearday)  24.98  29.86 590.78  <2e-16 ***
## s(o3median) 11.70  14.17  28.69  0.0126 *  
## s(tmpd)     16.35  20.23 130.03  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.281   Deviance explained = 30.5%
## UBRE = 0.36933  Scale est. = 1         n = 4841
fitLagYday=gam(death~s(yearday,bs="cr",k=40)+
           pm10medianLag+so2medianLag+s(o3medianLag,k=20)+
           s(tmpdLag,k=40),
         data=chicago,family=poisson)
plot(fitLagYday,pages=1)

summary(fitLagYday)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40)
## 
## Parametric coefficients:
##                 Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)    4.7421413  0.0015359 3087.495  < 2e-16 ***
## pm10medianLag  0.0008678  0.0001604    5.409 6.35e-08 ***
## so2medianLag  -0.0014050  0.0009221   -1.524    0.128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df Chi.sq p-value    
## s(yearday)     24.263 29.117  217.1 < 2e-16 ***
## s(o3medianLag)  2.867  3.728   17.3 0.00186 ** 
## s(tmpdLag)     30.090 34.606  860.9 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.352   Deviance explained = 34.8%
## UBRE = 0.32155  Scale est. = 1         n = 4325
fitLagYday2=gam(death~s(yearday,bs="cr",k=40)+
           pm10medianLag+so2medianLag+s(o3medianLag,k=20)+
           s(tmpdLag,k=40)+as.factor(year),
         data=chicago,family=poisson)
plot(fitLagYday2,pages=1)

summary(fitLagYday2)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         4.7606143  0.0062134 766.179  < 2e-16 ***
## pm10medianLag       0.0009592  0.0001613   5.948 2.72e-09 ***
## so2medianLag       -0.0012851  0.0009275  -1.386 0.165876    
## as.factor(year)88  -0.0005706  0.0095104  -0.060 0.952157    
## as.factor(year)89   0.0059884  0.0096690   0.619 0.535693    
## as.factor(year)90  -0.0108492  0.0081832  -1.326 0.184910    
## as.factor(year)91   0.0018837  0.0083722   0.225 0.821986    
## as.factor(year)92  -0.0299982  0.0081961  -3.660 0.000252 ***
## as.factor(year)93   0.0144891  0.0080285   1.805 0.071121 .  
## as.factor(year)94   0.0028798  0.0080824   0.356 0.721613    
## as.factor(year)95   0.0126773  0.0078474   1.615 0.106209    
## as.factor(year)96  -0.0297703  0.0079878  -3.727 0.000194 ***
## as.factor(year)97  -0.0603151  0.0080442  -7.498 6.48e-14 ***
## as.factor(year)98  -0.0535246  0.0080103  -6.682 2.36e-11 ***
## as.factor(year)99  -0.0215681  0.0079537  -2.712 0.006694 ** 
## as.factor(year)100 -0.0570526  0.0080773  -7.063 1.63e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df Chi.sq  p-value    
## s(yearday)     23.30 28.093 236.30  < 2e-16 ***
## s(o3medianLag)  1.59  2.025  15.95 0.000342 ***
## s(tmpdLag)     30.01 34.542 832.42  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.385   Deviance explained = 38.5%
## UBRE = 0.25308  Scale est. = 1         n = 4325

Which one is most appropriate? Let’s have at look at temperatures.

par(mfrow=c(1,2))
plot(predict(fitLag),predict(fitLagYday2))
abline(0,1)
plot(residuals(fitLag),residuals(fitLagYday2))
abline(0,1)

par(mfrow=c(1,2))
plot(fitLag,select=3)
plot(fitLagYday2,select=3)

nd=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago$tmpdLag,na.rm=TRUE),max(chicago$tmpdLag,na.rm=TRUE),length=100))
pr1=predict(fitLag,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE)
pr2=predict(fitLagYday2,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE)
par(mfrow=c(1,1))
matr.pred=cbind(pr1$fit,pr2$fit,
                pr1$fit-1.96*pr1$se.fit,pr2$fit-1.96*pr2$se.fit,
                pr1$fit+1.96*pr1$se.fit,pr2$fit+1.96*pr2$se.fit)
matplot(nd$tmpdLag,matr.pred,type="l",col=c("black","red"),lwd=c(2,2,1,1,1,1),lty=c(1,1,2,2,2,2))

sum(chicago$death)
## [1] 590252
sum(predict(fitLag,newdata=chicago,type="response"),na.rm=TRUE)
## [1] 497805
chicago2=chicago
chicago2$tmpdLag=chicago$tmpdLag+1
sum(predict(fitLag,newdata=chicago2,type="response"),na.rm=TRUE)
## [1] 498691.4
sum(predict(fitLagYday2,newdata=chicago2,type="response"),na.rm=TRUE)
## [1] 498172

Alternative exploration: summer and winter model.

Summer versus winter behaviour (especially related to temperature, for pollutants there are likely not enough data)

chicago.summer=subset(chicago,(yearday>150) & (yearday<244))
fitLag.summer=gam(death~s(time,bs="cr",k=200)+
                    pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + 
                    s(tmpdLag,k=40)+as.factor(year),
               data=chicago.summer,family=poisson)
plot(fitLag.summer,pages=1)

summary(fitLag.summer)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)         7.3665502 11.8939072   0.619    0.536
## pm10medianLag       0.0004324  0.0004505   0.960    0.337
## so2medianLag        0.0001718  0.0034530   0.050    0.960
## as.factor(year)88  -0.9909888  5.2978044  -0.187    0.852
## as.factor(year)89  -3.0809707  7.9609623  -0.387    0.699
## as.factor(year)90  -3.5231714  9.4742458  -0.372    0.710
## as.factor(year)91  -3.1206241 10.8517321  -0.288    0.774
## as.factor(year)92  -1.9791295 12.0408914  -0.164    0.869
## as.factor(year)93  -3.8073017 13.1451271  -0.290    0.772
## as.factor(year)94  -3.3476138 14.1352308  -0.237    0.813
## as.factor(year)95  -1.6493649 15.0650236  -0.109    0.913
## as.factor(year)96  -2.5921615 15.9479418  -0.163    0.871
## as.factor(year)97  -3.1252909 16.7825334  -0.186    0.852
## as.factor(year)98  -3.1622694 17.5739907  -0.180    0.857
## as.factor(year)99  -3.0409563 18.3274894  -0.166    0.868
## as.factor(year)100 -3.6995166 19.0517829  -0.194    0.846
## 
## Approximate significance of smooth terms:
##                   edf Ref.df  Chi.sq p-value    
## s(time)        78.170 95.586 136.142 0.00434 ** 
## s(o3medianLag)  1.001  1.001   5.552 0.01853 *  
## s(tmpdLag)     10.824 13.493 383.761 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.514   Deviance explained = 50.7%
## UBRE = 0.26025  Scale est. = 1         n = 1080
chicago.summer=subset(chicago,(yearday>150) & (yearday<244))
chicago.summer$time2=1:nrow(chicago.summer) ##provided it is ordered!
fitLag.summer=gam(death~s(time2,bs="cr",k=200)+
                    pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + 
                    s(tmpdLag,k=40)+as.factor(year),
               data=chicago.summer,family=poisson)
plot(fitLag.summer,pages=1)

summary(fitLag.summer)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(time2, bs = "cr", k = 200) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         4.906e+00  3.487e-01  14.067   <2e-16 ***
## pm10medianLag       4.459e-04  4.488e-04   0.994    0.320    
## so2medianLag       -1.795e-05  3.444e-03  -0.005    0.996    
## as.factor(year)88  -9.382e-02  1.086e-01  -0.864    0.388    
## as.factor(year)89  -4.596e-01  3.428e-01  -1.341    0.180    
## as.factor(year)90  -4.080e-01  3.519e-01  -1.159    0.246    
## as.factor(year)91  -3.954e-01  3.634e-01  -1.088    0.277    
## as.factor(year)92  -2.903e-01  3.727e-01  -0.779    0.436    
## as.factor(year)93  -2.997e-01  3.834e-01  -0.782    0.434    
## as.factor(year)94  -2.901e-01  3.935e-01  -0.737    0.461    
## as.factor(year)95  -1.586e-01  4.019e-01  -0.395    0.693    
## as.factor(year)96  -2.284e-01  4.152e-01  -0.550    0.582    
## as.factor(year)97  -1.894e-01  4.261e-01  -0.445    0.657    
## as.factor(year)98  -1.894e-01  4.376e-01  -0.433    0.665    
## as.factor(year)99  -1.887e-01  4.449e-01  -0.424    0.672    
## as.factor(year)100 -1.973e-01  4.548e-01  -0.434    0.664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df  Chi.sq p-value    
## s(time2)       77.24 95.536 139.702 0.00209 ** 
## s(o3medianLag)  1.00  1.001   5.367 0.02056 *  
## s(tmpdLag)     10.83 13.500 382.608 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.515   Deviance explained = 50.7%
## UBRE = 0.25824  Scale est. = 1         n = 1080
fitLagYday2.summer=gam(death~s(yearday,bs="cr",k=40)+
                 pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + 
                    s(tmpdLag,k=40)+as.factor(year),
               data=chicago.summer,family=poisson)
plot(fitLagYday2.summer,pages=1)

summary(fitLagYday2.summer)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         4.7090660  0.0115313 408.372  < 2e-16 ***
## pm10medianLag       0.0005376  0.0003857   1.394  0.16333    
## so2medianLag        0.0013250  0.0026751   0.495  0.62040    
## as.factor(year)88  -0.0620248  0.0201959  -3.071  0.00213 ** 
## as.factor(year)89  -0.0167532  0.0195758  -0.856  0.39210    
## as.factor(year)90  -0.0156428  0.0153908  -1.016  0.30945    
## as.factor(year)91  -0.0332092  0.0160556  -2.068  0.03860 *  
## as.factor(year)92  -0.0381858  0.0160413  -2.380  0.01729 *  
## as.factor(year)93   0.0167911  0.0155992   1.076  0.28175    
## as.factor(year)94  -0.0056347  0.0151205  -0.373  0.70941    
## as.factor(year)95   0.0124583  0.0145577   0.856  0.39212    
## as.factor(year)96  -0.0600682  0.0152927  -3.928 8.57e-05 ***
## as.factor(year)97  -0.0738696  0.0152273  -4.851 1.23e-06 ***
## as.factor(year)98  -0.0686377  0.0148950  -4.608 4.06e-06 ***
## as.factor(year)99  -0.0741995  0.0148715  -4.989 6.06e-07 ***
## as.factor(year)100 -0.0752074  0.0154491  -4.868 1.13e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df  Chi.sq p-value    
## s(yearday)     17.566 21.606  33.360 0.05311 .  
## s(o3medianLag)  1.001  1.002   9.913 0.00165 ** 
## s(tmpdLag)     12.322 15.318 633.628 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.475   Deviance explained = 45.1%
## UBRE = 0.27295  Scale est. = 1         n = 1080
chicago.winter=subset(chicago,(yearday>334) | (yearday<60))
chicago.winter$yearday2=chicago.winter$yearday
chicago.winter$yearday2[chicago.winter$yearday2>334]=chicago.winter$yearday2[chicago.winter$yearday2>334]-366
fitLagYday2.winter=gam(death~s(yearday2,bs="cr",k=40)+
                 pm10medianLag + so2medianLag + s(o3medianLag, k = 20) + 
                    s(tmpdLag,k=40)+as.factor(year),
               data=chicago.winter,family=poisson)
plot(fitLagYday2.winter,pages=1)

summary(fitLagYday2.winter)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## death ~ s(yearday2, bs = "cr", k = 40) + pm10medianLag + so2medianLag + 
##     s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
## 
## Parametric coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         4.8067503  0.0143517 334.926  < 2e-16 ***
## pm10medianLag       0.0000729  0.0004443   0.164  0.86967    
## so2medianLag       -0.0009371  0.0015493  -0.605  0.54526    
## as.factor(year)88   0.0067639  0.0188049   0.360  0.71908    
## as.factor(year)89   0.0644631  0.0207877   3.101  0.00193 ** 
## as.factor(year)90   0.0168657  0.0179773   0.938  0.34816    
## as.factor(year)91   0.0539994  0.0176569   3.058  0.00223 ** 
## as.factor(year)92   0.0065195  0.0173288   0.376  0.70675    
## as.factor(year)93  -0.0015626  0.0179901  -0.087  0.93078    
## as.factor(year)94   0.0541696  0.0180292   3.005  0.00266 ** 
## as.factor(year)95   0.0810445  0.0171135   4.736 2.18e-06 ***
## as.factor(year)96   0.0101169  0.0172220   0.587  0.55691    
## as.factor(year)97  -0.0366707  0.0175437  -2.090  0.03660 *  
## as.factor(year)98   0.0187132  0.0172832   1.083  0.27892    
## as.factor(year)99   0.0792005  0.0170875   4.635 3.57e-06 ***
## as.factor(year)100  0.0001118  0.0177545   0.006  0.99498    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df Chi.sq  p-value    
## s(yearday2)     7.293  9.104 82.472  < 2e-16 ***
## s(o3medianLag)  1.611  2.040  1.361    0.512    
## s(tmpdLag)     17.351 21.368 61.876 7.44e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.202   Deviance explained = 23.3%
## UBRE = 0.24345  Scale est. = 1         n = 1066
par(mfrow=c(1,3))
plot(fitLagYday2,select=3)
plot(fitLagYday2.winter,select=3)
plot(fitLagYday2.summer,select=3)

#par(mfrow=c(1,3))
#plot(fitLagYday2,type="terms",select="s(tmpdLag)")
#plot(fitLagYday2.winter,type="terms",,select="s(tmpdLag)")
#plot(fitLagYday2.summer,type="terms",,select="s(tmpdLag)")
nd=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago$tmpdLag,na.rm=TRUE),max(chicago$tmpdLag,na.rm=TRUE),length=100))
pr2=predict(fitLagYday2,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE)

nd.summer=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago.summer$tmpdLag,na.rm=TRUE),max(chicago.summer$tmpdLag,na.rm=TRUE),length=100))
pr2.summer=predict(fitLagYday2.summer,newdata=nd.summer,type="terms",term="s(tmpdLag)",se.fit=TRUE)

nd.winter=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday2=1,year=89,tmpdLag=seq(min(chicago.winter$tmpdLag,na.rm=TRUE),max(chicago.winter$tmpdLag,na.rm=TRUE),length=100))
pr2.winter=predict(fitLagYday2.winter,newdata=nd.winter,type="terms",term="s(tmpdLag)",se.fit=TRUE)

par(mfrow=c(1,1))
matr.pred=cbind(pr2$fit,pr2$fit-1.96*pr2$se.fit,pr2$fit+1.96*pr2$se.fit)
matr.pred.winter=cbind(pr2.winter$fit,pr2.winter$fit-1.96*pr2.winter$se.fit,pr2.winter$fit+1.96*pr2.winter$se.fit)
matr.pred.summer=cbind(pr2.summer$fit,pr2.summer$fit-1.96*pr2.summer$se.fit,pr2.summer$fit+1.96*pr2.summer$se.fit)
matplot(nd$tmpdLag,matr.pred,type="l",col=c("black"),lwd=c(2,1,1),lty=c(1,2,2),
        ylim=range(matr.pred,matr.pred.winter,matr.pred.summer))
matlines(nd.winter$tmpdLag,matr.pred.winter,type="l",col=c("blue"),lwd=c(2,1,1),lty=c(1,2,2))
matlines(nd.summer$tmpdLag,matr.pred.summer,type="l",col=c("red"),lwd=c(2,1,1),lty=c(1,2,2))

Bivariate spline

fitLagBiv=gam(death~s(time,bs="cr",k=200)+
           so2medianLag+pm10medianLag+
           s(o3medianLag,tmpdLag,k=40),
         data=chicago,family=poisson)
fitLagYdayBiv=gam(death~s(yearday,bs="cr",k=40)+
           so2medianLag+pm10medianLag+
           s(o3medianLag,tmpdLag,k=40),
         data=chicago,family=poisson)
vis.gam(fitLagBiv,c("o3medianLag","tmpdLag"),theta=45,too.far=0.07)

vis.gam(fitLagBiv,c("o3medianLag","tmpdLag"),plot.type="contour",too.far=0.07)
points(fitLagBiv$model$o3medianLag,fitLagBiv$model$tmpdLag)

vis.gam(fitLagYdayBiv,c("o3medianLag","tmpdLag"),theta=45,too.far=0.07)

vis.gam(fitLagYdayBiv,c("o3medianLag","tmpdLag"),plot.type="contour",too.far=0.07)
points(fitLagYdayBiv$model$o3medianLag,fitLagBiv$model$tmpdLag)

## Questions

  • which model is best? Perhaps via Cross Validation.

  • Additional covariates: calendar year (for the model with yday)