It is widely believed that high pollutant concentrations may lead to higher mortality.
In analyzing such a relationship, it must be kept in mind that mortality is also affected by other factors such as, for example, the temperature.
The following variables were observed daily from 1/1/1987 to 31/12/2000
death
total deaths (per day).
pm10medianmedian
particles in \(2.5-10\) per cubic m
pm25medianmedian
particles \(<2.5\) mg per cubic m (more
dangerous).
o3medianOzone
in parts per billion
so2median
Median Sulpher dioxide
measurement
time
time in days
tmpd
temperature in fahrenheit
It is convenient to define a date variable and add it to a data frame, also, we convert temperatures from fahrenheit to Celsius
library(gamair)
data(chicago)
head(chicago)
## death pm10median pm25median o3median so2median time tmpd
## 1 130 -7.4335443 NA -19.59234 1.9280426 -2556.5 31.5
## 2 150 NA NA -19.03861 -0.9855631 -2555.5 33.0
## 3 101 -0.8265306 NA -20.21734 -1.8914161 -2554.5 33.0
## 4 135 5.5664557 NA -19.67567 6.1393413 -2553.5 29.0
## 5 126 NA NA -19.21734 2.2784649 -2552.5 32.0
## 6 130 6.5664557 NA -17.63400 9.8585839 -2551.5 40.0
chicago$data=seq(as.Date("1987/1/1"), as.Date("2000/12/31"), "days")
chicago$tmpd=(chicago$tmpd-32)*5/9
We can then have a first look at the data
plot(chicago$data,chicago$death,pch=".")
par(mfrow=c(2,2))
plot(chicago$pm10median,chicago$death,pch=".")
plot(chicago$so2median,chicago$death,pch=".")
plot(chicago$o3median,chicago$death,pch=".")
plot(chicago$tmpd,chicago$death,pch=".")
A very first model
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
fit1=gam(death~s(time,bs="cr",k=200)+
pm10median+so2median+o3median+
tmpd,
data=chicago,family=poisson)
plot(fit1,pages=1)
summary(fit1)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10median + so2median +
## o3median + tmpd
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7304559 0.0036477 1296.825 < 2e-16 ***
## pm10median 0.0004321 0.0000889 4.861 1.17e-06 ***
## so2median 0.0008184 0.0005523 1.482 0.138
## o3median 0.0009306 0.0002032 4.581 4.63e-06 ***
## tmpd 0.0016772 0.0003211 5.223 1.76e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 168.6 188 2090 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.351 Deviance explained = 38.7%
## UBRE = 0.25467 Scale est. = 1 n = 4841
gam.check(fit1)
##
## Method: UBRE Optimizer: outer newton
## full convergence after 3 iterations.
## Gradient range [3.514596e-08,3.514596e-08]
## (score 0.2546689 & scale 1).
## Hessian positive definite, eigenvalue range [0.004247567,0.004247567].
## Model rank = 204 / 204
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(time) 199 169 0.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We notice there are some very high residuals (https://en.wikipedia.org/wiki/1995_Chicago_heat_wave)
fit1$model[sort.list(resid(fit1),decreasing=TRUE)[1:4],]
## death pm10median so2median o3median tmpd time
## 3118 411 14.798103 0.6976599 28.115091 30.00000 560.5
## 3119 287 -8.333333 -0.9330126 21.115009 28.33333 561.5
## 3120 228 -3.232732 -2.3158882 5.649732 25.83333 562.5
## 3117 226 20.941667 2.2685856 29.703545 33.05556 559.5
chicago[3111:3120,]
## death pm10median pm25median o3median so2median time tmpd data
## 3111 112 3.807231 NA 4.206649 2.5440348 553.5 21.38889 1995-07-08
## 3112 97 0.732566 NA 18.518625 -1.0277339 554.5 22.77778 1995-07-09
## 3113 122 16.211505 NA 2.523175 2.7234298 555.5 23.05556 1995-07-10
## 3114 119 19.716615 NA 18.141378 -0.6733466 556.5 26.11111 1995-07-11
## 3115 116 41.248516 NA 24.160425 -1.0226071 557.5 29.16667 1995-07-12
## 3116 121 56.363682 NA 36.745545 1.2718564 558.5 33.33333 1995-07-13
## 3117 226 20.941667 NA 29.703545 2.2685856 559.5 33.05556 1995-07-14
## 3118 411 14.798103 NA 28.115091 0.6976599 560.5 30.00000 1995-07-15
## 3119 287 -8.333333 NA 21.115009 -0.9330126 561.5 28.33333 1995-07-16
## 3120 228 -3.232732 NA 5.649732 -2.3158882 562.5 25.83333 1995-07-17
par(mfrow=c(1,2))
plot(fit1$model$tmpd,resid(fit1))
plot(fit1$model$o3median,resid(fit1))
par(mfrow=c(1,2))
rsd <- residuals(fit1,type="deviance")
plot(fit1$model$tmpd,rsd)
fit.res=gam(rsd~s(tmpd,k=20)-1,data=fit1$model,select=TRUE)
plot(fit.res,residuals=FALSE)
fit2=gam(death~s(time,bs="cr",k=200)+
s(pm10median)+s(so2median)+s(o3median)+
s(tmpd),
data=chicago,family=poisson)
plot(fit2,pages=1)
summary(fit2)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(time, bs = "cr", k = 200) + s(pm10median) + s(so2median) +
## s(o3median) + s(tmpd)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.744649 0.001342 3534 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 166.555 186.622 1828.269 <2e-16 ***
## s(pm10median) 1.001 1.002 5.658 0.0174 *
## s(so2median) 1.000 1.001 1.789 0.1811
## s(o3median) 8.023 8.752 17.177 0.0236 *
## s(tmpd) 7.992 8.685 100.199 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.361 Deviance explained = 39.7%
## UBRE = 0.23928 Scale est. = 1 n = 4841
There is some evidence of a non linear effect of
o3median
and a relatively clear evidence of a non linear
effect of temperature tmpd
, thus a reasonable model may
be
fit3=gam(death~s(time,bs="cr",k=200)+
pm10median+so2median+s(o3median,k=20)+
s(tmpd,k=40),
data=chicago,family=poisson)
plot(fit3,pages=1)
summary(fit3)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10median + so2median +
## s(o3median, k = 20) + s(tmpd, k = 40)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.745e+00 1.386e-03 3423.588 <2e-16 ***
## pm10median 2.284e-04 9.526e-05 2.398 0.0165 *
## so2median 7.575e-04 5.605e-04 1.352 0.1765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 166.48 186.55 1818.02 <2e-16 ***
## s(o3median) 10.18 12.51 22.28 0.0434 *
## s(tmpd) 12.91 16.13 108.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.363 Deviance explained = 39.9%
## UBRE = 0.23846 Scale est. = 1 n = 4841
gam.check(fit3)
##
## Method: UBRE Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [1.838621e-10,1.942568e-09]
## (score 0.238464 & scale 1).
## Hessian positive definite, eigenvalue range [0.0003795557,0.004281439].
## Model rank = 260 / 260
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(time) 199.0 166.5 0.94 <2e-16 ***
## s(o3median) 19.0 10.2 1.00 0.41
## s(tmpd) 39.0 12.9 1.03 0.99
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The effect of pollution and temperature on health may be lagged, so it may be relevant to use lagged concentrations/temperature as covariates, we then compute 3-days means \[ x_{i} = \frac{1}{3}(x_{i-2}+x_{i-1}+x_{i}),\;\;i=3,\ldots,n \]
meanlag3=function(x){
n=length(x)
c(NA,NA,NA,(x[1:(n-3)]+x[2:(n-2)]+x[3:(n-1)]+x[4:n])/4)
}
chicago$pm10medianLag=meanlag3(chicago$pm10median)
chicago$pm25medianLag=meanlag3(chicago$pm25median)
chicago$o3medianLag=meanlag3(chicago$o3median)
chicago$so2medianLag=meanlag3(chicago$so2median)
chicago$tmpdLag=meanlag3(chicago$tmpd)
par(mfrow=c(2,2))
plot(chicago$pm10medianLag,chicago$death,pch=".")
plot(chicago$so2medianLag,chicago$death,pch=".")
plot(chicago$o3medianLag,chicago$death,pch=".")
plot(chicago$tmpdLag,chicago$death,pch=".")
We then consider the alternative model
fitLag=gam(death~s(time,bs="cr",k=200)+
pm10medianLag+so2medianLag+s(o3medianLag,k=20)+
s(tmpdLag,k=40),
data=chicago,family=poisson)
plot(fitLag,pages=1)
summary(fitLag)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7435162 0.0015644 3032.088 < 2e-16 ***
## pm10medianLag 0.0006183 0.0001761 3.511 0.000446 ***
## so2medianLag 0.0021204 0.0010432 2.033 0.042086 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 147.178 170.479 1146.90 < 2e-16 ***
## s(o3medianLag) 1.003 1.005 11.38 0.000762 ***
## s(tmpdLag) 28.746 33.451 685.21 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.451 Deviance explained = 46.1%
## UBRE = 0.15416 Scale est. = 1 n = 4325
gam.check(fitLag)
##
## Method: UBRE Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-4.146838e-07,-1.051935e-07]
## (score 0.154161 & scale 1).
## Hessian positive definite, eigenvalue range [4.144902e-07,0.004488623].
## Model rank = 260 / 260
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(time) 199.0 147.2 1.02 0.86
## s(o3medianLag) 19.0 1.0 1.01 0.78
## s(tmpdLag) 39.0 28.8 1.02 0.85
chicago$yearday=as.POSIXlt(chicago$data)$yday
chicago$weekday=as.POSIXlt(chicago$data)$wday
chicago$year=as.POSIXlt(chicago$data)$year
par(mfrow=c(1,3))
plot(chicago$yearday,chicago$death,pch=".")
plot(as.factor(chicago$weekday),chicago$death,pch=".")
plot(as.factor(chicago$year),chicago$death,pch=".")
fitYday=gam(death~s(yearday,bs="cr",k=40)+
so2median+pm10median+
s(o3median,k=20)+s(tmpd,k=40),
data=chicago,family=poisson)
plot(fitYday,pages=1)
summary(fitYday)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + so2median + pm10median +
## s(o3median, k = 20) + s(tmpd, k = 40)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.746e+00 1.382e-03 3434.743 < 2e-16 ***
## so2median -3.389e-04 5.341e-04 -0.634 0.525835
## pm10median 3.278e-04 9.133e-05 3.590 0.000331 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(yearday) 25.15 30.03 553.3 < 2e-16 ***
## s(o3median) 14.84 17.10 36.3 0.00368 **
## s(tmpd) 17.46 21.53 174.4 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.242 Deviance explained = 26.2%
## UBRE = 0.44885 Scale est. = 1 n = 4841
fitYday2=gam(death~s(yearday,bs="cr",k=40)+
so2median+pm10median+
s(o3median,k=20)+s(tmpd,k=40)+as.factor(year),
data=chicago,family=poisson)
plot(fitYday2,pages=1)
summary(fitYday2)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + so2median + pm10median +
## s(o3median, k = 20) + s(tmpd, k = 40) + as.factor(year)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.754e+00 5.368e-03 885.574 < 2e-16 ***
## so2median -3.917e-04 5.352e-04 -0.732 0.464225
## pm10median 3.502e-04 9.176e-05 3.817 0.000135 ***
## as.factor(year)88 1.782e-02 7.520e-03 2.370 0.017784 *
## as.factor(year)89 1.698e-02 7.692e-03 2.208 0.027235 *
## as.factor(year)90 -5.274e-03 7.356e-03 -0.717 0.473441
## as.factor(year)91 4.609e-03 7.352e-03 0.627 0.530741
## as.factor(year)92 -1.665e-02 7.413e-03 -2.246 0.024733 *
## as.factor(year)93 2.647e-02 7.315e-03 3.619 0.000296 ***
## as.factor(year)94 1.247e-02 7.328e-03 1.702 0.088682 .
## as.factor(year)95 2.753e-02 7.201e-03 3.823 0.000132 ***
## as.factor(year)96 -1.617e-02 7.348e-03 -2.201 0.027744 *
## as.factor(year)97 -4.844e-02 7.374e-03 -6.569 5.08e-11 ***
## as.factor(year)98 -5.636e-02 7.333e-03 -7.685 1.53e-14 ***
## as.factor(year)99 -1.706e-02 7.287e-03 -2.341 0.019208 *
## as.factor(year)100 -5.130e-02 7.391e-03 -6.942 3.88e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(yearday) 24.98 29.86 590.78 <2e-16 ***
## s(o3median) 11.70 14.17 28.69 0.0126 *
## s(tmpd) 16.35 20.23 130.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.281 Deviance explained = 30.5%
## UBRE = 0.36933 Scale est. = 1 n = 4841
fitLagYday=gam(death~s(yearday,bs="cr",k=40)+
pm10medianLag+so2medianLag+s(o3medianLag,k=20)+
s(tmpdLag,k=40),
data=chicago,family=poisson)
plot(fitLagYday,pages=1)
summary(fitLagYday)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7421413 0.0015359 3087.495 < 2e-16 ***
## pm10medianLag 0.0008678 0.0001604 5.409 6.35e-08 ***
## so2medianLag -0.0014050 0.0009221 -1.524 0.128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(yearday) 24.263 29.117 217.1 < 2e-16 ***
## s(o3medianLag) 2.867 3.728 17.3 0.00186 **
## s(tmpdLag) 30.090 34.606 860.9 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.352 Deviance explained = 34.8%
## UBRE = 0.32155 Scale est. = 1 n = 4325
fitLagYday2=gam(death~s(yearday,bs="cr",k=40)+
pm10medianLag+so2medianLag+s(o3medianLag,k=20)+
s(tmpdLag,k=40)+as.factor(year),
data=chicago,family=poisson)
plot(fitLagYday2,pages=1)
summary(fitLagYday2)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7606143 0.0062134 766.179 < 2e-16 ***
## pm10medianLag 0.0009592 0.0001613 5.948 2.72e-09 ***
## so2medianLag -0.0012851 0.0009275 -1.386 0.165876
## as.factor(year)88 -0.0005706 0.0095104 -0.060 0.952157
## as.factor(year)89 0.0059884 0.0096690 0.619 0.535693
## as.factor(year)90 -0.0108492 0.0081832 -1.326 0.184910
## as.factor(year)91 0.0018837 0.0083722 0.225 0.821986
## as.factor(year)92 -0.0299982 0.0081961 -3.660 0.000252 ***
## as.factor(year)93 0.0144891 0.0080285 1.805 0.071121 .
## as.factor(year)94 0.0028798 0.0080824 0.356 0.721613
## as.factor(year)95 0.0126773 0.0078474 1.615 0.106209
## as.factor(year)96 -0.0297703 0.0079878 -3.727 0.000194 ***
## as.factor(year)97 -0.0603151 0.0080442 -7.498 6.48e-14 ***
## as.factor(year)98 -0.0535246 0.0080103 -6.682 2.36e-11 ***
## as.factor(year)99 -0.0215681 0.0079537 -2.712 0.006694 **
## as.factor(year)100 -0.0570526 0.0080773 -7.063 1.63e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(yearday) 23.30 28.093 236.30 < 2e-16 ***
## s(o3medianLag) 1.59 2.025 15.95 0.000342 ***
## s(tmpdLag) 30.01 34.542 832.42 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.385 Deviance explained = 38.5%
## UBRE = 0.25308 Scale est. = 1 n = 4325
Which one is most appropriate? Let’s have at look at temperatures.
par(mfrow=c(1,2))
plot(predict(fitLag),predict(fitLagYday2))
abline(0,1)
plot(residuals(fitLag),residuals(fitLagYday2))
abline(0,1)
par(mfrow=c(1,2))
plot(fitLag,select=3)
plot(fitLagYday2,select=3)
nd=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago$tmpdLag,na.rm=TRUE),max(chicago$tmpdLag,na.rm=TRUE),length=100))
pr1=predict(fitLag,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE)
pr2=predict(fitLagYday2,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE)
par(mfrow=c(1,1))
matr.pred=cbind(pr1$fit,pr2$fit,
pr1$fit-1.96*pr1$se.fit,pr2$fit-1.96*pr2$se.fit,
pr1$fit+1.96*pr1$se.fit,pr2$fit+1.96*pr2$se.fit)
matplot(nd$tmpdLag,matr.pred,type="l",col=c("black","red"),lwd=c(2,2,1,1,1,1),lty=c(1,1,2,2,2,2))
sum(chicago$death)
## [1] 590252
sum(predict(fitLag,newdata=chicago,type="response"),na.rm=TRUE)
## [1] 497805
chicago2=chicago
chicago2$tmpdLag=chicago$tmpdLag+1
sum(predict(fitLag,newdata=chicago2,type="response"),na.rm=TRUE)
## [1] 498691.4
sum(predict(fitLagYday2,newdata=chicago2,type="response"),na.rm=TRUE)
## [1] 498172
Summer versus winter behaviour (especially related to temperature, for pollutants there are likely not enough data)
chicago.summer=subset(chicago,(yearday>150) & (yearday<244))
fitLag.summer=gam(death~s(time,bs="cr",k=200)+
pm10medianLag + so2medianLag + s(o3medianLag, k = 20) +
s(tmpdLag,k=40)+as.factor(year),
data=chicago.summer,family=poisson)
plot(fitLag.summer,pages=1)
summary(fitLag.summer)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(time, bs = "cr", k = 200) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.3665502 11.8939072 0.619 0.536
## pm10medianLag 0.0004324 0.0004505 0.960 0.337
## so2medianLag 0.0001718 0.0034530 0.050 0.960
## as.factor(year)88 -0.9909888 5.2978044 -0.187 0.852
## as.factor(year)89 -3.0809707 7.9609623 -0.387 0.699
## as.factor(year)90 -3.5231714 9.4742458 -0.372 0.710
## as.factor(year)91 -3.1206241 10.8517321 -0.288 0.774
## as.factor(year)92 -1.9791295 12.0408914 -0.164 0.869
## as.factor(year)93 -3.8073017 13.1451271 -0.290 0.772
## as.factor(year)94 -3.3476138 14.1352308 -0.237 0.813
## as.factor(year)95 -1.6493649 15.0650236 -0.109 0.913
## as.factor(year)96 -2.5921615 15.9479418 -0.163 0.871
## as.factor(year)97 -3.1252909 16.7825334 -0.186 0.852
## as.factor(year)98 -3.1622694 17.5739907 -0.180 0.857
## as.factor(year)99 -3.0409563 18.3274894 -0.166 0.868
## as.factor(year)100 -3.6995166 19.0517829 -0.194 0.846
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 78.170 95.586 136.142 0.00434 **
## s(o3medianLag) 1.001 1.001 5.552 0.01853 *
## s(tmpdLag) 10.824 13.493 383.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.514 Deviance explained = 50.7%
## UBRE = 0.26025 Scale est. = 1 n = 1080
chicago.summer=subset(chicago,(yearday>150) & (yearday<244))
chicago.summer$time2=1:nrow(chicago.summer) ##provided it is ordered!
fitLag.summer=gam(death~s(time2,bs="cr",k=200)+
pm10medianLag + so2medianLag + s(o3medianLag, k = 20) +
s(tmpdLag,k=40)+as.factor(year),
data=chicago.summer,family=poisson)
plot(fitLag.summer,pages=1)
summary(fitLag.summer)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(time2, bs = "cr", k = 200) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.906e+00 3.487e-01 14.067 <2e-16 ***
## pm10medianLag 4.459e-04 4.488e-04 0.994 0.320
## so2medianLag -1.795e-05 3.444e-03 -0.005 0.996
## as.factor(year)88 -9.382e-02 1.086e-01 -0.864 0.388
## as.factor(year)89 -4.596e-01 3.428e-01 -1.341 0.180
## as.factor(year)90 -4.080e-01 3.519e-01 -1.159 0.246
## as.factor(year)91 -3.954e-01 3.634e-01 -1.088 0.277
## as.factor(year)92 -2.903e-01 3.727e-01 -0.779 0.436
## as.factor(year)93 -2.997e-01 3.834e-01 -0.782 0.434
## as.factor(year)94 -2.901e-01 3.935e-01 -0.737 0.461
## as.factor(year)95 -1.586e-01 4.019e-01 -0.395 0.693
## as.factor(year)96 -2.284e-01 4.152e-01 -0.550 0.582
## as.factor(year)97 -1.894e-01 4.261e-01 -0.445 0.657
## as.factor(year)98 -1.894e-01 4.376e-01 -0.433 0.665
## as.factor(year)99 -1.887e-01 4.449e-01 -0.424 0.672
## as.factor(year)100 -1.973e-01 4.548e-01 -0.434 0.664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time2) 77.24 95.536 139.702 0.00209 **
## s(o3medianLag) 1.00 1.001 5.367 0.02056 *
## s(tmpdLag) 10.83 13.500 382.608 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.515 Deviance explained = 50.7%
## UBRE = 0.25824 Scale est. = 1 n = 1080
fitLagYday2.summer=gam(death~s(yearday,bs="cr",k=40)+
pm10medianLag + so2medianLag + s(o3medianLag, k = 20) +
s(tmpdLag,k=40)+as.factor(year),
data=chicago.summer,family=poisson)
plot(fitLagYday2.summer,pages=1)
summary(fitLagYday2.summer)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(yearday, bs = "cr", k = 40) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7090660 0.0115313 408.372 < 2e-16 ***
## pm10medianLag 0.0005376 0.0003857 1.394 0.16333
## so2medianLag 0.0013250 0.0026751 0.495 0.62040
## as.factor(year)88 -0.0620248 0.0201959 -3.071 0.00213 **
## as.factor(year)89 -0.0167532 0.0195758 -0.856 0.39210
## as.factor(year)90 -0.0156428 0.0153908 -1.016 0.30945
## as.factor(year)91 -0.0332092 0.0160556 -2.068 0.03860 *
## as.factor(year)92 -0.0381858 0.0160413 -2.380 0.01729 *
## as.factor(year)93 0.0167911 0.0155992 1.076 0.28175
## as.factor(year)94 -0.0056347 0.0151205 -0.373 0.70941
## as.factor(year)95 0.0124583 0.0145577 0.856 0.39212
## as.factor(year)96 -0.0600682 0.0152927 -3.928 8.57e-05 ***
## as.factor(year)97 -0.0738696 0.0152273 -4.851 1.23e-06 ***
## as.factor(year)98 -0.0686377 0.0148950 -4.608 4.06e-06 ***
## as.factor(year)99 -0.0741995 0.0148715 -4.989 6.06e-07 ***
## as.factor(year)100 -0.0752074 0.0154491 -4.868 1.13e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(yearday) 17.566 21.606 33.360 0.05311 .
## s(o3medianLag) 1.001 1.002 9.913 0.00165 **
## s(tmpdLag) 12.322 15.318 633.628 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.475 Deviance explained = 45.1%
## UBRE = 0.27295 Scale est. = 1 n = 1080
chicago.winter=subset(chicago,(yearday>334) | (yearday<60))
chicago.winter$yearday2=chicago.winter$yearday
chicago.winter$yearday2[chicago.winter$yearday2>334]=chicago.winter$yearday2[chicago.winter$yearday2>334]-366
fitLagYday2.winter=gam(death~s(yearday2,bs="cr",k=40)+
pm10medianLag + so2medianLag + s(o3medianLag, k = 20) +
s(tmpdLag,k=40)+as.factor(year),
data=chicago.winter,family=poisson)
plot(fitLagYday2.winter,pages=1)
summary(fitLagYday2.winter)
##
## Family: poisson
## Link function: log
##
## Formula:
## death ~ s(yearday2, bs = "cr", k = 40) + pm10medianLag + so2medianLag +
## s(o3medianLag, k = 20) + s(tmpdLag, k = 40) + as.factor(year)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.8067503 0.0143517 334.926 < 2e-16 ***
## pm10medianLag 0.0000729 0.0004443 0.164 0.86967
## so2medianLag -0.0009371 0.0015493 -0.605 0.54526
## as.factor(year)88 0.0067639 0.0188049 0.360 0.71908
## as.factor(year)89 0.0644631 0.0207877 3.101 0.00193 **
## as.factor(year)90 0.0168657 0.0179773 0.938 0.34816
## as.factor(year)91 0.0539994 0.0176569 3.058 0.00223 **
## as.factor(year)92 0.0065195 0.0173288 0.376 0.70675
## as.factor(year)93 -0.0015626 0.0179901 -0.087 0.93078
## as.factor(year)94 0.0541696 0.0180292 3.005 0.00266 **
## as.factor(year)95 0.0810445 0.0171135 4.736 2.18e-06 ***
## as.factor(year)96 0.0101169 0.0172220 0.587 0.55691
## as.factor(year)97 -0.0366707 0.0175437 -2.090 0.03660 *
## as.factor(year)98 0.0187132 0.0172832 1.083 0.27892
## as.factor(year)99 0.0792005 0.0170875 4.635 3.57e-06 ***
## as.factor(year)100 0.0001118 0.0177545 0.006 0.99498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(yearday2) 7.293 9.104 82.472 < 2e-16 ***
## s(o3medianLag) 1.611 2.040 1.361 0.512
## s(tmpdLag) 17.351 21.368 61.876 7.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.202 Deviance explained = 23.3%
## UBRE = 0.24345 Scale est. = 1 n = 1066
par(mfrow=c(1,3))
plot(fitLagYday2,select=3)
plot(fitLagYday2.winter,select=3)
plot(fitLagYday2.summer,select=3)
#par(mfrow=c(1,3))
#plot(fitLagYday2,type="terms",select="s(tmpdLag)")
#plot(fitLagYday2.winter,type="terms",,select="s(tmpdLag)")
#plot(fitLagYday2.summer,type="terms",,select="s(tmpdLag)")
nd=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago$tmpdLag,na.rm=TRUE),max(chicago$tmpdLag,na.rm=TRUE),length=100))
pr2=predict(fitLagYday2,newdata=nd,type="terms",term="s(tmpdLag)",se.fit=TRUE)
nd.summer=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday=1,year=89,tmpdLag=seq(min(chicago.summer$tmpdLag,na.rm=TRUE),max(chicago.summer$tmpdLag,na.rm=TRUE),length=100))
pr2.summer=predict(fitLagYday2.summer,newdata=nd.summer,type="terms",term="s(tmpdLag)",se.fit=TRUE)
nd.winter=data.frame(time=1,pm10medianLag=1,so2medianLag=1,o3medianLag=1,yearday2=1,year=89,tmpdLag=seq(min(chicago.winter$tmpdLag,na.rm=TRUE),max(chicago.winter$tmpdLag,na.rm=TRUE),length=100))
pr2.winter=predict(fitLagYday2.winter,newdata=nd.winter,type="terms",term="s(tmpdLag)",se.fit=TRUE)
par(mfrow=c(1,1))
matr.pred=cbind(pr2$fit,pr2$fit-1.96*pr2$se.fit,pr2$fit+1.96*pr2$se.fit)
matr.pred.winter=cbind(pr2.winter$fit,pr2.winter$fit-1.96*pr2.winter$se.fit,pr2.winter$fit+1.96*pr2.winter$se.fit)
matr.pred.summer=cbind(pr2.summer$fit,pr2.summer$fit-1.96*pr2.summer$se.fit,pr2.summer$fit+1.96*pr2.summer$se.fit)
matplot(nd$tmpdLag,matr.pred,type="l",col=c("black"),lwd=c(2,1,1),lty=c(1,2,2),
ylim=range(matr.pred,matr.pred.winter,matr.pred.summer))
matlines(nd.winter$tmpdLag,matr.pred.winter,type="l",col=c("blue"),lwd=c(2,1,1),lty=c(1,2,2))
matlines(nd.summer$tmpdLag,matr.pred.summer,type="l",col=c("red"),lwd=c(2,1,1),lty=c(1,2,2))
fitLagBiv=gam(death~s(time,bs="cr",k=200)+
so2medianLag+pm10medianLag+
s(o3medianLag,tmpdLag,k=40),
data=chicago,family=poisson)
fitLagYdayBiv=gam(death~s(yearday,bs="cr",k=40)+
so2medianLag+pm10medianLag+
s(o3medianLag,tmpdLag,k=40),
data=chicago,family=poisson)
vis.gam(fitLagBiv,c("o3medianLag","tmpdLag"),theta=45,too.far=0.07)
vis.gam(fitLagBiv,c("o3medianLag","tmpdLag"),plot.type="contour",too.far=0.07)
points(fitLagBiv$model$o3medianLag,fitLagBiv$model$tmpdLag)
vis.gam(fitLagYdayBiv,c("o3medianLag","tmpdLag"),theta=45,too.far=0.07)
vis.gam(fitLagYdayBiv,c("o3medianLag","tmpdLag"),plot.type="contour",too.far=0.07)
points(fitLagYdayBiv$model$o3medianLag,fitLagBiv$model$tmpdLag)
## Questions
which model is best? Perhaps via Cross Validation.
Additional covariates: calendar year (for the model with
yday
)