Load the required libraries
library(tidyverse)
library(broom)
library(gtsummary)
library(MatchIt)
library(geepack)
library(boot)
Let’s simulate the data : first of all set the parameters of the data generating mechanism:
set.seed(0)
n.obs = 10000 #set sample size
#---- True parameters in outcome model
b0 = 60
b1 = 5
b2 = -0.3
b3 = -0.1
b4 = 8
b5 = 3
b6 = 2
#---- True parameters in exposure odds model
g0 = log(0.20/(1-0.20))
g1 = log(1.01)
g2 = log(1.005)
g3 = log(0.6)
g4 = log(0.5)
g5 = log(0.8)
#Function to compute outcome values
## Use the parameters specified above
mean_out <- function(C1, C2, C3, exposure){
b0 + b1*exposure + b2*C1 + b3*I(C1^2) + b4*C2 + b5*C3 + b6*exposure*C2 + rnorm(n = n.obs, mean = 0, sd = 5)
}
# Function to compute exposure probabilities
## Use the parameters specified above
prob_exp <- function(C1, C2, C3){
exp(g0 + g1*C1 + g2*I(C1^2) + g3*C2 + g4*C3 + g5*C2*C3)/(1 + exp(g0 + g1*C1+ g2*I(C1^2) + g3*C2 + g4*C3 + g5*C2*C3))
}
Now simulate the dataset :
df.sim <- tibble("ID" = seq(from = 1, to = n.obs, by = 1),
"C1" = rnorm(n = n.obs, mean = 0, sd = 5),
"C2" = rbinom(n = n.obs, size = 1, p = 0.4),
"C3" = rbinom(n = n.obs, size = 1, p = 0.3),
"Pexposure" = prob_exp(C1, C2, C3),
"Exposure" = rbinom(n = n.obs, size = 1,prob = Pexposure),
"Outcome" = as.numeric(mean_out(C1,C2,C3, Exposure)))
head(df.sim)
## # A tibble: 6 × 7
## ID C1 C2 C3 Pexposure Exposure Outcome
## <dbl> <dbl> <int> <int> <I<dbl>> <int> <dbl>
## 1 1 6.31 0 0 0.245 0 55.7
## 2 2 -1.63 0 0 0.200 0 62.9
## 3 3 6.65 0 0 0.250 0 47.7
## 4 4 6.36 0 0 0.246 0 45.9
## 5 5 2.07 0 1 0.115 0 61.5
## 6 6 -7.70 0 0 0.237 1 65.9
In this simulated data, A is exposure, C1 is a continuous covariate, and C2 and C3 are binary covariates.
The true outcome model is specified as follows:
Y=b0+b1xA+b2xC1+b3x(C1)^2+b4xC2+b5xC3+b6xAxC2+eps
where eps is the error term normally distributed with variance set at 5^2.
The true causal effect of exposure A among those with C2 = 0 is b1=5. The true causal effect of exposure A among those with C2 = 1 is b1+b6=7.
The marginal effect of A is: 5x0.6+7x0.4=5.8 because 40% of the total population has C2 = 1.
# Correctly specified model
df.sim %>%
lm(Outcome ~ Exposure*C2 + C1 + I(C1^2) + C3, data = .) %>%
tidy(conf.int = TRUE)
## # A tibble: 7 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 60.0 0.0875 686. 0 59.8 60.2
## 2 Exposure 5.02 0.166 30.2 4.61e-192 4.69 5.34
## 3 C2 8.10 0.112 72.4 0 7.88 8.32
## 4 C1 -0.305 0.0101 -30.1 2.49e-190 -0.324 -0.285
## 5 I(C1^2) -0.100 0.00148 -67.8 0 -0.103 -0.0972
## 6 C3 2.94 0.111 26.5 4.31e-150 2.72 3.16
## 7 Exposure:C2 1.93 0.294 6.56 5.77e- 11 1.35 2.51
The estimate for Exposure is 5.02. Note that this is an estimate of the conditional effect for C2=0 and it is identical to the true value because the model is correctly specified. The conditional effect for C2 = 1 is estimated to be 5.02 + 1.93 = 6.95, which again is nearly identical to the true parameter b1+b6=7.
One can now standardize the conditional effect estimates from the correctly specified multivariable regression model to get an estimate of a marginal effect:
# Make copies of original data
df.sim.a1 <- df.sim %>% mutate(Outcome = NA, Exposure = 1) #Assign Exposure = 1 to everyone
df.sim.a0 <- df.sim %>% mutate(Outcome = NA, Exposure = 0) #Assign Exposure = 0 to everyone
df.sim.combined <- bind_rows(df.sim.a1,df.sim.a0)
# Fit an outcome model to the original data
## Correctly specified model
gcomp.fit <- df.sim %>% lm(Outcome ~ Exposure*C2 + C1 + I(C1^2) + C3, data = .)
# Predict outcome values using the copied datasets
df.sim.combined$pred <- predict(gcomp.fit, newdata = df.sim.combined)
# ATE Estimate: difference between mean predicted values for rows with A=1 and mean predicted values for rows with A = 0
df.sim.combined %>% group_by(Exposure) %>% summarise( mean.Y = mean(pred) ) %>%
pivot_wider( names_from = Exposure, names_glue = "mean.Y.{Exposure}", values_from = mean.Y ) %>%
mutate(ATE = mean.Y.1-mean.Y.0)
## # A tibble: 1 × 3
## mean.Y.0 mean.Y.1 ATE
## <dbl> <dbl> <dbl>
## 1 61.6 67.4 5.78
The resulting estimate of a marginal effect is 5.78 — this is a consistent estimate of the true marginal effect of 5.8.
Confidence intervals for the standardized estimate can be obtained via bootstrapping:
standardization.boot <- function(data, indices){
df <- data[indices,]
df.a1 <- df %>%
mutate(Outcome = NA, Exposure = 1)
df.a0 <- df %>%
mutate(Outcome = NA, Exposure = 0)
df.combined <- bind_rows(df.a1,df.a0)
gcomp.fit <- df %>%
lm(Outcome ~ Exposure*C2 + C1 + I(C1^2) + C3, data = .)
df.combined$pred <- predict(gcomp.fit, newdata = df.combined)
output <- df.combined %>%
group_by(Exposure) %>%
summarise(
mean.Y = mean(pred)) %>%
pivot_wider(
names_from = Exposure,
names_glue = "mean.Y.{Exposure}",
values_from = mean.Y
) %>%
mutate(
ATE = mean.Y.1-mean.Y.0
)
return(output$ATE)
}
# bootstrap
standardization.results <- boot(data=df.sim, statistic=standardization.boot, R=100)
# 100 bootstrapped samples
# generating confidence intervals
empirical.se <- sd(standardization.results$t) # get empirical standard error estimate
estimate <- standardization.results$t0
ll <- estimate - qnorm(0.975)*empirical.se # normal approximation
ul <- estimate + qnorm(0.975)*empirical.se
data.frame(cbind(estimate, empirical.se, ll, ul))
## estimate empirical.se ll ul
## 1 5.782279 0.1425789 5.50283 6.061729