Lab 4
Modello gerarchico binomiale
library(dplyr) # optional
library(kableExtra)
library(ggplot2)
theme_set(theme_minimal())
library(gridExtra)
library(tidyr)
library(latex2exp)Esempio
Applicazione di un modello gerarchico coniugato allo studio dei tumori nei topi
Riferimento
Gelman: p. 108, Sec 5.3, Bayesian analysis of conjugate hierarchical models
Linguaggio
R, B-B.r
Soggetto
- Medicine testing
- Type F344 female rats in control group given placebo
- count how many get endometrial stromal polyps
- Experiment has been repeated 71 times
- \(y/n\) proporzione osservata di cavie con tumore - esperimento corrente (\(71^o\)): \(y/n=4/14\)
- \(\theta\) probabilià di tumore nei topi (femmina) di laboratorio che ricevono dose \(0\) del farmaco
Dati
y <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 2, 2, 2, 1, 5, 2, 5, 3, 2, 7, 7, 3, 3, 2, 9, 10, 4, 4, 4, 4, 4, 4, 4, 10, 4, 4,
4, 5, 11, 12, 5, 5, 6, 5, 6, 6, 6, 6, 16, 15, 15, 9, 4)
n <- c(20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 17, 20, 20, 20, 20, 19, 19,
18, 18, 25, 24, 23, 20, 20, 20, 20, 20, 20, 10, 49, 19, 46, 27, 17, 49, 47, 20, 20,
13, 48, 50, 20, 20, 20, 20, 20, 20, 20, 48, 19, 19, 19, 22, 46, 49, 20, 20, 23, 19,
22, 20, 20, 20, 52, 46, 47, 24, 14)Valori di \(\theta\) su cui valutare le densità
x <- seq(1e-04, 0.9999, length.out = 1000)Helper function per valutare la densità per ogni
valore
bdens <- function(n, y, x) dbeta(x, y + 1, n - y + 1)Di che densità si tratta?
Quale a priori?
Analisi separate
df_sep <- mapply(bdens, n, y, MoreArgs = list(x = x)) %>%
as.data.frame() %>%
cbind(x)
df_sep %>%
dim
[1] 1000 72
df_sep %>%
names()
[1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11" "V12" "V13"
[14] "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23" "V24" "V25" "V26"
[27] "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35" "V36" "V37" "V38" "V39"
[40] "V40" "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" "V49" "V50" "V51" "V52"
[53] "V53" "V54" "V55" "V56" "V57" "V58" "V59" "V60" "V61" "V62" "V63" "V64" "V65"
[66] "V66" "V67" "V68" "V69" "V70" "V71" "x"
df_sep <- df_sep %>%
gather(ind, p, -x)
df_sep %>%
dim
[1] 71000 3
df_sep %>%
names()
[1] "x" "ind" "p" Grafico delle a posteriori per \(t_j\)
labs1 <- paste("posterior of", c("theta_j", "theta_71"))
plot_sep <- ggplot(data = df_sep) + geom_line(aes(x = x, y = p, color = (ind == "V71"),
group = ind)) + labs(x = expression(theta), y = "", title = "Separate model", color = "") +
scale_y_continuous(breaks = NULL) + scale_color_manual(values = c("blue", "magenta"),
labels = labs1) + theme(legend.background = element_blank(), legend.position = c(0.8,
0.9))
plot_sepModello pooled
df_pool <- data.frame(x = x, p = dbeta(x, sum(y) + 1, sum(n) - sum(y) + 1))Grafico per il modello che mette insieme tutti i dati
plot_pool <- ggplot(data = df_pool) + geom_line(aes(x = x, y = p, color = "1")) + labs(x = expression(theta),
y = "", title = "Pooled model", color = "") + scale_y_continuous(breaks = NULL) +
scale_color_manual(values = "brown", labels = "Posterior of common theta") + theme(legend.background = element_blank(),
legend.position = c(0.7, 0.9))Mettiamo a confronto l’analisi separata con quella pooled
grid.arrange(plot_sep, plot_pool)Modello gerarchico
Valutazione della densità a posteriori degli iperparametri
Calcolo della marginale a posteriori di alpha and beta nel modello gerarchico.
Uso della griglia
A <- seq(0.5, 6, length.out = 100)
B <- seq(3, 33, length.out = 100)Vettori che contengono tutte le combinazioni a due a due di A e B
cA <- rep(A, each = length(B)) # change 'slowly'
cB <- rep(B, length(A)) # change 'quickly'Logaritmo per accuratezza numerica
lpfun <- function(a, b, y, n) log(a + b) * (-5/2) + sum(lgamma(a + b) - lgamma(a) - lgamma(b) +
lgamma(a + y) + lgamma(b + n - y) - lgamma(a + b + n))
lp <- mapply(lpfun, cA, cB, MoreArgs = list(y, n))Sottrazione del massimo per evitare over/underflow nell’esponenziazione
df_marg <- data.frame(x = cA, y = cB, p = exp(lp - max(lp)))Grafico della densità marginale a posteriori
title1 <- TeX("The marginal of $\\alpha$ and $\\beta$")
ggplot(data = df_marg, aes(x = x, y = y)) + geom_raster(aes(fill = p, alpha = p), interpolate = T) +
geom_contour(aes(z = p), colour = "black", size = 0.2) + coord_cartesian(xlim = c(1,
5), ylim = c(4, 26)) + labs(x = TeX("$\\alpha$"), y = TeX("$\\beta$"), title = title1) +
scale_fill_gradient(low = "yellow", high = "red", guide = "none") + scale_alpha(range = c(0,
1), guide = "none")Campionamento degli iperparametri
Campionamento dalla griglia (con rimpiazzo)
nsamp <- 100
# sample an integer from 1 to marg prob length (10,000), 100 times
samp_indices <- sample(length(df_marg$p), size = nsamp, replace = T, prob = df_marg$p/sum(df_marg$p))
samp_indices
[1] 8081 3539 4239 2625 2325 4654 5155 1617 4953 3641 7074 3843 1414 5756 2737 4857
[17] 3234 2423 2834 3936 2428 1420 3031 3433 8372 3640 3029 2834 1822 3941 2833 2629
[33] 2329 4039 3432 4751 1115 3536 4656 3742 1419 4957 1315 1922 2024 6060 4343 4242
[49] 2028 2523 2731 3741 3143 4456 4649 4952 1008 2325 2836 1314 3839 2835 3131 3742
[65] 3134 1006 2226 2229 4854 2836 1914 3739 3540 3135 611 2426 3939 1824 4249 1416
[81] 2122 2526 2934 2922 3846 4051 3033 911 3341 3134 2427 1822 2123 3343 2428 5559
[97] 3742 5148 4550 2833
samp_A <- cA[samp_indices]
samp_B <- cB[samp_indices]La superpopolazione dei theta
Grafico di (\(20\)) estrazioni dalla distribuzione delle distribuzioni Beta(alpha,beta), ie, grafico di Beta(alpha,beta) usando i campioni a posteriori di alpha e beta
df_psamp <- mapply(function(a, b, x) dbeta(x, a, b), samp_A, samp_B, MoreArgs = list(x = x)) %>%
as.data.frame() %>%
cbind(x) %>%
gather(ind, p, -x)
df_psamp$ind %>%
head
[1] "V1" "V1" "V1" "V1" "V1" "V1"
# helper function to convert ind to numeric for subsetting
indtonum <- function(x) strtoi(substring(x, 2))
title2 <- TeX("Beta($\\alpha,\\beta$) given posterior draws of $\\alpha$ and $\\beta$")
plot_psamp <- ggplot(data = subset(df_psamp, indtonum(ind) <= 20)) + geom_line(aes(x = x,
y = p, group = ind), color = "forestgreen") + labs(x = expression(theta), y = "",
title = title2) + scale_y_continuous(breaks = NULL)La media delle distribuzioni sopra è la distribuzione predittiva per un nuovo theta, e anche la distribuzione della popolazione di cui i theta_j sono estrazioni
Note the use of spread (tidyr pkg)
df_psampmean <- spread(df_psamp, ind, p) %>%
subset(select = -x) %>%
rowMeans() %>%
data.frame(x = x, p = .)
title3 <- TeX("Population distribution for each $\\theta_j$ and new $\\theta$s")
plot_psampmean <- ggplot(data = df_psampmean) + geom_line(aes(x = x, y = p), color = "forestgreen") +
labs(x = expression(theta), y = "", title = title3) + scale_y_continuous(breaks = NULL)Combiniamo i grafici
grid.arrange(plot_psamp, plot_psampmean)Distribuzioni a posteriori dei theta_j
Infine, confrontiamo i modelli separati con il modello gerarchico (usando un campione ogni 7 per pulizia d’immagine)
Modelli separati
plot_sep7 <- ggplot(data = subset(df_sep, indtonum(ind)%%7 == 0)) + geom_line(aes(x = x,
y = p, color = (ind == "V49"), group = ind)) + labs(x = expression(theta), y = "",
title = "Separate model", color = "") + scale_y_continuous(breaks = NULL) + scale_color_manual(values = c("blue",
"red"), guide = "none") + theme(legend.background = element_blank(), legend.position = c(0.8,
0.9))Modello gerarchico
Si noti che le distr marginali a posteriori per theta_j sono più ristrette che nel caso del modello separato, grazie all’informazione presa a prestito dagli altri theta_j’s
Densità media sui campioni (di a e b) per ogni coppia (n,y) su ogni valore x (di theta)
bdens2 <- function(n, y, a, b, x) rowMeans(mapply(dbeta, a + y, n - y + b, MoreArgs = list(x = x)))
df_hier <- mapply(bdens2, n, y, MoreArgs = list(samp_A, samp_B, x)) %>%
as.data.frame() %>%
cbind(x) %>%
gather(ind, p, -x)
plot_hier7 <- ggplot(data = subset(df_hier, indtonum(ind)%%7 == 0)) + geom_line(aes(x = x,
y = p, color = (ind == "V49"), group = ind)) + labs(x = expression(theta), y = "",
title = "Hierarchical model", color = "") + scale_color_manual(values = c("blue",
"red"), guide = "none") + scale_y_continuous(breaks = NULL) + theme(legend.background = element_blank(),
legend.position = c(0.8, 0.9))I due grafici insieme
grid.arrange(plot_sep7, plot_hier7)Intervalli per theta_j
- Modello pooled: Come vi aspettate la stima e il CI per la media comune?
- Modelli separati: stime dei \(\theta_j\) prossime alla stima di ML (\(y/n\)): perchè? Come vi aspettate la lunghezza degli CI rispetto al modello gerarchico?
- Modello EB: stime dei \(\theta_j\) vicine a quelle del modello gerarchico: perchè? Come vi aspettate la lunghezza degli CI rispetto al modello gerarchico?
Intervallo 90% per il modello pooled
m_pool <- (sum(y) + 1)/(sum(n) + 2)
m_pool
[1] 0.1539345
qq_pool <- qbeta(c(0.05, 0.95), sum(y) + 1, sum(n) - sum(y) + 1)
diff(qq_pool)
[1] 0.02843885Intervalli 90% per i modelli separati
qq_separate <- data.frame(id = 1:length(n), n = n, y = y, q5 = qbeta(0.05, y + 1, n -
y + 1), q95 = qbeta(0.95, y + 1, n - y + 1))per il modello gerarchico
qh <- function(q, n, y) colMeans(mapply(function(q, n, y, a, b) mapply(qbeta, q, a + y,
n - y + b), q, n, y, MoreArgs = list(samp_A, samp_B)))
qq_hier <- data.frame(id = 1:length(n), n = n, y = y, q5 = qh(0.05, n, y), q95 = qh(0.95,
n, y))
m_post <- mean(samp_A/(samp_A + samp_B))
m_post
[1] 0.1459348
mean(samp_A)
[1] 2.298889
mean(samp_B)
[1] 13.47576
v_post <- mean(samp_A * samp_B/((samp_A + samp_B)^2 + (samp_A + samp_B + 1)))
v_post
[1] 0.115636
qq_m_hier <- quantile(samp_A/(samp_A + samp_B), c(0.05, 0.95))
diff(qq_m_hier)
95%
0.03512298 per il modello EB
a_eb <- 1.4
b_eb <- 8.6
qq_eb <- data.frame(id = 1:length(n), n = n, y = y, q5 = qbeta(0.05, y + a_eb, n - y +
b_eb), q95 = qbeta(0.95, y + a_eb, n - y + b_eb))
m_eb <- a_eb/(a_eb + b_eb)
m_eb
[1] 0.14
v_eb <- a_eb * b_eb/((a_eb + b_eb)^2 + (a_eb + b_eb + 1))
v_eb
[1] 0.1084685Grafici
plot_q_s <- ggplot(data = qq_separate[seq(1, length(n), by = 3), ], aes(x = jitter(n,
amount = 1), ymin = q5, ymax = q95)) + geom_linerange() + geom_hline(yintercept = 0.5,
linetype = 2) + geom_hline(yintercept = c(m_pool, qq_pool), linetype = 2, col = c(4)) +
xlim(c(0, 60)) + ylim(c(0, 0.6))plot_q_h <- ggplot(data = qq_hier[seq(1, length(n), by = 3), ], aes(x = jitter(n, amount = 1),
ymin = q5, ymax = q95)) + geom_linerange(color = "red") + geom_hline(yintercept = c(m_post,
qq_m_hier), linetype = 2, col = 2) + geom_hline(yintercept = c(m_pool, qq_pool), linetype = 2,
col = 4) + xlim(c(0, 60)) + ylim(c(0, 0.6))plot_q_eb <- ggplot(data = qq_eb[seq(1, length(n), by = 3), ], aes(x = jitter(n, amount = 1),
ymin = q5, ymax = q95)) + geom_linerange(color = "orange") + geom_hline(yintercept = c(m_eb),
linetype = 2, col = c("orange")) + xlim(c(0, 60)) + ylim(c(0, 0.6))grid.arrange(plot_q_s, plot_q_h, plot_q_eb)ggplot(data = qq_separate[seq(1, length(n), by = 1), ], aes(x = jitter(n, amount = 0.5),
ymin = q5, ymax = q95)) + geom_linerange() + geom_hline(yintercept = c(m_post, qq_m_hier),
linetype = 2, col = 2) + geom_hline(yintercept = c(m_pool, qq_pool), linetype = 2,
col = c(4)) + geom_linerange(data = qq_hier[seq(1, length(n), by = 1), ], aes(x = jitter(n,
amount = 0.5), ymin = q5, ymax = q95), col = 2) + xlim(c(0, 60)) + ylim(c(0, 0.6))ggplot(data = qq_eb[seq(1, length(n), by = 1), ], aes(x = jitter(n, amount = 0.5), ymin = q5,
ymax = q95)) + geom_linerange(col = "orange") + geom_hline(yintercept = c(m_post,
qq_m_hier), linetype = 2, col = 2) + geom_hline(yintercept = c(m_eb), linetype = 2,
col = c("orange")) + geom_linerange(data = qq_hier[seq(1, length(n), by = 1), ], aes(x = jitter(n,
amount = 0.5), ymin = q5, ymax = q95), col = 2) + xlim(c(0, 60)) + ylim(c(0, 0.6))Gelman’s way
\((\gamma,\delta)\)=\(\left(\log(\frac{\alpha}{\beta}),\log(\alpha+\beta)\right)\)
\(\log(\frac{\alpha}{\beta})={\rm logit}(\frac{\alpha}{\alpha+\beta})\)
log_post_funct
log_post <- function(gamma, delta) {
a <- exp(gamma + delta)/(1 + exp(gamma))
b <- exp(delta)/(1 + exp(gamma))
ldens <- 0
for (i in 1:length(y)) {
ldens <- ldens + lgamma(a + b) - lgamma(a) - lgamma(b) + lgamma(a + y[i]) + lgamma(b +
n[i] - y[i]) - lgamma(a + b + n[i])
}
ldens - 5/2 * log(a + b) + log(a) + log(b)
}plot_join_post_funct
plot_join_post <- function(ll = 200, sim) {
gamma <- seq(-2.3, -1.3, le = ll)
delta <- seq(1, 5, le = ll)
contours <- seq(0.05, 0.95, 0.1)
logdens <- outer(gamma, delta, log_post)
dens <- exp(logdens - max(logdens))
contour(gamma, delta, dens, levels = contours, xlab = "log(alpha/beta)", ylab = "log(alpha+beta)")
points(log(sim$aaa/sim$bbb), log(sim$aaa + sim$bbb), cex = 0.25, col = 4)
mtext("Posterior density, 3, line=1, cex=1.5")
}grid_value
grid_value <- function(ll = 200) {
gamma <- seq(-2.3, -1.3, le = ll)
delta <- seq(1, 5, le = ll)
PP <- matrix(NA, ll, ll)
for (i in 1:ll) {
for (j in 1:ll) {
PP[i, j] <- log_post(gamma[i], delta[j])
}
}
PP <- exp(PP - max(PP))
PP <- PP/sum(PP)
return(list(PP = PP, gamma = gamma, delta = delta))
}Draw posterior sample of alpha, beta, and theta (using the inverse-cdf method)
sampling
sampling <- function(mm = 1000, JJ, gamma, delta) {
ggg <- aaa <- ddd <- bbb <- NULL
gamma_mar <- apply(JJ, 1, sum)
delta_mar <- apply(JJ, 2, sum)
ttt <- matrix(NA, length(y), mm)
for (l in 1:mm) {
uuu <- runif(1, 0, 1)
ggg[l] <- max(gamma[cumsum(gamma_mar) < uuu])
junk <- (1:length(gamma))[gamma == ggg[l]]
conditional <- JJ[junk, ]/sum(JJ[junk, ])
uuu <- runif(1, 0, 1)
ddd[l] <- max(delta[cumsum(conditional) < uuu])
aaa[l] <- exp(ggg[l] + ddd[l])/(1 + exp(ggg[l]))
bbb[l] <- exp(ddd[l])/(1 + exp(ggg[l]))
for (i in 1:length(y)) {
ttt[i, l] <- rbeta(1, aaa[l] + y[i], bbb[l] + n[i] - y[i])
}
}
return(list(aaa = aaa, bbb = bbb, ttt = ttt))
}o_grid <- grid_value()
names(o_grid)
[1] "PP" "gamma" "delta"
o_sampling <- sampling(JJ = o_grid$PP, gamma = o_grid$gamma, delta = o_grid$delta)
names(o_sampling)
[1] "aaa" "bbb" "ttt"
range(o_sampling$aaa)
[1] 0.7647167 7.5250194
range(o_sampling$bbb)
[1] 3.465322 46.799456
range(o_sampling$ttt)
[1] 3.951663e-05 5.774023e-01
o_sampling$aaa %>%
mean
[1] 2.314092
o_sampling$bbb %>%
mean
[1] 13.91046
mean_post <- o_sampling$aaa/(o_sampling$aaa + o_sampling$bbb)
mean_post %>%
mean()
[1] 0.1434209
mean_post %>%
quantile(c(0.05, 0.9))
5% 90%
0.1225401 0.1601942 plot_join_post(sim = o_sampling)