Lab 4

Modello gerarchico binomiale

library(dplyr)  # optional
library(kableExtra)
library(ggplot2)
theme_set(theme_minimal())
library(gridExtra)
library(tidyr)
library(latex2exp)

Esempio

Applicazione di un modello gerarchico coniugato allo studio dei tumori nei topi

Riferimento

Gelman: p. 108, Sec 5.3, Bayesian analysis of conjugate hierarchical models

Linguaggio

R, B-B.r

Soggetto

Dati

y <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 1, 5, 2, 5, 3, 2, 7, 7, 3, 3, 2, 9, 10, 4, 4, 4, 4, 4, 4, 4, 10, 4, 4,
    4, 5, 11, 12, 5, 5, 6, 5, 6, 6, 6, 6, 16, 15, 15, 9, 4)
n <- c(20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 17, 20, 20, 20, 20, 19, 19,
    18, 18, 25, 24, 23, 20, 20, 20, 20, 20, 20, 10, 49, 19, 46, 27, 17, 49, 47, 20, 20,
    13, 48, 50, 20, 20, 20, 20, 20, 20, 20, 48, 19, 19, 19, 22, 46, 49, 20, 20, 23, 19,
    22, 20, 20, 20, 52, 46, 47, 24, 14)

Valori di \(\theta\) su cui valutare le densità

x <- seq(1e-04, 0.9999, length.out = 1000)

Helper function per valutare la densità per ogni valore

bdens <- function(n, y, x) dbeta(x, y + 1, n - y + 1)

Di che densità si tratta?

Quale a priori?

Analisi separate

df_sep <- mapply(bdens, n, y, MoreArgs = list(x = x)) %>%
    as.data.frame() %>%
    cbind(x)
df_sep %>%
    dim
[1] 1000   72
df_sep %>%
    names()
 [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11" "V12" "V13"
[14] "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23" "V24" "V25" "V26"
[27] "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35" "V36" "V37" "V38" "V39"
[40] "V40" "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" "V49" "V50" "V51" "V52"
[53] "V53" "V54" "V55" "V56" "V57" "V58" "V59" "V60" "V61" "V62" "V63" "V64" "V65"
[66] "V66" "V67" "V68" "V69" "V70" "V71" "x"  

df_sep <- df_sep %>%
    gather(ind, p, -x)
df_sep %>%
    dim
[1] 71000     3
df_sep %>%
    names()
[1] "x"   "ind" "p"  

Grafico delle a posteriori per \(t_j\)

labs1 <- paste("posterior of", c("theta_j", "theta_71"))
plot_sep <- ggplot(data = df_sep) + geom_line(aes(x = x, y = p, color = (ind == "V71"),
    group = ind)) + labs(x = expression(theta), y = "", title = "Separate model", color = "") +
    scale_y_continuous(breaks = NULL) + scale_color_manual(values = c("blue", "magenta"),
    labels = labs1) + theme(legend.background = element_blank(), legend.position = c(0.8,
    0.9))

plot_sep

Modello pooled

df_pool <- data.frame(x = x, p = dbeta(x, sum(y) + 1, sum(n) - sum(y) + 1))

Grafico per il modello che mette insieme tutti i dati

plot_pool <- ggplot(data = df_pool) + geom_line(aes(x = x, y = p, color = "1")) + labs(x = expression(theta),
    y = "", title = "Pooled model", color = "") + scale_y_continuous(breaks = NULL) +
    scale_color_manual(values = "brown", labels = "Posterior of common theta") + theme(legend.background = element_blank(),
    legend.position = c(0.7, 0.9))

Mettiamo a confronto l’analisi separata con quella pooled

grid.arrange(plot_sep, plot_pool)

Modello gerarchico

Valutazione della densità a posteriori degli iperparametri

Calcolo della marginale a posteriori di alpha and beta nel modello gerarchico.

Uso della griglia

A <- seq(0.5, 6, length.out = 100)
B <- seq(3, 33, length.out = 100)

Vettori che contengono tutte le combinazioni a due a due di A e B

cA <- rep(A, each = length(B))  # change 'slowly'
cB <- rep(B, length(A))  # change 'quickly'

Logaritmo per accuratezza numerica

lpfun <- function(a, b, y, n) log(a + b) * (-5/2) + sum(lgamma(a + b) - lgamma(a) - lgamma(b) +
    lgamma(a + y) + lgamma(b + n - y) - lgamma(a + b + n))
lp <- mapply(lpfun, cA, cB, MoreArgs = list(y, n))

Sottrazione del massimo per evitare over/underflow nell’esponenziazione

df_marg <- data.frame(x = cA, y = cB, p = exp(lp - max(lp)))

Grafico della densità marginale a posteriori

title1 <- TeX("The marginal of $\\alpha$ and $\\beta$")
ggplot(data = df_marg, aes(x = x, y = y)) + geom_raster(aes(fill = p, alpha = p), interpolate = T) +
    geom_contour(aes(z = p), colour = "black", size = 0.2) + coord_cartesian(xlim = c(1,
    5), ylim = c(4, 26)) + labs(x = TeX("$\\alpha$"), y = TeX("$\\beta$"), title = title1) +
    scale_fill_gradient(low = "yellow", high = "red", guide = "none") + scale_alpha(range = c(0,
    1), guide = "none")

Campionamento degli iperparametri

Campionamento dalla griglia (con rimpiazzo)

nsamp <- 100
# sample an integer from 1 to marg prob length (10,000), 100 times
samp_indices <- sample(length(df_marg$p), size = nsamp, replace = T, prob = df_marg$p/sum(df_marg$p))
samp_indices
  [1] 8081 3539 4239 2625 2325 4654 5155 1617 4953 3641 7074 3843 1414 5756 2737 4857
 [17] 3234 2423 2834 3936 2428 1420 3031 3433 8372 3640 3029 2834 1822 3941 2833 2629
 [33] 2329 4039 3432 4751 1115 3536 4656 3742 1419 4957 1315 1922 2024 6060 4343 4242
 [49] 2028 2523 2731 3741 3143 4456 4649 4952 1008 2325 2836 1314 3839 2835 3131 3742
 [65] 3134 1006 2226 2229 4854 2836 1914 3739 3540 3135  611 2426 3939 1824 4249 1416
 [81] 2122 2526 2934 2922 3846 4051 3033  911 3341 3134 2427 1822 2123 3343 2428 5559
 [97] 3742 5148 4550 2833
samp_A <- cA[samp_indices]
samp_B <- cB[samp_indices]

La superpopolazione dei theta

Grafico di (\(20\)) estrazioni dalla distribuzione delle distribuzioni Beta(alpha,beta), ie, grafico di Beta(alpha,beta) usando i campioni a posteriori di alpha e beta

df_psamp <- mapply(function(a, b, x) dbeta(x, a, b), samp_A, samp_B, MoreArgs = list(x = x)) %>%
    as.data.frame() %>%
    cbind(x) %>%
    gather(ind, p, -x)

df_psamp$ind %>%
    head
[1] "V1" "V1" "V1" "V1" "V1" "V1"

# helper function to convert ind to numeric for subsetting
indtonum <- function(x) strtoi(substring(x, 2))

title2 <- TeX("Beta($\\alpha,\\beta$) given posterior draws of $\\alpha$ and $\\beta$")
plot_psamp <- ggplot(data = subset(df_psamp, indtonum(ind) <= 20)) + geom_line(aes(x = x,
    y = p, group = ind), color = "forestgreen") + labs(x = expression(theta), y = "",
    title = title2) + scale_y_continuous(breaks = NULL)

La media delle distribuzioni sopra è la distribuzione predittiva per un nuovo theta, e anche la distribuzione della popolazione di cui i theta_j sono estrazioni

Note the use of spread (tidyr pkg)

df_psampmean <- spread(df_psamp, ind, p) %>%
    subset(select = -x) %>%
    rowMeans() %>%
    data.frame(x = x, p = .)


title3 <- TeX("Population distribution for each $\\theta_j$ and new $\\theta$s")
plot_psampmean <- ggplot(data = df_psampmean) + geom_line(aes(x = x, y = p), color = "forestgreen") +
    labs(x = expression(theta), y = "", title = title3) + scale_y_continuous(breaks = NULL)

Combiniamo i grafici

grid.arrange(plot_psamp, plot_psampmean)

Distribuzioni a posteriori dei theta_j

Infine, confrontiamo i modelli separati con il modello gerarchico (usando un campione ogni 7 per pulizia d’immagine)

Modelli separati

plot_sep7 <- ggplot(data = subset(df_sep, indtonum(ind)%%7 == 0)) + geom_line(aes(x = x,
    y = p, color = (ind == "V49"), group = ind)) + labs(x = expression(theta), y = "",
    title = "Separate model", color = "") + scale_y_continuous(breaks = NULL) + scale_color_manual(values = c("blue",
    "red"), guide = "none") + theme(legend.background = element_blank(), legend.position = c(0.8,
    0.9))

Modello gerarchico

Si noti che le distr marginali a posteriori per theta_j sono più ristrette che nel caso del modello separato, grazie all’informazione presa a prestito dagli altri theta_j’s

Densità media sui campioni (di a e b) per ogni coppia (n,y) su ogni valore x (di theta)

bdens2 <- function(n, y, a, b, x) rowMeans(mapply(dbeta, a + y, n - y + b, MoreArgs = list(x = x)))

df_hier <- mapply(bdens2, n, y, MoreArgs = list(samp_A, samp_B, x)) %>%
    as.data.frame() %>%
    cbind(x) %>%
    gather(ind, p, -x)

plot_hier7 <- ggplot(data = subset(df_hier, indtonum(ind)%%7 == 0)) + geom_line(aes(x = x,
    y = p, color = (ind == "V49"), group = ind)) + labs(x = expression(theta), y = "",
    title = "Hierarchical model", color = "") + scale_color_manual(values = c("blue",
    "red"), guide = "none") + scale_y_continuous(breaks = NULL) + theme(legend.background = element_blank(),
    legend.position = c(0.8, 0.9))

I due grafici insieme

grid.arrange(plot_sep7, plot_hier7)

Intervalli per theta_j
  • Modello pooled: Come vi aspettate la stima e il CI per la media comune?
  • Modelli separati: stime dei \(\theta_j\) prossime alla stima di ML (\(y/n\)): perchè? Come vi aspettate la lunghezza degli CI rispetto al modello gerarchico?
  • Modello EB: stime dei \(\theta_j\) vicine a quelle del modello gerarchico: perchè? Come vi aspettate la lunghezza degli CI rispetto al modello gerarchico?

Intervallo 90% per il modello pooled

m_pool <- (sum(y) + 1)/(sum(n) + 2)
m_pool
[1] 0.1539345
qq_pool <- qbeta(c(0.05, 0.95), sum(y) + 1, sum(n) - sum(y) + 1)
diff(qq_pool)
[1] 0.02843885

Intervalli 90% per i modelli separati

qq_separate <- data.frame(id = 1:length(n), n = n, y = y, q5 = qbeta(0.05, y + 1, n -
    y + 1), q95 = qbeta(0.95, y + 1, n - y + 1))

per il modello gerarchico

qh <- function(q, n, y) colMeans(mapply(function(q, n, y, a, b) mapply(qbeta, q, a + y,
    n - y + b), q, n, y, MoreArgs = list(samp_A, samp_B)))

qq_hier <- data.frame(id = 1:length(n), n = n, y = y, q5 = qh(0.05, n, y), q95 = qh(0.95,
    n, y))

m_post <- mean(samp_A/(samp_A + samp_B))
m_post
[1] 0.1459348
mean(samp_A)
[1] 2.298889
mean(samp_B)
[1] 13.47576
v_post <- mean(samp_A * samp_B/((samp_A + samp_B)^2 + (samp_A + samp_B + 1)))
v_post
[1] 0.115636
qq_m_hier <- quantile(samp_A/(samp_A + samp_B), c(0.05, 0.95))
diff(qq_m_hier)
       95% 
0.03512298 

per il modello EB

a_eb <- 1.4
b_eb <- 8.6

qq_eb <- data.frame(id = 1:length(n), n = n, y = y, q5 = qbeta(0.05, y + a_eb, n - y +
    b_eb), q95 = qbeta(0.95, y + a_eb, n - y + b_eb))

m_eb <- a_eb/(a_eb + b_eb)
m_eb
[1] 0.14
v_eb <- a_eb * b_eb/((a_eb + b_eb)^2 + (a_eb + b_eb + 1))
v_eb
[1] 0.1084685

Grafici

plot_q_s <- ggplot(data = qq_separate[seq(1, length(n), by = 3), ], aes(x = jitter(n,
    amount = 1), ymin = q5, ymax = q95)) + geom_linerange() + geom_hline(yintercept = 0.5,
    linetype = 2) + geom_hline(yintercept = c(m_pool, qq_pool), linetype = 2, col = c(4)) +
    xlim(c(0, 60)) + ylim(c(0, 0.6))
plot_q_h <- ggplot(data = qq_hier[seq(1, length(n), by = 3), ], aes(x = jitter(n, amount = 1),
    ymin = q5, ymax = q95)) + geom_linerange(color = "red") + geom_hline(yintercept = c(m_post,
    qq_m_hier), linetype = 2, col = 2) + geom_hline(yintercept = c(m_pool, qq_pool), linetype = 2,
    col = 4) + xlim(c(0, 60)) + ylim(c(0, 0.6))
plot_q_eb <- ggplot(data = qq_eb[seq(1, length(n), by = 3), ], aes(x = jitter(n, amount = 1),
    ymin = q5, ymax = q95)) + geom_linerange(color = "orange") + geom_hline(yintercept = c(m_eb),
    linetype = 2, col = c("orange")) + xlim(c(0, 60)) + ylim(c(0, 0.6))
grid.arrange(plot_q_s, plot_q_h, plot_q_eb)

ggplot(data = qq_separate[seq(1, length(n), by = 1), ], aes(x = jitter(n, amount = 0.5),
    ymin = q5, ymax = q95)) + geom_linerange() + geom_hline(yintercept = c(m_post, qq_m_hier),
    linetype = 2, col = 2) + geom_hline(yintercept = c(m_pool, qq_pool), linetype = 2,
    col = c(4)) + geom_linerange(data = qq_hier[seq(1, length(n), by = 1), ], aes(x = jitter(n,
    amount = 0.5), ymin = q5, ymax = q95), col = 2) + xlim(c(0, 60)) + ylim(c(0, 0.6))

ggplot(data = qq_eb[seq(1, length(n), by = 1), ], aes(x = jitter(n, amount = 0.5), ymin = q5,
    ymax = q95)) + geom_linerange(col = "orange") + geom_hline(yintercept = c(m_post,
    qq_m_hier), linetype = 2, col = 2) + geom_hline(yintercept = c(m_eb), linetype = 2,
    col = c("orange")) + geom_linerange(data = qq_hier[seq(1, length(n), by = 1), ], aes(x = jitter(n,
    amount = 0.5), ymin = q5, ymax = q95), col = 2) + xlim(c(0, 60)) + ylim(c(0, 0.6))

Gelman’s way

\((\gamma,\delta)\)=\(\left(\log(\frac{\alpha}{\beta}),\log(\alpha+\beta)\right)\)

\(\log(\frac{\alpha}{\beta})={\rm logit}(\frac{\alpha}{\alpha+\beta})\)

log_post_funct

log_post <- function(gamma, delta) {
    a <- exp(gamma + delta)/(1 + exp(gamma))
    b <- exp(delta)/(1 + exp(gamma))
    ldens <- 0
    for (i in 1:length(y)) {
        ldens <- ldens + lgamma(a + b) - lgamma(a) - lgamma(b) + lgamma(a + y[i]) + lgamma(b +
            n[i] - y[i]) - lgamma(a + b + n[i])
    }
    ldens - 5/2 * log(a + b) + log(a) + log(b)
}

plot_join_post_funct

plot_join_post <- function(ll = 200, sim) {
    gamma <- seq(-2.3, -1.3, le = ll)
    delta <- seq(1, 5, le = ll)
    contours <- seq(0.05, 0.95, 0.1)
    logdens <- outer(gamma, delta, log_post)
    dens <- exp(logdens - max(logdens))
    contour(gamma, delta, dens, levels = contours, xlab = "log(alpha/beta)", ylab = "log(alpha+beta)")
    points(log(sim$aaa/sim$bbb), log(sim$aaa + sim$bbb), cex = 0.25, col = 4)
    mtext("Posterior density, 3, line=1, cex=1.5")
}

grid_value

grid_value <- function(ll = 200) {
    gamma <- seq(-2.3, -1.3, le = ll)
    delta <- seq(1, 5, le = ll)
    PP <- matrix(NA, ll, ll)
    for (i in 1:ll) {
        for (j in 1:ll) {
            PP[i, j] <- log_post(gamma[i], delta[j])
        }
    }
    PP <- exp(PP - max(PP))
    PP <- PP/sum(PP)
    return(list(PP = PP, gamma = gamma, delta = delta))
}

Draw posterior sample of alpha, beta, and theta (using the inverse-cdf method)

sampling

sampling <- function(mm = 1000, JJ, gamma, delta) {
    ggg <- aaa <- ddd <- bbb <- NULL
    gamma_mar <- apply(JJ, 1, sum)
    delta_mar <- apply(JJ, 2, sum)
    ttt <- matrix(NA, length(y), mm)
    for (l in 1:mm) {
        uuu <- runif(1, 0, 1)
        ggg[l] <- max(gamma[cumsum(gamma_mar) < uuu])
        junk <- (1:length(gamma))[gamma == ggg[l]]
        conditional <- JJ[junk, ]/sum(JJ[junk, ])
        uuu <- runif(1, 0, 1)
        ddd[l] <- max(delta[cumsum(conditional) < uuu])
        aaa[l] <- exp(ggg[l] + ddd[l])/(1 + exp(ggg[l]))
        bbb[l] <- exp(ddd[l])/(1 + exp(ggg[l]))
        for (i in 1:length(y)) {
            ttt[i, l] <- rbeta(1, aaa[l] + y[i], bbb[l] + n[i] - y[i])
        }
    }
    return(list(aaa = aaa, bbb = bbb, ttt = ttt))
}
o_grid <- grid_value()
names(o_grid)
[1] "PP"    "gamma" "delta"

o_sampling <- sampling(JJ = o_grid$PP, gamma = o_grid$gamma, delta = o_grid$delta)
names(o_sampling)
[1] "aaa" "bbb" "ttt"
range(o_sampling$aaa)
[1] 0.7647167 7.5250194
range(o_sampling$bbb)
[1]  3.465322 46.799456
range(o_sampling$ttt)
[1] 3.951663e-05 5.774023e-01

o_sampling$aaa %>%
    mean
[1] 2.314092
o_sampling$bbb %>%
    mean
[1] 13.91046

mean_post <- o_sampling$aaa/(o_sampling$aaa + o_sampling$bbb)

mean_post %>%
    mean()
[1] 0.1434209
mean_post %>%
    quantile(c(0.05, 0.9))
       5%       90% 
0.1225401 0.1601942 
plot_join_post(sim = o_sampling)