Lab on computation
Gibbs sampling
library(dplyr)
library(ggplot2)
theme_set(theme_minimal())
library(tidyr)
library(gganimate)
library(MASS)
library(rstan)Esempio BDA3, p. 277
Bivariate normal distribution
Consider a single observation \(y=(y_1, y_2)\) from a bivariate normally distributed population with unknown mean \(\theta = (\theta_1, \theta_2)\) and known covariance matrix \[\left.\begin{pmatrix} y_1\\ y_2\end{pmatrix}\right|\theta \sim N\left(\begin{pmatrix} \theta_1\\ \theta_2\end{pmatrix},\begin{pmatrix} 1&\rho\\\rho&1\end{pmatrix}\right)\]
With a uniform prior distribution on \(\theta\), the posterior distribution is \[\left.\begin{pmatrix} \theta_1\\ \theta_2\end{pmatrix} \right|y \sim N\left(\begin{pmatrix} y_1\\ y_2\end{pmatrix},\begin{pmatrix} 1&\rho\\\rho&1\end{pmatrix}\right)\]
Parameters of a normal distribution used as a toy target distribution
y1 <- 0
y2 <- 0
r <- 0.8
S <- diag(2)
S[1, 2] <- r
S[2, 1] <- r
S
[,1] [,2]
[1,] 1.0 0.8
[2,] 0.8 1.0Although it is simple to draw directly from the joint posterior distribution of \((\theta_1, \theta_2)\), for the purpose of exposition we demonstrate the Gibbs sampler here.
Sample from the toy distribution to visualize 90% HPD interval with ggplot’s stat_ellipse()
dft <- data.frame(mvrnorm(1e+05, c(0, 0), S))Starting value of the chain (overdispersed starting points)
t1 <- -2.5
t2 <- 2.5Number of iterations
M <- 2 * 2500N.B. In this implementation one iteration updates only one parameter and one complete iteration updating both parameters takes two basic iterations. This implementation was used to make plotting of Gibbs sampler’s zig-zagging. You can implement this also by saving only the final state of complete iteration updating all parameters.
Gibbs sampling
We need the conditional posterior distributions, which, from the properties of the multivariate normal distribution are \[\begin{aligned} \theta_1|\theta_2,y\sim N(y_1+\rho(\theta_2-y_2), 1-\rho^2)\\ \theta_2|\theta_1,y\sim N(y_2+\rho(\theta_1-y_1), 1-\rho^2) \end{aligned}\]
# Allocate memory for the sample
tt <- matrix(rep(0, 2 * M), ncol = 2)
dim(tt)
[1] 5000 2
tt[1, ] <- c(t1, t2) # Save starting point
# For demonstration load pre-computed values
load("demo11_1.RData")
# tt is a M x 2 array, with M draws of both theta_1 and theta_2
dim(tt)
[1] 2001 2
head(tt)
[,1] [,2]
[1,] -2.5000000 2.500000
[2,] 2.5186384 2.500000
[3,] 2.5186384 2.071432
[4,] 1.1460003 2.071432
[5,] 1.1460003 1.440903
[6,] 0.8898989 1.440903
# Replace this with your algorithm!Insert your own Gibbs sampling here
# etc etc etcThe rest is for illustration.
Take the first 50 draws to illustrate how the sampler works showing the componentwise updating of the Gibbs iterations.
df100 <- data.frame(id = rep(1, 100), iter = 1:100, th1 = tt[1:100, 1], th2 = tt[1:100,
2], th1l = c(tt[1, 1], tt[1:(100 - 1), 1]), th2l = c(tt[1, 2], tt[1:(100 - 1), 2]))
df100 %>%
head()
id iter th1 th2 th1l th2l
1 1 1 -2.5000000 2.500000 -2.500000 2.500000
2 1 2 2.5186384 2.500000 -2.500000 2.500000
3 1 3 2.5186384 2.071432 2.518638 2.500000
4 1 4 1.1460003 2.071432 2.518638 2.071432
5 1 5 1.1460003 1.440903 1.146000 2.071432
6 1 6 0.8898989 1.440903 1.146000 1.440903
df100 %>%
tail
id iter th1 th2 th1l th2l
95 1 95 2.7705204 1.5521852 2.7705204 2.3629610
96 1 96 0.9651865 1.5521852 2.7705204 1.5521852
97 1 97 0.9651865 0.7599717 0.9651865 1.5521852
98 1 98 0.5803784 0.7599717 0.9651865 0.7599717
99 1 99 0.5803784 0.1376108 0.5803784 0.7599717
100 1 100 0.6603094 0.1376108 0.5803784 0.1376108# labels and frame indices for the plot
labs1 <- c("Draws", "Steps of the sampler", "90% HPD")
ind1 <- (1:50) * 2 - 1
ind1
[1] 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53
[28] 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95 97 99
df100s <- df100
df100s[ind1 + 1, 3:4] = df100s[ind1, 3:4]
df100s %>%
head
id iter th1 th2 th1l th2l
1 1 1 -2.500000 2.500000 -2.500000 2.500000
2 1 2 -2.500000 2.500000 -2.500000 2.500000
3 1 3 2.518638 2.071432 2.518638 2.500000
4 1 4 2.518638 2.071432 2.518638 2.071432
5 1 5 1.146000 1.440903 1.146000 2.071432
6 1 6 1.146000 1.440903 1.146000 1.440903
j = 100
p1 <- ggplot() + geom_point(data = df100s[1:j, ], aes(th1, th2, group = id, color = "1")) +
geom_segment(data = df100[1:j, ], aes(x = th1, xend = th1l, color = "2", y = th2,
yend = th2l)) + stat_ellipse(data = dft, aes(x = X1, y = X2, color = "3"), level = 0.9) +
coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") +
scale_color_manual(values = c("red", "forestgreen", "blue"), labels = labs1) + guides(color = guide_legend(override.aes = list(shape = c(16,
NA, NA), linetype = c(0, 1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())The following generates a gif animation of the steps of the sampler (might take 10 seconds).
animate(p1 + transition_reveal(along = iter) + shadow_trail(0.01)) Show only the end result as a static figure
p1 Highlight warm-up period of the 15 first draws with purple
p1 + geom_point(data = df100[ind1[1:15], ], aes(th1, th2), color = "magenta")Show 950 draws after a warm-up period of 50 draws is removed
# Take the first 1000 observations
s <- 1000
dfs <- data.frame(th1 = tt[1:s, 1], th2 = tt[1:s, 2])
# Remove warm-up period of 50 first draws later
warm <- 50labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs[-(1:warm), ], aes(th1, th2, color = "1"), alpha = 0.5,
size = 0.75) + stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) +
coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") +
scale_color_manual(values = c("steelblue", "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())Convergence diagnostics
samp <- tt
dim(samp)
[1] 2001 2
dim(samp) <- c(dim(tt), 1)
samp <- aperm(samp, c(1, 3, 2))
samp %>%
head
, , 1
[,1]
[1,] -2.5000000
[2,] 2.5186384
[3,] 2.5186384
[4,] 1.1460003
[5,] 1.1460003
[6,] 0.8898989
, , 2
[,1]
[1,] 2.500000
[2,] 2.500000
[3,] 2.071432
[4,] 2.071432
[5,] 1.440903
[6,] 1.440903
res <- monitor(samp, probs = c(0.25, 0.5, 0.75), digits_summary = 2)
Inference for the input samples (1 chains: each with iter = 2001; warmup = 1000):
Q5 Q50 Q95 Mean SD Rhat Bulk_ESS Tail_ESS
V1 -1.6 0.0 1.6 0 1 1.03 124 204
V2 -1.5 0.1 1.5 0 1 1.02 127 204
For each parameter, Bulk_ESS and Tail_ESS are crude measures of
effective sample size for bulk and tail quantities respectively (an ESS > 100
per chain is considered good), and Rhat is the potential scale reduction
factor on rank normalized split chains (at convergence, Rhat <= 1.05).
res
mean se_mean sd 25% 50% 75% n_eff Rhat
V1 0.02844871 0.08655818 0.9640420 -0.5952721 0.03076163 0.6780889 124 1.027028
V2 0.02589387 0.08474349 0.9564332 -0.6184635 0.06950219 0.6851340 128 1.017276
valid Q5 Q50 Q95 MCSE_Q25 MCSE_Q50 MCSE_Q75 MCSE_SD
V1 1 -1.552452 0.03076163 1.561233 0.11667750 0.07573500 0.09066867 0.06135037
V2 1 -1.522365 0.06950219 1.544432 0.07010534 0.09184938 0.09238738 0.06006045
Bulk_ESS Tail_ESS
V1 124 204
V2 127 204neff <- res[, "n_eff"]
neff
V1 V2
124 128
res[, "sd"]
V1 V2
0.9640420 0.9564332
res[, "sd"]/sqrt(neff)
V1 V2
0.08657353 0.08453755 # both theta have own neff, but for plotting these are so close to each other, so
# that single relative efficiency value is used
reff <- mean(neff)/(dim(samp)[1]/2)Visual convergence diagnostics
Collapse the data frame with row numbers augmented into key-value pairs for visualizing the chains
dfb <- dfs[-(1:warm), ]
dfb %>%
head
th1 th2
51 -0.9156186 -0.2871039
52 -0.3155023 -0.2871039
53 -0.3155023 -1.5495679
54 -1.6261900 -1.5495679
55 -1.6261900 -0.4371982
56 -0.8579086 -0.4371982
sb <- s - warm
dfch <- within(dfb, iter <- 1:sb) %>%
gather(grp, value, -iter)
dfch %>%
head
iter grp value
1 1 th1 -0.9156186
2 2 th1 -0.3155023
3 3 th1 -0.3155023
4 4 th1 -1.6261900
5 5 th1 -1.6261900
6 6 th1 -0.8579086Another data frame for visualizing the estimate of the autocorrelation function
nlags <- 20
dfa <- sapply(dfb, function(x) acf(x, lag.max = nlags, plot = F)$acf) %>%
data.frame(iter = 0:(nlags)) %>%
gather(grp, value, -iter)
dfa %>%
head
iter grp value
1 0 th1 1.0000000
2 1 th1 0.8078408
3 2 th1 0.6161891
4 3 th1 0.4747894
5 4 th1 0.3321819
6 5 th1 0.2497841A third data frame to visualize the cumulative averages and the 95% intervals
dfca <- (cumsum(dfb)/(1:sb)) %>%
within({
iter <- 1:sb
uppi <- 1.96/sqrt(1:sb)
upp <- 1.96/(sqrt(1:sb * reff))
}) %>%
gather(grp, value, -iter)Visualize the chains
ggplot(data = dfch) + geom_line(aes(iter, value, color = grp)) + labs(title = "Trends") +
scale_color_discrete(labels = c("theta1", "theta2")) + theme(legend.position = "bottom",
legend.title = element_blank())Visualize the estimate of the autocorrelation function
ggplot(data = dfa) + geom_line(aes(iter, value, color = grp)) + geom_hline(aes(yintercept = 0)) +
labs(title = "Autocorrelation function") + scale_color_discrete(labels = c("theta1",
"theta2")) + theme(legend.position = "bottom", legend.title = element_blank())Visualize the estimate of the Monte Carlo error estimates
# labels
labs3 <- c("theta1", "theta2", "95% interval for MCMC error", "95% interval for independent MC")
ggplot() + geom_line(data = dfca, aes(iter, value, color = grp, linetype = grp)) + geom_line(aes(1:sb,
-1.96/sqrt(1:sb * reff)), linetype = 2) + geom_line(aes(1:sb, -1.96/sqrt(1:sb)), linetype = 3) +
geom_hline(aes(yintercept = 0)) + coord_cartesian(ylim = c(-1.5, 1.5)) + labs(title = "Cumulative averages") +
scale_color_manual(values = c("red", "blue", rep("black", 2)), labels = labs3) + scale_linetype_manual(values = c(1,
1, 2, 3), labels = labs3) + theme(legend.position = "bottom", legend.title = element_blank())