Lab on computation

Gibbs sampling

library(dplyr)
library(ggplot2)
theme_set(theme_minimal())
library(tidyr)
library(gganimate)
library(MASS)
library(rstan)

Esempio BDA3, p. 277

Bivariate normal distribution

Consider a single observation \(y=(y_1, y_2)\) from a bivariate normally distributed population with unknown mean \(\theta = (\theta_1, \theta_2)\) and known covariance matrix \[\left.\begin{pmatrix} y_1\\ y_2\end{pmatrix}\right|\theta \sim N\left(\begin{pmatrix} \theta_1\\ \theta_2\end{pmatrix},\begin{pmatrix} 1&\rho\\\rho&1\end{pmatrix}\right)\]

With a uniform prior distribution on \(\theta\), the posterior distribution is \[\left.\begin{pmatrix} \theta_1\\ \theta_2\end{pmatrix} \right|y \sim N\left(\begin{pmatrix} y_1\\ y_2\end{pmatrix},\begin{pmatrix} 1&\rho\\\rho&1\end{pmatrix}\right)\]

Parameters of a normal distribution used as a toy target distribution

y1 <- 0
y2 <- 0
r <- 0.8
S <- diag(2)
S[1, 2] <- r
S[2, 1] <- r
S
     [,1] [,2]
[1,]  1.0  0.8
[2,]  0.8  1.0

Although it is simple to draw directly from the joint posterior distribution of \((\theta_1, \theta_2)\), for the purpose of exposition we demonstrate the Gibbs sampler here.

Sample from the toy distribution to visualize 90% HPD interval with ggplot’s stat_ellipse()

dft <- data.frame(mvrnorm(1e+05, c(0, 0), S))

Starting value of the chain (overdispersed starting points)

t1 <- -2.5
t2 <- 2.5

Number of iterations

M <- 2 * 2500

N.B. In this implementation one iteration updates only one parameter and one complete iteration updating both parameters takes two basic iterations. This implementation was used to make plotting of Gibbs sampler’s zig-zagging. You can implement this also by saving only the final state of complete iteration updating all parameters.

Gibbs sampling

We need the conditional posterior distributions, which, from the properties of the multivariate normal distribution are \[\begin{aligned} \theta_1|\theta_2,y\sim N(y_1+\rho(\theta_2-y_2), 1-\rho^2)\\ \theta_2|\theta_1,y\sim N(y_2+\rho(\theta_1-y_1), 1-\rho^2) \end{aligned}\]

# Allocate memory for the sample
tt <- matrix(rep(0, 2 * M), ncol = 2)
dim(tt)
[1] 5000    2
tt[1, ] <- c(t1, t2)  # Save starting point
# For demonstration load pre-computed values
load("demo11_1.RData")
# tt is a M x 2 array, with M draws of both theta_1 and theta_2
dim(tt)
[1] 2001    2
head(tt)
           [,1]     [,2]
[1,] -2.5000000 2.500000
[2,]  2.5186384 2.500000
[3,]  2.5186384 2.071432
[4,]  1.1460003 2.071432
[5,]  1.1460003 1.440903
[6,]  0.8898989 1.440903
# Replace this with your algorithm!

Insert your own Gibbs sampling here

# etc etc etc

The rest is for illustration.

Take the first 50 draws to illustrate how the sampler works showing the componentwise updating of the Gibbs iterations.

df100 <- data.frame(id = rep(1, 100), iter = 1:100, th1 = tt[1:100, 1], th2 = tt[1:100,
    2], th1l = c(tt[1, 1], tt[1:(100 - 1), 1]), th2l = c(tt[1, 2], tt[1:(100 - 1), 2]))
df100 %>%
    head()
  id iter        th1      th2      th1l     th2l
1  1    1 -2.5000000 2.500000 -2.500000 2.500000
2  1    2  2.5186384 2.500000 -2.500000 2.500000
3  1    3  2.5186384 2.071432  2.518638 2.500000
4  1    4  1.1460003 2.071432  2.518638 2.071432
5  1    5  1.1460003 1.440903  1.146000 2.071432
6  1    6  0.8898989 1.440903  1.146000 1.440903
df100 %>%
    tail
    id iter       th1       th2      th1l      th2l
95   1   95 2.7705204 1.5521852 2.7705204 2.3629610
96   1   96 0.9651865 1.5521852 2.7705204 1.5521852
97   1   97 0.9651865 0.7599717 0.9651865 1.5521852
98   1   98 0.5803784 0.7599717 0.9651865 0.7599717
99   1   99 0.5803784 0.1376108 0.5803784 0.7599717
100  1  100 0.6603094 0.1376108 0.5803784 0.1376108
# labels and frame indices for the plot
labs1 <- c("Draws", "Steps of the sampler", "90% HPD")
ind1 <- (1:50) * 2 - 1
ind1
 [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53
[28] 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95 97 99
df100s <- df100
df100s[ind1 + 1, 3:4] = df100s[ind1, 3:4]
df100s %>%
    head
  id iter       th1      th2      th1l     th2l
1  1    1 -2.500000 2.500000 -2.500000 2.500000
2  1    2 -2.500000 2.500000 -2.500000 2.500000
3  1    3  2.518638 2.071432  2.518638 2.500000
4  1    4  2.518638 2.071432  2.518638 2.071432
5  1    5  1.146000 1.440903  1.146000 2.071432
6  1    6  1.146000 1.440903  1.146000 1.440903

j = 100
p1 <- ggplot() + geom_point(data = df100s[1:j, ], aes(th1, th2, group = id, color = "1")) +
    geom_segment(data = df100[1:j, ], aes(x = th1, xend = th1l, color = "2", y = th2,
        yend = th2l)) + stat_ellipse(data = dft, aes(x = X1, y = X2, color = "3"), level = 0.9) +
    coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") +
    scale_color_manual(values = c("red", "forestgreen", "blue"), labels = labs1) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA, NA), linetype = c(0, 1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

The following generates a gif animation of the steps of the sampler (might take 10 seconds).

animate(p1 + transition_reveal(along = iter) + shadow_trail(0.01))

Show only the end result as a static figure

p1

Highlight warm-up period of the 15 first draws with purple

p1 + geom_point(data = df100[ind1[1:15], ], aes(th1, th2), color = "magenta")

Show 950 draws after a warm-up period of 50 draws is removed

# Take the first 1000 observations
s <- 1000
dfs <- data.frame(th1 = tt[1:s, 1], th2 = tt[1:s, 2])

# Remove warm-up period of 50 first draws later
warm <- 50
labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs[-(1:warm), ], aes(th1, th2, color = "1"), alpha = 0.5,
    size = 0.75) + stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) +
    coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") +
    scale_color_manual(values = c("steelblue", "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Convergence diagnostics

samp <- tt
dim(samp)
[1] 2001    2
dim(samp) <- c(dim(tt), 1)
samp <- aperm(samp, c(1, 3, 2))
samp %>%
    head
, , 1

           [,1]
[1,] -2.5000000
[2,]  2.5186384
[3,]  2.5186384
[4,]  1.1460003
[5,]  1.1460003
[6,]  0.8898989

, , 2

         [,1]
[1,] 2.500000
[2,] 2.500000
[3,] 2.071432
[4,] 2.071432
[5,] 1.440903
[6,] 1.440903
res <- monitor(samp, probs = c(0.25, 0.5, 0.75), digits_summary = 2)
Inference for the input samples (1 chains: each with iter = 2001; warmup = 1000):

     Q5 Q50 Q95 Mean SD  Rhat Bulk_ESS Tail_ESS
V1 -1.6 0.0 1.6    0  1  1.03      124      204
V2 -1.5 0.1 1.5    0  1  1.02      127      204

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

res
         mean    se_mean        sd        25%        50%       75% n_eff     Rhat
V1 0.02844871 0.08655818 0.9640420 -0.5952721 0.03076163 0.6780889   124 1.027028
V2 0.02589387 0.08474349 0.9564332 -0.6184635 0.06950219 0.6851340   128 1.017276
   valid        Q5        Q50      Q95   MCSE_Q25   MCSE_Q50   MCSE_Q75    MCSE_SD
V1     1 -1.552452 0.03076163 1.561233 0.11667750 0.07573500 0.09066867 0.06135037
V2     1 -1.522365 0.06950219 1.544432 0.07010534 0.09184938 0.09238738 0.06006045
   Bulk_ESS Tail_ESS
V1      124      204
V2      127      204
neff <- res[, "n_eff"]
neff
 V1  V2 
124 128 
res[, "sd"]
       V1        V2 
0.9640420 0.9564332 
res[, "sd"]/sqrt(neff)
        V1         V2 
0.08657353 0.08453755 
# both theta have own neff, but for plotting these are so close to each other, so
# that single relative efficiency value is used
reff <- mean(neff)/(dim(samp)[1]/2)

Visual convergence diagnostics

Collapse the data frame with row numbers augmented into key-value pairs for visualizing the chains

dfb <- dfs[-(1:warm), ]
dfb %>%
    head
          th1        th2
51 -0.9156186 -0.2871039
52 -0.3155023 -0.2871039
53 -0.3155023 -1.5495679
54 -1.6261900 -1.5495679
55 -1.6261900 -0.4371982
56 -0.8579086 -0.4371982
sb <- s - warm
dfch <- within(dfb, iter <- 1:sb) %>%
    gather(grp, value, -iter)
dfch %>%
    head
  iter grp      value
1    1 th1 -0.9156186
2    2 th1 -0.3155023
3    3 th1 -0.3155023
4    4 th1 -1.6261900
5    5 th1 -1.6261900
6    6 th1 -0.8579086

Another data frame for visualizing the estimate of the autocorrelation function

nlags <- 20
dfa <- sapply(dfb, function(x) acf(x, lag.max = nlags, plot = F)$acf) %>%
    data.frame(iter = 0:(nlags)) %>%
    gather(grp, value, -iter)
dfa %>%
    head
  iter grp     value
1    0 th1 1.0000000
2    1 th1 0.8078408
3    2 th1 0.6161891
4    3 th1 0.4747894
5    4 th1 0.3321819
6    5 th1 0.2497841

A third data frame to visualize the cumulative averages and the 95% intervals

dfca <- (cumsum(dfb)/(1:sb)) %>%
    within({
        iter <- 1:sb
        uppi <- 1.96/sqrt(1:sb)
        upp <- 1.96/(sqrt(1:sb * reff))
    }) %>%
    gather(grp, value, -iter)

Visualize the chains

ggplot(data = dfch) + geom_line(aes(iter, value, color = grp)) + labs(title = "Trends") +
    scale_color_discrete(labels = c("theta1", "theta2")) + theme(legend.position = "bottom",
    legend.title = element_blank())

Visualize the estimate of the autocorrelation function

ggplot(data = dfa) + geom_line(aes(iter, value, color = grp)) + geom_hline(aes(yintercept = 0)) +
    labs(title = "Autocorrelation function") + scale_color_discrete(labels = c("theta1",
    "theta2")) + theme(legend.position = "bottom", legend.title = element_blank())

Visualize the estimate of the Monte Carlo error estimates

# labels
labs3 <- c("theta1", "theta2", "95% interval for MCMC error", "95% interval for independent MC")
ggplot() + geom_line(data = dfca, aes(iter, value, color = grp, linetype = grp)) + geom_line(aes(1:sb,
    -1.96/sqrt(1:sb * reff)), linetype = 2) + geom_line(aes(1:sb, -1.96/sqrt(1:sb)), linetype = 3) +
    geom_hline(aes(yintercept = 0)) + coord_cartesian(ylim = c(-1.5, 1.5)) + labs(title = "Cumulative averages") +
    scale_color_manual(values = c("red", "blue", rep("black", 2)), labels = labs3) + scale_linetype_manual(values = c(1,
    1, 2, 3), labels = labs3) + theme(legend.position = "bottom", legend.title = element_blank())

Highly correlated parameters

Same again with \(r=0.99\).

Parameters of a normal distribution used as a toy target distribution

y1 <- 0
y2 <- 0
r <- 0.99
S <- diag(2)
S[1, 2] <- r
S[2, 1] <- r

Sample from the toy distribution to visualize 90% HPD interval

dft <- data.frame(mvrnorm(1e+05, c(0, 0), S))

Starting value of the chain and number of iterations.

t1 <- -2.5
t2 <- 2.5

M <- 2 * 2500
# Allocate memory for the sample
tt <- matrix(rep(0, 2 * M), ncol = 2)
tt[1, ] <- c(t1, t2)  # Save starting point
dim(tt)
[1] 5000    2
# For demonstration load pre-computed values
load("demo11_1b.RData")
dim(tt)
[1] 2001    2
# tt is a M x 2 array, with M draws of both theta_1 and theta_2 Replace this with
# your algorithm!

Take the first 50 draws to illustrate how the sampler works

df100 <- data.frame(id = rep(1, 100), iter = 1:100, th1 = tt[1:100, 1], th2 = tt[1:100,
    2], th1l = c(tt[1, 1], tt[1:(100 - 1), 1]), th2l = c(tt[1, 2], tt[1:(100 - 1), 2]))
# labels and frame indices for the plot
labs1 <- c("Draws", "Steps of the sampler", "90% HPD")
ind1 <- (1:50) * 2 - 1
df100s <- df100
df100s[ind1 + 1, 3:4] = df100s[ind1, 3:4]
p1 <- ggplot() + geom_point(data = df100s, aes(th1, th2, group = id, color = "1")) + geom_segment(data = df100,
    aes(x = th1, xend = th1l, color = "2", y = th2, yend = th2l)) + stat_ellipse(data = dft,
    aes(x = X1, y = X2, color = "3"), level = 0.9) + coord_cartesian(xlim = c(-4, 4),
    ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("red",
    "forestgreen", "blue"), labels = labs1) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA, NA), linetype = c(0, 1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

The following generates a gif animation of the steps of the sampler

animate(p1 + transition_reveal(along = iter) + shadow_trail(0.01))

Show only the end result as a static figure

p1

Highlight warm-up period of the 15 first draws with purple

p1 + geom_point(data = df100[ind1[1:15], ], aes(th1, th2), color = "magenta")

Show 950 draws after a warm-up period of 50 draws is removed, representing a set of correlated draws from the target distribution.

# Take the first 1000 observations

s <- 1000
dfs <- data.frame(th1 = tt[1:s, 1], th2 = tt[1:s, 2])

# Remove warm-up period of 50 first draws later

warm <- 50
labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs[-(1:warm), ], aes(th1, th2, color = "1"), alpha = 0.5,
    size = 0.75) + stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) +
    coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") +
    scale_color_manual(values = c("steelblue", "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Convergence diagnostics

samp <- tt
dim(samp) <- c(dim(tt), 1)
samp <- aperm(samp, c(1, 3, 2))
res <- monitor(samp, probs = c(0.25, 0.5, 0.75), digits_summary = 2)
Inference for the input samples (1 chains: each with iter = 2001; warmup = 1000):

     Q5 Q50 Q95 Mean  SD  Rhat Bulk_ESS Tail_ESS
V1 -0.9 0.2 1.5  0.2 0.8  1.14        5       15
V2 -0.9 0.2 1.5  0.2 0.8  1.15        5       18

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).
neff <- res[, "n_eff"]
neff
V1 V2 
 6  6 
# As before single relative efficiency value is used
reff <- mean(neff)/(dim(samp)[1]/2)

Visual convergence diagnostics

Collapse the data frame with row numbers augmented into key-value pairs for visualizing the chains

dfb <- dfs[-(1:warm), ]
sb <- s - warm
dfch <- within(dfb, iter <- 1:sb) %>%
    gather(grp, value, -iter)

Another data frame for visualizing the estimate of the autocorrelation function

nlags <- 75
dfa <- sapply(dfb, function(x) acf(x, lag.max = nlags, plot = F)$acf) %>%
    data.frame(iter = 0:(nlags)) %>%
    gather(grp, value, -iter)

A third data frame to visualize the cumulative averages and the 95% intervals

dfca <- (cumsum(dfb)/(1:sb)) %>%
    within({
        iter <- 1:sb
        uppi <- 1.96/sqrt(1:sb)
        upp <- 1.96/(sqrt(1:sb * reff))
    }) %>%
    gather(grp, value, -iter)

Visualize the chains

ggplot(data = dfch) + geom_line(aes(iter, value, color = grp)) + labs(title = "Trends") +
    scale_color_discrete(labels = c("theta1", "theta2")) + theme(legend.position = "bottom",
    legend.title = element_blank())

Visualize the estimate of the autocorrelation function

ggplot(data = dfa) + geom_line(aes(iter, value, color = grp)) + geom_hline(aes(yintercept = 0)) +
    labs(title = "Autocorrelation function") + scale_color_discrete(labels = c("theta1",
    "theta2")) + theme(legend.position = "bottom", legend.title = element_blank())

Visualize the estimate of the Monte Carlo error estimates

# labels
labs3 <- c("theta1", "theta2", "95% interval for MCMC error", "95% interval for independent MC")
ggplot() + geom_line(data = dfca, aes(iter, value, color = grp, linetype = grp)) + geom_line(aes(1:sb,
    -1.96/sqrt(1:sb * reff)), linetype = 2) + geom_line(aes(1:sb, -1.96/sqrt(1:sb)), linetype = 3) +
    geom_hline(aes(yintercept = 0)) + coord_cartesian(ylim = c(-1.5, 1.5)) + labs(title = "Cumulative averages") +
    scale_color_manual(values = c("red", "blue", rep("black", 2)), labels = labs3) + scale_linetype_manual(values = c(1,
    1, 2, 3), labels = labs3) + theme(legend.position = "bottom", legend.title = element_blank())