Lab on computation

Metropolis sampling

library(dplyr)
library(ggplot2)
theme_set(theme_minimal())
library(tidyr)
library(gganimate)
library(ggforce)
library(MASS)
library(rstan)

Esempio BDA3, p. 277 (continuiamo con l’esempio con cui abbiamo illustrato il Gibbs sampler)

Bivariate normal distribution

Parameters of a normal distribution used as a toy target distribution

y1 <- 0
y2 <- 0
r <- 0.8
S <- diag(2)
S[1, 2] <- r
S[2, 1] <- r

Metropolis proposal distribution scale

sp <- ???

Sample from the toy distribution to visualize 90% HPD interval with ggplot’s stat_ellipse()

dft <- data.frame(mvrnorm(1e+05, c(0, 0), S))

Starting value of the chain

t1 <- -2.5
t2 <- 2.5

Number of iterations

M <- 5000

Metropolis sampling

# Allocate memory for the sample
tt <- matrix(rep(0, 2 * M), ncol = 2)
tt[1, ] <- c(t1, t2)  # Save starting point
# For demonstration load pre-computed values
load("demo11_2a.RData")
# tt is a M x 2 array, with M draws of both theta_1 and theta_2
dim(tt)
[1] 5000    2
head(tt)
          [,1]     [,2]
[1,] -2.500000 2.500000
[2,] -1.808482 2.575362
[3,] -1.808482 2.575362
[4,] -2.158913 2.231633
[5,] -2.158913 2.231633
[6,] -2.158913 2.231633
# Replace this with your algorithm!

Insert your own Metropolis here

# etc etc etc

The rest is for illustration.

Take the first 100 draws to illustrate how the sampler works

df100 <- data.frame(id = rep(1, 100), iter = 1:100, th1 = tt[1:100, 1], th2 = tt[1:100,
    2], th1l = c(tt[1, 1], tt[1:(100 - 1), 1]), th2l = c(tt[1, 2], tt[1:(100 - 1), 2]))
df100 %>%
    head()
  id iter       th1      th2      th1l     th2l
1  1    1 -2.500000 2.500000 -2.500000 2.500000
2  1    2 -1.808482 2.575362 -2.500000 2.500000
3  1    3 -1.808482 2.575362 -1.808482 2.575362
4  1    4 -2.158913 2.231633 -1.808482 2.575362
5  1    5 -2.158913 2.231633 -2.158913 2.231633
6  1    6 -2.158913 2.231633 -2.158913 2.231633

# labels and frame indices for the plot
labs1 <- c("Draws", "Steps of the sampler", "90% HPD")
p1 <- ggplot() + geom_jitter(data = df100, width = 0.05, height = 0.05, aes(th1, th2,
    group = id, color = "1"), alpha = 0.3) + geom_segment(data = df100, aes(x = th1, xend = th1l,
    color = "2", y = th2, yend = th2l)) + stat_ellipse(data = dft, aes(x = X1, y = X2,
    color = "3"), level = 0.9) + coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1",
    y = "theta2") + scale_color_manual(values = c("red", "forestgreen", "blue"), labels = labs1) +
    guides(color = guide_legend(override.aes = list(shape = c(16, NA, NA), linetype = c(0,
        1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

The following generates a gif animation of the steps of the sampler (might take 10 seconds).

animate(p1 + transition_reveal(along = iter) + shadow_trail(0.01))

Plot the final frame

p1

Take the first 5000 observations after warmup of 500

s <- 5000
warm <- 500
dfs <- data.frame(th1 = tt[(warm + 1):s, 1], th2 = tt[(warm + 1):s, 2])

Show 1000 draws after the warm-up

labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs[1:1000, ], aes(th1, th2, color = "1"), alpha = 0.3, size = 0.75) +
    stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) + coord_cartesian(xlim = c(-4,
    4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("steelblue",
    "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Show 4500 draws after the warm-up

labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs, aes(th1, th2, color = "1"), alpha = 0.3, size = 0.75) +
    stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) + coord_cartesian(xlim = c(-4,
    4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("steelblue",
    "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Convergence diagnostics

samp <- tt
dim(samp)
[1] 5000    2
dim(samp) <- c(dim(tt), 1)
samp <- aperm(samp, c(1, 3, 2))
samp %>%
    head
, , 1

          [,1]
[1,] -2.500000
[2,] -1.808482
[3,] -1.808482
[4,] -2.158913
[5,] -2.158913
[6,] -2.158913

, , 2

         [,1]
[1,] 2.500000
[2,] 2.575362
[3,] 2.575362
[4,] 2.231633
[5,] 2.231633
[6,] 2.231633
res <- monitor(samp, probs = c(0.25, 0.5, 0.75), digits_summary = 2)
Inference for the input samples (1 chains: each with iter = 5000; warmup = 2500):

     Q5 Q50 Q95 Mean SD  Rhat Bulk_ESS Tail_ESS
V1 -1.8   0 1.6 -0.1  1     1      101      152
V2 -1.7   0 1.7  0.0  1     1       92      105

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

res
          mean   se_mean       sd        25%         50%       75% n_eff     Rhat
V1 -0.07869941 0.1025303 1.034916 -0.7483862 -0.04865786 0.5869848   103 1.002561
V2 -0.03807652 0.1074524 1.022875 -0.7411121  0.01686879 0.6261406    92 1.000491
   valid        Q5         Q50      Q95  MCSE_Q25   MCSE_Q50  MCSE_Q75    MCSE_SD
V1     1 -1.821258 -0.04865786 1.578810 0.1205514 0.09048805 0.1138745 0.07270846
V2     1 -1.745641  0.01686879 1.730768 0.1256292 0.09507786 0.1094501 0.07622624
   Bulk_ESS Tail_ESS
V1      101      152
V2       92      105

neff <- res[, "n_eff"]
neff
 V1  V2 
103  92 
res[, "sd"]
      V1       V2 
1.034916 1.022875 
res[, "sd"]/sqrt(neff)
       V1        V2 
0.1019733 0.1066421

# both theta have own neff, but for plotting these are so close to each other, so
# that single relative efficiency value is used
reff <- mean(neff/(s/2))

Visual convergence diagnostics

Collapse the data frame with row numbers augmented into key-value pairs for visualizing the chains

dfb <- dfs
sb <- s - warm
dfch <- within(dfb, iter <- 1:sb) %>%
    gather(grp, value, -iter)

Another data frame for visualizing the estimate of the autocorrelation function

nlags <- 50
dfa <- sapply(dfb, function(x) acf(x, lag.max = nlags, plot = F)$acf) %>%
    data.frame(iter = 0:(nlags)) %>%
    gather(grp, value, -iter)

A third data frame to visualize the cumulative averages and the 95% intervals

dfca <- (cumsum(dfb)/(1:sb)) %>%
    within({
        iter <- 1:sb
        uppi <- 1.96/sqrt(1:sb)
        upp <- 1.96/(sqrt(1:sb * reff))
    }) %>%
    gather(grp, value, -iter)

Visualize the chains

ggplot(data = dfch) + geom_line(aes(iter, value, color = grp)) + labs(title = "Trends") +
    scale_color_discrete(labels = c("theta1", "theta2")) + theme(legend.position = "bottom",
    legend.title = element_blank())

Visualize the estimate of the autocorrelation function

ggplot(data = dfa) + geom_line(aes(iter, value, color = grp)) + geom_hline(aes(yintercept = 0)) +
    labs(title = "Autocorrelation function") + scale_color_discrete(labels = c("theta1",
    "theta2")) + theme(legend.position = "bottom", legend.title = element_blank())

Visualize the estimate of the Monte Carlo error estimates

# labels
labs3 <- c("theta1", "theta2", "95% interval for MCMC error", "95% interval for independent MC")
ggplot() + geom_line(data = dfca, aes(iter, value, color = grp, linetype = grp)) + geom_line(aes(1:sb,
    -1.96/sqrt(1:sb * reff)), linetype = 2) + geom_line(aes(1:sb, -1.96/sqrt(1:sb)), linetype = 3) +
    geom_hline(aes(yintercept = 0)) + coord_cartesian(ylim = c(-1.5, 1.5)) + labs(title = "Cumulative averages") +
    scale_color_manual(values = c("red", "blue", rep("black", 2)), labels = labs3) + scale_linetype_manual(values = c(1,
    1, 2, 3), labels = labs3) + theme(legend.position = "bottom", legend.title = element_blank())

Highly correlated parameters

Same again with \(r=0.99\).

Parameters of a normal distribution used as a toy target distribution

y1 <- 0
y2 <- 0
r <- 0.99
S <- diag(2)
S[1, 2] <- r
S[2, 1] <- r

Metropolis proposal distribution scale

sp <- 0.3

Sample from the toy distribution to visualize 90% HPD interval with ggplot’s stat_ellipse()

dft <- data.frame(mvrnorm(1e+05, c(0, 0), S))

Starting value of the chain

t1 <- -2.5
t2 <- 2.5

Number of iterations

M <- 5000

Insert your own Metropolis sampling here

# etc etc etc

# Allocate memory for the sample
tt <- matrix(rep(0, 2 * M), ncol = 2)
tt[1, ] <- c(t1, t2)  # Save starting point
# For demonstration load pre-computed values
load("demo11_2b.RData")
# tt is a M x 2 array, with M draws of both theta_1 and theta_2 Replace this with
# your algorithm!

The rest is for illustration.

Take the first 100 draws to illustrate how the sampler works

df100 <- data.frame(id = rep(1, 100), iter = 1:100, th1 = tt[1:100, 1], th2 = tt[1:100,
    2], th1l = c(tt[1, 1], tt[1:(100 - 1), 1]), th2l = c(tt[1, 2], tt[1:(100 - 1), 2]))

# labels and frame indices for the plot
labs1 <- c("Draws", "Steps of the sampler", "90% HPD")
p1 <- ggplot() + geom_jitter(data = df100, width = 0.05, height = 0.05, aes(th1, th2,
    group = id, color = "1"), alpha = 0.3) + geom_segment(data = df100, aes(x = th1, xend = th1l,
    color = "2", y = th2, yend = th2l)) + stat_ellipse(data = dft, aes(x = X1, y = X2,
    color = "3"), level = 0.9) + coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1",
    y = "theta2") + scale_color_manual(values = c("red", "forestgreen", "blue"), labels = labs1) +
    guides(color = guide_legend(override.aes = list(shape = c(16, NA, NA), linetype = c(0,
        1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

The following generates a gif animation of the steps of the sampler.

animate(p1 + transition_reveal(along = iter) + shadow_trail(0.01))

Plot the final frame

p1

Take the first 5000 observations after warmup of 500

s <- 5000
warm <- 500
dfs <- data.frame(th1 = tt[(warm + 1):s, 1], th2 = tt[(warm + 1):s, 2])

Show 1000 draws after the warm-up

labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs[1:1000, ], aes(th1, th2, color = "1"), alpha = 0.3, size = 0.75) +
    stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) + coord_cartesian(xlim = c(-4,
    4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("steelblue",
    "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Show 4500 draws after the warm-up

labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs, aes(th1, th2, color = "1"), alpha = 0.3, size = 0.75) +
    stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) + coord_cartesian(xlim = c(-4,
    4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("steelblue",
    "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Convergence diagnostics

samp <- tt
dim(samp)
[1] 5000    2
dim(samp) <- c(dim(tt), 1)
samp <- aperm(samp, c(1, 3, 2))
res <- monitor(samp, probs = c(0.25, 0.5, 0.75), digits_summary = 2)
Inference for the input samples (1 chains: each with iter = 5000; warmup = 2500):

     Q5  Q50 Q95 Mean  SD  Rhat Bulk_ESS Tail_ESS
V1 -1.7 -0.4 1.5 -0.2 1.1  1.02       24       50
V2 -1.6 -0.4 1.5 -0.2 1.1  1.03       24       51

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

neff <- res[, "n_eff"]
neff
V1 V2 
24 24

# As before single relative efficiency value is used
reff <- mean(neff/(s/2))

Visual convergence diagnostics

Collapse the data frame with row numbers augmented into key-value pairs for visualizing the chains

dfb <- dfs
sb <- s - warm
dfch <- within(dfb, iter <- 1:sb) %>%
    gather(grp, value, -iter)

Another data frame for visualizing the estimate of the autocorrelation function

nlags <- 100
dfa <- sapply(dfb, function(x) acf(x, lag.max = nlags, plot = F)$acf) %>%
    data.frame(iter = 0:(nlags)) %>%
    gather(grp, value, -iter)

A third data frame to visualize the cumulative averages and the 95% intervals

dfca <- (cumsum(dfb)/(1:sb)) %>%
    within({
        iter <- 1:sb
        uppi <- 1.96/sqrt(1:sb)
        upp <- 1.96/(sqrt(1:sb * reff))
    }) %>%
    gather(grp, value, -iter)

Visualize the chains

ggplot(data = dfch) + geom_line(aes(iter, value, color = grp)) + labs(title = "Trends") +
    scale_color_discrete(labels = c("theta1", "theta2")) + theme(legend.position = "bottom",
    legend.title = element_blank())

Visualize the estimate of the autocorrelation function

ggplot(data = dfa) + geom_line(aes(iter, value, color = grp)) + geom_hline(aes(yintercept = 0)) +
    labs(title = "Autocorrelation function") + scale_color_discrete(labels = c("theta1",
    "theta2")) + theme(legend.position = "bottom", legend.title = element_blank())

Visualize the estimate of the Monte Carlo error estimates

# labels
labs3 <- c("theta1", "theta2", "95% interval for MCMC error", "95% interval for independent MC")
ggplot() + geom_line(data = dfca, aes(iter, value, color = grp, linetype = grp)) + geom_line(aes(1:sb,
    -1.96/sqrt(1:sb * reff)), linetype = 2) + geom_line(aes(1:sb, -1.96/sqrt(1:sb)), linetype = 3) +
    geom_hline(aes(yintercept = 0)) + coord_cartesian(ylim = c(-1.5, 1.5)) + labs(title = "Cumulative averages") +
    scale_color_manual(values = c("red", "blue", rep("black", 2)), labels = labs3) + scale_linetype_manual(values = c(1,
    1, 2, 3), labels = labs3) + theme(legend.position = "bottom", legend.title = element_blank())

Tuning Metropolis scale

Same again with \(sp = 1.5\)

sp <- 1.5

Insert your own Metropolis sampling here

# etc etc etc

# Allocate memory for the sample
tt <- matrix(rep(0, 2 * M), ncol = 2)
tt[1, ] <- c(t1, t2)  # Save starting point
# For demonstration load pre-computed values
load("demo11_2c.RData")
# tt is a M x 2 array, with M draws of both theta_1 and theta_2 Replace this with
# your algorithm!

The rest is for illustration.

Take the first 100 draws to illustrate how the sampler works

df100 <- data.frame(id = rep(1, 100), iter = 1:100, th1 = tt[1:100, 1], th2 = tt[1:100,
    2], th1l = c(tt[1, 1], tt[1:(100 - 1), 1]), th2l = c(tt[1, 2], tt[1:(100 - 1), 2]))

# labels and frame indices for the plot
labs1 <- c("Draws", "Steps of the sampler", "90% HPD")
p1 <- ggplot() + geom_jitter(data = df100, width = 0.05, height = 0.05, aes(th1, th2,
    group = id, color = "1"), alpha = 0.3) + geom_segment(data = df100, aes(x = th1, xend = th1l,
    color = "2", y = th2, yend = th2l)) + stat_ellipse(data = dft, aes(x = X1, y = X2,
    color = "3"), level = 0.9) + coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) + labs(x = "theta1",
    y = "theta2") + scale_color_manual(values = c("red", "forestgreen", "blue"), labels = labs1) +
    guides(color = guide_legend(override.aes = list(shape = c(16, NA, NA), linetype = c(0,
        1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

The following generates a gif animation of the steps of the sampler.

animate(p1 + transition_reveal(along = iter) + shadow_trail(0.01))

Plot the final frame

p1

Take the first 5000 observations after warmup of 500

s <- 5000
warm <- 500
dfs <- data.frame(th1 = tt[(warm + 1):s, 1], th2 = tt[(warm + 1):s, 2])

Show 1000 draws after the warm-up

labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs[1:1000, ], aes(th1, th2, color = "1"), alpha = 0.3, size = 0.75) +
    stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) + coord_cartesian(xlim = c(-4,
    4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("steelblue",
    "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Show 4500 draws after the warm-up

labs2 <- c("Draws", "90% HPD")
ggplot() + geom_point(data = dfs, aes(th1, th2, color = "1"), alpha = 0.3, size = 0.75) +
    stat_ellipse(data = dft, aes(x = X1, y = X2, color = "2"), level = 0.9) + coord_cartesian(xlim = c(-4,
    4), ylim = c(-4, 4)) + labs(x = "theta1", y = "theta2") + scale_color_manual(values = c("steelblue",
    "blue"), labels = labs2) + guides(color = guide_legend(override.aes = list(shape = c(16,
    NA), linetype = c(0, 1), alpha = c(1, 1)))) + theme(legend.position = "bottom", legend.title = element_blank())

Convergence diagnostics

samp <- tt
dim(samp) <- c(dim(tt), 1)
samp <- aperm(samp, c(1, 3, 2))
res <- monitor(samp, probs = c(0.25, 0.5, 0.75), digits_summary = 2)
Inference for the input samples (1 chains: each with iter = 5000; warmup = 2500):

     Q5  Q50 Q95 Mean  SD  Rhat Bulk_ESS Tail_ESS
V1 -1.8 -0.5 0.8 -0.5 0.8  1.00       45       34
V2 -1.9 -0.4 0.8 -0.5 0.8  1.01       45       33

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

neff <- res[, "n_eff"]

# As before single relative efficiency value is used
reff <- mean(neff/(s/2))

Visual convergence diagnostics

Collapse the data frame with row numbers augmented into key-value pairs for visualizing the chains

dfb <- dfs
sb <- s - warm
dfch <- within(dfb, iter <- 1:sb) %>%
    gather(grp, value, -iter)

Another data frame for visualizing the estimate of the autocorrelation function

nlags <- 100
dfa <- sapply(dfb, function(x) acf(x, lag.max = nlags, plot = F)$acf) %>%
    data.frame(iter = 0:(nlags)) %>%
    gather(grp, value, -iter)

A third data frame to visualize the cumulative averages and the 95% intervals

dfca <- (cumsum(dfb)/(1:sb)) %>%
    within({
        iter <- 1:sb
        uppi <- 1.96/sqrt(1:sb)
        upp <- 1.96/(sqrt(1:sb * reff))
    }) %>%
    gather(grp, value, -iter)

Visualize the chains

ggplot(data = dfch) + geom_line(aes(iter, value, color = grp)) + labs(title = "Trends") +
    scale_color_discrete(labels = c("theta1", "theta2")) + theme(legend.position = "bottom",
    legend.title = element_blank())

Visualize the estimate of the autocorrelation function

ggplot(data = dfa) + geom_line(aes(iter, value, color = grp)) + geom_hline(aes(yintercept = 0)) +
    labs(title = "Autocorrelation function") + scale_color_discrete(labels = c("theta1",
    "theta2")) + theme(legend.position = "bottom", legend.title = element_blank())

Visualize the estimate of the Monte Carlo error estimates

# labels
labs3 <- c("theta1", "theta2", "95% interval for MCMC error", "95% interval for independent MC")
ggplot() + geom_line(data = dfca, aes(iter, value, color = grp, linetype = grp)) + geom_line(aes(1:sb,
    -1.96/sqrt(1:sb * reff)), linetype = 2) + geom_line(aes(1:sb, -1.96/sqrt(1:sb)), linetype = 3) +
    geom_hline(aes(yintercept = 0)) + coord_cartesian(ylim = c(-1.5, 1.5)) + labs(title = "Cumulative averages") +
    scale_color_manual(values = c("red", "blue", rep("black", 2)), labels = labs3) + scale_linetype_manual(values = c(1,
    1, 2, 3), labels = labs3) + theme(legend.position = "bottom", legend.title = element_blank())