#####################################
# Bayesian Statistics: Laboratory 3 #
# 28/04/2023                        #
#####################################

##############################################################
## Load the following poackages                              #
##############################################################
library(mvtnorm) # For the multivariate normal distribution  
library(truncnorm) # For the truncated normal distribution   
library(microbenchmark) # For computational time comparisons 
library(gclus) # For the dataset bank 

set.seed(123)
#############################################################


## Load the bank dataset and explore it 
data(bank)
help(bank)

# Extract the outcome 
y <- bank$Status
# equivalently 
# y <- bank[,1]

# Build the model matrix
X <- model.matrix(~ Length + Left + Right + Bottom, data = bank)
# equivalently
# X <- cbind(1,bank[,2:5])

# Sample size 
n <- dim(bank)[1]


###########################
# Probit regression model #
###########################

## Obtain the MLE of beta:
## -) Use the `glm()` (with the suitable arguments)
## -) extract $\hat \beta$, as well as the estimated covariance matrix of $\beta$ ($\hat \Sigma_{\beta}$)

# Fit the model 
fit_glm <- glm(y ~ Length + Left + Right + Bottom, family = binomial(link = "probit"), data = bank)
# Alternatively
# fit_glm <- glm(Status ~ X - 1, family = binomial(link = "probit"), data = bank)

# MLE
beta_mle <- coef(fit_glm)
round(beta_mle,2)

# Obtain the estimated covariance matrix of beta 
sigma.asymp <- summary(fit_glm)$cov.unscaled 
sigma.asymp

# Alternatively sigma.asymp can be obtained as
eta_hat <- predict(fit_glm)
p_hat <- predict(fit_glm, type = "response")
Wii <- dnorm(eta_hat)^2 / (p_hat * (1 - p_hat))
sigma.asymp2 <- solve(t(X) %*% diag(Wii) %*% X)
round(sigma.asymp2, 2)


## A computational note: The previous implementation of the matrix product $X^\top W X$ is not efficient 
## Consider the following implementation that 
## -) at first explicitly take into account the diagonal structure of $W$ 
## -) and then we make use of the efficient matrix product via `crossprod` 
## Then, we compare the computational performances by means of the `microbenchmark()` function of the **microbenchmark** package


PREC1 <- t(X) %*% diag(Wii) %*% X
PREC2 <- t(X) %*% (X * Wii)
PREC3 <- crossprod(X *  sqrt(Wii))
c(max(abs(PREC1 - PREC2)), max(abs(PREC1 - PREC3)))


microbenchmark(
  PREC1 = t(X) %*% diag(Wii) %*% X,
  PREC2 = t(X) %*% (X * Wii),
  PREC3 = crossprod(X *  sqrt(Wii)))


####################################
# Bayesian Probit regression model #
####################################
# With flat prior (uniform) write the posterior function 

post <- function(b, y, X){
  pi <- pnorm(X %*% as.vector(b))
  return(prod(pi ^ y * (1 - pi) ^ (1 - y)))
}


##RW  Metropolis - Hastings
# Implement the following random walk Metropolis-Hastings algorithm:
# -) Initialize $\beta^{(0)}=\hat{\beta}$
# -) For S-1 iterations repeat the following steps 
#    -) Generate $\beta^* \sim N_{p}(\beta^{(s-1)}, \tau^2\hat\Sigma)$
#     -) Compute the acceptance probability as
#        $$\rho(\beta^{(s-1)},\beta^*)=\min\left(1, \frac{\pi(\beta^*|y)}{\pi(\beta^{(s-1)}|y)}\right)$$
# -) Generate $U \sim U(0,1)$: if $U < \rho(\beta^{(s-1)},\beta^*)$, then  $\beta^{(s)}=\beta^*$, otherwise  $\beta^{(s)}=\beta^{(s-1)}$.
#
#Note: beta_mle is the MLE, while sigma_asymp is the estimated covariance matrix of $\beta$ 

mh <- function(Nsim, tau,y, X, beta_mle, sigma_asymp){
  b <- matrix(0, nrow = Nsim, ncol = ncol(X))
  b[1,] <- beta_mle
  
  for(s in 2:Nsim){
    set.seed(s)
    proposal <- rmvnorm(1, b[s - 1, ], tau ^ 2 * sigma.asymp)
    rho <- min(1, post(proposal, y, X) / post(b[s - 1, ], y, X))
    if ( runif(1) < rho ) {
      b[s, ] <- proposal
    } else {
      b[s, ] <- b[s - 1, ]
    }
  }
  return(b)
}


# Simulate 10000 iterations using different values of $\tau= 0.1, 1, 4$
tau <- c(0.1, 1, 4)
K <- length(tau)
nsim <- 1e4
b_sim <- lapply(1 : K, 
                function(.x) mh(nsim, tau[.x], y, 
                                X, beta_mle, sygma.asymp))


# Plot the trace of the parameter $\beta_2$, the autocorrelation and 
# the histogram adding a vertical line for the posterior estimate of the mean.
par(mfrow = c(1, 3))
for(j in 1 : K) plot(b_sim[[j]][, 2], type = "l",        
                     ylab = "Trace", xlab = "Iteration", main = "")

for(j in 1 : K) acf(b_sim[[j]][, 2], main = "")

for(j in 1 : K) {
  hist(b_sim[[j]][, 2], breaks = 50, border = "gray40",freq = F, main = "")
  abline(v = mean(b_sim[[j]][,2]), col = "firebrick3", lty = 2)
}  



# Which value of $\tau$ seems the best?
# Using the selected simulations, give a Monte Carlo estimate
# of the posterior mean and variance of $\beta$ (burn-in=1000 and thin=10). 
post_sample <- b_sim[[2]][seq(1e3, 1e4, by = 10),]

# Posterior mean 
beta_post_mean <- round(colMeans(post_sample), 2)
beta_post_mean
# Posterior variance 
beta_post_var <- round(apply(post_sample, 2, var), 2)
beta_post_var


# Comparison between posterior mean and MLE   
rbind(beta_post_mean, round(beta_mle, 2) )
# Comparison between posterior variance and  estimated variance of the MLE of beta
rbind(beta_post_var, round(diag(sigma.asymp), 2))

##################
# GIBBS SAMPLING #
##################
# Implement a Gibbs' sampler to generate $\beta$ from the posterior distribution: 
#  - Initialize $\beta^{(1)}=\hat \beta$ (to be precise we also should intialise $y^{*(1)}$) \linebreak
#  - for s in $2,\ldots, S$
#    - generate $y^{*(s)}|\beta^{(s-1)},X,y \sim \begin{cases}
#   \mathcal{TN}(x_i\beta^{(s-1)},1, 0, +\infty) \quad \text{if } y_i=1\\
#   \mathcal{TN}(x_i\beta^{(s-1)},1, -\infty, 0) \quad \text{if } y_i=0
#\end{cases}$ \linebreak
#    - generate $\beta^{(s)}|X,y^{*(s)} \sim N_{p}((X^T X)^{-1}X^Ty^{*}, (X^TX)^{-1})$ \linebreak

gs <- function(Nsim, y, X, beta_mle){
  b <- matrix(0, nrow = Nsim, ncol = ncol(X))
  y_star <- rep(NA, nrow(X))
  b[1,] <- beta_mle  
  S <- solve(crossprod(X))
  SX <- S %*% t(X)
  for(i in 2 : Nsim){
    mu <- X %*% b[i - 1,]
    y_star[y == 1] <- rtruncnorm(sum(y), a = 0, b = Inf, 
                                 mean = mu[y == 1], sd = 1)
    y_star[y == 0] <- rtruncnorm(n - sum(y), a = -Inf, b = 0, 
                                 mean = mu[y == 0], sd = 1)
    b[i,] <- rmvnorm(1, SX %*% y_star, S)
  }
  return(b)
}

# Simulate a sample of 10000 draws of $\beta$ 
bgs <- gs(1e4, y, X, beta_mle)

# Visualise  the results for $beta_2$(traceplot, achf, histograms).
par(mfrow = c(1, 3))
plot(bgs[, 2], type = "l", ylab = "Trace", xlab = "Iteration")
acf(bgs[, 2], main = "")
hist(bgs[, 2], breaks = 50, border = "gray40", freq = F, main = "")
abline(v = mean(bgs[,2]), col = "firebrick3", lty = 2)


# Then extract the posterior mean and variance fo $\beta$ 
# (by considering a burn-in of 1000 and thinning of 10 as above)
post_sample <- bgs[seq(1e3, 1e4, by = 10),]
beta_post_mean <- round(colMeans(post_sample), 2)
beta_post_mean
beta_post_var <- round(apply(post_sample, 2, var), 2)
beta_post_var

# Compare the posterior mean and variance estimates with the MLE 
# and the estimated asymtotic variance 
rbind(round(colMeans(post_sample),2), beta_mle)

rbind(round(apply(post_sample, 2, var),2), round(diag(sigma.asymp), 2))


# Compare the two MH and the GS algorithms via microbenchmark(). 
# Consider `times = 10` as argument 
microbenchmark(
  MH =  mh(nsim, tau = 1, y, X, beta_mle, sigma.asymp),
  GS = gs(nsim, y, X, beta_mle), times = 10L)