##################################### # Bayesian Statistics: Laboratory 4 # # 05/05/2023 # ##################################### # Some useful package library(rstudioapi) setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) library(rstan) library(rstanarm) library(ggplot2) library(bayesplot) theme_set(bayesplot::theme_default()) set.seed(1) ############################################## # Load the dataset and ecplore its structure # ############################################## data <- readRDS('pest_data.RDS') str(data) # Select a subset of variables Svar <- c("complaints", "building_id", "traps", "date", "live_in_super","age_of_building", "month", "total_sq_foot", "average_tenant_age", "monthly_average_rent", "floors") pest_data <- data[, Svar] ############################################## # number of buildings N_buildings <- length(unique(pest_data$building_id)) N_buildings # Some Exploratory analysis ggplot(pest_data, aes(x = complaints)) + geom_bar() ggplot(pest_data, aes(x = traps, y = complaints)) + geom_jitter() ## arrange data into a list stan_dat <- list( N = nrow(pest_data), complaints = pest_data$complaints, traps = pest_data$traps ) str(stan_dat) ##################################### # (simple) Poisson Regression Model # ##################################### ## compile the model saved in simple_poisson_regression.stan comp_model_P <- stan_model('simple_poisson_regression.stan') # Sampling (refresh = 0 avoid printing the trace) fit_P1 <- sampling(comp_model_P, data = stan_dat) # Posterior summary print(fit_P1, pars = c('alpha','beta1'), probs = c(0.1, 0.5, 0.9)) # Alternatively: by using the stan_glm() function of rstanarm package fit2 <- stan_glm(complaints ~ traps, data = pest_data, family = poisson, prior_intercept = normal(log(4), 1), prior = normal(-0.25,1), refresh = 0, warmup = 1000, seed = 13, chains = 4, iter = 2000) round(summary(fit2)[1:2,1:8],2) # MLE fit3 <- glm(complaints ~ traps, data = pest_data, family = poisson) round(summary(fit3)$coefficients[,1:2],2) # Extract the draws res_matrix <- as.matrix(fit_P1) # in matrix form res_matrix <- as.matrix(fit_P1, pars = c('alpha','beta1')) # in matrix form res_array <- as.array(fit_P1, pars = c('alpha','beta1')) # in array form str(res_matrix) str(res_array) apply(res_matrix,2,mean) apply(res_array[,,1],2,mean) apply(res_array[,,2],2,mean) # Alternatively (parmuted indicates whether the draws after the warmup period # in each chain should be permuted and merged) e1 <- extract(fit_P1, permuted = TRUE) str(e1) mean(e1$alpha) e2 <- extract(fit_P1, permuted = FALSE) str(e2) apply(e2[,,1],2,mean) # Different possible call for analysing traceplot, acf, histograms and scatterplot # according to the previous extraction # Trace mcmc_trace(res_matrix) + labs(title = "Traceplot")# all together mcmc_trace(res_array) mcmc_trace(fit_P1, pars = c('alpha','beta1')) # Autocorrelation mcmc_acf(res_matrix) # all together mcmc_acf(res_array) mcmc_acf(fit_P1, pars = c('alpha','beta1')) # Histograms: all equivalent mcmc_hist(res_matrix) mcmc_hist(res_array) mcmc_hist(fit_P1, pars=c('alpha','beta1')) # Scatter plot: all equivalent mcmc_scatter(res_matrix, alpha = 0.2) mcmc_scatter(res_array, alpha = 0.2) mcmc_scatter(fit_P1, pars=c('alpha','beta1'), alpha = 0.2) ## Posterior predictive checks # Extract the simulated datasets and arrange into a matrix y_rep <- as.matrix(fit_P1, pars = "y_rep") # comparing the distribution of y and the distributions of the first 200 rows in the y_rep matrix ppc_dens_overlay(y = stan_dat$complaints, y_rep[1:200,]) # Plot of the observed proportion of zeros and a histogram of the # proportion of zeros in each of the simulated datasets prop_zero <- function(x) mean(x == 0) ppc_stat(pest_data$complaints, y_rep, stat = "prop_zero", binwidth = 0.005) #Plot of the standardised residuals of the observed vs predicted number of complaints mean_y_rep <- colMeans(y_rep) std_resid <- (stan_dat$complaints - mean_y_rep) / sqrt(mean_y_rep) ggplot() + geom_point(mapping = aes(x = mean_y_rep, y = std_resid)) + geom_hline(yintercept = c(-2,2)) # Plot uncertainty intervals for the predicted number of complaints for different numbers of bait stations with(stan_dat, ppc_intervals( y = complaints, yrep = y_rep, x = traps ) + labs(x = "Number of traps", y = "Number of complaints")) ####################################### # (multiple) Poisson Regression Model # ####################################### # Exploratory analysis ggplot(pest_data, aes(x = traps, y = complaints, color = factor(live_in_super, label=c(FALSE, TRUE)))) + scale_color_discrete(name = "Live-in super") + geom_jitter() + theme(legend.position = "bottom") ggplot(pest_data, aes(x = log(total_sq_foot/1e4), y = log1p(complaints))) + geom_point() + geom_smooth(method = "lm", se = FALSE) # Add the explanatory variable and the offset to the stan_dat list # To complete stan_dat$super <- pest_data$live_in_super stan_dat$sqfoot <- pest_data$total_sq_foot/1e4 # See the structure str(stan_dat) # Compile the model (after implementing it) comp_model_P2 <- stan_model('multiple_poisson_regression.stan') #Sampling from the posterior fit_P2 <- sampling(comp_model_P2, data = stan_dat, refresh=0) # Ppsterior summaries print(fit_P2, pars = c('alpha','beta1','beta2')) # PPCs y_rep2 <- as.matrix(fit_P2, pars = "y_rep") ppc_dens_overlay(stan_dat$complaints, y_rep2[1 : 200, ]) ppc_stat(pest_data$complaints, y_rep2, stat = "prop_zero", binwidth = 0.005) mean_y_rep2 <- colMeans(y_rep2) std_resid2 <- (stan_dat$complaints - mean_y_rep2) / sqrt(mean_y_rep2) ggplot() + geom_point(mapping = aes(x = mean_y_rep2, y = std_resid2)) + geom_hline(yintercept = c(-2,2)) with(stan_dat, ppc_intervals( y = complaints, yrep = y_rep2, x = traps ) + labs(x = "Number of traps", y = "Number of complaints")) ################################ # Negative Binomial Regression # ################################ # You don't need modify the stan_dat list # Compile the model comp_model_NB <- stan_model('multiple_NB_regression.stan') # Sampling fit_NB <- sampling(comp_model_NB, data = stan_dat, refresh=0) # Print a posterior summary of the regression parameters print(fit_NB, pars = c('alpha','beta1','beta2')) # PPCs y_rep3 <- as.matrix(fit_NB, pars = "y_rep") ppc_dens_overlay(stan_dat$complaints, y_rep3[1 : 200,]) ppc_stat(stan_dat$complaints, y_rep3, stat = "prop_zero", binwidth = 0.005) inv_phi <- as.matrix(fit_NB, pars = "inv_phi") mean_inv_phi <- mean(inv_phi) mean_y_rep3 <- colMeans(y_rep3) std_resid3 <- (stan_dat$complaints - mean_y_rep3) / sqrt(mean_y_rep3 + mean_y_rep3 ^ 2 * mean_inv_phi) ggplot() + geom_point(mapping = aes(x = mean_y_rep3, y = std_resid3)) + geom_hline(yintercept = c(-2,2)) with(stan_dat, ppc_intervals( y = complaints, yrep = y_rep3, x = traps ) + labs(x = "Number of traps", y = "Number of complaints")) # PPC accounting for the building id with(pest_data, ppc_stat_grouped( y = complaints, yrep = y_rep3, group = building_id, stat = 'mean', binwidth = 0.2 )) # Time series plot of the traps and the complaints for each building ggplot(pest_data, aes(x = date, y = complaints, color = live_in_super == TRUE)) + geom_line(aes(linetype = "Number of complaints")) + geom_point(color = "black") + facet_wrap(~ building_id, scales = "free", ncol = 5, labeller = label_both) + geom_line(aes(y = traps, linetype = "Number of traps"), color = "black", size = 0.25) + scale_x_date(name = "Month", date_labels = "%b") + scale_y_continuous(name = "", limits = range(pest_data$complaints)) + scale_linetype_discrete(name = "") + scale_color_discrete(name = "Live-in super") + theme(legend.position="bottom", legend.box = "horizontal", legend.title = element_text(size = 7), legend.text = element_text(size = 7), strip.text.x = element_text(size = 7), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))