## ----eval=TRUE,echo=TRUE, warning=FALSE, results='hide',message=FALSE-------------------------------------------- library(rstan) library(dplyr) library(lubridate) library(ggplot2) library(bayesplot) theme_set(bayesplot::theme_default()) # seed for R's pseudo-RNGs, not Stan's set.seed(1123) ## ----eval=TRUE,echo=TRUE----------------------------------------------------------------------------------------- pest_data <- readRDS('pest_data.RDS') str(pest_data) summary(pest_data) ##number of buildings N_buildings <- length(unique(pest_data$building_id)) N_buildings ## ----eval=TRUE,echo=FALSE---------------------------------------------------------------------------------------- ggplot(pest_data, aes(x = complaints)) + geom_bar() ggplot(pest_data, aes(x = traps, y = complaints, color = live_in_super == TRUE)) + geom_jitter() ## ---- eval=TRUE,echo=FALSE, data-plots-ts, fig.height = 16,fig.width = 6, fig.height = 8------------------------- ggplot(pest_data, aes(x = date, y = complaints, color = live_in_super == TRUE)) + geom_line(aes(linetype = "Number of complaints")) + geom_point(color = "black") + geom_line(aes(y = traps, linetype = "Number of traps"), color = "black", size = 0.25) + facet_wrap(~building_id, scales = "free", ncol = 2, labeller = label_both) + scale_x_date(name = "Month", date_labels = "%b") + scale_y_continuous(name = "", limits = range(pest_data$complaints)) + scale_linetype_discrete(name = "") + scale_color_discrete(name = "Live-in super") ## ---------------------------------------------------------------------------------------------------------------- ## compile the model comp_model_P <- stan_model('simple_poisson_regression.stan') ## ----stan-data--------------------------------------------------------------------------------------------------- ## arrange data into a list stan_dat_simple <- list( N = nrow(pest_data), complaints = pest_data$complaints, traps = pest_data$traps ) str(stan_dat_simple) ## ----fit_P_real_data, cache=TRUE--------------------------------------------------------------------------------- ## fit the model fit_P_real_data <- sampling(comp_model_P, data = stan_dat_simple) ## ----results_simple_P-------------------------------------------------------------------------------------------- ## print the parameters print(fit_P_real_data, pars = c('alpha','beta')) ## ----hist_simple_P----------------------------------------------------------------------------------------------- mcmc_hist(as.matrix(fit_P_real_data, pars = c('alpha','beta'))) mcmc_scatter(as.matrix(fit_P_real_data, pars = c('alpha','beta')), alpha = 0.2) ## ---------------------------------------------------------------------------------------------------------------- ## posterior predictive checking y_rep <- as.matrix(fit_P_real_data, pars = "y_rep") ppc_dens_overlay(y = stan_dat_simple$complaints, y_rep[1:200,]) ## ---------------------------------------------------------------------------------------------------------------- ## standardised residuals of the observed vs predicted number of complaints mean_y_rep <- colMeans(y_rep) std_resid <- (stan_dat_simple$complaints - mean_y_rep) / sqrt(mean_y_rep) qplot(mean_y_rep, std_resid) + hline_at(2) + hline_at(-2) ## ---------------------------------------------------------------------------------------------------------------- ggplot(pest_data, aes(x = log(total_sq_foot), y = log1p(complaints))) + geom_point() + geom_smooth(method = "lm", se = FALSE) ## ---------------------------------------------------------------------------------------------------------------- ## add the two variables to the list of the data stan_dat_simple$log_sq_foot <- log(pest_data$total_sq_foot/1e4) stan_dat_simple$live_in_super <- pest_data$live_in_super str(stan_dat_simple) ## ----compmultPDGP------------------------------------------------------------------------------------------------ ## compile the model comp_model_P_mult <- stan_model('multiple_poisson_regression.stan') ## ----fit_mult_P_real_dat----------------------------------------------------------------------------------------- fit_model_P_mult_real <- sampling(comp_model_P_mult, data = stan_dat_simple) y_rep <- as.matrix(fit_model_P_mult_real, pars = "y_rep") ppc_dens_overlay(stan_dat_simple$complaints, y_rep[1:200,]) ## ---------------------------------------------------------------------------------------------------------------- ppc_intervals( y = stan_dat_simple$complaints, yrep = y_rep, x = stan_dat_simple$traps ) + labs(x = "Number of traps", y = "Number of complaints") ## ----message=FALSE, cache=TRUE, results="hide"------------------------------------------------------------------- comp_model_NB <- stan_model('multiple_NB_regression.stan') ## ----runNB------------------------------------------------------------------------------------------------------- fitted_model_NB <- sampling(comp_model_NB, data = stan_dat_simple) #fitted_model_NB <- stan(file="multiple_NB_regression.stan", data=stan_dat_simple) samps_NB <- rstan::extract(fitted_model_NB) ## ----ppc-full---------------------------------------------------------------------------------------------------- ## predictions vs. the data y_rep <- samps_NB$y_rep ppc_dens_overlay(stan_dat_simple$complaints, y_rep[1:200,]) ## ---------------------------------------------------------------------------------------------------------------- ## standardised residuals mean_inv_phi <- mean(samps_NB$inv_phi) mean_y_rep <- colMeans(y_rep) std_resid <- (stan_dat_simple$complaints - mean_y_rep) / sqrt(mean_y_rep + mean_y_rep^2*mean_inv_phi) qplot(mean_y_rep, std_resid) + hline_at(2) + hline_at(-2) ## ---------------------------------------------------------------------------------------------------------------- ## prediction by number of traps ppc_intervals( y = stan_dat_simple$complaints, yrep = y_rep, x = stan_dat_simple$traps ) + labs(x = "Number of traps", y = "Number of complaints") ## ----ppc-group_means--------------------------------------------------------------------------------------------- ppc_stat_grouped( y = stan_dat_simple$complaints, yrep = y_rep, group = pest_data$building_id, stat = 'mean', binwidth = 0.2 )