Lab 3 (cont)

Analisi di dati di un biotest

library(ggplot2)
theme_set(theme_minimal())
library(gridExtra)
library(tidyr)
library(dplyr)
library(purrr)

library(latex2exp)  # optional

library(rstanarm)  # optional
options(mc.cores = parallel::detectCores())  # optional

Regressione Binomiale e campionamento da griglia per dati di un biotest (BDA3, p.74)

Dati

df1 <- data.frame(x = c(-0.86, -0.3, -0.05, 0.73), n = c(5, 5, 5, 5), y = c(0, 1, 3, 5))

Disegniamo i dati

ggplot(df1, aes(x = x, y = y)) + geom_point(size = 2, color = "red") + scale_x_continuous(breaks = df1$x,
    minor_breaks = NULL, limits = c(-1.5, 1.5)) + scale_y_continuous(breaks = 0:5, minor_breaks = NULL) +
    labs(title = "Bioassay", x = "Dose (log g/ml)", y = "Number of deaths") + theme(panel.grid.major = element_blank())

Adattiamo un modello lineare

library(rstanarm)

#' Fit Gaussian regression (a wrong model)
fit0 <- stan_glm(y ~ x, data = df1)

Note: The standard deviations reported (labeled MAD_SD) are proportional to the median absolute deviation (mad) from the median. Compared to the raw posterior standard deviation, the MAD_SD will be more robust for long-tailed distributions. T

fit0
stan_glm
 family:       gaussian [identity]
 formula:      y ~ x
 observations: 4
 predictors:   2
------
            Median MAD_SD
(Intercept) 2.6    0.4   
x           3.3    0.7   

Auxiliary parameter(s):
      Median MAD_SD
sigma 0.9    0.5   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

fit0_b <- glm(y ~ x, data = df1)
fit0_b

Call:  glm(formula = y ~ x, data = df1)

Coefficients:
(Intercept)            x  
      2.643        3.274  

Degrees of Freedom: 3 Total (i.e. Null);  2 Residual
Null Deviance:      14.75 
Residual Deviance: 0.7387   AIC: 10.59

Disegniamo il modello di regressione lineare

pb1 <- ggplot(df1, aes(x = x, y = y)) + geom_point(size = 2, color = "red") + scale_x_continuous(minor_breaks = NULL,
    limits = c(-1.5, 1.5)) + scale_y_continuous(breaks = 0:5, minor_breaks = NULL) + labs(title = "Data",
    x = "Dose (log g/ml)", y = "Number of deaths") + theme(panel.grid.major = element_blank())
#' Plot Gaussian regression
pb2 <- pb1 + geom_abline(intercept = coef(fit0)[1], slope = coef(fit0)[2], linetype = "dashed") +
    labs(title = "Linear fit", x = "Dose (log g/ml)", y = "Number of deaths")
pb2

# ggsave('figs/bioassay_fitlin.pdf', width=6, height=4)

#' Plot Gaussian regression with extended range
pb3 <- pb2 + scale_x_continuous(minor_breaks = NULL, limits = c(-1.5, 1.5)) + scale_y_continuous(breaks = -1:6,
    minor_breaks = NULL, limits = c(-0.5, 5.5)) + geom_hline(yintercept = c(0, 5), color = "lightgray")
Scale for 'x' is already present. Adding another scale for 'x', which will replace
the existing scale.
Scale for 'y' is already present. Adding another scale for 'y', which will replace
the existing scale.
pb3

# ggsave('figs/bioassay_fitlin2.pdf', width=6, height=4)

Trasformiamo i dati in proporzioni

library(latex2exp)
#' Plot data as proportions
pb4 <- ggplot(df1, aes(x = x, y = y/n)) + geom_point(size = 2, color = "red") + scale_x_continuous(minor_breaks = NULL,
    limits = c(-1.5, 1.5)) + scale_y_continuous(breaks = seq(0, 1, length.out = 6), minor_breaks = NULL) +
    labs(title = "Data", x = "Dose (log g/ml)", y = TeX("Proportion of deaths / $\\theta$")) +
    geom_hline(yintercept = c(0, 1), color = "lightgray") + theme(panel.grid.major = element_blank())
pb4

# ggsave('figs/bioassay_data2.pdf', width=6, height=4)

Adattiamo un modello logistico

#' Fit Binomial model with logit link function
fit1 <- stan_glm(y/n ~ x, family = binomial(), weights = n, data = df1)

fit1
stan_glm
 family:       binomial [logit]
 formula:      y/n ~ x
 observations: 4
 predictors:   2
------
            Median MAD_SD
(Intercept) 0.4    0.7   
x           5.1    2.0   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

fit1_b <- glm(y/n ~ x, family = binomial(), weights = n, data = df1)
fit1_b

Call:  glm(formula = y/n ~ x, family = binomial(), data = df1, weights = n)

Coefficients:
(Intercept)            x  
     0.8466       7.7488  

Degrees of Freedom: 3 Total (i.e. Null);  2 Residual
Null Deviance:      15.79 
Residual Deviance: 0.05474  AIC: 7.965

Disegniamo il modello Binomiale nello spazio logit

#' Plot Binomial model fit in logit space
pb5l <- ggplot(df1) + scale_x_continuous(minor_breaks = NULL, limits = c(-1.5, 1.5)) +
    scale_y_continuous(minor_breaks = NULL) + labs(title = "Logistic regression in latent space",
    x = "Dose (log g/ml)", y = TeX("logit$(\\theta)$")) + stat_function(fun = function(x) (coef(fit1)[1] +
    coef(fit1)[2] * x), linetype = "dashed", color = "darkgreen") + theme(panel.grid.major = element_blank())
pb5l

# ggsave('figs/bioassay_fitlogitspace.pdf', width=6, height=4)

e nello spazio \(\theta\)

#' Plot Binomial model fit in theta space
pb5 <- pb4 + stat_function(fun = function(x) invlogit(coef(fit1)[1] + coef(fit1)[2] *
    x), linetype = "dashed", color = "blue") + labs(title = "Logistic regression fit",
    x = "Dose (log g/ml)", y = TeX("Proportion of deaths / $\\theta$"))
pb5

# ggsave('figs/bioassay_fitbinom.pdf', width=6, height=4)

Dimostrazione della valutazione su griglia e campionamento da griglia

#' Demonstrate grid evaluation and sampling
A = seq(-4, 8, length.out = 50)
B = seq(-10, 40, length.out = 50)
#' Make vectors that contain all pairwise combinations of A and B
cA <- rep(A, each = length(B))
cB <- rep(B, length(A))

Una funzione d’aiuto

#' Make a helper function to calculate the log likelihood
#' given a dataframe with x, y, and n and evaluation
#' points a and b. For the likelihood see BDA p. 75
logl <- function(df, a, b) df["y"] * (a + b * df["x"]) - df["n"] * log1p(exp(a + b * df["x"]))

Calcolo della verosimiglianza

#' Calculate likelihoods: apply logl function for each observation
#' ie. each row of data frame of x, n and y
p <- apply(df1, 1, logl, cA, cB) %>%
    # sum the log likelihoods of observations and exponentiate to get the joint
    # likelihood
rowSums() %>%
    exp()

Campionamento dalla griglia

#' Sample from the grid (with replacement)
nsamp <- 1000
samp_indices <- sample(length(p), size = nsamp, replace = T, prob = p/sum(p))
samp_A <- cA[samp_indices[1:nsamp]]
samp_B <- cB[samp_indices[1:nsamp]]
#' Add random jitter, see BDA3 p. 76
samp_Aj <- samp_A + runif(nsamp, (A[1] - A[2])/2, (A[2] - A[1])/2)
samp_Bj <- samp_B + runif(nsamp, (B[1] - B[2])/2, (B[2] - B[1])/2)

Creiamo un dataframe

#'  Create data frame
samps <- data_frame(ind = 1:nsamp, alpha = samp_A, beta = samp_B) %>%
    mutate(ld50 = -alpha/beta)
Warning: `data_frame()` was deprecated in tibble 1.1.0.
ℹ Please use `tibble()` instead.
sampsj <- data_frame(ind = 1:nsamp, alpha = samp_Aj, beta = samp_Bj) %>%
    mutate(ld50 = -alpha/beta)

Grafico della densità a posteriori

#' Create a plot posterior density evaluated in a grid
xl <- c(-1, 5)
yl <- c(-2, 30)
pos <- ggplot(data = data.frame(cA, cB, p), aes(cA, cB)) + geom_raster(aes(fill = p),
    interpolate = T) + coord_cartesian(xlim = xl, ylim = yl) + labs(title = "Posterior density evaluated in a grid",
    x = TeX("$\\alpha$"), y = TeX("$\\beta$")) + scale_fill_gradient(low = "white", high = "red")

#' Plot posterior density evaluated in a grid with interpolation and contours
pos + geom_contour(aes(z = p), colour = "black", size = 0.2)

# ggsave('figs/bioassay_grid1.pdf', width=6, height=4)

#' Plot posterior density evaluated in a grid with interpolation
pos

# ggsave('figs/bioassay_grid2.pdf', width=6, height=4)

#' Plot posterior density evaluated in a grid without interpolation
pos <- ggplot(data = data.frame(cA, cB, p), aes(cA, cB)) + geom_raster(aes(fill = p),
    interpolate = F) + coord_cartesian(xlim = xl, ylim = yl) + labs(title = "Posterior density evaluated in a grid",
    x = TeX("$\\alpha$"), y = TeX("$\\beta$")) + scale_fill_gradient(low = "white", high = "red")
pos

# ggsave('figs/bioassay_grid3.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid3.png', width=6, height=4)

#' Plot posterior density evaluated in a grid and a posterior draw
pos + geom_point(data = samps[1, ], aes(alpha, beta), color = "blue", size = 1) + labs(title = "Posterior density and draws in a grid",
    x = TeX("$\\alpha$"), y = TeX("$\\beta$"))

# ggsave('figs/bioassay_grid4.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid4.png', width=6, height=4)

#' Plot posterior density evaluated in a grid and 100 posterior draws
pos + geom_point(data = samps[1:100, ], aes(alpha, beta), color = "blue", size = 1) +
    labs(title = "Posterior density and draws in a grid", x = TeX("$\\alpha$"), y = TeX("$\\beta$"))

# ggsave('figs/bioassay_grid5.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid5.png', width=6, height=4) + transition_reveal(id=id,
# along=ind) + shadow_trail(0.01)

#' Plot posterior density evaluated in a grid and 1000 posterior draws
pos + geom_point(data = samps, aes(alpha, beta), color = "blue", size = 1) + labs(title = "Posterior density and draws in a grid",
    x = TeX("$\\alpha$"), y = TeX("$\\beta$"))

# ggsave('figs/bioassay_grid6.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid6.png', width=6, height=4)

pos + geom_point(data = sampsj, aes(alpha, beta), color = "blue", size = 1) + labs(title = "Posterior density in a grid and jittered draws",
    x = TeX("$\\alpha$"), y = TeX("$\\beta$"))

# ggsave('figs/bioassay_grid7.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid7.png', width=6, height=4)

#' For illustration use coarser a grid
A = seq(-4, 8, length.out = 25)
B = seq(-10, 40, length.out = 25)

#' Make vectors that contain all pairwise combinations of A and B
cA <- rep(A, each = length(B))
cB <- rep(B, length(A))

#' Make a helper function to calculate the log likelihood
#' given a dataframe with x, y, and n and evaluation
#' points a and b. For the likelihood see BDA p. 75
logl <- function(df, a, b) df["y"] * (a + b * df["x"]) - df["n"] * log1p(exp(a + b * df["x"]))

#' Calculate likelihoods: apply logl function for each observation
#' ie. each row of data frame of x, n and y
pc <- apply(df1, 1, logl, cA, cB) %>%
    # sum the log likelihoods of observations and exponentiate to get the joint
    # likelihood
rowSums() %>%
    exp()

#' Plot posterior density evaluated in a coarser grid
xl <- c(-1, 5)
yl <- c(-2, 30)
posc <- ggplot(data = data.frame(cA, cB, pc), aes(cA, cB)) + geom_raster(aes(fill = pc),
    interpolate = F) + coord_cartesian(xlim = xl, ylim = yl) + labs(title = "Posterior density evaluated in a grid",
    x = TeX("$\\alpha$"), y = TeX("$\\beta$")) + scale_fill_gradient(low = "white", high = "red")
posc

# ggsave('figs/bioassay_grid3_1.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid3_1.png', width=6, height=4)

#' Plot posterior density evaluated in a coarser grid and cell mid points
posc + geom_point(size = 0.5)

# ggsave('figs/bioassay_grid3_2.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid3_2.png', width=6, height=4)

#' Plot posterior density evaluated in a coarser grid and annotate three cells
posc + annotate("text", x = cA[which(p == max(p))], y = cB[which(p == max(p))], label = "1",
    size = 5) + annotate("text", x = cA[which(p == max(p)) + 50], y = cB[which(p == max(p)) +
    50], label = "2", size = 5) + annotate("text", x = cA[which(p == max(p)) + 100], y = cB[which(p ==
    max(p)) + 100], label = "3", size = 5)
Warning: Removed 1 rows containing missing values (geom_text).
Removed 1 rows containing missing values (geom_text).
Removed 1 rows containing missing values (geom_text).

# ggsave('figs/bioassay_grid3_3.pdf', width=6, height=4)
# ggsave('figs/bioassay_grid3_3.png', width=6, height=4)

#' Densities and probabilities in three annotated cells
round(c(p[which(p == max(p))], p[which(p == max(p)) + 50], p[which(p == max(p)) + 100]),
    4)
[1] 0.0027 0.0026 0.0023
pn <- p/sum(p)
round(c(pn[which(p == max(p))], pn[which(p == max(p)) + 50], pn[which(p == max(p)) + 100]),
    4)
[1] 0.0105 0.0101 0.0086

Estrazioni a posteriori delle curve logistiche

library(purrr)
#' Compute posterior draws of logistic curves
draws <- as.data.frame(fit1)
colnames(draws) <- c("alpha", "beta")
draws <- mutate(draws, id = 1:4000, ld50 = -alpha/beta)
xr <- seq(-1.5, 1.5, length.out = 100)
draws100 <- draws[seq(1, 4000, length.out = 100), ]
dff <- pmap_df(draws100, ~data_frame(x = xr, id = ..3, f = invlogit(..1 + ..2 * x)))

Alternativamente

invlogit <- plogis
xr <- seq(-1.5, 1.5, length.out = 100)
dff <- pmap_df(samps[1:100, ], ~data_frame(x = xr, id = ..1, f = invlogit(..2 + ..3 *
    x)))

Disegniamo le estrazioni a posteriori delle curve logistiche

#' Plot posterior draws of logistic curves
pb6 <- ggplot(df1, aes(x = x, y = y/n)) + geom_line(data = dff, aes(x = x, y = f, group = id),
    linetype = 1, color = "blue", alpha = 0.2) + geom_point(size = 2, color = "red") +
    scale_x_continuous(minor_breaks = NULL, limits = c(-1.5, 1.5)) + scale_y_continuous(breaks = seq(0,
    1, length.out = 6), minor_breaks = NULL) + labs(title = "Posterior draws", x = "Dose (log g/ml)",
    y = TeX("Proportion of deaths / $\\theta$")) + geom_hline(yintercept = c(0, 1), color = "lightgray") +
    theme(panel.grid.major = element_blank())
pb6

# ggsave('figs/bioassay_post.pdf', width=6, height=4)

Aggiungiamo le estrazioni a posteriori di LD50

#' Plot posterior draws of logistic curves + posterior draws of LD50
pb6 + geom_hline(yintercept = c(0.5), color = "lightgray", linetype = "dashed") + geom_point(data = draws100,
    aes(x = ld50, y = 0.5), color = "blue", alpha = 0.2)

# ggsave('figs/bioassay_postld50.pdf', width=6, height=4)

Istogramma delle estrazioni a posteriori di LD50

#' Plot histogram of posterior draws of LD50
ggplot(data = draws, aes(x = ld50)) + geom_histogram(fill = "steelblue", color = "black",
    bins = 60) + scale_x_continuous(minor_breaks = NULL, limits = c(-1.5, 1.5)) + labs(title = "Bioassay LD50",
    x = "Dose (log g/ml)", y = "") + theme(panel.grid.major = element_blank())
Warning: Removed 1 rows containing non-finite values (stat_bin).

# ggsave('figs/bioassay_histld50.pdf', width=6, height=4)