#####################
#Lab 2 - 26/10/2021 #
#####################

########################################################
# Comparison of four estimators of the population mean #
########################################################
#Initial settings:
# Number of replications B = 10000
# Sample size n = 10
# Population mean = 2
# Population variance = 4

B <- 1000
n <- 10
mu <- 5
sigma <- 2

# Save the results in a matrix:
# Column 1: sample mean 
# Column 2: sample median
# Column 3: semi-sum of minumu and maximum
# Column 4: trimmed sample mean
est <- matrix(0,B,4)

set.seed(1234)
# For each replication we generate 10 samples from
# a normal r.v. with mean mu and variance sigma^2
# and we compute the four sample estimate 
for (i in 1:B){
  x <- rnorm(n, mu, sigma)
  est[i,1] <- mean(x)
  est[i,2] <- median(x)
  est[i,3] <- (min(x)+max(x))/2
  est[i,4] <- sum(sort(x)[-c(1,n)])/(n-2)
}

# By using the boxplot, we plot the distribution of the sample estimate 
# Blue horizontal line is the true population mean
par(mfrow=c(1,1), xaxt="n")
boxplot(est, main="Comparison between four estimators")
par(xaxt="s")
axis(1, 1:4, c(expression(hat(mu)[1]), expression(hat(mu)[2]), expression(hat(mu)[3]), expression(hat(mu)[4])) )
abline(h=5, lwd=2, col="blue")

# Bias
bias <- apply(est, 2, mean) - mu
bias
# Variances
variance <- apply(est, 2, var)
variance

# We repeat the previous steps considering n=200
est_200 <- matrix(0, B, 4)
n <- 200

set.seed(1234)
for (i in 1:B){
  x <- rnorm(n, mu, sigma)
  est_200[i , 1] <- mean(x)
  est_200[i , 2] <- median(x)
  est_200[i , 3] <- (max(x)+min(x))/2
  est_200[i , 4] <- sum(sort(x)[-c(1,n)])/(n-2)
}

# We plot the histograms for n=10 and n=200 of the sample estimates
# Blue vertical line is the true population mean
library(MASS)
# n =10
par(mfrow=c(2,4))
for(j in 1:4){
  hist.scott(est[,j], prob = TRUE, 
             main = substitute(hat(mu)[j],
                               list(j = j)),
             xlab = "", xlim = c(0, 10), cex.main = 1.5)
  abline(v = mu, col = "blue", lwd = 2)
}

# n= 200
for(j in 1:4){
  hist.scott(est_200[,j],  prob = TRUE, 
             main=substitute(hat(mu)[j] ,
                             list(j = j)), 
             xlab = "", xlim = c(0, 10), cex.main = 1.5)
  abline(v = mu, col = "blue", lwd = 2)
}

# Try with n=1000

########################
# Confidence Intervals #
########################

####################################################
# Normal case with sigma^2 known                   #
# (we take mu = 5 and sigma = 2 initialized above) #
####################################################

B <- 1000
n <- 10

# 1-alpha is the confidence level
# CI is matrix where we save the confidence intervals for each replication:
# -) first column: lower bound
# -)  second column: upper bound
# l is a vector whose elements assume TRUE (1) or FALSE(0) depending on whether the true parameter value lies within the interval

alpha <- 0.05
CI <- matrix(0,B,2)
l <- rep(0,B)

set.seed(1234)
for (i in 1:B){
  x <- rnorm(n, mu, sigma)
  CI[i,1] <- mean(x) - qnorm(1-alpha/2)*sigma/sqrt(n)
  CI[i,2] <- mean(x) + qnorm(1-alpha/2)*sigma/sqrt(n)
  l[i] <- (mu > CI[i,1] & mu < CI[i,2]) 
}

#Empirical coverage probability
mean(l)


#Plot the first 100 c.i.:
# black: intervals that not include mu
# red: intervals that include mu
par(mfrow=c(1,1))
plot(1, xlim = c(0, 10), ylim = c(0, 11), type = "n", xlab = expression(mu), ylab = "",
     main = paste("100 IC for the mean (known variance)"), cex.main = 1.2)
abline(v = mu)

d <- 0
for (i in 1:100){
  d <- d + 0.1 
  lines(seq(CI[i,1], CI[i,2], length = 100), rep(d, 100), col = (l[i] + 1))
}

sum(l[1:100])

####################################
# Normal case with sigma^2 unknown #
####################################
B <- 1000
n <- 10

alpha <- 0.05
CI <- matrix(0, B, 2)
l <- rep(0, B)

set.seed(1234)
for (i in 1:B){
  x <- rnorm(n, mu, sigma)
  CI[i,1] <- mean(x) - qt(1-alpha/2, n-1)*sd(x)/sqrt(n)
  # CI[i,1] <- mean(x) + qt(alpha/2, n-1)*sd(x)/sqrt(n)
  CI[i,2] <- mean(x) + qt(1-alpha/2, n-1)*sd(x)/sqrt(n)
  l[i] <- (mu > CI[i,1] & mu < CI[i,2]) 
}

#Empirical coverage probability
mean(l)


plot(1, xlim = c(0,10), ylim = c(0,11), type = "n", xlab = expression(mu), ylab = "", 
     main = paste("100 IC for the mean (unknown variance)"), cex.main=1.2)
abline(v = mu)

d <- 0
for (i in 1:100){
  d <- d + 0.1 
  lines(seq(CI[i,1], CI[i,2], length = 100), rep(d, 100), col = (l[i] + 1))
}

sum(l[1:100])

###############################
# Application with  real data #
###############################
library(DAAG)
data(pair65)
pair_data_frame <- cbind(pair65, pair65[,1] - pair65[,2])
pair_data_frame <- cbind(c(1:9), pair_data_frame)
dimnames(pair_data_frame) <- list(1:9, c("pair","heated", "ambient", "difference"))

n <- nrow(pair65)

#compute quantiles for student t with n-1 =8 degrees of freedom
alpha <- 0.05
q_inf_95 <- qt(alpha/2, df = n -1)
q_sup_95 <- qt(1-alpha/2, df = n-1)

alpha <- 0.01
q_inf_99 <- qt(alpha/2, df = n-1)
q_sup_99 <- qt(1-alpha/2, df = n-1)


#compute the confidence intervals
d <- mean(pair_data_frame[,4])
d

s <- sd(pair_data_frame[,4])
s

# 95% confidence interval
CI_95 <- c(d + q_inf_95*s/sqrt(n), d + q_sup_95*s/sqrt(n))
CI_95

# 99% confidence interval
CI_99 <- c(d + q_inf_99*s/sqrt(n), d + q_sup_99*s/sqrt(n))
CI_99


library(RColorBrewer)
plotclr <- brewer.pal(6,"YlOrRd")

par(mfrow = c(1, 2))
# Plot of the probability density function of the pivotal quantity (t-student with 8 df) with endpoints that encloses 95% of the probability 
curve(dt(x, 8), xlim = c(-6, 6), ylim = c(0, 0.4),
      main = "", col = "blue", lwd = 3, xlab = "t", 
      ylab = expression(t[8]),  yaxs="i")
cord.x <- c(q_inf_95, seq(q_inf_95, q_sup_95, 0.01), q_sup_95)
cord.y <- c(0, dt(seq(q_inf_95, q_sup_95, 0.01), 8), 0)
polygon(cord.x, cord.y, col = plotclr[3], border = NA )

# Plot of the probability density function of the pivotal quantity (t-student with 8 df) with endpoints that encloses 99% of the probability 
curve(dt(x, 8), xlim = c(-6, 6), ylim = c(0,0.4), 
      main = "", col = "blue", lwd = 3, xlab="t",
      ylab = expression(t[8]),  yaxs="i")
cord.x2 <- c(q_inf_99, seq(q_inf_99, q_sup_99, 0.01), q_sup_99)
cord.y2 <- c(0, dt(seq(q_inf_99, q_sup_99, 0.01), 8), 0)
polygon(cord.x2, cord.y2, col = plotclr[5], border = NA )

######################
# Hypothesis testing #
######################

#############################################
# Application: test for the mean difference #
#############################################
Followers_M <- c(135, 118, 113, 107,  98,  86)
Followers_O <- c(123, 110, 106, 103,  91,  89,  89,  62,  58)

## by hand
n1 <- length(Followers_M)
n2 <- length(Followers_O)

mean_M <- mean(Followers_M)
mean_O <- mean(Followers_O)

#pooled variance: see Data Analysis and Graphics Using R An Example Based Approach, by Maindonald and Braun: pages 104 and 67
v <- (var(Followers_M)*(n1 - 1) + var(Followers_O)*(n2 - 1))/(n1 + n2 - 2) 
v

test.stat <- (mean_M - mean_O)/(sqrt(v*(1/n1 + 1/n2)))
test.stat

pt(test.stat, 13, lower.tail = FALSE)

# by using the t.test() funtion
t <- t.test(Followers_M, Followers_O, mu = 0, alternative = "greater", var.equal = TRUE) 
t

par(mfrow=c(1,1))
# Plot
curve(dt(x, 13), xlim = c(-5, 5), ylim = c(0, 0.4), 
      main = "p-values and rejection region", col = "blue", 
      lwd = 2, xlab = "x-y",  ylab = expression(t[13]),  yaxs="i")
cord.x <- c(qt(0.95, 13),seq(qt(0.95, 13), 5, 0.01), 5)
cord.y <- c(0, dt(seq(qt(0.95, 13), 5, 0.01), 13), 0)
polygon(cord.x, cord.y, col = plotclr[3], border = NA )

abline(v = t$statistic, lty = 2, lwd = 2, col = "red")
text(0, 0.2, paste("Accept", expression(H0)))
text(2.7, 0.08, paste("Reject", expression(H0)))
text(as.double(t$statistic) - 0.15, 0.02, "t", col = "red", cex = 1.2)

###################################
# Pearson's chi-squared test: GOF #
###################################
# Initial settings:
# number of attempts n = 50
# number of zones K = 4
# vector of probabilities p = c( 7/16, 5/16, 3/16, 1/16)
n <- 50
K <- 4
p <- c( 7/16, 5/16, 3/16, 1/16)

# generate the values: Let's suppose that we sampling from p
set.seed(1234)
x <- sample(1:K, n, replace = TRUE, prob = p)

# observed (absolute) frequencies
observed <- table(x)
observed

# expected (absolute) frequencies
expected <- n*p
expected

# observed test statistic
X2 <- sum( (observed - expected)^(2)/expected)
X2

#manually compute the p-value
pchisq(X2, df = K - 1, lower.tail = FALSE )

#same result with the chisq.test function
chisq.test(observed, p = p)

# Let's suppose to generate according to a different vector of probabilities
p2 <- c(5/16, 3/16, 6/16, 2/16)
x_after_energy_drink <- sample(1:K, n, replace = TRUE, prob = p2)

new_observed <- table(x_after_energy_drink)
new_observed

#new test after the energetic drink
chisq.test(new_observed, p = p)


###########################################
# Pearson's chi-squared test: homogeneity #
###########################################
# Initial settings:
# number of attempts n = 50
# number of zones K = 4
# number of friends M = 6
n <- 50
K <- 4
M <- 6
x <- matrix(0, M, n)
p <- c(7/16, 5/16, 3/16, 1/16)

# generate the values
for (m in 1:M){
  x[m, ] <- sample(1:K, n, replace = TRUE, prob = p)
} 
observed_matrix <- apply(x, 1, table)
chisq.test(observed_matrix, p = p)

#Add a great player in the games
M <- 7 
n <- 50
K <- 4 
x <- matrix(0, M, n )
p <- c(7/16, 5/16, 3/16, 1/16)

set.seed(123)
# the 6 friends
for (m in 1:(M-1)){
  x[m, ] <- sample(1:K, n, replace=TRUE, prob = p )
} 

x[M,] <- sample(1:K, n, replace = TRUE, prob = c(1/16, 3/16, 5/16, 7/16)) #great player

observed_matrix <- apply(x, 1, table)
observed_matrix


# test homogeneity
chisq.test(observed_matrix, p=p)


#############################
# Weak law of large numbers #
#############################
law_large_numb <- function(n, p){
  x<-rbinom(n, 1, p)
  return(mean(x))
}
law_large_numb <- Vectorize(law_large_numb)

# Plot
set.seed(12345) 
par(mfrow = c(1, 1))
curve(law_large_numb(x, p = 0.5), from = 1, to = 1000, xlab = "n", 
      ylab = "frequency")
abline(a = 0.5, b = 0, lwd = 2, col = "red")
legend(80, 0.9, lty = 1, col = c("black", "red"), 
       lwd  =c(1, 2), c( "X/n", "p"))

# Another plot, obtained varying n arguments of curve
par(mfrow = c(1, 1))
curve(law_large_numb(x, p = 0.5), n=1000,
      from = 1, to = 1000, xlab = "n", 
      ylab = "frequency")
abline(a = 0.5, b = 0, lwd = 2, col = "red")
legend(80, 0.9, lty = 1, col = c("black", "red"), 
       lwd  =c(1, 2), c( "X/n", "p"))