##################### #Lab 2 - 26/10/2021 # ##################### ######################################################## # Comparison of four estimators of the population mean # ######################################################## #Initial settings: # Number of replications B = 10000 # Sample size n = 10 # Population mean = 2 # Population variance = 4 B <- 1000 n <- 10 mu <- 5 sigma <- 2 # Save the results in a matrix: # Column 1: sample mean # Column 2: sample median # Column 3: semi-sum of minumu and maximum # Column 4: trimmed sample mean est <- matrix(0,B,4) set.seed(1234) # For each replication we generate 10 samples from # a normal r.v. with mean mu and variance sigma^2 # and we compute the four sample estimate for (i in 1:B){ x <- rnorm(n, mu, sigma) est[i,1] <- mean(x) est[i,2] <- median(x) est[i,3] <- (min(x)+max(x))/2 est[i,4] <- sum(sort(x)[-c(1,n)])/(n-2) } # By using the boxplot, we plot the distribution of the sample estimate # Blue horizontal line is the true population mean par(mfrow=c(1,1), xaxt="n") boxplot(est, main="Comparison between four estimators") par(xaxt="s") axis(1, 1:4, c(expression(hat(mu)[1]), expression(hat(mu)[2]), expression(hat(mu)[3]), expression(hat(mu)[4])) ) abline(h=5, lwd=2, col="blue") # Bias bias <- apply(est, 2, mean) - mu bias # Variances variance <- apply(est, 2, var) variance # We repeat the previous steps considering n=200 est_200 <- matrix(0, B, 4) n <- 200 set.seed(1234) for (i in 1:B){ x <- rnorm(n, mu, sigma) est_200[i , 1] <- mean(x) est_200[i , 2] <- median(x) est_200[i , 3] <- (max(x)+min(x))/2 est_200[i , 4] <- sum(sort(x)[-c(1,n)])/(n-2) } # We plot the histograms for n=10 and n=200 of the sample estimates # Blue vertical line is the true population mean library(MASS) # n =10 par(mfrow=c(2,4)) for(j in 1:4){ hist.scott(est[,j], prob = TRUE, main = substitute(hat(mu)[j], list(j = j)), xlab = "", xlim = c(0, 10), cex.main = 1.5) abline(v = mu, col = "blue", lwd = 2) } # n= 200 for(j in 1:4){ hist.scott(est_200[,j], prob = TRUE, main=substitute(hat(mu)[j] , list(j = j)), xlab = "", xlim = c(0, 10), cex.main = 1.5) abline(v = mu, col = "blue", lwd = 2) } # Try with n=1000 ######################## # Confidence Intervals # ######################## #################################################### # Normal case with sigma^2 known # # (we take mu = 5 and sigma = 2 initialized above) # #################################################### B <- 1000 n <- 10 # 1-alpha is the confidence level # CI is matrix where we save the confidence intervals for each replication: # -) first column: lower bound # -) second column: upper bound # l is a vector whose elements assume TRUE (1) or FALSE(0) depending on whether the true parameter value lies within the interval alpha <- 0.05 CI <- matrix(0,B,2) l <- rep(0,B) set.seed(1234) for (i in 1:B){ x <- rnorm(n, mu, sigma) CI[i,1] <- mean(x) - qnorm(1-alpha/2)*sigma/sqrt(n) CI[i,2] <- mean(x) + qnorm(1-alpha/2)*sigma/sqrt(n) l[i] <- (mu > CI[i,1] & mu < CI[i,2]) } #Empirical coverage probability mean(l) #Plot the first 100 c.i.: # black: intervals that not include mu # red: intervals that include mu par(mfrow=c(1,1)) plot(1, xlim = c(0, 10), ylim = c(0, 11), type = "n", xlab = expression(mu), ylab = "", main = paste("100 IC for the mean (known variance)"), cex.main = 1.2) abline(v = mu) d <- 0 for (i in 1:100){ d <- d + 0.1 lines(seq(CI[i,1], CI[i,2], length = 100), rep(d, 100), col = (l[i] + 1)) } sum(l[1:100]) #################################### # Normal case with sigma^2 unknown # #################################### B <- 1000 n <- 10 alpha <- 0.05 CI <- matrix(0, B, 2) l <- rep(0, B) set.seed(1234) for (i in 1:B){ x <- rnorm(n, mu, sigma) CI[i,1] <- mean(x) - qt(1-alpha/2, n-1)*sd(x)/sqrt(n) # CI[i,1] <- mean(x) + qt(alpha/2, n-1)*sd(x)/sqrt(n) CI[i,2] <- mean(x) + qt(1-alpha/2, n-1)*sd(x)/sqrt(n) l[i] <- (mu > CI[i,1] & mu < CI[i,2]) } #Empirical coverage probability mean(l) plot(1, xlim = c(0,10), ylim = c(0,11), type = "n", xlab = expression(mu), ylab = "", main = paste("100 IC for the mean (unknown variance)"), cex.main=1.2) abline(v = mu) d <- 0 for (i in 1:100){ d <- d + 0.1 lines(seq(CI[i,1], CI[i,2], length = 100), rep(d, 100), col = (l[i] + 1)) } sum(l[1:100]) ############################### # Application with real data # ############################### library(DAAG) data(pair65) pair_data_frame <- cbind(pair65, pair65[,1] - pair65[,2]) pair_data_frame <- cbind(c(1:9), pair_data_frame) dimnames(pair_data_frame) <- list(1:9, c("pair","heated", "ambient", "difference")) n <- nrow(pair65) #compute quantiles for student t with n-1 =8 degrees of freedom alpha <- 0.05 q_inf_95 <- qt(alpha/2, df = n -1) q_sup_95 <- qt(1-alpha/2, df = n-1) alpha <- 0.01 q_inf_99 <- qt(alpha/2, df = n-1) q_sup_99 <- qt(1-alpha/2, df = n-1) #compute the confidence intervals d <- mean(pair_data_frame[,4]) d s <- sd(pair_data_frame[,4]) s # 95% confidence interval CI_95 <- c(d + q_inf_95*s/sqrt(n), d + q_sup_95*s/sqrt(n)) CI_95 # 99% confidence interval CI_99 <- c(d + q_inf_99*s/sqrt(n), d + q_sup_99*s/sqrt(n)) CI_99 library(RColorBrewer) plotclr <- brewer.pal(6,"YlOrRd") par(mfrow = c(1, 2)) # Plot of the probability density function of the pivotal quantity (t-student with 8 df) with endpoints that encloses 95% of the probability curve(dt(x, 8), xlim = c(-6, 6), ylim = c(0, 0.4), main = "", col = "blue", lwd = 3, xlab = "t", ylab = expression(t[8]), yaxs="i") cord.x <- c(q_inf_95, seq(q_inf_95, q_sup_95, 0.01), q_sup_95) cord.y <- c(0, dt(seq(q_inf_95, q_sup_95, 0.01), 8), 0) polygon(cord.x, cord.y, col = plotclr[3], border = NA ) # Plot of the probability density function of the pivotal quantity (t-student with 8 df) with endpoints that encloses 99% of the probability curve(dt(x, 8), xlim = c(-6, 6), ylim = c(0,0.4), main = "", col = "blue", lwd = 3, xlab="t", ylab = expression(t[8]), yaxs="i") cord.x2 <- c(q_inf_99, seq(q_inf_99, q_sup_99, 0.01), q_sup_99) cord.y2 <- c(0, dt(seq(q_inf_99, q_sup_99, 0.01), 8), 0) polygon(cord.x2, cord.y2, col = plotclr[5], border = NA ) ###################### # Hypothesis testing # ###################### ############################################# # Application: test for the mean difference # ############################################# Followers_M <- c(135, 118, 113, 107, 98, 86) Followers_O <- c(123, 110, 106, 103, 91, 89, 89, 62, 58) ## by hand n1 <- length(Followers_M) n2 <- length(Followers_O) mean_M <- mean(Followers_M) mean_O <- mean(Followers_O) #pooled variance: see Data Analysis and Graphics Using R An Example Based Approach, by Maindonald and Braun: pages 104 and 67 v <- (var(Followers_M)*(n1 - 1) + var(Followers_O)*(n2 - 1))/(n1 + n2 - 2) v test.stat <- (mean_M - mean_O)/(sqrt(v*(1/n1 + 1/n2))) test.stat pt(test.stat, 13, lower.tail = FALSE) # by using the t.test() funtion t <- t.test(Followers_M, Followers_O, mu = 0, alternative = "greater", var.equal = TRUE) t par(mfrow=c(1,1)) # Plot curve(dt(x, 13), xlim = c(-5, 5), ylim = c(0, 0.4), main = "p-values and rejection region", col = "blue", lwd = 2, xlab = "x-y", ylab = expression(t[13]), yaxs="i") cord.x <- c(qt(0.95, 13),seq(qt(0.95, 13), 5, 0.01), 5) cord.y <- c(0, dt(seq(qt(0.95, 13), 5, 0.01), 13), 0) polygon(cord.x, cord.y, col = plotclr[3], border = NA ) abline(v = t$statistic, lty = 2, lwd = 2, col = "red") text(0, 0.2, paste("Accept", expression(H0))) text(2.7, 0.08, paste("Reject", expression(H0))) text(as.double(t$statistic) - 0.15, 0.02, "t", col = "red", cex = 1.2) ################################### # Pearson's chi-squared test: GOF # ################################### # Initial settings: # number of attempts n = 50 # number of zones K = 4 # vector of probabilities p = c( 7/16, 5/16, 3/16, 1/16) n <- 50 K <- 4 p <- c( 7/16, 5/16, 3/16, 1/16) # generate the values: Let's suppose that we sampling from p set.seed(1234) x <- sample(1:K, n, replace = TRUE, prob = p) # observed (absolute) frequencies observed <- table(x) observed # expected (absolute) frequencies expected <- n*p expected # observed test statistic X2 <- sum( (observed - expected)^(2)/expected) X2 #manually compute the p-value pchisq(X2, df = K - 1, lower.tail = FALSE ) #same result with the chisq.test function chisq.test(observed, p = p) # Let's suppose to generate according to a different vector of probabilities p2 <- c(5/16, 3/16, 6/16, 2/16) x_after_energy_drink <- sample(1:K, n, replace = TRUE, prob = p2) new_observed <- table(x_after_energy_drink) new_observed #new test after the energetic drink chisq.test(new_observed, p = p) ########################################### # Pearson's chi-squared test: homogeneity # ########################################### # Initial settings: # number of attempts n = 50 # number of zones K = 4 # number of friends M = 6 n <- 50 K <- 4 M <- 6 x <- matrix(0, M, n) p <- c(7/16, 5/16, 3/16, 1/16) # generate the values for (m in 1:M){ x[m, ] <- sample(1:K, n, replace = TRUE, prob = p) } observed_matrix <- apply(x, 1, table) chisq.test(observed_matrix, p = p) #Add a great player in the games M <- 7 n <- 50 K <- 4 x <- matrix(0, M, n ) p <- c(7/16, 5/16, 3/16, 1/16) set.seed(123) # the 6 friends for (m in 1:(M-1)){ x[m, ] <- sample(1:K, n, replace=TRUE, prob = p ) } x[M,] <- sample(1:K, n, replace = TRUE, prob = c(1/16, 3/16, 5/16, 7/16)) #great player observed_matrix <- apply(x, 1, table) observed_matrix # test homogeneity chisq.test(observed_matrix, p=p) ############################# # Weak law of large numbers # ############################# law_large_numb <- function(n, p){ x<-rbinom(n, 1, p) return(mean(x)) } law_large_numb <- Vectorize(law_large_numb) # Plot set.seed(12345) par(mfrow = c(1, 1)) curve(law_large_numb(x, p = 0.5), from = 1, to = 1000, xlab = "n", ylab = "frequency") abline(a = 0.5, b = 0, lwd = 2, col = "red") legend(80, 0.9, lty = 1, col = c("black", "red"), lwd =c(1, 2), c( "X/n", "p")) # Another plot, obtained varying n arguments of curve par(mfrow = c(1, 1)) curve(law_large_numb(x, p = 0.5), n=1000, from = 1, to = 1000, xlab = "n", ylab = "frequency") abline(a = 0.5, b = 0, lwd = 2, col = "red") legend(80, 0.9, lty = 1, col = c("black", "red"), lwd =c(1, 2), c( "X/n", "p"))