# On the Normal distribution curve(dnorm(x, mean = 100, sd = 10), xlim = c(70, 130), ylab = "density") abline(v = 100, lty = "dashed", col = "red") abline(v = c(90, 110), lty = "dashed", col = "blue") # Density functions: location shift curve(dnorm(x, mean = -5, sd = 1), xlim = c(-10, 15), ylab = "density") curve(dnorm(x, mean = 0, sd = 1), add = TRUE, col = "darkgreen") curve(dnorm(x, mean = 5, sd = 1), add = TRUE, col = "gold") curve(dnorm(x, mean = 10, sd = 1), add = TRUE, col = "darkred") # Density functions: scaling curve(dnorm(x, mean = 0, sd = 0.5), xlim = c(-10, 10), ylab = "density") curve(dnorm(x, mean = 0, sd = 1), add = TRUE, col = "darkgreen") curve(dnorm(x, mean = 0, sd = 2), add = TRUE, col = "gold") curve(dnorm(x, mean = 0, sd = 5), add = TRUE, col = "darkred") # Density functions: location shift and scaling curve(dnorm(x, mean = 0, sd = 1), xlim = c(-10, 10), ylab = "density") curve(dnorm(x, mean = 0, sd = 2), add = TRUE, col = "darkgreen") curve(dnorm(x, mean = 5, sd = 1), add = TRUE, col = "gold") curve(dnorm(x, mean = 5, sd = 2), add = TRUE, col = "darkred") # Cumulative distribution: location shift and scaling curve(pnorm(x, mean = 0, sd = 1), xlim = c(-10, 10), ylab = "cumulative distribution function") curve(pnorm(x, mean = 0, sd = 2), add = TRUE, col = "darkgreen") curve(pnorm(x, mean = 5, sd = 1), add = TRUE, col = "gold") curve(pnorm(x, mean = 5, sd = 2), add = TRUE, col = "darkred") # The dnorm function: graphical check curve(dnorm(x, mean = 100, sd = 10), xlim = c(80,120), ylab = "density") dnorm(110, mean = 100, sd = 10) y <- dnorm(110, mean = 100, sd = 10) segments(x0 = 110, y0 = 0, x1 = 110, y1 = y, col = "red", lwd = 2) segments(x0 = 80, y0 = y, x1 = 110, y1 = y, col = "red", lwd = 2) points(110, y, col = "red", pch = 19) # The pnorm function: graphical check curve(pnorm(x, mean = 100, sd = 10), xlim = c(80,120), ylab = "cumulative distribution function") pnorm(110, mean = 100, sd = 10) y <- pnorm(110, mean = 100, sd = 10) segments(x0 = 110, y0 = 0, x1 = 110, y1 = y, col = "red", lwd = 2) segments(x0 = 80, y0 = y, x1 = 110, y1 = y, col = "red", lwd = 2) points(110, y, col = "red", pch = 19) # The qnorm function: graphical check curve(pnorm(x, mean = 100, sd = 10), xlim = c(80,120), ylab = "cumulative distribution function") qnorm(c(0.1,0.5,0.9), mean = 100, sd = 10) x <- qnorm(c(0.1,0.5,0.9), mean = 100, sd = 10) segments(x0 = c(0,0,0), y0 = c(0.1, 0.5, 0.9), x1 = x, y1 = c(0.1, 0.5, 0.9), col = "red", lwd = 2) segments(x0 = x, y0 = 0, x1 = x, y1 = c(0.1, 0.5, 0.9), col = "red", lwd = 2) # The rnorm funtion: graphical check set.seed(14) sim <- rnorm(1000, mean = 170, sd = 10) hist(sim, breaks = 40, freq = FALSE) curve(dnorm(x, mean = 170, sd = 10), add = TRUE) # Confidence Intervals mu <- 5; sigma <- 2; B <- 1000; n <- 10; alpha <- 0.05 CI <- matrix(0, B, 2) l <- rep(0, B) set.seed(23) q0975 <- qt(1 - alpha/2, n - 1) for (i in 1 : B){ x <- rnorm(n, mu, sigma) CI[i, ] <- mean(x) + c(-q0975, q0975) * sd(x)/sqrt(n) l[i] <- (mu > CI[i,1] & mu < CI[i,2]) } #Empirical coverage probability mean(l) # Plot of the (first 100) CIs plot(1, xlim = c(0,10), ylim = c(0,11), type = "n", xlab = expression(mu), ylab = "", yaxt = "n", main = paste("100 IC for the mean (unknown variance)"), cex.main = 1.2) abline(v = mu) d <- 0 for (i in 1 : 100){ d <- d + 0.1 lines(seq(CI[i, 1], CI[i, 2], length = 100), rep(d, 100), col = (l[i] + 1)) } # number of intervals (out the 100) including the true parameter value sum(l[1 : 100]) # Example of hypothesis test y <- c(497.0621, 497.7690, 497.2631, 498.6688, 494.6227, 496.2185, 498.2635, 502.7199, 497.4339, 494.2883) t_obs <- (mean(y) - 500)/(sd(y)/sqrt(length(y))) curve(dt(x, 9), xlim = c(-5, 5), ylim = c(0, 0.4), main = "Rejection region", col = "blue", lwd = 2, xlab = "", ylab = expression(t[9]), yaxs="i") cord.x0 <- c(-5, seq(-5, qt(0.025, 9), 0.01), qt(0.025, 9)) cord.y0 <- c(0, dt(seq(-5,qt(0.025, 9), 0.01), 9), 0) cord.x1 <- c(qt(0.975, 9),seq(qt(0.975, 9), 5, 0.01), 5) cord.y1 <- c(0, dt(seq(qt(0.975, 9), 5, 0.01), 9), 0) polygon(cord.x0, cord.y0, col = "darkred", border = NA ) polygon(cord.x1, cord.y1, col = "darkred", border = NA ) abline(v = t_obs, lty = 2, lwd = 2, col = "red") text(0, 0.2, paste("Accept", expression(H0))) text(c(-2.7,2.7), 0.08, paste("Reject", expression(H0)), col = "darkred") text(as.double(t_obs) - 0.15, 0.02, "t", col = "red", cex = 1.2) curve(dt(x, 9), xlim = c(-5, 5), ylim = c(0, 0.4), main = "P-value", col = "blue", lwd = 2, xlab = "", ylab = expression(t[9]), yaxs="i") cord.x0 <- c(-5, seq(-5, t_obs, 0.01), t_obs) cord.y0 <- c(0, dt(seq(-5,t_obs, 0.01), 9), 0) polygon(cord.x0, cord.y0, col = "darkgreen", border = NA ) abline(v = t_obs, lty = 2, lwd = 2, col = "red") text(as.double(t_obs) - 0.15, 0.02, "t", col = "red", cex = 1.2) # Likelihood optimization: normal model nllik <- function(par, data) { mu <- par[1] sigma2 <- par[2] n <- length(data) nll <- 0.5 * n * log(sigma2) + sum((data - mu)^2) / (2 * sigma2) return(nll) } # Generate some data set.seed(13) n <- 50 x <- rnorm(n, mean = 10, sd = 2) # Set the starting values of the algorithm start_vals <- c(mu = 0, sigma = 1) # Fit the model fit <- nlminb(start = start_vals, objective = nllik, data = x, lower = c(-Inf, 1e-6)) # Estimated parameters fit$par # Check mean(x) var(x)*(n-1)/n # Likelihood optimization: linear regression model nllik_lm <- function(par, x, y) { beta0 <- par[1] beta1 <- par[2] sigma2 <- par[3] mu <- beta0 + beta1 * x n <- length(y) nll <- 0.5 * n * log(sigma2) + sum((y - mu)^2) / (2 * sigma2) return(nll) } # Generate data n <- 100 x <- runif(n, 0, 10) beta1 <- 2; beta2 <- 0.8; sigma <- 1.5 y <- beta1 + beta2 * x + rnorm(n, 0, sigma) plot(x,y) # Starting values start_vals <- c(beta1 = 0, beta2 = 0, sigma2 = 1) # Fit fit <- nlminb( start = start_vals, objective = nllik_lm, x = x, y = y, lower = c(-Inf, -Inf, 1e-6) ) # LM estimates fit$par # Check fitLM <- lm(y ~ x) fitLM$coefficients # sigma2 estimate sum(fitLM$residuals^2)/n # Test for the nullity of the single coefficients library(ISLR) data("Credit") lmFit <- lm(Balance ~ Limit + Student, data = Credit) summary(lmFit)$coefficients X <- model.matrix(Balance ~ Limit + Student, data = Credit) n <- nrow(X) p <- ncol(X) hat_s2 <- sum(residuals(lmFit)^2)/(nrow(Credit) - p) # observed test statistics tval <- lmFit$coefficients/(sqrt(hat_s2 * diag(solve(t(X) %*% X)))) tval # p-values 2*pt(-abs(tval), n - p) # Test for the nullity of all the regression coefficients hat_sigma2 <- sum(residuals(lmFit)^2)/n tilde_sigma2 <- var(Credit$Balance)*(n-1)/n # observed test statistics fval <- ((tilde_sigma2 - hat_sigma2)/(p-1))/((hat_sigma2)/(n-p)) fval summary(lmFit)$fstatistic # p-value pf(fval, p-1, n-p, lower = FALSE) summary(lmFit)