########################################### # Laboratory 5: GLM - Logistic Regression # ########################################### # Load the data (dataset Wells.csv) # See the structure # See a summary of all variables in the dataset # See the distribution of distance, arsenic and education by switch setwd("C:/Users/Gioia/Desktop/Lab 5") data <- read.csv2("Wells.csv", header=TRUE) str(data) summary(data) par(mfrow=c(1,2)) boxplot(dist ~ switch, data = data, horizontal = TRUE, xlab = "Distance (in meters)", ylab = "Switch") boxplot(arsenic ~ switch, data = data, horizontal = TRUE, xlab = "Arsenic (in hundreds of mg per liter)", ylab = "Switch") par(mfrow=c(1,1)) barplot(prop.table(table(data$switch, data$educ), margin = 1), beside=TRUE, legend.text = c("no switch", "switch"), xlab="Education (years)") # Fit the logistic regression model with one predictor (distance) fit1 <- glm(switch ~ dist , data = data, family = binomial(link = "logit")) summary(fit1) # Probability of switching well for dist = 0 # response function: invlogit <- function(eta) 1/(1+exp(-eta)) invlogit(c(1,0)%*%coef(fit1)) # Alternatively predict(fit1, newdata=data.frame(dist=0), type="response") # Difference on the probability of switch well due to an increase # of one unit on the predictor from the mean invlogit(c(1,mean(data$dist)+1)%*%coef(fit1))- invlogit(c(1,mean(data$dist))%*%coef(fit1)) # Alternatively predict(fit1, newdata=data.frame(dist=mean(data$dist)+1), type="response")- predict(fit1, newdata=data.frame(dist=mean(data$dist)), type="response") # Difference on the probability of switch well due to an increase # of one unit on the predictor from 99th percentile invlogit(c(1,quantile(data$dist,0.99)+1)%*%coef(fit1))- invlogit(c(1,quantile(data$dist,0.99))%*%coef(fit1)) # Consider dist/100 for interpretability purposes and fit the model data$dist100 <- data$dist/100 fit2 <- glm(switch ~ dist100 , data = data, family = binomial(link = "logit")) summary(fit2) # Difference on probability scale due to an increase of 1 unit # on the predictor dist100 from the mean invlogit(c(1,mean(data$dist100)+1)%*%coef(fit2))- invlogit(c(1,mean(data$dist100))%*%coef(fit2)) # It is the same difference due to an increase of 100 unit on the predictor # dist from the mean (i.e. considering fit1) invlogit(c(1,mean(data$dist)+100)%*%coef(fit1))- invlogit(c(1,mean(data$dist))%*%coef(fit1)) # Plot the fitted model switch.jitter <- jitter(data$switch, factor=0.10) par(mfrow=c(1,1)) with(data, plot(dist, switch.jitter, xlab="Distance (in meters) to nearest safe well", ylab = "Pr(Swiching)", type = "n")) curve (invlogit(coef(fit1)[1]+coef(fit1)[2]*x), lwd=1, add=TRUE) points(data$dist, switch.jitter, pch=20, cex=.1) # Adding predictors # See the histogram of arsenic levels par(mfrow=c(1,1)) hist(data$arsenic, freq = TRUE, xlab = "Arsenic concentration in well water", breaks = seq(0, 0.25+max(data$arsenic), 0.25), main = "") length(which(data$arsenic >= 0.5 & data$arsenic <= 0.75)) # Fit the logistic regression model with two predictors (dist100 and arsenic) fit3 <- glm(switch ~ dist100 + arsenic, data=data, family = binomial("logit")) summary(fit3) # Plot the fitted model # Pr(Switching) vs distance (consider Arsenic = 0.5 and Arsenic = 1) with(data, plot(dist100, switch.jitter, xlab = "Distance (in hundreds meters) to nearest safe well", ylab = "Pr(Switching)", type = "n")) curve(invlogit(coef(fit3)[1]+coef(fit3)[2]*x+coef(fit3)[3]),add=TRUE, col="blue") # Alternatively #curve(invlogit(coef(fit3)%*%c(1,x,1)),add=TRUE, col="blue") curve(invlogit(coef(fit3)[1]+coef(fit3)[2]*x+coef(fit3)[3]*0.5),add=TRUE, col="red") points(data$dist, switch.jitter, pch=20, cex=.1) text (0.50, .27, "if As = 0.5", adj=0, cex=.8, col = "red") text (0.75, .50, "if As = 1.0", adj=0, cex=.8, col = "blue") #Alternatively (on the meter-based distance scale) with(data, plot(dist, switch.jitter, xlab = "Distance (in meters) to nearest safe well", ylab = "Pr(Switching)", type = "n")) curve(invlogit(coef(fit3)[1]+coef(fit3)[2]*x/100+coef(fit3)[3]),add=TRUE, col="blue") curve(invlogit(coef(fit3)[1]+coef(fit3)[2]*x/100+coef(fit3)[3]*0.5),add=TRUE, col="red") points(data$dist, switch.jitter, pch=20, cex=.1) text (50, .27, "if As = 0.5", adj=0, cex=.8, col = "red") text (75, .50, "if As = 1.0", adj=0, cex=.8, col = "blue") # Pr(Switching) vs Arsenic (Consider distance = 0 and distance = 50) with(data, plot(arsenic, switch.jitter, xlab = "Arsenic concentration (in hundreds mg per liter) in well water", ylab = "Pr(Switching)", type = "n")) curve(invlogit(coef(fit3)[1]+coef(fit3)[3]*x),add=TRUE, col="red") text(1.5,0.8,"if dist=0", col="red") curve(invlogit(coef(fit3)[1]+coef(fit3)[2]*0.5+coef(fit3)[3]*x),add=TRUE, col="blue") text(4,0.6,"if dist=50", col="blue") points(data$arsenic, switch.jitter, cex=0.01, pch=20) # Logistic regression with interactions fit4 <- glm(switch ~ dist100 + arsenic + dist100:arsenic, data = data, family = binomial(link = "logit")) # Equivalently (by using *) fit4 <- glm(switch ~ dist100*arsenic, data = data, family = binomial(link = "logit")) summary(fit4) # Probability of switching for an average distance and # an average concentration of arsenic invlogit(c(1,mean(data$dist100), mean(data$arsenic), mean(data$dist100)*mean(data$arsenic))%*%coef(fit4)) #Equivalently predict(fit4, newdata=data.frame(dist100 = mean(data$dist100), arsenic = mean(data$arsenic)), type = "response") # Difference in probability due to an increase of 100 meters # in the distance (from the mean), helding arsenic at its mean invlogit(c(1,mean(data$dist100)+1, mean(data$arsenic), (mean(data$dist100)+1)*mean(data$arsenic))%*%coef(fit4))- invlogit(c(1,mean(data$dist100), mean(data$arsenic), mean(data$dist100)*mean(data$arsenic))%*%coef(fit4)) # Difference in probability due to an increase of 1 unit # in the arsenic (from the mean), helding distance at its mean invlogit(c(1,mean(data$dist100),mean(data$arsenic)+1, mean(data$dist100)*(mean(data$arsenic)+1))%*% coef(fit4))- invlogit(c(1,mean(data$dist100),mean(data$arsenic), mean(data$dist100)*mean(data$arsenic))%*%coef(fit4)) # Difference in logit probability scale when dist increases by 100 meters, # considering arsenic at its mean level coef(fit4)[2] + coef(fit4)[4]*mean(data$arsenic) # Difference in logit probability scale when arsenic increases by 1 unit, # considering dist at its mean level coef(fit4)[3] + coef(fit4)[4]*mean(data$dist100) # Centering the input variables data$c.dist100 <- scale(data$dist100, scale = FALSE) data$c.arsenic <- scale(data$arsenic, scale = FALSE) # Refitting the model with centered inputs fit5 <- glm(switch ~ c.dist100 + c.arsenic + c.dist100:c.arsenic, family = binomial(link = "logit"), data = data) summary(fit5) # Pr(Switching) vs Distance (Consider Arsenic=0.5 and Arsenic = 1) with(data, plot(data$dist, switch.jitter, type = "n", xlab = "Distance(in meters) to nearest safe well", ylab = "Pr(Switching)")) curve(invlogit(cbind(1, x/100, 1, x/100)%*%coef(fit5)), add=TRUE, col="red") curve(invlogit(cbind(1, x/100, 0.5, 0.5*x/100)%*%coef(fit5)), add=TRUE, col="blue") text(100,0.6,"if As = 1", col="red") text(35,0.4,"if As = 0.5", col="blue") points(data$dist, switch.jitter, pch=20, cex=0.01) # Pr(Switching) vs Arsenic (Consider Distance = 0 and Distance = 50) with(data, plot(data$arsenic, switch.jitter, type = "n", xlab = "Arsenic concentration (in hundreds of mg per liter) in well water", ylab = "Pr(Switching)")) curve(invlogit(cbind(1, 0, x, 0)%*%coef(fit5)), add=TRUE, col="red") curve(invlogit(cbind(1, 0.5, x, 0.5*x)%*%coef(fit5)), add=TRUE, col="blue") text(2,0.9,"if Dist = 0", col="red") text(3,0.6,"if Dist = 50", col="blue") points(data$arsenic, switch.jitter, pch=20, cex=0.01) # Adding social predictor to the model # divide education by 4 and centering it data$c.educ4 <- scale(data$educ/4, scale = FALSE) fit6 <- glm(switch ~ c.dist100*c.arsenic + c.educ4, data = data, family = binomial(link = "logit")) summary(fit6) # Consider also the interaction between education and distance and # between education and arsenic fit7 <- glm(switch ~ c.dist100*c.arsenic + c.dist100*c.educ4 + c.arsenic * c.educ4, data = data, family = binomial(link = "logit")) summary(fit7) # Equivalently fit7 <- glm(switch ~ c.dist100 + c.arsenic + c.educ4 + c.dist100:c.arsenic + c.arsenic:c.educ4 + c.dist100:c.educ4, data = data, family = binomial(link = "logit")) summary(fit7) # But not c.dist100*c.arsenic*c.educ4 # Residual plot: wrong pred7 <- predict(fit7, type="response") # or # pred7 <- fit7$fitted.values plot(c(0,1),c(-1,1), xlab = "Estimated Pr(Switching)", type = "n", ylab = "Observed - Estimated", main = "Residual Plot") abline(h = 0, col = "gray", lwd = 0.5) points(pred7, data$switch - pred7, pch = 20, cex = 0.2) ## Binned Residual Plot binned.resids <- function(x, y, nclass = sqrt(length(x))){ breaks.index <- floor(length(x)*(1:(nclass))/nclass) breaks <- c(-Inf, sort(x)[breaks.index], Inf) output <- NULL xbreaks <- NULL x.binned <- as.numeric(cut(x, breaks)) for(i in 1:nclass){ items <- (1:length(x))[x.binned == i] x.range <- range(x[items]) xbar <- mean(x[items]) ybar <- mean(y[items]) n <- length(items) sdev <- sd(y[items]) output <- rbind(output, c(xbar, ybar, n, x.range, 2*sdev/sqrt(n))) } colnames(output) <- c("xbar", "ybar", "n", "x.lo", "x.hi", "2se") return(list(binned = output, xbreaks = xbreaks)) } # Little changes w.r.t to the binned.resids function of the arm package # library("arm") # arm::binned.resids # Binned residuals vs Estimated probability of Switching br7 <- binned.resids(pred7, data$switch - pred7, nclass = 50)$binned plot(range(br7[,1]), range(br7[,2],br7[,6], -br7[,6]), xlab = "Estimated Pr(Switching)", ylab = "Average residual", type = "n", main = "Binned Residual Plot") abline(h = 0, col = "gray", lwd = 0.5) lines(br7[,1], br7[,6], col = "gray", lwd = 0.5) lines(br7[,1], -br7[,6], col = "gray", lwd = 0.5) points(br7[,1], br7[,2], pch = 19, cex = 0.5) # Plot of the binned residuals vs input of interest # distance br.dist <- binned.resids(data$dist, data$switch - pred7, nclass = 40)$binned plot(range(br.dist[,1]), range(br.dist[,2], br.dist[,6], -br.dist[,6]), xlab = "Distance to the nearest safe well", ylab = "Average residual", type = "n", main = "Binned residual plot") abline(h = 0, col = "gray", lwd = 0.5) lines(br.dist[,1], br.dist[,6], col = "gray", lwd = 0.5) lines(br.dist[,1], -br.dist[,6], col = "gray", lwd = 0.5) points(br.dist[,1], br.dist[,2], pch = 19, cex = 0.5) # Arsenic br.arsenic <- binned.resids(data$arsenic, data$switch - pred7, nclass = 40)$binned plot(range(br.arsenic[,1]), range(br.arsenic[,2], br.arsenic[,6], -br.arsenic[,6]), xlab = "Arsenic concentration", ylab = "Average residual", type = "n", main = "Binned residual plot") abline(h = 0, col = "gray", lwd = 0.5) lines(br.arsenic[,1], br.arsenic[,6], col = "gray", lwd = 0.5) lines(br.arsenic[,1], -br.arsenic[,6], col = "gray", lwd = 0.5) points(br.arsenic[,1], br.arsenic[,2], pch = 19, cex = 0.5) # Error rate # Error rate of the Null Model fit0 <- glm(switch ~ 1, family = binomial(link = "logit"), data = data) coef(fit0) invlogit(coef(fit0)) m0 <- mean(data$switch) err.rate0 <- min(m0, 1 - m0) # Alternatively err.rate0 <- mean((rep(1, length(data$switch)) > 0.5 & data$switch ==0) | ((rep(1, length(data$switch)) < 0.5 & data$switch ==1 ))) # Error rate of Model 7 err.rate7 <- mean((pred7 > 0.5 & data$switch ==0) | ((pred7 < 0.5 & data$switch ==1 ))) table(data$switch,ifelse(pred7>0.5,1,0)) (845+288)/3020 # Separation in logistic regression library("arm") #To use the invlogit, display functions of arm package set.seed(2) # Simulate some data n <- 100 x1 <- rnorm(n) x2 <- rbinom(n, 1, 0.5) # Fix the true paramater value b0 <- 1 b1 <- 1.5 b2 <- 2 y <- rbinom(n, 1, arm::invlogit(b0 + b1*x1 + b2*x2)) # Fit the Logistic regression model M1 <- glm(y ~ x1 + x2, family = binomial(link = "logit")) display(M1) # Plot the fitted logistic curve when the binary predictor is equal to 0 and 1 # x2 == 0 curve(arm::invlogit(coef(M1)[1] + coef(M1)[2]*x + coef(M1)[3]*0), from=-5,to=5, ylab="invlogit", xlab="x1") # x2 == 1 curve(arm::invlogit(coef(M1)[1] + coef(M1)[2]*x + coef(M1)[3]*1), from=-5,to=5, add=TRUE, lty = 2) # Create quasi separation # Assign the value 1 to all ys when x2 = 1 y <- ifelse(x2 == 1, 1, y) # Fit the model M2 <- glm(y ~ x1 + x2, family = binomial(link = "logit")) display(M2) # Plot the fitted logistic curve when the binary predictor is equal to 0 and 1 # x2 == 0 curve(arm::invlogit(coef(M2)[1] + coef(M2)[2]*x + coef(M2)[3]*0), from=-15,to=5, ylab="invlogit", xlab="x1") # x2 == 1 curve(arm::invlogit(coef(M2)[1] + coef(M2)[2]*x + coef(M2)[3]*1), from=-15,to=5, add=TRUE, lty = 2) # Create quasi separation # Assign y = 1 when x1 > 0 y <- ifelse(x1>0, 1, 0) # Fit the model with only x1 as predictor M3 <- glm(y ~ x1, family = binomial(link = "logit")) display(M3) # Plot the fitted logistic curve curve(arm::invlogit(coef(M3)[1] + coef(M3)[2]*x), from=-5,to=5)