# Binary models library(micsr) data(mode_choice) mode_choice$ivtime <- mode_choice$ivtime/60 mode_choice$ovtime <- mode_choice$ovtime/60 mode_choice$cost <- mode_choice$cost/100 mode_choice$cost <- mode_choice$cost * 8.42 mode_choice$gcost <- (mode_choice$ivtime + mode_choice$ovtime) * 8 + mode_choice$cost # linear probability model linprob <- lm(mode ~ gcost, mode_choice) #probit and logit models probit <- glm(mode ~ gcost, mode_choice, family = binomial(link = "probit")) logit <- glm(mode ~ gcost, mode_choice, family = binomial(link = "logit")) # Coefficients of gcost round(summary(linprob)$coefficients[2,],2) round(summary(probit)$coefficients[2,],2) round(summary(logit)$coefficients[2,],2) # Differencese between coefficints also multiplying by 0.4 (probit) and 0.25 (logit) c(coef(linprob)[2], coef(probit)[2] * 0.4, coef(logit)[2] * 0.25) # evaluation at the mean prodictor for the comparison task xmean <- mean(mode_choice$gcost) lprobit <- coef(probit)[1] + coef(probit)[2] * xmean llogit <- coef(logit)[1] + coef(logit)[2] * xmean round(dnorm(lprobit) * coef(probit)[2], 3) round(dlogis(llogit) * coef(logit)[2], 3) coef(linprob)[2] # Plot the fitted curves xlim <- c(-30,30) set.seed(123) y_jitter <- jitter(as.numeric(mode_choice$mode), amount = 0.02) plot(mode_choice$gcost, y_jitter, xlim = xlim, pch = 16, cex = 0.2, xlab = "gcost", ylab = "mode") curve(coef(linprob)[1] + coef(linprob)[2] * x, add = TRUE, lty = 1 ) curve(pnorm(coef(probit)[1] + coef(probit)[2] * x), add = TRUE, lty = 2) curve(plogis(coef(logit)[1] + coef(logit)[2] * x), add = TRUE, lty = 3) legend("topleft", legend = c("linear", "probit", "logit"), lty = c(1, 2, 3), bty = "n") # Airbnb example # For those having problems with the micsr.data call, # you can load the dataset via .csv or .RData; # see below for the two alternative ways # # airbnb <- read.csv("airbnb.csv") # load("airbnb.RData") airbnb <- as.data.frame(micsr.data::airbnb) str(airbnb) mean(as.numeric(airbnb$acceptance) - 1) table(airbnb$acceptance)/nrow(airbnb) apply(is.na(airbnb), 2, sum) airbnb <- na.omit(airbnb) airbnb$acceptance <- ifelse(airbnb$acceptance == "no", 0, 1) str(airbnb) fit_lprob <- lm(acceptance ~ guest_race + I(log(price)) + city, data = airbnb) fit_logit <- glm(acceptance ~ guest_race + I(log(price)) + city, data = airbnb, family = binomial("logit")) fit_probit <- glm(acceptance ~ guest_race + I(log(price)) + city, data = airbnb, family = binomial("probit")) round(cbind(coef(fit_lprob), coef(fit_logit) * 0.25, coef(fit_probit)* 0.4, coef(fit_logit)/coef(fit_probit)),3) # Coefficients comparison round(rbind(coef(fit_logit), coef(fit_probit)),3) # Comparison between the differences in terms of probability pnorm(coef(fit_probit)[1] + coef(fit_probit)[2] + coef(fit_probit)[3] * log(182)) - pnorm(coef(fit_probit)[1] + coef(fit_probit)[3] * log(182)) plogis(coef(fit_logit)[1] + coef(fit_logit)[2] + coef(fit_logit)[3] * log(182)) - plogis(coef(fit_logit)[1] + coef(fit_logit)[3] * log(182)) # Explore the glm function summary(logit) #Deviance logit$deviance logit$df.residual # Equivalently deviance(logit) df.residual(logit) # Null deviance logit$null.deviance logit$df.null summary(glm(mode ~ 1, mode_choice, family = "binomial")) # Residuals head(resid(logit, type = "response")) head(resid(logit, type = "pearson")) head(resid(logit, type = "deviance")) # Predict head(predict(logit, type = "link")) head(predict(logit, type = "response")) head(predict(logit, type = "link", newdata = data.frame(gcost = 5))) head(predict(logit, type = "response", newdata = data.frame(gcost = 5))) # Estimation y <- mode_choice$mode X <- model.matrix(~gcost, data = mode_choice) loglik_logit <- function(beta, X, y) { eta <- X %*% beta p <- exp(eta) / (1 + exp(eta)) sum(y * log(p) + (1 - y) * log(1 - p)) } beta_start <- rep(0, ncol(X)) opt <- optim( par = beta_start, fn = function(b) -loglik_logit(b, X, y), hessian = TRUE, method = "BFGS" ) beta_hat <- opt$par cbind(beta_hat, logit$coefficients) # Standard errors # Via numerical Hessian H_obs <- opt$hessian vcov_obs <- solve(H_obs) se_obs <- sqrt(diag(vcov_obs)) cbind(se_obs, summary(logit)$coefficients[,2]) # Via Fisher Information matrix p_hat <- 1 / (1 + exp(-X %*% beta_hat)) W <- diag(as.vector(p_hat * (1 - p_hat))) Fisher <- t(X) %*% W %*% X vcov_fisher <- solve(Fisher) se_fisher <- sqrt(diag(vcov_fisher)) cbind(se_fisher, summary(logit)$coefficients[,2]) cbind(logLik(logit), loglik_logit(beta_hat, X, y)) # Model comparisons logit2 <- glm(mode ~ cost + ivtime + ovtime, data = mode_choice, family = binomial("logit")) probit2 <- glm(mode ~ cost + ivtime + ovtime, data = mode_choice, family = binomial("probit")) cbind(summary(logit)$coefficients[,1:2], summary(probit)$coefficients[,1:2]) cbind(summary(logit2)$coefficients[,1:2], summary(probit2)$coefficients[,1:2]) cbind(-2*loglik_logit(beta_hat, X, y), logit$deviance) cbind(- 2*loglik_logit(beta_hat, X, y) + 4, AIC(logit)) cbind(logLik(logit), logLik(logit2), logLik(probit), logLik(probit2)) cbind(AIC(logit), AIC(logit2), AIC(probit), AIC(probit2)) cbind(BIC(logit), BIC(logit2), BIC(probit), BIC(probit2)) cbind(logit$deviance, logit2$deviance, probit$deviance, probit2$deviance) cbind(logit$null.deviance, logit2$null.deviance, probit$null.deviance, probit2$null.deviance) # Testing nested models library(lmtest) lrtest(probit, probit2) car::linearHypothesis(probit2, c("ivtime = 8 * cost", "ovtime = 8 * cost")) # Conditional moment test cmtest(probit2, test = "normality") cmtest(probit2, test = "heterosc") cmtest(probit2, test = "reset")