## IV-PROBIT (Two-Step / Control Function)

library(micsr)
str(federiv)
table(federiv$federiv)/nrow(federiv)

form <- federiv ~ eqrat + optval + bonus +  #suspected to be endogenous
  ltass + linsown + linstown + roe + mktbk + 
  perfor + dealdum + div + year |
  ltass + linsown + linstown + roe + mktbk + 
  perfor + dealdum + div + year + 
  no_emp + no_subs + no_off + ceo_age + gap + cfa # External instruments

bank_2st <- binomreg(form, data = federiv, link = "probit",
                     method = "twosteps")
summary(bank_2st)
bank_2ml <- binomreg(form, data = federiv, link = "probit",
                     method = "ml")
summary(bank_2ml)
endogtest(form, federiv) 

glm1 <- glm(federiv ~ eqrat + optval + bonus +  #suspected to be endogenous
              ltass + linsown + linstown + roe + mktbk + 
              perfor + dealdum + div + year, data = federiv, 
              family = binomial("probit"))
summary(glm1)
help(glm)

# Matching 
tuscany <- twa[twa$region == "Tuscany", ]
table(tuscany$group)

tuscany$perm <- ifelse(tuscany$outcome == "perm", 1, 0)
cbind(control = mean(tuscany$perm[tuscany$group == "control"]),
      treated = mean(tuscany$perm[tuscany$group == "treated"])) 

tuscany$dist2 <- tuscany$dist^2 
tuscany$livselfemp <- (tuscany$city == "livorno") * (tuscany$occup == "selfemp")

ftusc <- perm + group ~ city + sex + marital + age +
  loc + children + educ + pvoto + training +
  empstat + occup + sector + wage + hour + feduc + femp + fbluecol +
  dist + dist2 + livselfemp
ps <- pscore(ftusc, tuscany)

ps$strata[, c(1,5,6,2,3)]

ps

tusc_tr <- ps$model[ps$model$group == "treated", c("id", "pscore")]
colnames(tusc_tr) <- c("id_tr", "ps_tr")

tusc_ctl <- ps$model[ps$model$group == "control", c("id", "pscore")]
colnames(tusc_ctl) <- c("id_ctl", "ps_ctl")

print(head(tusc_tr, 2))

# Merging the two tables via inequality joining 
tusc_tr_2 <- tusc_tr[1:2, ]

result <- data.frame(
  id_tr = tusc_tr_2$id_tr,
  n = sapply(1:2, function(i) {
    sum(tusc_ctl$ps_ctl >= tusc_tr_2$ps_tr[i])
  })
)

print(result)

# Merging the two tables via rolling inequality
id_sup <- ps_sup <- numeric(nrow(tusc_tr))

for (i in seq_len(nrow(tusc_tr))) {
  ps_treated <- tusc_tr$ps_tr[i]
  candidates <- tusc_ctl[tusc_ctl$ps_ctl >= ps_treated, ]
  
  if (nrow(candidates) == 0) {
    id_sup[i] <- NA
    ps_sup[i] <- NA
  } else {
    j <- which.min(candidates$ps_ctl)
    id_sup[i] <- candidates$id_ctl[j]
    ps_sup[i] <- candidates$ps_ctl[j]
  }
}

match_sup <- data.frame(
  id_tr  = tusc_tr$id_tr,
  ps_tr  = tusc_tr$ps_tr,
  id_sup = id_sup,
  ps_sup = ps_sup
)

print(match_sup[1:3, ])

# The same for the closest lower propensity score control 
id_inf <- ps_inf <- numeric(nrow(tusc_tr))

for (i in seq_len(nrow(tusc_tr))) {
  ps_treated <- tusc_tr$ps_tr[i]
  candidates <- tusc_ctl[tusc_ctl$ps_ctl <= ps_treated, ]
  
  if (nrow(candidates) == 0) {
    id_inf[i] <- NA
    ps_inf[i] <- NA
  } else {
    j <- which.max(candidates$ps_ctl)
    id_inf[i] <- candidates$id_ctl[j]
    ps_inf[i] <- candidates$ps_ctl[j]
  }
}

match_inf <- data.frame(
  id_tr  = tusc_tr$id_tr,
  ps_tr  = tusc_tr$ps_tr,
  id_inf = id_inf,
  ps_inf = ps_inf
)

print(match_inf[1:3, ])


# Merge the two tables 
m <- merge(match_sup, match_inf, by = "id_tr", all.x = TRUE)
m$id_ctl <- ifelse(
  is.na(m$ps_inf) | (!is.na(m$ps_sup) & (m$ps_tr.y - m$ps_inf) >= (m$ps_sup - m$ps_tr.x)),
  m$id_sup,
  m$id_inf
)

m$ps_ctl <- ifelse(
  m$id_ctl == m$id_inf,
  m$ps_inf,
  m$ps_sup
)

match_nearest <- m[, c("id_tr", "id_ctl", "ps_tr.x",  "ps_tr.y", "ps_ctl")]
head(match_nearest, 3)


length(unique(match_nearest$id_ctl))

tab <- table(match_nearest$id_ctl)      
tab_sorted <- sort(tab, decreasing = TRUE)  
head(tab_sorted, 2)  


match_nearest$diff <- match_nearest$ps_tr.x - match_nearest$ps_ctl  
match_nearest[order(-match_nearest$diff), ][1:3, ]  

match_caliper <- match_nearest[abs(match_nearest$ps_tr - match_nearest$ps_ctl) < 0.01, ]               

tmp <- match_nearest[, !(names(match_nearest) %in% c("ps_tr", "ps_ctl"))]
id_tr <- tmp$id_tr
id_ctl <- tmp$id_ctl
group_tr <- rep("tr", length(id_tr))
group_ctl <- rep("ctl", length(id_ctl))

match_long <- data.frame(
  id = c(id_tr, id_ctl),
  gp = c(group_tr, group_ctl)
)

match_smpl <- merge(match_long, tuscany[, c("id", "perm")], by = "id", all.x = TRUE)
head(match_smpl, 2)   

means <- tapply(match_smpl$perm, match_smpl$gp, mean)
# ATET
atet <- means["tr"] - means["ctl"]
c(treatment = means["tr"], control = means["ctl"], atet = atet)


# Using the MatchIt library
library(MatchIt)
ftusc2 <- update(ftusc, group ~ .)
mtch <- matchit(ftusc2, tuscany, replace = TRUE)

summary(mtch)$nn

matched_data <- match.data(mtch)
head(matched_data, 2)

mtch_cap <- matchit(ftusc2, tuscany, replace = TRUE,
                    caliper = 0.01)
matched_data <- match.data(mtch_cap)

att <- with(matched_data, mean(perm[group == "treated"]) - mean(perm[group == "control"]))
att

plot(summary(mtch_cap))