# install.packages(c("dplyr", "splines", "readxl", "ggplot2", "pROC", "caret", "ResourceSelection"))

library(dplyr)
library(splines)
library(readxl)
library(ggplot2)
library(pROC)
library(caret)
library(ResourceSelection)

# Load data ----------------------------------------------------------

df <- read_xlsx("Example1_2.xlsx")
df <- na.omit(df)

df$hypertension <- as.numeric(df$hypertension)

# Train / Test split -------------------------------------------------

set.seed(123)

trainIndex <- createDataPartition(df$hypertension, p = 0.7, list = FALSE)

train <- df[trainIndex, ]
test  <- df[-trainIndex, ]

# Fit model on TRAIN -------------------------------------------------

model_pred <- glm(
  hypertension ~ Age + BMI + Female_sex + TotChol,
  data = train,
  family = binomial
)

summary(model_pred)

# Predictions on TEST -----------------------------------------------

test$pred_prob <- predict(model_pred, newdata = test, type = "response")

# Discrimination (TEST) ---------------------------------------------

roc_obj <- roc(
  response = test$hypertension,
  predictor = test$pred_prob,
  levels = c(0, 1),
  direction = "<"
)

auc_value <- auc(roc_obj)
auc_value

# ROC curve (Sensitivity vs FPR)

roc_df <- data.frame(
  sensitivity = roc_obj$sensitivities,
  specificity = roc_obj$specificities
) %>%
  mutate(fpr = 1 - specificity)

ggplot(roc_df, aes(x = fpr, y = sensitivity)) +
  geom_line(linewidth = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_equal() +
  labs(
    x = "False positive rate (1 - specificity)",
    y = "Sensitivity",
    title = paste0("ROC curve (TEST), AUC = ", round(auc_value, 3))
  )

# Classification threshold = 0.5 ------------------------------------

threshold <- 0.5

test$pred_class_05 <- ifelse(test$pred_prob >= threshold, 1, 0)

cm_05 <- confusionMatrix(
  factor(test$pred_class_05, levels = c(0, 1)),
  factor(test$hypertension, levels = c(0, 1)),
  positive = "1"
)

cm_05

# Youden threshold (computed on TRAIN!) ------------------------------

roc_train <- roc(train$hypertension, predict(model_pred, type = "response"))

youden <- coords(
  roc_train,
  x = "best",
  best.method = "youden",
  ret = c("threshold"),
  transpose = FALSE
)

best_threshold <- as.numeric(youden["threshold"])
best_threshold

# Apply Youden threshold to TEST -------------------------------------

test$pred_class_youden <- ifelse(test$pred_prob >= best_threshold, 1, 0)

cm_youden <- confusionMatrix(
  factor(test$pred_class_youden, levels = c(0, 1)),
  factor(test$hypertension, levels = c(0, 1)),
  positive = "1"
)

cm_youden

# Calibration (TEST) -------------------------------------------------

# Calibration plot (deciles)

test$decile <- ntile(test$pred_prob, 10)

calib_deciles <- test %>%
  group_by(decile) %>%
  summarise(
    mean_predicted = mean(pred_prob),
    observed_risk = mean(hypertension),
    .groups = "drop"
  )

ggplot(calib_deciles, aes(x = mean_predicted, y = observed_risk)) +
  geom_point(size = 2) +
  geom_line() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_equal() +
  labs(
    x = "Mean predicted probability",
    y = "Observed proportion",
    title = "Calibration plot (TEST)"
  )

# Calibration intercept & slope (TEST) -------------------------------

eps <- 1e-15
test$pred_prob_adj <- pmin(pmax(test$pred_prob, eps), 1 - eps)
test$logit_pred <- qlogis(test$pred_prob_adj)

cal_model <- glm(
  hypertension ~ logit_pred,
  data = test,
  family = binomial
)

summary(cal_model)

calibration_intercept <- coef(cal_model)[1]
calibration_slope <- coef(cal_model)[2]

calibration_intercept
calibration_slope