# install.packages(c("dplyr", "splines", "readxl", "ggplot2", "pROC", "caret", "ResourceSelection"))

library(dplyr)
library(splines)
library(readxl)
library(ggplot2)
library(pROC)
library(caret)
library(ResourceSelection)

# Load data ----------------------------------------------------------

df <- read_xlsx("Example1_2.xlsx")
df <- na.omit(df)

# Make sure outcome is coded 0/1
df$hypertension <- as.numeric(df$hypertension)

# Predictive model ---------------------------------------------------

model_pred <- glm(
  hypertension ~ Age + BMI + Female_sex + TotChol,
  data = df,
  family = binomial
)

summary(model_pred)

# Predicted probabilities
df$pred_prob <- predict(model_pred, type = "response")

# Discrimination: ROC and AUC ----------------------------------------

roc_obj <- roc(
  response = df$hypertension,
  predictor = df$pred_prob,
  levels = c(0, 1),
  direction = "<"
)

auc_value <- auc(roc_obj)
auc_value

# ROC curve: sensitivity vs false positive rate
roc_df <- data.frame(
  sensitivity = roc_obj$sensitivities,
  specificity = roc_obj$specificities
) %>%
  mutate(fpr = 1 - specificity)

ggplot(roc_df, aes(x = fpr, y = sensitivity)) +
  geom_line(linewidth = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_equal() +
  labs(
    x = "False positive rate (1 - specificity)",
    y = "Sensitivity",
    title = paste0("ROC curve, AUC = ", round(auc_value, 3))
  )

# Classification using threshold = 0.5 -------------------------------

threshold <- 0.5

df$pred_class_05 <- ifelse(df$pred_prob >= threshold, 1, 0)

cm_05 <- confusionMatrix(
  factor(df$pred_class_05, levels = c(0, 1)),
  factor(df$hypertension, levels = c(0, 1)),
  positive = "1"
)

cm_05
# Interpretation:
# With threshold = 0.5, the model has very high specificity,
# so it identifies most patients without hypertension, 
# but only 2% of hypertensive ones.


# Alternative threshold: Youden index --------------------------------

youden <- coords(
  roc_obj,
  x = "best",
  best.method = "youden",
  ret = c("threshold", "sensitivity", "specificity"),
  transpose = FALSE
)

youden

best_threshold <- as.numeric(youden["threshold"])
best_threshold

df$pred_class_youden <- ifelse(df$pred_prob >= best_threshold, 1, 0)

cm_youden <- confusionMatrix(
  factor(df$pred_class_youden, levels = c(0, 1)),
  factor(df$hypertension, levels = c(0, 1)),
  positive = "1"
)

cm_youden




# Calibration 1: calibration plot by deciles -------------------------

df$decile <- ntile(df$pred_prob, 10)

calib_deciles <- df %>%
  group_by(decile) %>%
  summarise(
    mean_predicted = mean(pred_prob),
    observed_risk = mean(hypertension),
    n = n(),
    .groups = "drop"
  )

calib_deciles

ggplot(calib_deciles, aes(x = mean_predicted, y = observed_risk)) +
  geom_point(size = 2) +
  geom_line() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  coord_equal() +
  labs(
    x = "Mean predicted probability",
    y = "Observed proportion",
    title = "Calibration plot by deciles",
    subtitle = "Dashed line = perfect calibration"
  )


# Calibration 2: calibration intercept and slope ---------------------
# The calibration intercept indicates whether predictions are systematically 
# too high or too low: 0 is the ideal value (no systematic bias)
# The calibration slope reflects whether predictions are too extreme or too moderate:
# 1 = perfect calibration;
# >1 = underfitting
#< 1 = underfitting (predictions are too extreme)

eps <- 1e-15   #We truncate predicted probabilities to avoid values of 0 and 1, ensuring that the logit transformation is well-defined.
df$pred_prob_adj <- pmin(pmax(df$pred_prob, eps), 1 - eps)
df$logit_pred <- qlogis(df$pred_prob_adj)

cal_model <- glm(
  hypertension ~ logit_pred,
  data = df,
  family = binomial
)

summary(cal_model)

calibration_intercept <- coef(cal_model)[1]
calibration_slope <- coef(cal_model)[2]

calibration_intercept
calibration_slope