# install.packages(c("dplyr", "splines", "readxl", "ggplot2", "pROC", "caret", "ResourceSelection")) library(dplyr) library(splines) library(readxl) library(ggplot2) library(pROC) library(caret) library(ResourceSelection) # Load data ---------------------------------------------------------- df <- read_xlsx("Example1_2.xlsx") df <- na.omit(df) df$hypertension <- as.numeric(df$hypertension) # Train / Test split ------------------------------------------------- set.seed(123) trainIndex <- createDataPartition(df$hypertension, p = 0.7, list = FALSE) train <- df[trainIndex, ] test <- df[-trainIndex, ] # Fit model on TRAIN ------------------------------------------------- model_pred <- glm( hypertension ~ Age + BMI + Female_sex + TotChol, data = train, family = binomial ) summary(model_pred) # Predictions on TEST ----------------------------------------------- test$pred_prob <- predict(model_pred, newdata = test, type = "response") # Discrimination (TEST) --------------------------------------------- roc_obj <- roc( response = test$hypertension, predictor = test$pred_prob, levels = c(0, 1), direction = "<" ) auc_value <- auc(roc_obj) auc_value # ROC curve (Sensitivity vs FPR) roc_df <- data.frame( sensitivity = roc_obj$sensitivities, specificity = roc_obj$specificities ) %>% mutate(fpr = 1 - specificity) ggplot(roc_df, aes(x = fpr, y = sensitivity)) + geom_line(linewidth = 1) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + coord_equal() + labs( x = "False positive rate (1 - specificity)", y = "Sensitivity", title = paste0("ROC curve (TEST), AUC = ", round(auc_value, 3)) ) # Classification threshold = 0.5 ------------------------------------ threshold <- 0.5 test$pred_class_05 <- ifelse(test$pred_prob >= threshold, 1, 0) cm_05 <- confusionMatrix( factor(test$pred_class_05, levels = c(0, 1)), factor(test$hypertension, levels = c(0, 1)), positive = "1" ) cm_05 # Youden threshold (computed on TRAIN!) ------------------------------ roc_train <- roc(train$hypertension, predict(model_pred, type = "response")) youden <- coords( roc_train, x = "best", best.method = "youden", ret = c("threshold"), transpose = FALSE ) best_threshold <- as.numeric(youden["threshold"]) best_threshold # Apply Youden threshold to TEST ------------------------------------- test$pred_class_youden <- ifelse(test$pred_prob >= best_threshold, 1, 0) cm_youden <- confusionMatrix( factor(test$pred_class_youden, levels = c(0, 1)), factor(test$hypertension, levels = c(0, 1)), positive = "1" ) cm_youden # Calibration (TEST) ------------------------------------------------- # Calibration plot (deciles) test$decile <- ntile(test$pred_prob, 10) calib_deciles <- test %>% group_by(decile) %>% summarise( mean_predicted = mean(pred_prob), observed_risk = mean(hypertension), .groups = "drop" ) ggplot(calib_deciles, aes(x = mean_predicted, y = observed_risk)) + geom_point(size = 2) + geom_line() + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + coord_equal() + labs( x = "Mean predicted probability", y = "Observed proportion", title = "Calibration plot (TEST)" ) # Calibration intercept & slope (TEST) ------------------------------- eps <- 1e-15 test$pred_prob_adj <- pmin(pmax(test$pred_prob, eps), 1 - eps) test$logit_pred <- qlogis(test$pred_prob_adj) cal_model <- glm( hypertension ~ logit_pred, data = test, family = binomial ) summary(cal_model) calibration_intercept <- coef(cal_model)[1] calibration_slope <- coef(cal_model)[2] calibration_intercept calibration_slope