# install.packages(c("dplyr", "splines", "readxl", "ggplot2", "pROC", "caret", "ResourceSelection")) library(dplyr) library(splines) library(readxl) library(ggplot2) library(pROC) library(caret) library(ResourceSelection) # Load data ---------------------------------------------------------- df <- read_xlsx("Example1_2.xlsx") df <- na.omit(df) # Make sure outcome is coded 0/1 df$hypertension <- as.numeric(df$hypertension) # Predictive model --------------------------------------------------- model_pred <- glm( hypertension ~ Age + BMI + Female_sex + TotChol, data = df, family = binomial ) summary(model_pred) # Predicted probabilities df$pred_prob <- predict(model_pred, type = "response") # Discrimination: ROC and AUC ---------------------------------------- roc_obj <- roc( response = df$hypertension, predictor = df$pred_prob, levels = c(0, 1), direction = "<" ) auc_value <- auc(roc_obj) auc_value # ROC curve: sensitivity vs false positive rate roc_df <- data.frame( sensitivity = roc_obj$sensitivities, specificity = roc_obj$specificities ) %>% mutate(fpr = 1 - specificity) ggplot(roc_df, aes(x = fpr, y = sensitivity)) + geom_line(linewidth = 1) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + coord_equal() + labs( x = "False positive rate (1 - specificity)", y = "Sensitivity", title = paste0("ROC curve, AUC = ", round(auc_value, 3)) ) # Classification using threshold = 0.5 ------------------------------- threshold <- 0.5 df$pred_class_05 <- ifelse(df$pred_prob >= threshold, 1, 0) cm_05 <- confusionMatrix( factor(df$pred_class_05, levels = c(0, 1)), factor(df$hypertension, levels = c(0, 1)), positive = "1" ) cm_05 # Interpretation: # With threshold = 0.5, the model has very high specificity, # so it identifies most patients without hypertension, # but only 2% of hypertensive ones. # Alternative threshold: Youden index -------------------------------- youden <- coords( roc_obj, x = "best", best.method = "youden", ret = c("threshold", "sensitivity", "specificity"), transpose = FALSE ) youden best_threshold <- as.numeric(youden["threshold"]) best_threshold df$pred_class_youden <- ifelse(df$pred_prob >= best_threshold, 1, 0) cm_youden <- confusionMatrix( factor(df$pred_class_youden, levels = c(0, 1)), factor(df$hypertension, levels = c(0, 1)), positive = "1" ) cm_youden # Calibration 1: calibration plot by deciles ------------------------- df$decile <- ntile(df$pred_prob, 10) calib_deciles <- df %>% group_by(decile) %>% summarise( mean_predicted = mean(pred_prob), observed_risk = mean(hypertension), n = n(), .groups = "drop" ) calib_deciles ggplot(calib_deciles, aes(x = mean_predicted, y = observed_risk)) + geom_point(size = 2) + geom_line() + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + coord_equal() + labs( x = "Mean predicted probability", y = "Observed proportion", title = "Calibration plot by deciles", subtitle = "Dashed line = perfect calibration" ) # Calibration 2: calibration intercept and slope --------------------- # The calibration intercept indicates whether predictions are systematically # too high or too low: 0 is the ideal value (no systematic bias) # The calibration slope reflects whether predictions are too extreme or too moderate: # 1 = perfect calibration; # >1 = underfitting #< 1 = underfitting (predictions are too extreme) eps <- 1e-15 #We truncate predicted probabilities to avoid values of 0 and 1, ensuring that the logit transformation is well-defined. df$pred_prob_adj <- pmin(pmax(df$pred_prob, eps), 1 - eps) df$logit_pred <- qlogis(df$pred_prob_adj) cal_model <- glm( hypertension ~ logit_pred, data = df, family = binomial ) summary(cal_model) calibration_intercept <- coef(cal_model)[1] calibration_slope <- coef(cal_model)[2] calibration_intercept calibration_slope