#========================================================
# DIAGNOSTIC TEST PERFORMANCE USING PRE-SPECIFIED CUT-OFFS
# Example: Creatine kinase (CK) for myocardial infarction
#========================================================


# Load required package
library(readxl)

# Import the dataset
df <- read_xlsx("CK.xlsx")

# Inspect the first rows of the dataset
head(df)

#--------------------------------------------------------
# Function to calculate diagnostic performance measures
# given a binary test result and the true disease status
#--------------------------------------------------------
compute_metrics <- function(test_result, disease_status) {
  
  # Create the confusion matrix
  conf_mat <- table(Test = test_result, Disease = disease_status)
  print(conf_mat)
  
  # Extract the four core quantities from the confusion matrix
  # TP = true positives
  # FN = false negatives
  # FP = false positives
  # TN = true negatives
  
  TP <- conf_mat["1", "infarction"]
  FN <- conf_mat["0", "infarction"]
  FP <- conf_mat["1", "angina"]
  TN <- conf_mat["0", "angina"]
  
  # Total number of subjects
  total <- sum(conf_mat)
  
  # Calculate diagnostic performance measures
  sensitivity <- TP / (TP + FN)
  specificity <- TN / (TN + FP)
  accuracy    <- (TP + TN) / total
  PPV         <- TP / (TP + FP)
  NPV         <- TN / (TN + FN)
  
  # Return the results as a data frame
  results <- data.frame(
    Metric = c("Sensitivity", "Specificity", "Accuracy", "PPV", "NPV"),
    Value  = c(sensitivity, specificity, accuracy, PPV, NPV)
  )
  
  return(results)
}

#--------------------------------------------------------
# Example 1: Define a binary test using CK >= 100
#--------------------------------------------------------

# Create a binary test variable:
# 1 = positive test, 0 = negative test
df$CK_100 <- as.factor(ifelse(df$CK >= 100, 1, 0))

# Calculate performance measures
results_ck100 <- compute_metrics(test_result = df$CK_100,
                                 disease_status = df$Patologia)

# Display results
print(results_ck100)

#--------------------------------------------------------
# Example 2: Define a binary test using CK >= 80
#--------------------------------------------------------

# Create a second binary test variable using a lower cut-off
df$CK_80 <- as.factor(ifelse(df$CK >= 80, 1, 0))

# Calculate performance measures
results_ck80 <- compute_metrics(test_result = df$CK_80,
                                disease_status = df$Patologia)

# Display results
print(results_ck80)


###########################################################
#========================================================
# ROC ANALYSIS FOR A CONTINUOUS TEST
# Example: sFlt/PlGF ratio for hypertension
#========================================================

# Load required packages
library(readxl)
library(pROC)
library(dplyr)

# Import the dataset
df <- read_excel("Exercise_2.xls")

# Inspect the structure of the dataset
str(df)

#--------------------------------------------------------
# Step 1. Create the ROC curve
#--------------------------------------------------------
# response  = true disease status (must be binary: 0/1)
# predictor = continuous test value

roc_obj <- roc(
  response  = df$hypertension,
  predictor = df$sfltplgf,
  quiet = TRUE
)

#--------------------------------------------------------
# Step 2. Calculate and print the Area Under the Curve
#--------------------------------------------------------
auc_value <- auc(roc_obj)
print(auc_value)

#--------------------------------------------------------
# Step 3. Plot the ROC curve
#--------------------------------------------------------
plot(
  roc_obj,
  legacy.axes = TRUE,
  main = "ROC curve: sFlt/PlGF for hypertension",
  lwd = 3,
  col = "blue"
)


# Add AUC to the plot
legend(
  "bottomright",
  legend = paste("AUC =", round(auc_value, 3)),
  bty = "n"
)

#--------------------------------------------------------
# Step 4. Define a function to calculate performance
# measures for any chosen cut-off
#--------------------------------------------------------
calc_metrics <- function(cutoff, data) {
  
  # Classify subjects according to the chosen threshold
  # 1 = test positive, 0 = test negative
  pred <- ifelse(data$sfltplgf >= cutoff, 1, 0)
  true <- data$hypertension
  
  # Compute confusion matrix components
  TP <- sum(pred == 1 & true == 1, na.rm = TRUE)
  TN <- sum(pred == 0 & true == 0, na.rm = TRUE)
  FP <- sum(pred == 1 & true == 0, na.rm = TRUE)
  FN <- sum(pred == 0 & true == 1, na.rm = TRUE)
  
  # Compute diagnostic performance measures
  sensitivity <- ifelse((TP + FN) > 0, TP / (TP + FN), NA)
  specificity <- ifelse((TN + FP) > 0, TN / (TN + FP), NA)
  PPV         <- ifelse((TP + FP) > 0, TP / (TP + FP), NA)
  NPV         <- ifelse((TN + FN) > 0, TN / (TN + FN), NA)
  accuracy    <- ifelse((TP + TN + FP + FN) > 0,
                        (TP + TN) / (TP + TN + FP + FN), NA)
  
  # Youden index: a common criterion for the optimal cut-off
  youden_index <- sensitivity + specificity - 1
  
  # Return all metrics
  data.frame(
    cutoff = cutoff,
    TP = TP,
    FP = FP,
    TN = TN,
    FN = FN,
    sensitivity = sensitivity,
    specificity = specificity,
    PPV = PPV,
    NPV = NPV,
    accuracy = accuracy,
    youden_index = youden_index
  )
}

#--------------------------------------------------------
# Step 5. Calculate performance measures for all cut-offs
# identified by the ROC procedure
#--------------------------------------------------------
thresholds <- sort(unique(roc_obj$thresholds))

results <- bind_rows(lapply(thresholds, calc_metrics, data = df)) %>%
  distinct() %>%
  arrange(desc(youden_index), desc(accuracy))

# Print the full table
print(results)

#--------------------------------------------------------
# Step 6. Identify the optimal cut-off according to
# the Youden index
#--------------------------------------------------------
best_cutoff_row <- results[which.max(results$youden_index), ]

print(best_cutoff_row)


#--------------------------------------------------------
# Step 7. Alternative way to extract the best cut-off
# directly from the pROC package
#--------------------------------------------------------
best_coords <- coords(
  roc_obj,
  x = "best",
  best.method = "youden",
  ret = c("threshold", "sensitivity", "specificity"),
  transpose = FALSE
)

print(best_coords)