Load the required libraries:

# load packages
library(knitr)
library(tidyverse)
library(lubridate)
library(survival)
library(survminer)
library(gtsummary)
library(ISwR)
library(KMsurv)
library(timeROC)
library(rms)

Load the dataset melanom:

data("melanom", package="ISwR")
str(melanom)

## 'data.frame':    205 obs. of  6 variables:
##  $ no    : int  789 13 97 16 21 469 685 7 932 944 ...
##  $ status: int  3 3 2 3 1 1 1 1 3 1 ...
##  $ days  : int  10 30 35 99 185 204 210 232 232 279 ...
##  $ ulc   : int  1 2 2 2 1 1 1 1 1 1 ...
##  $ thick : int  676 65 134 290 1208 484 516 1288 322 741 ...
##  $ sex   : int  2 2 2 1 2 2 2 2 1 1 ...

head(melanom)

##    no status days ulc thick sex
## 1 789      3   10   1   676   2
## 2  13      3   30   2    65   2
## 3  97      2   35   2   134   2
## 4  16      3   99   2   290   1
## 5  21      1  185   1  1208   2
## 6 469      1  204   1   484   2

Discrimination for the survival data : time-dependent ROC curve

In what follows in order to simplify the steps, we ignore the “training” - “test” usual splitting (or cross-validation) and we use directly the functions on the original dataset, just to show how they works.
First of all we evaluate the discrimination accuracy of the tumor thickness as a prognostic biomarker for death from malignant melanoma:

ROC.thick<-timeROC(T=melanom$days,delta=melanom$status,marker=melanom$thick,cause=1, times=c(1800,2000,2200),iid=TRUE)
ROC.thick

## Time-dependent-Roc curve estimated using IPCW  (n=205, with competing risks). 
##        Cases Survivors Other events Censored AUC_1 (%) se_1 AUC_2 (%) se_2
## t=1800    45       124           36        0     75.39 4.10     76.51 3.85
## t=2000    46       103           56        0     75.60 4.14     76.11 3.82
## t=2200    50        83           72        0     72.55 4.40     74.33 3.81
## 
## Method used for estimating IPCW:marginal 
## 
## Total computation time : 0.12  secs.

Note that since we are here in a competing risk setting (two causes of death, but we are interested in cause=1), we obtain two AUCs estimated each one for a risk. We are now interested in evaluating AUC_1. We can also evaluate the confidence intervals for the estimated AUCs:

confint(ROC.thick)

## $CI_AUC_1
##         2.5% 97.5%
## t=1800 67.36 83.41
## t=2000 67.49 83.72
## t=2200 63.92 81.19
## 
## $CB_AUC_1
##         2.5% 97.5%
## t=1800 66.60 84.17
## t=2000 66.72 84.49
## t=2200 63.11 82.00
## 
## $C.alpha.1
##      95% 
## 2.145463 
## 
## $CI_AUC_2
##         2.5% 97.5%
## t=1800 68.96 84.05
## t=2000 68.63 83.60
## t=2200 66.86 81.79
## 
## $CB_AUC_2
##         2.5% 97.5%
## t=1800 68.36 84.65
## t=2000 68.04 84.19
## t=2200 66.27 82.39
## 
## $C.alpha.2
##      95% 
## 2.115864

Then, we estimate a Cox model by adding sex to the tumor thickness:

mod.cov <- coxph(Surv(days,status==1)~sex+log(thick), data=melanom)
summary(mod.cov)

## Call:
## coxph(formula = Surv(days, status == 1) ~ sex + log(thick), data = melanom)
## 
##   n= 205, number of events= 57 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## sex        0.4580    1.5809   0.2687 1.705   0.0883 .  
## log(thick) 0.7809    2.1834   0.1573 4.963 6.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## sex            1.581     0.6326    0.9337     2.677
## log(thick)     2.183     0.4580    1.6040     2.972
## 
## Concordance= 0.749  (se = 0.033 )
## Likelihood ratio test= 33.45  on 2 df,   p=5e-08
## Wald test            = 31  on 2 df,   p=2e-07
## Score (logrank) test = 32.52  on 2 df,   p=9e-08

We now extract the linear predictor from the Cox model (X*beta): note that the linear predictor of this cause-specific Cox model is evaluated at baseline and it is independent of time.

lin.pred <- predict(mod.cov)

And we now compute the time-dependent discrimination from this model:

ROC.cox <- timeROC(T=melanom$days,delta=melanom$status,marker=lin.pred,cause=1, times=c(1800,2000,2200),iid=TRUE)
ROC.cox

## Time-dependent-Roc curve estimated using IPCW  (n=205, with competing risks). 
##        Cases Survivors Other events Censored AUC_1 (%) se_1 AUC_2 (%) se_2
## t=1800    45       124           36        0     75.76 4.09     76.94 3.85
## t=2000    46       103           56        0     76.17 4.15     76.39 3.83
## t=2200    50        83           72        0     74.33 4.34     75.13 3.75
## 
## Method used for estimating IPCW:marginal 
## 
## Total computation time : 0.1  secs.

along with the estimated confidence intervals:

confint(ROC.cox)

## $CI_AUC_1
##         2.5% 97.5%
## t=1800 67.75 83.77
## t=2000 68.04 84.30
## t=2200 65.82 82.83
## 
## $CB_AUC_1
##         2.5% 97.5%
## t=1800 66.82 84.71
## t=2000 67.09 85.25
## t=2200 64.83 83.82
## 
## $C.alpha.1
##      95% 
## 2.188951 
## 
## $CI_AUC_2
##         2.5% 97.5%
## t=1800 69.40 84.49
## t=2000 68.87 83.90
## t=2200 67.77 82.48
## 
## $CB_AUC_2
##         2.5% 97.5%
## t=1800 69.07 84.82
## t=2000 68.55 84.23
## t=2200 67.45 82.80
## 
## $C.alpha.2
##      95% 
## 2.045094

Now we can compare the AUCs: is there a significant improvement in discrimination by adding sex?

pp1 <- compare(ROC.thick,ROC.cox)
round(pp1$p_values_AUC_1,digits=4)

## t=1800 t=2000 t=2200 
## 0.7713 0.6594 0.1936

No significant increment in discrimination is observed from this test.

Now we add also ulcer in the Cox model but this time as a covariate (not as a strata variable):

mod.cov2 <- coxph(Surv(days,status==1)~sex+log(thick)+ulc, data=melanom)
summary(mod.cov2)

## Call:
## coxph(formula = Surv(days, status == 1) ~ sex + log(thick) + 
##     ulc, data = melanom)
## 
##   n= 205, number of events= 57 
## 
##               coef exp(coef) se(coef)      z Pr(>|z|)   
## sex         0.3813    1.4641   0.2706  1.409  0.15879   
## log(thick)  0.5756    1.7782   0.1794  3.209  0.00133 **
## ulc        -0.9389    0.3911   0.3243 -2.895  0.00379 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## sex           1.4641     0.6830    0.8615    2.4883
## log(thick)    1.7782     0.5624    1.2511    2.5273
## ulc           0.3911     2.5571    0.2071    0.7384
## 
## Concordance= 0.768  (se = 0.032 )
## Likelihood ratio test= 42.66  on 3 df,   p=3e-09
## Wald test            = 36.99  on 3 df,   p=5e-08
## Score (logrank) test = 42.81  on 3 df,   p=3e-09

We now extract also from this model the linear predictor (X*beta):

lin.pred2 <- predict(mod.cov2)

And now we compare the performance of this model with the tumor thickness:

ROC.cox2 <- timeROC(T=melanom$days,delta=melanom$status,marker=lin.pred2,cause=1, times=c(1800,2000,2200),iid=TRUE)
ROC.cox2

## Time-dependent-Roc curve estimated using IPCW  (n=205, with competing risks). 
##        Cases Survivors Other events Censored AUC_1 (%) se_1 AUC_2 (%) se_2
## t=1800    45       124           36        0     78.74 3.95     79.36 3.71
## t=2000    46       103           56        0     78.59 4.06     78.58 3.73
## t=2200    50        83           72        0     75.88 4.27     76.66 3.69
## 
## Method used for estimating IPCW:marginal 
## 
## Total computation time : 0.11  secs.

confint(ROC.cox2)

## $CI_AUC_1
##         2.5% 97.5%
## t=1800 71.00 86.47
## t=2000 70.64 86.54
## t=2200 67.51 84.25
## 
## $CB_AUC_1
##         2.5% 97.5%
## t=1800 69.96 87.51
## t=2000 69.57 87.61
## t=2200 66.38 85.38
## 
## $C.alpha.1
##      95% 
## 2.223397 
## 
## $CI_AUC_2
##         2.5% 97.5%
## t=1800 72.10 86.62
## t=2000 71.27 85.88
## t=2200 69.43 83.89
## 
## $CB_AUC_2
##         2.5% 97.5%
## t=1800 71.25 87.48
## t=2000 70.41 86.74
## t=2200 68.58 84.74
## 
## $C.alpha.2
##      95% 
## 2.189879

pp1 <- compare(ROC.thick,ROC.cox2)
round(pp1$p_values_AUC_1,digits=4)

## t=1800 t=2000 t=2200 
## 0.1473 0.2036 0.1781

Again, no significant difference in discrimination is observed.

Finally, we can plot the estimated time-dependent AUCs:

plotAUCcurve(ROC.thick, FP = 2, add = FALSE,conf.int = FALSE,conf.band = FALSE, col = "black")
plotAUCcurve(ROC.cox, FP = 2, add =TRUE, conf.int = FALSE,conf.band = FALSE, col = "blue")
plotAUCcurve(ROC.cox2, FP = 2, add =TRUE, conf.int = FALSE,conf.band = FALSE, col = "red")
title(main="Time-dependent AUCs")
legend("topleft", legend=c("Tumor thickness","Thick + sex", "Thick + sex+ulc"), lty=c(1,1,1),lwd=c(2,2,2), 
       col=c("black","blue", "red"), bty="n")

We can also fix a specific time-horizon :

plot(ROC.thick,time=2200,add = FALSE, col = "black",title=FALSE, lwd=2)
plot(ROC.cox,time=2200,add = TRUE, col = "blue",title=FALSE, lwd=2)
plot(ROC.cox2,time=2200,add = TRUE, col = "red",title=FALSE, lwd=2)
legend("bottomright", legend=c("Tumor thickness","Thick + sex", "Thick + sex + ulc"), lty=c(1,1,1),lwd=c(2,2,2), 
       col=c("black","blue", "red"), bty="n")
title(main="Time-dependent ROC curves at 2200 days")

Note that discrimination in this context is focused exclusively on the performance of the linear predictor extracted from the Cox model. We are not considering here the survival estimates.

Calibration for the Cox model

Just for didactic reasons, we are in this example IGNORING the competing risk present in these data, and act as we have an unique event.
Therefore the method is illustrated is valid only in the standard survival setting, not in the competing risk setting (more advanced approaches are needed in that case, as for example estimating cumulative incidence curves using the multi-state approach). We now evaluate calibration for the Cox model: first of all, we determine summaries of variables for effect and plotting ranges, values to adjust to, and overall ranges for the dataset Melanom, since this is required in order to use the R functions developed in the rms library. In this example, the boostrap resampling of the original dataset is used to evaluate calibration.

options(datadist='dd')
dd <-datadist(melanom)
dd

##                       no status      days ulc     thick sex
## Low:effect      222.0000      1 1525.0000   1   97.0000   1
## Adjust to       469.0000      2 2005.0000   1  194.0000   1
## High:effect     731.0000      3 3042.0000   2  356.0000   2
## Low:prediction   11.0000      1  294.2195   1   32.0000   1
## High:prediction 943.0488      3 4119.2439   2  838.7805   2
## Low               2.0000      1   10.0000   1   10.0000   1
## High            992.0000      3 5565.0000   2 1742.0000   2
## 
## Values:
## 
## status : 1 2 3 
## ulc : 1 2 
## sex : 1 2

Then, we re-estimate the Cox model by adding some options useful to fix the time horizon to evaluate calibration, in this case fixed at 2000 days:

mod.cov2 <- cph(Surv(days,status==1)~sex+log(thick)+ulc, data=melanom,x=TRUE , y=TRUE,surv=TRUE,time.inc=2000)

Finally, we use the function calibrate by comparing predicted probabilities from the Cox model with the observed ones estimated by the Kaplan-Meier:

calkm <- calibrate(mod.cov2, u=2000, m=40, cmethod= 'KM', B=200)

## Using Cox survival estimates at 2000 Days

calkm

## calibrate.cph(fit = mod.cov2, cmethod = "KM", u = 2000, m = 40, 
##     B = 200)
## 
## 
## n=205  B=200  u=2000 Day
## 
##       index.orig    training         test mean.optimism mean.corrected   n
## [1,]  0.01505666 0.017600887  0.028737147  -0.011136260    0.026192916 195
## [2,]  0.01466150 0.007914552 -0.008432073   0.016346625   -0.001685122 200
## [3,]  0.01195721 0.011739626  0.004060550   0.007679076    0.004278131 200
## [4,] -0.01622951 0.005080958 -0.011494079   0.016575037   -0.032804547 200
## [5,]  0.01160306 0.001673813  0.003804451  -0.002130638    0.013733696 200
##      mean.predicted        KM KM.corrected    std.err
## [1,]      0.4394888 0.4545455    0.4656817 0.17797217
## [2,]      0.6603385 0.6750000    0.6586534 0.10971343
## [3,]      0.8233362 0.8352934    0.8276144 0.07384917
## [4,]      0.8978469 0.8816174    0.8650424 0.06477229
## [5,]      0.9396165 0.9512195    0.9533502 0.03536639

plot(calkm, cex.lab=0.7)

Another more general approach to estimating calibration curves for survival models is to use the flexible adaptive hazard regression approach (not covered here). For survival models, “predicted” means predicted survival probability at a single time point, and “observed” refers to the corresponding Kaplan-Meier survival estimate. The number of intervals of predicted survival probabilities in which subjects are ranked depend on the parameter m that correspond to the average number of subjects in each group evaluated at u-time.
As an internal validation tool, the bootstrap is used to de-bias the estimates to correct for overfitting, allowing estimation of the likely future calibration performance of the fitted model.

Evaluating the performance for the Cox model

Discrimination for the survival data : time-dependent ROC curve

Calibration for the Cox model