# Load libraries library(epiR) library(Epi) library(popEpi) library(ggplot2) library(scales) library(lubridate) library(AER) library(MASS) library(survival) # Prevalence example # ------------------------------------------------------------------- # Interested in the prevalence of disease X in a population of 1000 individuals. # Two hundred are tested and four returned a positive result. # Assuming 100% test sensitivity and specificity, estimate prevalence. ncas <- 4 npop <- 200 tmp <- as.matrix(cbind(ncas, npop)) epi.conf( tmp, ctype = "prevalence", method = "exact", N = 1000, conf.level = 0.95 ) * 100 # The estimated prevalence of disease X in this population is 2.0 # (95% confidence interval [CI] 0.55 - 5.0) cases per 100 individuals. # The total number of persons in Country X alive with colon cancer on # 1 January 2023 is 16281, and the number of persons in Country X on # 1 January 2023 is 5534738, so the point prevalence is: 16281 / 5534738 # That is, 0.29%. # This is the empirical prevalence of colon cancer patients in Country X # on 1 January 2023. # The theoretical prevalence is the probability that a randomly chosen # person from the population has colorectal cancer. # The data used to estimate p are: # X = number of persons with colorectal cancer # N = number of persons in the population # We can fit a binomial model: X <- 16281 N <- 5534738 mp <- glm(cbind(X, N - X) ~ 1, family = binomial) #N-X=number of failures round(ci.pred(mp) * 100, 4) # glm fits generalized linear models, here a model for a binomial outcome. # ci.pred produces a prediction of the theoretical prevalence from the model. # ------------------------------------------------------------------- # Another example: prevalence of diabetes by age and sex # ------------------------------------------------------------------- # We use a dataset of Diabetes prevalence in Denmark in 1-year age classes # by sex, measured on 01-01-2010. data(pr) str(pr) head(pr) # Variables: # A : Numeric, age, 0-99 # sex : Sex, factor with levels M F # X : Number of diabetic patients # N : Population size # Compute observed prevalence by age and sex: prt <- stat.table(index = list(A, sex), contents = ratio(X, N, 100), data = pr)[1, , ] round(prt[70:76, ], 1) print(prt) # Plot prevalence by age with one curve for each sex: matplot( 0:99, prt, type = "l", lwd = 2, col = c(4, 2), lty = 1, xlab = "Age" ) # Find the maximum values for males and females i_males <- which.max(prt[,1]) i_females <- which.max(prt[,2]) age_males <- (i_males - 1) age_females <- (i_females - 1) age_males age_females abline(v = c(76, 81), col=c("red", "blue")) # Ages between 76 and 81 correspond to the highest prevalences # for men and women. # ------------------------------------------------------------------- # Cumulative incidence: US Traffic Fatalities # ------------------------------------------------------------------- # We use a dataset of US traffic fatalities for the "lower 48" US states, # annually for 1982 through 1988. data(Fatalities) str(Fatalities) # Variables of interest: # fatal: Number of vehicle fatalities # pop : Population in the state # Calculate cumulative incidence per state per year and plot as time series: table.deaths <- with(Fatalities, tapply(fatal, list(state, year), sum)) table.exp <- with(Fatalities, tapply(pop, list(state, year), sum)) inc.cum.year.state <- (table.deaths / table.exp) * 10000 # Plot just the first five states: plot.ts( t(inc.cum.year.state[1:5, ]), plot.type = "single", xlab = "years", ylab = "Incidence", col = c(1:5), lty = c(1:5), ylim = c(1, 4) ) legend( "topleft", col = c(1:5), lty = c(1:5), legend = c("al", "az", "ar", "ca", "co"), bty = "n" ) # Here we are not showing confidence intervals around these estimates. # ------------------------------------------------------------------- # Incidence rate # ------------------------------------------------------------------- # A study was conducted by Feychting, Osterlund, and Ahlbom (1998) # to report the frequency of cancer among the blind. # A total of 136 diagnoses of cancer were made from 22050 person-years at risk. ncas <- 136 ntar <- 22050 tmp <- as.matrix(cbind(ncas, ntar)) epi.conf( tmp, ctype = "inc.rate", method = "exact", N = 1000, conf.level = 0.95 ) * 1000 # The incidence rate of cancer in this population was 6.2 # (95% CI 5.2 to 7.3) cases per 1000 person-years at risk. # ------------------------------------------------------------------- # A very famous incidence rate: the mortality rate # ------------------------------------------------------------------- # Mortality is typically reported as a number of people that have died # in a population of a certain size. # Example: 2648 persons died out of 150000 persons. 2648 / 150000 # This is 17.6 per 1000 persons. # But deaths must be recorded over some period of time. # If the 2648 deaths were accumulated over a period of two years, # then the mortality would be half of what we just computed, because # the 150000 people had each two years to encounter death. # This leads to the concept of person-years. # If the population size is 150000 and the observation period is 2 years, # the number of person-years is 300000 and the mortality rate is: 2648 / 300000 # 8.83 deaths per 1000 person-years. # ------------------------------------------------------------------- # Melanoma data (incidence rate, more elaborate example) # ------------------------------------------------------------------- # Load and view the dataset Melanoma: data(Melanoma) head(Melanoma) # Variables: # time : survival time in days, possibly censored # status : 1 died from melanoma, 2 alive, 3 dead from other causes # sex : 1 = male, 0 = female # age : age in years # year : year of operation # thickness : tumour thickness in mm # ulcer : 1 = presence, 0 = absence # Calculate the incidence rate of death for melanoma using the # person-time epidemiological approach. # Ignore death from other causes for now. mm.deaths <- sum(Melanoma$status == 1) mm.deaths per.time <- sum(Melanoma$time / 365.25) per.time mortality.rate <- mm.deaths / per.time incidence.rate <- round(100 * mortality.rate, 1) incidence.rate # If you report the incidence rate of deaths as 4.7 per 100 person-years, # epidemiologists might understand, but most others will not. # You can express it as 4.7 deaths per 100 persons per year. # The survival package has a function that computes person-years of follow-up, # also stratified into subgroups. py <- pyears(Surv(time, status == 1) ~ 1, data = Melanoma) py$pyears # Evaluate whether the incidence rate of death differs by sex: py.sex <- pyears(Surv(time, status == 1) ~ sex, data = Melanoma) py.sex py.sex$pyears # Re-do the calculations stratified by sex: index.males <- Melanoma$sex == 1 mm.deaths.males <- sum(Melanoma[index.males, ]$status == 1) mm.deaths.males index.females <- Melanoma$sex == 0 mm.deaths.females <- sum(Melanoma[index.females, ]$status == 1) mm.deaths.females mortality.rate.males <- mm.deaths.males / py.sex$pyears[2] incidence.rate.males <- round(100 * mortality.rate.males, 1) mortality.rate.females <- mm.deaths.females / py.sex$pyears[1] incidence.rate.females <- round(100 * mortality.rate.females, 1) incidence.rate.males incidence.rate.females # For males we observe 6.9 deaths per 100 persons per year, # and for females 3.6 deaths per 100 persons per year.