# Load libraries
library(epiR)
library(Epi)
library(popEpi)
library(ggplot2)
library(scales)
library(lubridate)
library(AER)
library(MASS)
library(survival)

# Prevalence example
# -------------------------------------------------------------------

# Interested in the prevalence of disease X in a population of 1000 individuals.
# Two hundred are tested and four returned a positive result.
# Assuming 100% test sensitivity and specificity, estimate prevalence.

ncas <- 4
npop <- 200
tmp <- as.matrix(cbind(ncas, npop))

epi.conf(
  tmp,
  ctype = "prevalence",
  method = "exact",
  N = 1000,
  conf.level = 0.95
) * 100

# The estimated prevalence of disease X in this population is 2.0
# (95% confidence interval [CI] 0.55 - 5.0) cases per 100 individuals.


# The total number of persons in Country X alive with colon cancer on
# 1 January 2023 is 16281, and the number of persons in Country X on
# 1 January 2023 is 5534738, so the point prevalence is:

16281 / 5534738

# That is, 0.29%.

# This is the empirical prevalence of colon cancer patients in Country X
# on 1 January 2023.

# The theoretical prevalence is the probability that a randomly chosen
# person from the population has colorectal cancer.

# The data used to estimate p are:
# X = number of persons with colorectal cancer
# N = number of persons in the population
# We can fit a binomial model:

X <- 16281
N <- 5534738
mp <- glm(cbind(X, N - X) ~ 1, family = binomial) #N-X=number of failures
round(ci.pred(mp) * 100, 4)

# glm fits generalized linear models, here a model for a binomial outcome.
# ci.pred produces a prediction of the theoretical prevalence from the model.

# -------------------------------------------------------------------
# Another example: prevalence of diabetes by age and sex
# -------------------------------------------------------------------

# We use a dataset of Diabetes prevalence in Denmark in 1-year age classes
# by sex, measured on 01-01-2010.

data(pr)
str(pr)
head(pr)

# Variables:
# A   : Numeric, age, 0-99
# sex : Sex, factor with levels M F
# X   : Number of diabetic patients
# N   : Population size

# Compute observed prevalence by age and sex:

prt <- stat.table(index = list(A, sex), contents = ratio(X, N, 100), data = pr)[1, , ]
round(prt[70:76, ], 1)
print(prt)
# Plot prevalence by age with one curve for each sex:

matplot(
  0:99,
  prt,
  type = "l",
  lwd = 2,
  col = c(4, 2),
  lty = 1,
  xlab = "Age"
)

# Find the maximum values for males and females

i_males   <- which.max(prt[,1])
i_females <- which.max(prt[,2])

age_males   <- (i_males - 1) 
age_females <- (i_females - 1) 

age_males
age_females

abline(v = c(76, 81), col=c("red", "blue"))

# Ages between 76 and 81 correspond to the highest prevalences
# for men and women.

# -------------------------------------------------------------------
# Cumulative incidence: US Traffic Fatalities
# -------------------------------------------------------------------

# We use a dataset of US traffic fatalities for the "lower 48" US states,
# annually for 1982 through 1988.

data(Fatalities)
str(Fatalities)

# Variables of interest:
# fatal: Number of vehicle fatalities
# pop  : Population in the state

# Calculate cumulative incidence per state per year and plot as time series:

table.deaths <- with(Fatalities, tapply(fatal, list(state, year), sum))
table.exp <- with(Fatalities, tapply(pop, list(state, year), sum))
inc.cum.year.state <- (table.deaths / table.exp) * 10000

# Plot just the first five states:

plot.ts(
  t(inc.cum.year.state[1:5, ]),
  plot.type = "single",
  xlab = "years",
  ylab = "Incidence",
  col = c(1:5),
  lty = c(1:5),
  ylim = c(1, 4)
)
legend(
  "topleft",
  col = c(1:5),
  lty = c(1:5),
  legend = c("al", "az", "ar", "ca", "co"),
  bty = "n"
)

# Here we are not showing confidence intervals around these estimates.

# -------------------------------------------------------------------
# Incidence rate
# -------------------------------------------------------------------

# A study was conducted by Feychting, Osterlund, and Ahlbom (1998)
# to report the frequency of cancer among the blind.
# A total of 136 diagnoses of cancer were made from 22050 person-years at risk.

ncas <- 136
ntar <- 22050
tmp <- as.matrix(cbind(ncas, ntar))

epi.conf(
  tmp,
  ctype = "inc.rate",
  method = "exact",
  N = 1000,
  conf.level = 0.95
) * 1000

# The incidence rate of cancer in this population was 6.2
# (95% CI 5.2 to 7.3) cases per 1000 person-years at risk.



# -------------------------------------------------------------------
# A very famous incidence rate: the mortality rate
# -------------------------------------------------------------------

# Mortality is typically reported as a number of people that have died
# in a population of a certain size.

# Example: 2648 persons died out of 150000 persons.

2648 / 150000

# This is 17.6 per 1000 persons.

# But deaths must be recorded over some period of time.
# If the 2648 deaths were accumulated over a period of two years,
# then the mortality would be half of what we just computed, because
# the 150000 people had each two years to encounter death.

# This leads to the concept of person-years.

# If the population size is 150000 and the observation period is 2 years,
# the number of person-years is 300000 and the mortality rate is:

2648 / 300000

# 8.83 deaths per 1000 person-years.

# -------------------------------------------------------------------
# Melanoma data (incidence rate, more elaborate example)
# -------------------------------------------------------------------

# Load and view the dataset Melanoma:

data(Melanoma)
head(Melanoma)

# Variables:
# time      : survival time in days, possibly censored
# status    : 1 died from melanoma, 2 alive, 3 dead from other causes
# sex       : 1 = male, 0 = female
# age       : age in years
# year      : year of operation
# thickness : tumour thickness in mm
# ulcer     : 1 = presence, 0 = absence

# Calculate the incidence rate of death for melanoma using the
# person-time epidemiological approach.
# Ignore death from other causes for now.

mm.deaths <- sum(Melanoma$status == 1)
mm.deaths

per.time <- sum(Melanoma$time / 365.25)
per.time

mortality.rate <- mm.deaths / per.time
incidence.rate <- round(100 * mortality.rate, 1)
incidence.rate

# If you report the incidence rate of deaths as 4.7 per 100 person-years,
# epidemiologists might understand, but most others will not.
# You can express it as 4.7 deaths per 100 persons per year.

# The survival package has a function that computes person-years of follow-up,
# also stratified into subgroups.

py <- pyears(Surv(time, status == 1) ~ 1, data = Melanoma)
py$pyears

# Evaluate whether the incidence rate of death differs by sex:

py.sex <- pyears(Surv(time, status == 1) ~ sex, data = Melanoma)
py.sex
py.sex$pyears

# Re-do the calculations stratified by sex:

index.males <- Melanoma$sex == 1
mm.deaths.males <- sum(Melanoma[index.males, ]$status == 1)
mm.deaths.males

index.females <- Melanoma$sex == 0
mm.deaths.females <- sum(Melanoma[index.females, ]$status == 1)
mm.deaths.females

mortality.rate.males <- mm.deaths.males / py.sex$pyears[2]
incidence.rate.males <- round(100 * mortality.rate.males, 1)

mortality.rate.females <- mm.deaths.females / py.sex$pyears[1]
incidence.rate.females <- round(100 * mortality.rate.females, 1)

incidence.rate.males
incidence.rate.females

# For males we observe 6.9 deaths per 100 persons per year,
# and for females 3.6 deaths per 100 persons per year.