###Estimating the POPULATION MEAN

##To estimate the average price of tranquilizer tablets, we selected a random sample of pharmacies.
##We want the estimate to be within ±0.10 of the true mean with 95% confidence.
##From a pilot study, the standard deviation was 0.85.

#How many pharmacies should be selected?


z_alpha <- 1.96  #the critical value of a standard Normal distribution corresponding to alpha=0.05
sigma   <- 0.85  #estimated variability of the price
prec    <- 0.10  #our precision level (margin of error)

n <- (z_alpha^2*sigma^2)/prec^2

sample_size <- ceiling(n)  #this rounds n to units
sample_size    #at least n=278 pharmacies are required

##########################################################
###Estimating a POPULATION PROPORTION/PREVALENCE

##We want to estimate the prevalence of hypertension in a population.
##We select a random sample and we want a 95% confidence and a 5% precision.
##Suppose we do not know the true prevalence, we consider p=0.5 
##p=0.5 is the scenario with the maximum variability, therefore the most conservative one

zstar <- qnorm(.975)  #critical value of the Normal distribution corresponding to a 95% confidence level
p = 0.5 #expected prevalence of hypertension in the population
prec = 0.05 
n <- zstar^2*p*(1-p) / prec^2
minsamp <- ceiling(n)
minsamp  

##Assume we know from previous studies that the prevalence of hypertension is 20% in the population
zstar <- qnorm(.975)  #critical value of the Normal distribution corresponding to a 95% confidence level
p = 0.2 #expected prevalence of hypertension in the population
prec = 0.05 #margin of error
n <- zstar^2*p*(1-p) / prec^2
minsamp <- ceiling(n)
minsamp 


########################################################
###Estimating the CONFIDENCE INTERVAL for an INCIDENCE RATE
##The diet dataframe contains data from 337 subjects and 14 variables
##It is a subsample of a larger cohort used to estimate the incidence of CHD
##These subjects had completed a questionnaire on dietary habits: 46 CHD events occurred in this sample.

library(Epi)
data(diet)

attach(diet)   #makes variables directly accessible using their name

#Calculate the follow-up time in years for each subject included in the study
y <- cal.yr(dox)-cal.yr(doe)  #extract years from dox and doe variables

Y <- sum(y)   #total follow-up of the study
D <- sum(chd) #total number of incident cases (46)
rate<-D/Y  #Incidence rate

results<-c(round(rate, digits=3), round(rate-1.96*(sqrt(D)/Y),digits=3), round(rate+1.96*(sqrt(D)/Y), digits=3))
results  #Incidence was 0.010 per person-year [95% CI: 0.007-0.013]

############################################################################
###Sample size for an INCIDENCE RATE based on PRECISION

##Assume that, for a specific disease A, 
##the incidence rate from previous studies is estimated at 50*10.000 person-years.

##We want to determine the minimum sample size to estimate, at a confidence level of 95%,
##the incidence rate in that population within ± 5 per 10.000 person-years.

#Find the desired SE for the rate
#We know that a 95% CI is given by: estimate -/+1.96*SE
#As we want a margin of error of 5, we impose: 1.96*SE=5, and SE=5/1.96
se.rate <- (5/1.96)
#As SE=[expected rate/SE]^2 --> observed cases=[50/(5*1.96)]^2
number.cases <- (50/se.rate)^2  
number.cases   #385 cases

#The original rate was expressed in 10.000 person-years,
#We need to convert cases into person-time:
#We know that rate=cases/person-years --> person-years=cases/rate:
person.years <- number.cases/50   #7.68*10000
person.years*10000   #true person-years