-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathICI3D_Lab5_introLikelihood.R
154 lines (123 loc) · 6.54 KB
/
ICI3D_Lab5_introLikelihood.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
## Introduction to Likelihood
## (C) Steve Bellan, Cari van Schalkwyk and the ICI3D team 2009–2019
## The code is made available under a Creative Commons Attribution 4.0 International License. You
## are free to reuse this code provided that you give appropriate credit, provide a link to the
## license, and indicate if changes were made. You may do so in any reasonable manner, but not in
## any way that suggests the licensor endorses you or your use. Giving appropriate credit includes
## citation of the original repository.
## Set margins for plotting.
par(mar=c(6,4,4,2))
## Say we are randomly sampling from a population of people in a town
## of size 1,000,000 and we sample 100 people from that population.
sampleSize <- 100
## Say that the true HIV prevalence in that population of size 1,000,000
## is 30%, i.e. 300,000 are HIV positive.
true.prev <- .3
## Sample from this distribution once
## samplePos <- rbinom(1,100,.3)
## You can use the above code to sample from this distribution, but
## we'll set it at 28 just so everyone is working on the same example
## for now.
samplePos <- 28
samplePrev <- samplePos/sampleSize
samplePrev
## Create a color vector for every possible number of HIV+ people we
## could have found when sampling 100 people (0:100 means 101
## possibilities). Then make the value we chose have a different
## color.
color.vec <- rep("black",sampleSize+1)
color.vec[samplePos+1] <- "purple"
### Create a histogram of the binomial distribution function with the
### true prevalence, highlighting the value we sampled.
barplot.obj <- barplot(dbinom(0:sampleSize,size=sampleSize, prob=true.prev),
xlab="number HIV+", names.arg=c(0:sampleSize),
ylab="probability", col=color.vec, border = NA, space =0)
## A politician is claiming that HIV prevalence is 20%
## Given that we found samplePos
samplePos
## positives, would we accept or reject the hypothesis that the true prevalence is .2?
potential.prev <- .2
## Calculate the probability of getting every single possible value
## from the binomial distribtion with this hypothesized prevalence.
prob.dist <- dbinom(0:sampleSize, size=sampleSize, prob=potential.prev)
ymax <- .2
color.vec <- rep("grey",sampleSize)
## Plot the binomial distribution for N=100, and p=.2
barplot(prob.dist,
xlab = "number HIV+",
names.arg = c(0:sampleSize),
ylab = "probability",
col = color.vec ,
border = NA,
main = paste("hypothetical prevalence:",potential.prev*100,"%"),
ylim = c(0,ymax),
space = 0)
##What is the probability of observing 28/100, under this hypothesised prevalence?
dbinom(samplePos, sampleSize, potential.prev)
arrows(x0 = samplePos , y0=ymax, y1=dbinom(samplePos, sampleSize, potential.prev))
## Now do a classic test:
binom.test(samplePos, sampleSize, potential.prev, alternative = "two.sided")
## Task 1: Explain what we learn from the binom.test
## Focus on the confidence intervals; don't worry about the P value (or about the "guess" (potential.prev)
## Task 2: Repeat the above for two or three different examples
## with different hypothesized prevalence and/or samplePos
## Explain what you see.
###########################################################################
## Now let's use the Maximum Likelihood approach to construct confidence intervals.
###########################################################################
## Create a vector of potential prevalences spanning 0-1 with 10000 values
## These are all potential "null hypotheses".
potential.prev.vector <- seq(0,1, length=10000)
## The likelihood of a prevalence is the probability of observing the data given the
## hypothesized prevalence:
## L(prevalence | data) = p(data | prevalence)
likelihoods <- dbinom(samplePos, sampleSize, potential.prev.vector)
plot(potential.prev.vector, likelihoods, col = "purple", type = "l", lwd = 2,
xlab = "potential HIV prevalences", ylab = "likelihood",
main = "p(our data given prevalence) = LIKELIHOOD",
bty = "n")
######################################################################
## Let's look at the negative log-likelihood which is what people
## traditionally use because it's usually easier to calculate (though
## for this example R can do all the work computationally quite
## easily).
plot(potential.prev.vector, -log(likelihoods), type = "l", col = "purple", col.main = "white",
bty = "n", lwd = 3, xlab = "potential prevalences (our models)",
ylab = "-log(likelihood)", main = "we usually minimize the -log(likelihood)")
## We can use the Likelihood Ratio Test to calculate confidence intervals.
## If the logL of a hypothesised prevalence is not within the chi-squared
## alpha=.05 cutoff of the minimum logL, then we do not reject it.
chisq.crit <- qchisq(.95, df = 1)
min.l <- min(-log(likelihoods))
abline(h = min.l + chisq.crit/2, lwd = 3, lty = 2)
######################################################################
## It's hard to see this so let's zoom in on last plot for values of
## the potential prevalence between .15 and .5.
xlim <- c(.15, .5)
zoom.plot.index <- potential.prev.vector>.15 & potential.prev.vector<.5
plot(potential.prev.vector[zoom.plot.index], -log(likelihoods[zoom.plot.index]),
type = "l", col = "purple", col.main = "white", xlim = xlim,
bty = "n", lwd = 3, xlab = "potential prevalences (our models)",
ylab = "-log(likelihood)", main = "we usually minimize the -log(likelihood)")
chisq.crit <- qchisq(.95, df = 1)
chisq.crit
min.l <- min(-log(likelihoods))
ci.likelihood <- range(potential.prev.vector[-log(likelihoods) < min.l + chisq.crit/2])
abline(h = min.l + chisq.crit/2, lwd = 3, lty = 2)
## add title to the plot
ci.l <- signif(ci.likelihood[1],3)
ci.u <- signif(ci.likelihood[2],3)
mtext(paste("95% CI includes HIV prevalences of ", ci.l*100, "% to ", ci.u*100, "%", sep=""), side = 3, line =0)
arrows(ci.l, min.l + 4, ci.l, min.l, length = .2)
arrows(ci.u, min.l + 4, ci.u, min.l, length = .2)
text(ci.l, min.l + 4, ci.l,cex = 1,pos = 3)
text(ci.u, min.l + 4, ci.u,cex = 1,pos = 3)
## Compare confidence intervals calculated using binom.test() and using the Likelihood
## Ratio Test.
binom.test(samplePos, sampleSize, samplePos/sampleSize, alternative = "two.sided")
ci.likelihood
## Are the results similar?
## Task 3: Consider the following questions:
## For this example, do you think it was worth the trouble constructing Likelihood based confidence intervals (CIs)?
## Under what circumstances might you use classical tests for CIs?
## Under what circumstances would likelihood-based tests be better for CIs?