-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathestimate_binary_vector.R
141 lines (125 loc) · 5.05 KB
/
estimate_binary_vector.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# PRELIMINARIES ----------------------------------------------------------------
#
# Authors: Rob Hinch and Chris Wymant
# Acknowledgment: written while funded by a Li Ka Shing Foundation grant
# awarded to Christophe Fraser.
#
# Abbreviations:
# df = dataframe
# num = number
# mcmc = Markov Chain Monte Carlo
# param = parameter
# mu = the meanlog param of a lognormal
# sigma = the sdlog param of a lognormal
#
# Script purpose: we simulate a set of values y from either a 'signal'
# distribution or a 'noise' distribution, both lognormals but with different
# params, and then get Stan to infer the params of those distributions
# and which values were signal and which were noise. See the comments in the
# Stan code for how that works.
library(tidyverse)
library(rstan)
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
# INPUT ------------------------------------------------------------------------
# The path of the Stan code associated with this R code
file_input_stan_code <-
"~/teaching/other_topics/estimate_binary_vector.stan"
# Population-level params for the simulation
num_observations <- 1000
fraction_signal <- 0.8
mu_signal <- 10
sigma_signal <- 3
mu_noise <- 3
sigma_noise <- 0.8
# SIMULATE DATA ----------------------------------------------------------------
df <- tibble(
is_signal = rbernoulli(num_observations, fraction_signal),
y_observed = if_else(is_signal,
rlnorm(num_observations, mu_signal, sigma_signal),
rlnorm(num_observations, mu_noise, sigma_noise)))
# RUN STAN ---------------------------------------------------------------------
stan_input_posterior <- list(
num_observations = nrow(df),
y = df$y_observed,
prior_mu_signal_min = 7,
prior_mu_signal_max = 15,
prior_sigma_signal_min = 0.5,
prior_sigma_signal_max = 5,
prior_mu_noise_min = 1,
prior_mu_noise_max = 6,
prior_sigma_noise_min = 0.5,
prior_sigma_noise_max = 5,
prior_fraction_signal_min = 0,
prior_fraction_signal_max = 1,
get_posterior_not_prior = 1L
)
stan_input_prior <- stan_input_posterior
stan_input_prior$get_posterior_not_prior <- 0L
# Compile the Stan code
model_compiled <- stan_model(file_input_stan_code)
# Run the Stan code to sample from both posterior and prior
num_mcmc_iterations <- 500
num_mcmc_chains <- 4
start_time <- Sys.time()
fit <- sampling(model_compiled,
data = stan_input_posterior,
iter = num_mcmc_iterations,
chains = num_mcmc_chains)
fit_prior <- sampling(model_compiled,
data = stan_input_prior,
iter = num_mcmc_iterations,
chains = num_mcmc_chains)
end_time <- Sys.time()
cat("Time taken running Stan:\n")
end_time - start_time
# ANALYSE STAN OUTPUT ----------------------------------------------------------
# Get stan output into a long df labelled by posterior/prior
df_fit <- bind_rows(fit %>%
as.data.frame() %>%
mutate(density_type = "posterior",
sample = row_number()),
fit_prior %>%
as.data.frame() %>%
mutate(density_type = "prior",
sample = row_number())) %>%
as_tibble() %>%
pivot_longer(-c("sample", "density_type"), names_to = "param")
# Plot the prior and posterior for population-level (not for specific
# observations) params and the simulation truth
df_params_true <- tibble(
param = c("fraction_signal", "mu_signal", "sigma_signal", "mu_noise",
"sigma_noise"),
value = c(fraction_signal, mu_signal, sigma_signal,
mu_noise, sigma_noise))
ggplot(df_fit %>%
filter(!startsWith(param, "lp"),
!startsWith(param, "prob_is_noise_given_params"))) +
geom_histogram(aes(value, y = ..density.., fill = density_type),
alpha = 0.6,
position = "identity",
bins = 60) +
geom_vline(data = df_params_true, aes(xintercept = value), color = "black") +
facet_wrap(~param, scales = "free", nrow = 3) +
scale_fill_brewer(palette = "Set1") +
theme_classic() +
coord_cartesian(expand = FALSE) +
labs(fill = "",
x = "param value",
y = "probability density")
# Calculate the posterior means for each observation's probability of
# being noise. Link observations to their simulation truth. Binarise the
# means and compare against whether they really were noise.
df_fit %>%
filter(startsWith(param, "prob_is_noise_given_params"),
density_type == "posterior") %>%
group_by(param) %>%
summarise(mean_prob_is_noise = mean(value), .groups = "drop") %>%
mutate(which_observation =
str_match(param, "^prob_is_noise_given_params\\[([0-9]+)\\]$")[, 2] %>%
as.integer()) %>%
left_join(df %>% mutate(which_observation = row_number()),
by = "which_observation") %>%
mutate(correctly_classified = (mean_prob_is_noise < 0.5 & is_signal) |
(mean_prob_is_noise > 0.5 & ! is_signal)) %>%
summarise(fraction_correctly_classified = sum(correctly_classified) / n())