-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathHierarchicalSchools.R
139 lines (121 loc) · 5.08 KB
/
HierarchicalSchools.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# PRELIMINARIES ----------------------------------------------------------------
library(tidyverse)
library(ggforce) # just for one plot
library(rstan)
library(lme4)
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
set.seed(1)
file_input_stan_code <- "~/Dropbox (Infectious Disease)/STAN/HierarchicalSchools.stan"
file_output_plot_params <- "~/HierarchicalSchools.pdf"
file_output_plot_data <- "~/HierarchicalSchoolsData.pdf"
num_students <- 1000
num_schools <- 10
stddev_students <- 10
stddev_schools <- 5
grade_untreated_mean <- 60
treatment_effect <- 10
# SIMULATE DATA ----------------------------------------------------------------
# A df with one row per student. Randomly assign each to a school, and choose
# which schools were treated (even schools).
df <- tibble(student = 1:num_students,
school = sample(1:num_schools, size = num_students, replace = TRUE),
treated = school %% 2 == 0)
# Simulate effects specific to each school (separate from the treatment effect)
school_effects <- rnorm(num_schools,
mean = 0,
sd = stddev_schools)
# As a convenient intermediate quantity, define each student's expected grade:
# the mean for students at untreated schools +
# the effect specific to that student's school +
# the treatment effect if their school was treated
df$grade_expected <-
grade_untreated_mean +
map_dbl(df$school, function(which_school) {school_effects[[which_school]]}) +
if_else(df$treated, treatment_effect, 0)
# Using that intermediate, simulate each student's grade
df$grade <- rnorm(n = num_students,
mean = df$grade_expected,
sd = stddev_students)
df$grade <- pmin(df$grade, 100)
df$grade <- pmax(df$grade, 0)
ggplot(df %>%
mutate(treated = if_else(treated, "treated", "not treated")),
aes(as.factor(school), grade)) +
geom_violin() +
geom_sina(aes(color = treated)) +
theme_classic() +
scale_y_continuous(expand = expansion(mult = c(0.02, 0.02))) +
labs(x = "school",
y = "grade",
color = "") +
scale_color_brewer(palette = "Set1")
ggsave(file_output_plot_data, height = 6, width = 7)
# FREQUENTIST ESTIMATION -------------------------------------------------------
lmm <- lmer(data = df, grade ~ treated + (1 | school))
summary(lmm)
confint(lmm)
# RUN STAN ---------------------------------------------------------------------
# Collect together all the data Stan wants in a list. Make one list for sampling
# from the posterior and one for sampling from the prior.
list_stan_posterior <- list(
num_schools = num_schools,
num_students = num_students,
school = df$school,
grade = df$grade,
treated = df$treated,
get_posterior_not_prior = 1L
)
list_stan_prior <- list_stan_posterior
list_stan_prior$get_posterior_not_prior <- 0L
# Compile the Stan code
model_compiled <- stan_model(file_input_stan_code)
# Run the Stan code
num_mcmc_iterations <- 500
num_mcmc_chains <- 4
fit <- sampling(model_compiled,
data = list_stan_posterior,
iter = num_mcmc_iterations,
chains = num_mcmc_chains)
fit_prior <- sampling(model_compiled,
data = list_stan_prior,
iter = num_mcmc_iterations,
chains = num_mcmc_chains)
# ANALYSE STAN OUTPUT ----------------------------------------------------------
# Get stan output into a long df labelled by posterior/prior
df_fit <- bind_rows(fit %>%
as.data.frame() %>%
mutate(density_type = "posterior",
sample = row_number()),
fit_prior %>%
as.data.frame() %>%
mutate(density_type = "prior",
sample = row_number())) %>%
as_tibble() %>%
pivot_longer(-c("sample", "density_type"), names_to = "parameter")
# Plot each parameter's marginal posterior, marginal prior and simulation truth
list_parameters_truth <- list(stddev_students = stddev_students,
stddev_schools = stddev_schools,
grade_untreated_mean = grade_untreated_mean,
treatment_effect = treatment_effect)
df_parameters_truth <- tibble(parameter = names(list_parameters_truth),
value = unlist(list_parameters_truth)) %>%
bind_rows(tibble(parameter = paste0("school_effects[", 1:num_schools, "]"),
value = school_effects))
ggplot(df_fit %>%
filter(parameter != "lp__",
! str_starts(parameter, "grade_expected"),
! str_starts(parameter, "grade_simulated"))) +
geom_histogram(aes(value, y = ..density.., fill = density_type),
alpha = 0.6,
position = "identity",
bins = 60) +
geom_vline(data = df_parameters_truth, aes(xintercept = value)) +
facet_wrap(~parameter, scales = "free", nrow = 3) +
scale_fill_brewer(palette = "Set1") +
theme_classic() +
coord_cartesian(expand = FALSE) +
labs(fill = "",
x = "parameter value",
y = "probability density")
ggsave(file_output_plot_params, height = 6, width = 12)