-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgme.r
198 lines (156 loc) · 8.11 KB
/
gme.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
library(cmdstanr)
library(tidyverse)
library(latex2exp)
library(posterior)
library(bayesplot)
library(reshape2)
library(ggridges)
# Detect the number of usable cores. Also make sure that Stan reuses models that have not changed so that we don't have to recompile every time.
options(mc.cores = parallel::detectCores())
# Define a function to read in the appropriate data set and do some preprocessing.
import_data = function(dataset) {
# for globular cluster data set
if (dataset == 'gc') {
d = read.csv('../data/GC_Vasiliev.csv') %>%
filter(!is.na(Vlos)) %>% # require Vlos to exist
filter(Rgc > 15) # only use GCs outside 15kpc radius)
# formatting GC names
d$Fullname = d$Fullname %>% trimws
}
# for dwarf galaxy data set
else if (dataset == 'dg') {
# ignore these DGs, they are possibly associated with the LMC
d = read.csv('../data/DG_Fritz.csv') %>%
filter(!is.na(Vlos)) %>% # require Vlos
filter(Rgc > 15) # only use DGs outside 15kpc radius
# formatting DG names
d$Fullname = d$Fullname %>% trimws
remove = c("Carina II", "Carina III", "Horologium I", "Hydrus I")
d = d %>% filter(!Fullname %in% remove) # filter out DGs we don't want to include
}
return(d)
}
# read data
d = import_data('gc')
# Format the data to be passed in to Stan.
stan_data = list(
N = nrow(d),
ra = d$RA,
dec = d$DEC,
plx = d$plx,
Xgc = d$Xgc,
Ygc = d$Ygc,
Zgc = d$Zgc,
pmra_measured = d$PMra,
pmdec_measured = d$PMdec,
vlos_measured = d$Vlos,
r_measured = d$Rgc,
pmra_err = d$ePMra,
pmdec_err = d$ePMdec,
vlos_err = d$eVlos,
r_err = d$eRgc
)
# Define a function which generates a dictionary with the initial values. Initial values must be specified for either all parameters xor none of the parameters. The type for each parameter must match what is defined in the Stan model.
initfun = function() {
list(
# for the GC dataset, these values seem to work well
p_gamma = runif(1, 0.4, 0.55),
p_phi0 = runif(1, 60, 80),
p_beta = runif(1, -0.1, 0.5),
p_alpha = runif(1, 3.1, 3.5),
# for the DG dataset, the (hyper)priors are different so
# use these (random) initial values instead
# p_beta = runif(1, -2.5, 0.5),
# p_alpha = runif(1, 3.3, 3.7),
pmra = rnorm(stan_data$N, 0, 1e-6),
pmdec = rnorm(stan_data$N, 0, 1e-6),
vlos = rnorm(stan_data$N, 0, 1),
r = runif(stan_data$N, 0, max(stan_data$r_measured))
)
}
# Given a Stan file, compile the model and run it, passing in at the minimum the data to use. Here we also specify the number of chains (where each chain is run on a separate thread), the initial values, and some additional parameters controlling the algorithm.
#`adapt_delta` effectively controls the step size of the numerical integrator. Run the code with lower values (default is 0.8) first, and if the number of divergences is significant enough to impact the validity of the samples then increase this value. This increases the model robustness but may lead to slower sampling. Do not increase this if you only have a few divergences out of a few thousand draws, or if your `n_eff` is already reasonably large.
#`max_treedepth` increases the depth of the binary tree created for each NUTS iteration when the sampler integrates forward and backward in time. If the code complains about saturated tree depths, this value may need to be increased. This will increase the runtime. Note that saturated tree depths are an efficiency problem, not a validity problem.
#`chains` controls the number of Markov chains to use. The default is 4. Each chain runs on a separate core, and you can set the number of chains to as many cores as you have. However, since `n_eff` of a few hundred is usually enough for inference, more chains are not always necessary. Increasing the number of chains may also increase the runtime. Use `chains=1` for diagnostic purposes.
#`iter_warmup` controls the number of iterations to perform in the warmup stage. This helps NUTS find the optimal number of steps and the step size, and the samples drawn here are not used for inference purposes. `iter_sampling` controls the total number of iterations to perform. The number of actual usable draws will be `iter - warmup`.
mod = cmdstan_model("../models/gc.stan")
fit = mod$sample(
data = stan_data,
chains = 2,
init = initfun,
iter_warmup = 1000,
iter_sampling = 1000,
# output_dir = '../saved/'
# max_treedepth = 13,
# adapt_delta = 0.95,
)
fit = stan(file = "../models/gc.stan", # or use another model (.stan file) in the same directory
data=stan_data,
# warmup=4e3,
# iter=4e3 + 2e3,
chains = 2, # use this for diagnostics
# chains = parallel::detectCores(), # use all cores
init = initfun,
control=list(
# max_treedepth=13,
# adapt_delta=0.95
),
verbose = TRUE
)
# Run diagnostics.
fit$cmdstan_diagnose()
# Draw samples from the posterior.
samples = fit$draws() %>% as_draws_df
posterior = fit$draws() %>% as.array
np = nuts_params(fit)
# Here are the parameters of most interest.
params = c("p_phi0", "p_gamma", "p_alpha", "p_beta")
# Make some plots of the posterior samples.
mcmc_dens_overlay(posterior, pars = params)
# Parallel plot. Check where divergences occur.
mcmc_parcoord(posterior, pars = params, transformations = scale, np = np)
# Pairs plot
mcmc_pairs(posterior, pars = params, np = np, off_diag_fun = 'hex')
# Show a traceplot of the chains. These are like usual traceplots, and should look like Gaussian noise with a stable means and (non-zero) variances.
mcmc_trace(posterior, pars = params)
# ridgeline plot of distances, proper motions, los velocities
plot_posterior_ridgelines = function(samples, varname) {
select(samples, contains(paste(varname, "[", sep = ""))) %>% melt %>% ggplot() + geom_density_ridges(aes(x=value, y=variable), quantile_lines = TRUE, quantiles = 2)
}
plot_posterior_ridgelines(samples, "pmra")
plot_posterior_ridgelines(samples, "pmdec")
plot_posterior_ridgelines(samples, "r")
plot_posterior_ridgelines(samples, "vlos")
# Define a function which, given the radius in kpc, returns either a large number of mass estimates at that radius, or some summary statistics for the mass at that radius. The resulting mass is in units of $10^{12}$ M$_\odot$.
mass_at_radius = function(r, data, full=FALSE) {
m = data$p_gamma * data$p_phi0 * 2.325e-3 * (r)^(1 - data$p_gamma) * 1e12
if (full) {
return(m)
} else {
return (quantile(m, c(0.025, 0.125, 0.25, 0.5, 0.75, 0.875, 0.975)))
}
}
# Plot the mass distribution out to some radius, along with various credible intervals.
plot_masses = function(massfunc, data, add=FALSE, color="blue", upto=200) {
radii = seq(1, upto, length.out = 1000)
masses = sapply(radii, massfunc, data = data) / 1e12
par(mfrow=c(1,1))
if (add) {
lines(radii, masses['50%',], type='l', ylab=TeX("Mass ($10^{12} M_{sol}$)"), xlab="Radius (kpc)")
} else {
plot(radii, masses['50%',], type='l', ylab=TeX("Mass ($10^{12} M_{sol}$)$"), xlab="Radius (kpc)", ylim=c(0, 1.5))
}
polygon(c(radii, rev(radii)), c(masses['25%',], rev(masses['75%',])), col=adjustcolor(color,alpha.f=0.4) , fillOddEven = TRUE)
polygon(c(radii, rev(radii)), c(masses['12.5%',], rev(masses['87.5%',])), col=adjustcolor(color,alpha.f=0.3) , fillOddEven = TRUE)
polygon(c(radii, rev(radii)), c(masses['2.5%',], rev(masses['97.5%',])), col=adjustcolor(color,alpha.f=0.2) , fillOddEven = TRUE)
}
# Plot the mass distribution out to 300 kpc.
plot_masses(mass_at_radius, samples, color='#5E81AC', upto=300)
# Plot the density of the mass within 200 kpc.
mass_at_radius(200, samples, full=TRUE) %>%
data.frame %>%
rename(mass='.') %>%
ggplot(aes(x=mass)) +
geom_density(col='#5E81AC', size=1) +
geom_vline(xintercept=mass_at_radius(200, samples)['50%'], col='#5E81AC', size=1) +
xlim(5e11, 2e12)