-
Notifications
You must be signed in to change notification settings - Fork 0
/
Model_temperature_test.R
384 lines (296 loc) · 17.1 KB
/
Model_temperature_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
################################################################################
# Script to fit phylogenetic multilevel model using brms: Part 1
# Title: Temperature effect on metabolic scaling of ectothermic vertebrates
# Authors: Guillermo García-Gómez (guillegar.gz@gmail.com) & Matthew Spencer
# Date: 07112023
# Operating System: Windows 10 Pro 21H2
# ------------------------------------------------------------------------------
# Cite as:
# García-Gómez, G., Hirst, A.G., Spencer, M., & Atkinson, D. (2024).
# Responses of intraspecific metabolic scaling to temperature and activity differ between water- and air-breathing ectothermic vertebrates.
# OSF (DOI 10.17605/OSF.IO/N6M5J)
# ------------------------------------------------------------------------------
rm(list=ls())# clear the work environment
today <- format(Sys.Date(),"%Y%m%d")# setting the date
# ------------------------------------------------------------------------------
# It needs to be set to Project directory
getwd()# to check
#
# ------------------------------------------------------------------------------
# Packages ####
#
# Bayesian analysis
library(rstan)
library(rstantools)
library(brms)
library(bayestestR)
library(loo)
# visual-checking model performance
library(ggmcmc)
library(bayesplot)
library(tidybayes)
# phylogenetic tree
library(rotl)
library(ape)
## plotting
library(ggplot2)
library(ggtree)
library(cowplot)
# data sorting
library(dplyr)
# Computes t probabilities, quantiles, random deviates and densities
library(mvtnorm)
# call functions to check model performance
source("temperaturefuncs.R")
fitt <- TRUE # fit model with student-t errors?
fitgaussian <- FALSE # fit model with Gaussian errors?
fittsimulated <- FALSE # fit student-t model to data simulated under posterior mean parameters estimated from real data?
tdiagnostics <- FALSE # diagnostics for model with student-t errors
processtsimulated <- FALSE # plot estimates from student-t model fitted to simulated data
simfitname <- "../Model_outputs/simfitt" # base for names in which saved fits to simulated data will be stored
# ------------------------------------------------------------------------------
set.seed(1234)# to replicate the results
# General STAN specifications
rstan::rstan_options(auto_write = TRUE) # write objects to the hard disk to avoid recompilation
options(mc.cores = parallel::detectCores())
#### Load data ####
all_temp <- read.csv("table_S1.csv") # Table S1
# Histograms:
# response variable: slopes b
hist(all_temp$b)
# predictor variable: log10(Metabolic level, L)
hist(log10(all_temp$L))
#### Models for all species (water + air-breathers) ####
# Building the phylogenetic tree for all species (needs internet connection)
all_temp.taxa <- tnrs_match_names(as.character(all_temp$species_phylo)) # match species from dataset to those included in Tree of Life
all_temp.tree <- tol_induced_subtree(ott_ids = ott_id(all_temp.taxa), label = "name") # building a tree
# Check phylogenetic tree
ggtree(all_temp.tree, layout = "fan", open.angle=0, ladderize = TRUE, size=0.75) +
geom_tiplab(size=2.1, hjust = -.1, color='blue')
# group species colour-coded according to respiration mode by selecting nodes in the tree
(tree2 <- groupClade(all_temp.tree, c(113, 157, 115)))
# plot species colour-coded by respiration mode
(p <- ggtree(tree2, aes(color=group), layout = "fan", ladderize = TRUE, size=0.5) +
xlim(0,30) +
scale_color_manual(values=c("cornflowerblue", "cornflowerblue", "chartreuse3"))+
#scale_x_reverse() +
geom_tiplab(size=2) +
theme(legend.position = "none"))
# The tree object contains information on the relationship between species.
# Using this information, we can construct a covariance matrix of species.
all_temp$phylo <- gsub(' ','_',all_temp$species_phylo)# On the tree, names have underscores instead of spaces.
## to compute branch length
all_temp.tree # inferred from taxonomic relatedness, no branch-lengths
is.binary(all_temp.tree) # check for polytomies
all_temp.tree_p <- multi2di(all_temp.tree) # resolve polytomies before calculating branch length
is.binary(all_temp.tree_p)
# plot new tree without polytomies
(all_temp.tree_p_2 <- groupClade(all_temp.tree_p, c(113, 157, 115)))
(p <- ggtree(all_temp.tree_p_2, aes(color=group), layout = "fan", ladderize = TRUE, size=0.5) +
xlim(0,30) +
scale_color_manual(values=c("cornflowerblue", "cornflowerblue", "chartreuse3"))+
#scale_x_reverse() +
geom_tiplab(size=2) +
theme(legend.position = "none"))
# compute branch length
all_temp.tree.bl <- compute.brlen(all_temp.tree_p, method="Grafen", power = 1) # Grafen method used to calculate branch-length from a non-ultrametric tree
# preliminary analyses showed that different tree transformation (rho) resulted in only slightly differences in model performance, so default is used (rho, 'power' = 1).
# This is how a transformation of rho = 1 looks like:
ggtree(all_temp.tree.bl, layout = "fan", open.angle=0, ladderize = TRUE, size=0.75) +
geom_tiplab(size=2.1, hjust = -.1, color='blue')
A <- vcv.phylo(all_temp.tree.bl) # calculate covariance matrix from tree
# explore whether our tree covers all the species we wanted
# it to include, and making sure that the species names in our
# database match those in the tree. We use the following code.
setdiff(all_temp$phylo, rownames(A)) # check species names in dataset match those in A (0 is OK)
setdiff(rownames(A), all_temp$phylo) # check species names in A match those in dataset (0 is OK)
#-------------------------------------------------------------------------------
# Define priors
# mix of weakly informative and informative priors
priors =c(
## Informative priors:
# Intercept:
# Following MLBH (Glazier 2005, 2010), slope b approaches 2/3 with temperature-increased
# metabolic levels (log10 L = 0 (intercept), i.e. L of 1 mg O2/g/h, which is high):
prior(normal(0.67, 0.5), "Intercept"),
# Coefficient of log-transformed Metabolic level (L):
# Following empirical estimate of slope b vs. L between species
# at different temperatures from Killen et al. (2010) in Eco. Lett. (estimate = -0.145):
prior(normal(-0.1, 0.5), "b", coef = "log10_L"),
## weakly informative priors
prior(normal(0, 1), "b"),
prior(student_t(3, 0, 2.5), "sd"),
prior(student_t(3, 0, 2.5), "sigma")
)
#-------------------------------------------------------------------------------
#### Model fitting ####
if(fitt){
# model
m_all.temp_complex.info <- brm(
b ~ log10_L * group + (log10_L|experiment) + (1|gr(phylo, cov = A)),
data = all_temp,
family = student(),
control = list(adapt_delta = 0.999, max_treedepth = 15),
data2 = list(A = A),
prior = priors,
sample_prior = TRUE,
save_pars = save_pars(all = TRUE),
iter = 7000, warmup = 3000, chains = 4, cores = 4)
# total running time: 30 min # Bulk_ESS seems fine for all parameters (i.e. > 1000)
# Model summary ####
summary(m_all.temp_complex.info, prob = 0.95) %>% print(digits = 3)
# Family: student
# Links: mu = identity; sigma = identity; nu = identity
# Formula: b ~ log10_L * group + (log10_L | experiment) + (1 | gr(phylo, cov = A))
# Data: all_temp (Number of observations: 523)
# Draws: 4 chains, each with iter = 7000; warmup = 3000; thin = 1;
# total post-warmup draws = 16000
# Group-Level Effects:
# ~experiment (Number of levels: 149)
# Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
# sd(Intercept) 0.122 0.017 0.091 0.157 1.001 2866 5990
# sd(log10_L) 0.097 0.018 0.063 0.135 1.004 1230 2563
# cor(Intercept,log10_L) 0.422 0.184 -0.005 0.707 1.003 1540 3580
# ~phylo (Number of levels: 112)
# Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
# sd(Intercept) 0.068 0.041 0.004 0.157 1.005 1439 4266
# Population-Level Effects:
# Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
# Intercept 0.726 0.070 0.581 0.875 1.002 11715 8065
# log10_L -0.002 0.021 -0.043 0.039 1.000 7300 9856
# groupwaterMbreather -0.027 0.087 -0.206 0.155 1.002 10759 8228
# log10_L:groupwaterMbreather -0.090 0.029 -0.147 -0.035 1.000 6741 8839
# Family Specific Parameters:
# Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
# sigma 0.031 0.004 0.024 0.039 1.001 2976 6283
# nu 1.783 0.265 1.339 2.375 1.000 5800 9221
# create a folder named 'Model_outputs', and save the model (so it doesn't need to be rerun for every session) ####
# note: saved models are too large to be pushed in git standard repositories
saveRDS(m_all.temp_complex.info,"../Model_outputs/m_all.temp.rds")
m_all.temp <- readRDS("../Model_outputs/m_all.temp.rds") # rename saved model
summary(m_all.temp, prob = 0.95) %>% print(digits = 3)
# write stan code for this model:
stancode(m_all.temp)
# Posterior predictive checks ####
plot(m_all.temp, N = 5, ask = FALSE) # check convergence of chains / looks fine (note sd_phylo_intercept)
pp_check(m_all.temp, type="scatter_avg", ndraws = 100) # some outliers at the x end
pp_check(m_all.temp, type = "loo_pit_overlay", nsamples = NULL) # looks fine
pp_check(m_all.temp, type = "dens_overlay", nsamples = 99) # looks fine
plot(conditional_effects(m_all.temp), points = TRUE) # see data from the same experiment (Fig. 2)
## LOO
loo_model_all_temp <- loo(m_all.temp, save_psis = TRUE)
loo_model_all_temp # 3 problematic observations (0.6% of total data) based on pareto-k-diagnostic (k > 0.7)
plot(loo_model_all_temp) # plot of the pareto-k values
## RELOO without problematic observations, using one more chain:
# (takes about 40 minutes to run, and results are almost identical)
# This is a necessary step to calculate Monte Carlo SE of elpd_loo
reloo_model_all_temp <- reloo(loo_model_all_temp, m_all.temp, chains = 1)
reloo_model_all_temp # negligible improve of LOOIC (-1172.0 vs. -1172.3).
# Computed from 16000 by 523 log-likelihood matrix
# Estimate SE
# elpd_loo 586.1 27.7
# p_loo 290.5 13.1
# looic -1172.3 55.3
# ------
# Monte Carlo SE of elpd_loo is 0.3.
# Pareto k diagnostic values:
# Count Pct. Min. n_eff
# (-Inf, 0.5] (good) 507 96.9% 259
# (0.5, 0.7] (ok) 16 3.1% 378
# (0.7, 1] (bad) 0 0.0% <NA>
# (1, Inf) (very bad) 0 0.0% <NA>
# All Pareto k estimates are ok (k < 0.7).
# See help('pareto-k-diagnostic') for details.
# Calculate phylogenetic signal (equivalent lambda):
hyp <- "sd_phylo__Intercept^2 / (sd_phylo__Intercept^2 + sigma^2) = 0"
(hyp_temp <- hypothesis(m_all.temp, hyp, class = NULL) %>% print(digits = 3))
# Hypothesis Tests for class :
# Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
# 1 (sd_phylo__Interc... = 0 0.696 0.278 0.014 0.965 0.331 0.249 *
# ---
# 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
# '*': For one-sided hypotheses, the posterior probability exceeds 95%;
# for two-sided hypotheses, the value tested against lies outside the 95%-CI.
# Posterior probabilities of point hypotheses assume equal prior probabilities.
}
if(fittsimulated){
m_all.temp <- readRDS("../Model_outputs/m_all.temp.rds")
trueparms <- as.list(posterior_summary(m_all.temp, variable = c("b_Intercept", "b_log10_L", "b_groupwaterMbreather", "b_log10_L:groupwaterMbreather", "sigma", "nu", "sd_experiment__Intercept", "sd_experiment__log10_L", "cor_experiment__Intercept__log10_L", "sd_phylo__Intercept"))[, 1])
iter <- 10 #number of simulated data sets to generate
simb <- array(dim = c(dim(all_temp)[1], iter))
for(i in 1:iter){
simdata <- simulatetdata(realdata = all_temp, trueparms = trueparms, A = A)
simb[, i] <- simdata$b
simfit <- brm(
b ~ log10_L * group + (log10_L|experiment) + (1|gr(phylo, cov = A)),
data = all_temp,
family = student(),
control = list(adapt_delta = 0.999, max_treedepth = 15),
data2 = list(A = A),
prior = priors,
sample_prior = TRUE,
save_pars = save_pars(all = TRUE),
iter = 7000, warmup = 3000, chains = 4, cores = 4) # SAME NUMBER OF ITERATIONS THAN REAL MODEL
saveRDS(simfit, paste(simfitname, "_", i, ".rds", sep = ""))
}
matplot(all_temp$b, simb, xlab = "real b", ylab = "simulated b", pch = 16) #THIS IS JUST FOR TESTING: ARE WE GENERATING PLAUSIBLE VALUES?
abline(h = c(2/3, 1), lty = 2, lwd = 3) # typical boundaries of b under the MLBH
abline(h = c(0.2, 1.6), lty = 2, lwd = 2, col = "red") # minimum and maximum value from real data
}
if(processtsimulated){
fnames <- list.files(path = "../Model_outputs/", pattern = "simfitt_")
m_all.temp <- readRDS("../Model_outputs/m_all.temp.rds")
trueparms <- as.list(posterior_summary(m_all.temp, variable = c("b_Intercept", "b_log10_L", "b_groupwaterMbreather", "b_log10_L:groupwaterMbreather", "sigma", "nu", "sd_experiment__Intercept", "sd_experiment__log10_L", "cor_experiment__Intercept__log10_L", "sd_phylo__Intercept"))[, 1])
plotsimulated(path = "../Model_outputs/", fnames = fnames, trueparms = trueparms)
dev.off()
}
if(tdiagnostics){
m_all.temp <- readRDS(file = "../Model_outputs/m_all.temp.rds")
postnu <- as_draws_matrix(m_all.temp, variable = "nu")
priornu <- rgamma(n = dim(postnu)[1], shape = 2, rate = 0.1) #sample from gamma distribution with parameters as in Stan code
priornu <- priornu[priornu > 1] #truncate as in Stan code
plot(density(postnu), xlab = expression(nu), ylab = "density", main = "", lwd = 2, xlim = range(priornu))
lines(density(priornu), lwd = 2, lty = "dashed") #we can see that there is information on nu in the data: posterior is much more concentrated than prior
legend("topright", bty = "n", lty = c("solid", "dashed"), lwd = c(2, 2), legend = c("posterior", "prior"))
postsigma <- as_draws_matrix(m_all.temp, variable = "sigma")
priorsigma <- abs(rt(n = dim(postsigma)[1], df = 3) * 2.5 + 0) #sample from half non-standardized t-distribution with parameters as in Stan code
plot(density(postsigma), xlab = expression(sigma), ylab = "density", main = "", lwd = 2, xlim = range(priorsigma))
lines(density(priorsigma), lwd = 2, lty = "dashed") #can see there is information on sigma in the data: posterior much more concentrated than prior
legend("topright", bty = "n", lty = c("solid", "dashed"), lwd = c(2, 2), legend = c("posterior", "prior"))
resid <- residuals(m_all.temp) #these are summarized: column 1 is estimated residual
qqplot(x = qt(ppoints(dim(resid)[1]), df = mean(postnu)), y = resid[, 1], xlab = "theoretical", ylab = "observed")
qqline(y = resid[, 1], distribution = function(p) qt(p, df = mean(postnu)), probs = c(0.25, 0.75))
par(mfrow = c(4, 4))
resid <- residuals(m_all.temp, summary = FALSE) #one row for each draw from posterior
for(i in 1:16){
qqplot(x = qt(ppoints(dim(resid)[2]), df = mean(postnu)), y = resid[i, ], xlab = "theoretical", ylab = "observed")
qqline(y = resid[i, ], distribution = function(p) qt(p, df = mean(postnu)), probs = c(0.25, 0.75))
}
}
if(fitgaussian){
# Student-t distribution vs. Gaussian distribution ####
# Check that model performance is better using a Student-t distribution than a Gaussian distribution:
m_all.temp_gaussian <- update(m_all.temp, family = gaussian())
# total running time: 20 min # Bulk_ESS are fine (i.e.> 1000)
# save model (so that they don't need to be rerun for every session)
# saveRDS(m_all.temp_gaussian,"../Model_outputs/m_all.temp_gaussian.rds")
# m_all.temp_gaussian <- readRDS("../Model_outputs/m_all.temp_gaussian.rds") # call saved model
# compare models through LOOIC
loo(m_all.temp, m_all.temp_gaussian)
# Model comparisons:
# elpd_diff se_diff
# m_all.temp_complex.info 0.0 0.0
# m_all.temp_complex_gaussian -96.4 18.7
}
# References:
# Glazier, D.S. (2005). Beyond the ‘3/4-power law’: variation in the intra- and interspecific scaling of metabolic rate in animals. Biol. Rev., 80, 611-662.
# Glazier, D.S. (2010). A unifying explanation for diverse metabolic scaling in animals and plants. Biol. Rev., 85, 111-138.
# Killen, S.S., Atkinson, D. & Glazier, D.S. (2010). The intraspecific scaling of metabolic rate with body mass in fishes depends on lifestyle and temperature. Ecol. Lett., 13, 184-193.
#-------------------------------------------------------------------------------
# Save data on the R session and packages versions for reproducibility shake ####
sink("../R_session/Bayesian_models_temp_R_session.txt") # Create folder named 'R_session'
sessionInfo()
sink()
################################################################################
############################ END OF SCRIPT #####################################
################################################################################