Reorganizing to reflect updated chlorophyll analyses for the NDFS syn…

…thesis manuscript
InteragencyEcologicalProgram · Nov 21, 2023 · f387ffb · f387ffb
1 parent 566832e
commit f387ffb
Show file tree

Hide file tree

Showing 9 changed files with 751 additions and 2,398 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -6,7 +6,8 @@ layout: default
 
 ### Exploratory Analyses
 
-* [Models using Categorical Predictors](rtm_chl_gam_analysis_categorical.html)
+* [GAM Model using Categorical Predictors - Initial analysis](rtm_chl_gam_categorical_initial.html)
+* [Models using Categorical Predictors - Revised analysis](rtm_chl_models_categorical_revised.html)
 * [Models using Flow as a Continuous Predictor](rtm_chl_models_flow.html)
 * [Explore relationship between continuous chlorophyll and percent flow pulse water](rtm_chl_analysis_perc_flow_pulse.html)
 

diff --git a/docs/rtm_chl_analysis_final_model_results.html b/docs/rtm_chl_analysis_final_model_results.html
diff --git a/docs/rtm_chl_gam_analysis_categorical.html → docs/rtm_chl_gam_categorical_initial.html b/docs/rtm_chl_gam_analysis_categorical.html → docs/rtm_chl_gam_categorical_initial.html
diff --git a/docs/rtm_chl_analysis_model_selection.html → docs/rtm_chl_models_categorical_revised.html b/docs/rtm_chl_analysis_model_selection.html → docs/rtm_chl_models_categorical_revised.html
diff --git a/manuscript_synthesis/model/chla_cat3_lm_model.rds b/manuscript_synthesis/model/chla_cat3_lm_model.rds
diff --git a/manuscript_synthesis/model/chla_model_data.rds b/manuscript_synthesis/model/chla_model_data.rds
diff --git a/manuscript_synthesis/notebooks/rtm_chl_analysis_final_model_results.Rmd b/manuscript_synthesis/notebooks/rtm_chl_analysis_final_model_results.Rmd
diff --git a/...ooks/rtm_chl_gam_analysis_categorical.Rmd → ...books/rtm_chl_gam_categorical_initial.Rmd b/...ooks/rtm_chl_gam_analysis_categorical.Rmd → ...books/rtm_chl_gam_categorical_initial.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "NDFS Synthesis Manuscript: Chlorophyll analysis"
-subtitle: "GAM model using categorical predictors"
+subtitle: "GAM model using categorical predictors - Initial"
 author: "Dave Bosworth"
 date: '`r format(Sys.Date(), "%B %d, %Y")`'
 output: 

diff --git a/...ooks/rtm_chl_analysis_model_selection.Rmd → ...ks/rtm_chl_models_categorical_revised.Rmd b/...ooks/rtm_chl_analysis_model_selection.Rmd → ...ks/rtm_chl_models_categorical_revised.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "NDFS Synthesis Manuscript: Chlorophyll analysis"
-subtitle: "Model selection"
+subtitle: "Models using categorical predictors - Revised Analysis"
 author: "Dave Bosworth"
 date: '`r format(Sys.Date(), "%B %d, %Y")`'
 output: 
@@ -42,14 +42,16 @@ library(car)
 library(gratia)
 library(rlang)
 library(patchwork)
+library(emmeans)
+library(multcomp)
 library(here)
 library(conflicted)
 
 # Source functions
 source(here("manuscript_synthesis/src/global_functions.R"))
 
 # Declare package conflict preferences 
-conflicts_prefer(dplyr::filter(), dplyr::lag())
+conflicts_prefer(dplyr::filter(), dplyr::lag(), dplyr::select())
 ```
 
 Display current versions of R and packages used for this analysis:
@@ -58,10 +60,10 @@ Display current versions of R and packages used for this analysis:
 devtools::session_info()
 ```
 
-Create function to plot linear model diagnostics:
+Create functions for document:
 
 ```{r global funcs}
-# Create model diagnostic plots to check assumptions
+# Create diagnostic plots for linear models to check assumptions
 plot_lm_diag <- function(df_data, param_var, unit_label, model, trans = c("none", "log", "sqrt"), ...) {
   trans <- match.arg(trans)
   
@@ -118,6 +120,75 @@ plot_lm_diag <- function(df_data, param_var, unit_label, model, trans = c("none"
   
   (plt_hist + plt_qq) / (plt_res_fit + plt_obs_fit)
 }
+
+plot_model_summary <- function(df_data, em_tuk_obj) {
+  # Calculate min and max values of observed data for each Station - Year group to
+    # determine vertical positioning of letters for figure
+  df_data_summ <- df_data %>% 
+    summarize(
+      max_val = max(Chla),
+      min_val = min(Chla),
+      .by = c(StationCode, Year)
+    )
+
+  # Add significance grouping letters to the Tukey post-hoc results
+  df_tuk <- em_tuk_obj %>% 
+    cld(sort = FALSE, Letters = letters) %>% 
+    as_tibble() %>% 
+    mutate(
+      group = str_remove_all(.group, fixed(" ")),
+      # back transform log-transformed results
+      across(c(emmean, lower.CL, upper.CL), ~ exp(.x) / 1000),
+      Year = as.numeric(as.character(Year_fct))
+    ) %>% 
+    # Add min and max values of observed data to the Tukey post-hoc results and
+      # calculate vertical positioning of letters
+    left_join(df_data_summ, by = join_by(StationCode, Year)) %>% 
+    mutate(max_val = if_else(upper.CL > max_val, upper.CL, max_val)) %>% 
+    group_by(StationCode, Year) %>% 
+    mutate(max_val = max(max_val)) %>% 
+    ungroup() %>% 
+    mutate(y_pos = max_val + (max_val - min_val) / 10) %>% 
+    select(
+      StationCode,
+      Year,
+      FlowActionPeriod,
+      emmean,
+      lower.CL,
+      upper.CL,
+      group,
+      y_pos
+    )
+  
+  # Create summary figure
+  df_tuk %>%
+    ggplot(
+      aes(
+        x = FlowActionPeriod,
+        y = emmean,
+        ymin = lower.CL,
+        ymax = upper.CL
+      )
+    ) +
+    geom_boxplot(
+      data = df_data,
+      aes(x = FlowActionPeriod, y = Chla),
+      inherit.aes = FALSE
+    ) +
+    geom_crossbar(color = "grey82", fill = "grey", alpha = 0.7, linewidth = 0.1) +
+    geom_point(color = "red") +
+    geom_text(aes(y = y_pos, label = group), size = 3) +
+    facet_wrap(
+      vars(StationCode, Year), 
+      scales = "free_y",
+      nrow = 4,
+      labeller = labeller(.multi_line = FALSE)
+    ) +
+    xlab("Flow Pulse Period") +
+    ylab(expression(Chlorophyll~Fluoresence~(mu*g~L^{-1}))) +
+    theme_bw() +
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
+}
 ```
 
 
@@ -911,18 +982,67 @@ df_m_aic_bic %>% arrange(AIC)
 df_m_aic_bic %>% arrange(BIC)
 ```
 
-According to AIC, model 3 (LM 3-way interactions without seasonal term) was the best model, while BIC preferred model 6 (LM 2-way interactions without seasonal term). We'll select model 3 (LM 3-way interactions without seasonal term) because the 3-way interaction between categorical variables was significant in this model indicating that there are possible significant differences between Flow Pulse Periods within each Station-Year grouping. We'll export model 3 and its data to be used for further exploration in other notebooks. 
+According to AIC, model 3 (LM 3-way interactions without seasonal term) was the best model, while BIC preferred model 6 (LM 2-way interactions without seasonal term). We'll select model 3 (LM 3-way interactions without seasonal term) because the 3-way interaction between categorical variables was significant in this model indicating that there are possible significant differences between Flow Pulse Periods within each Station-Year grouping. 
+
+# Model Results
+
+## Model 3
+
+Lets take a closer look at model 3: LM 3-way interactions without seasonal term <br>
+Formula: `r format(m_lm_cat3_lag1$call) %>% str_c(collapse = "") %>% str_squish() %>% str_sub(start = 14, end = -12)`
 
-```{r export models and data}
-# Define file path for model and data used in the model
-fp_model <- here("manuscript_synthesis/model")
+### Pairwise Contrasts
 
-# Export model and data used in the model
-m_lm_cat3_lag1 %>% saveRDS(file.path(fp_model, "chla_cat3_lm_model.rds"))
+Tukey pairwise contrasts of Flow Pulse Period for each Station - Year combination:
 
+```{r lm cat3 lag1 tukey flow action period, warning = FALSE}
+em_lm_cat3_lag1 <- emmeans(m_lm_cat3_lag1, ~ FlowActionPeriod | StationCode * Year_fct)
+pairs(em_lm_cat3_lag1)
+```
+
+### Summary Figure
+
+```{r lm cat3 lag 1 summary figure, fig.width = 8.5, fig.height = 7.5}
 df_chla_wt_lag %>% 
-  select(StationCode, Year, Date, Chla, Chla_log, FlowActionPeriod, lag1) %>% 
-  drop_na(Chla_log, lag1) %>% 
-  saveRDS(file.path(fp_model, "chla_model_data.rds"))
+  drop_na(Chla, lag1) %>% 
+  plot_model_summary(em_lm_cat3_lag1)
 ```
 
+Observed daily average chlorophyll fluorescence values (boxplots) and model results (model means as red points ±95% confidence intervals as gray boxes) for the Flow Pulse Period comparisons by Station and Year. Different letters above boxplots identify statistically significant (p < 0.05) differences from a Tukey post-hoc test.
+
+The model results don't match the observed values well at all. Unfortunately, this linear model is not a good candidate to use for our analysis.
+
+## Model 1
+
+Since the linear model using 3-way interactions doesn't fit the observed data that well, let's take a closer look at the GAM version of this model, model 1: GAM 3-way interactions with s(DOY) <br>
+Formula: `r format(m_gamm_cat3_ar1_gam$formula) %>% str_c(collapse = "") %>% str_squish()`
+
+### Pairwise Contrasts
+
+Tukey pairwise contrasts of Flow Pulse Period for each Station - Year combination:
+
+```{r gam cat3 ar1 tukey flow action period, warning = FALSE}
+# Add the model call back to the gam object so it works with emmeans
+m_gamm_cat3_ar1_gam$call <- quote(
+  gamm(
+    f_gam_cat3, 
+    data = df_chla_wt_c, 
+    correlation = corARMA(form = ~ 1|Year_fct/StationCode, p = 1),
+    method = "REML"
+  )
+)
+
+em_gamm_cat3_ar1 <- emmeans(m_gamm_cat3_ar1_gam, ~ FlowActionPeriod | StationCode * Year_fct)
+pairs(em_gamm_cat3_ar1)
+```
+
+### Summary Figure
+
+```{r gam cat3 ar1 summary figure, fig.width = 8.5, fig.height = 7.5}
+plot_model_summary(df_chla_wt_c, em_gamm_cat3_ar1)
+```
+
+Observed daily average chlorophyll fluorescence values (boxplots) and model results (model means as red points ±95% confidence intervals as gray boxes) for the Flow Pulse Period comparisons by Station and Year. Different letters above boxplots identify statistically significant (p < 0.05) differences from a Tukey post-hoc test.
+
+The model means seem to match the observed values better than Model 3, but there is a large amount of uncertainty around the means resulting in the model not seeing any significant differences among the pairwise comparisons. Unfortunately, this linear model does not seem like a good candidate to use for our analysis as well.
+