From 9c56201ee92df58d7b1ec02585e7f313ad9ae713 Mon Sep 17 00:00:00 2001
From: sktron <simo.kitanovski@uni-due.de>
Date: Mon, 25 Mar 2024 15:33:53 +0100
Subject: [PATCH] updated vignette to include data on analysis with replicates

---
 DESCRIPTION               |   3 +-
 NEWS                      |   3 +-
 R/dgu.R                   |  65 ++++++++++++-----------
 R/loo.R                   |   9 ++--
 R/utils_input.R           |  21 +++++++-
 R/utils_usage.R           |  29 +++++++++-
 man/d_zibb_1.Rd           |   2 +-
 man/d_zibb_2.Rd           |   2 +-
 man/d_zibb_3.Rd           |   2 +-
 vignettes/User_Manual.Rmd | 109 ++++++++++++++++++++++++++++++++++++--
 10 files changed, 197 insertions(+), 48 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a7d4f5d..d85822c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -33,7 +33,8 @@ Suggests:
     ggplot2,
     ggforce,
     gridExtra,
-    ggrepel
+    ggrepel,
+    patchwork
 LinkingTo: 
     BH (>= 1.66.0),
     Rcpp (>= 0.12.0),
diff --git a/NEWS b/NEWS
index 5bfab61..166b146 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,6 @@
 Changes in version 1.17
-+ model explanded for designs with biological replicates 
++ model expanded for designs with biological replicates 
++ model expanded for paired sample designs (also with replicates)
 
 Changes in version 1.15
 + model for gene usage (GU) analysis
diff --git a/R/dgu.R b/R/dgu.R
index 47c9a9f..ef08d7e 100644
--- a/R/dgu.R
+++ b/R/dgu.R
@@ -1,20 +1,22 @@
 
 DGU <- function(ud,
-               mcmc_warmup = 500,
-               mcmc_steps = 1500,
-               mcmc_chains = 4,
-               mcmc_cores = 1,
-               hdi_lvl = 0.95,
-               adapt_delta = 0.95,
-               max_treedepth = 12) {
+                mcmc_warmup = 500,
+                mcmc_steps = 1500,
+                mcmc_chains = 4,
+                mcmc_cores = 1,
+                hdi_lvl = 0.95,
+                adapt_delta = 0.95,
+                max_treedepth = 12,
+                paired = FALSE) {
   
   # check inputs
   check_dgu_input(ud = ud,
-                  mcmc_chains = base::as.integer(x = mcmc_chains),
-                  mcmc_cores = base::as.integer(x = mcmc_cores),
-                  mcmc_steps = base::as.integer(x = mcmc_steps),
-                  mcmc_warmup = base::as.integer(x = mcmc_warmup),
-                  hdi_lvl = hdi_lvl)
+                  mcmc_chains = as.integer(x = mcmc_chains),
+                  mcmc_cores = as.integer(x = mcmc_cores),
+                  mcmc_steps = as.integer(x = mcmc_steps),
+                  mcmc_warmup = as.integer(x = mcmc_warmup),
+                  hdi_lvl = hdi_lvl,
+                  paired = paired)
   
   udr <- ud
   ud <- get_usage(u = udr)
@@ -25,19 +27,20 @@ DGU <- function(ud,
   
   # get model
   m <- get_model(has_conditions = ud$has_conditions, 
-                 has_replicates = ud$has_replicates)
+                 has_replicates = ud$has_replicates,
+                 paired = paired)
   
   # fit model
-  glm <- rstan::sampling(object = m$model,
-                         data = ud,
-                         chains = mcmc_chains,
-                         cores = mcmc_cores,
-                         iter = mcmc_steps,
-                         warmup = mcmc_warmup,
-                         algorithm = "NUTS",
-                         control = control_list,
-                         pars = m$pars,
-                         refresh = 50)
+  glm <- sampling(object = m$model,
+                  data = ud,
+                  chains = mcmc_chains,
+                  cores = mcmc_cores,
+                  iter = mcmc_steps,
+                  warmup = mcmc_warmup,
+                  algorithm = "NUTS",
+                  control = control_list,
+                  pars = m$pars,
+                  refresh = 50)
   
   message("Computing summaries ... \n")
   gu <- get_condition_prop(glm = glm, hdi_lvl = hdi_lvl, ud = ud, 
@@ -47,7 +50,7 @@ DGU <- function(ud,
   dgu_prob <- get_dgu_prob(glm = glm, hdi_lvl = hdi_lvl, ud = ud, 
                            model_type = m$model_type)
   theta <- get_sample_prop_gu(glm = glm, hdi_lvl = hdi_lvl, ud = ud)
-
+  
   
   # ppc
   message("Computing posterior predictions ... \n")
@@ -56,11 +59,11 @@ DGU <- function(ud,
     ppc_condition = get_ppc_condition(glm = glm, ud = ud, hdi_lvl = hdi_lvl))
   
   # result pack
-  return (list(dgu = dgu,
-               dgu_prob = dgu_prob,
-               gu = gu,
-               theta = theta,
-               ppc = ppc,
-               ud = ud,
-               fit = glm))
+  return(list(dgu = dgu,
+              dgu_prob = dgu_prob,
+              gu = gu,
+              theta = theta,
+              ppc = ppc,
+              ud = ud,
+              fit = glm))
 }
diff --git a/R/loo.R b/R/loo.R
index d3d1083..3e26ea4 100644
--- a/R/loo.R
+++ b/R/loo.R
@@ -8,7 +8,8 @@ LOO <- function(ud,
                 mcmc_cores = 1,
                 hdi_lvl = 0.95,
                 adapt_delta = 0.95,
-                max_treedepth = 12) {
+                max_treedepth = 12,
+                paired = FALSE) {
   
   # check inputs
   check_dgu_input(ud = ud,
@@ -16,7 +17,8 @@ LOO <- function(ud,
                   mcmc_cores = as.integer(x = mcmc_cores),
                   mcmc_steps = as.integer(x = mcmc_steps),
                   mcmc_warmup = as.integer(x = mcmc_warmup),
-                  hdi_lvl = hdi_lvl)
+                  hdi_lvl = hdi_lvl,
+                  paired = paired)
   
   # process data
   udp <- get_usage(u = ud)
@@ -57,7 +59,8 @@ LOO <- function(ud,
                mcmc_cores = mcmc_cores,
                hdi_lvl = hdi_lvl,
                adapt_delta = adapt_delta,
-               max_treedepth = max_treedepth)
+               max_treedepth = max_treedepth,
+               paired = paired)
     
     if(is.data.frame(out$gu)==TRUE) {
       out$gu$loo_id <- rs[r]
diff --git a/R/utils_input.R b/R/utils_input.R
index 34fd27f..dd5a1e1 100644
--- a/R/utils_input.R
+++ b/R/utils_input.R
@@ -8,14 +8,16 @@ check_dgu_input <- function(ud,
                             mcmc_cores,
                             mcmc_steps,
                             mcmc_warmup,
-                            hdi_lvl) {
+                            hdi_lvl,
+                            paired) {
   
   if(missing(ud) || is.null(ud) ||
      missing(mcmc_chains) || is.null(mcmc_chains) ||
      missing(mcmc_steps) || is.null(mcmc_steps) ||
      missing(mcmc_warmup) || is.null(mcmc_warmup) ||
      missing(mcmc_cores) || is.null(mcmc_cores) ||
-     missing(hdi_lvl) || is.null(hdi_lvl)) {
+     missing(hdi_lvl) || is.null(hdi_lvl) ||
+     missing(paired) || is.null(paired)) {
     stop("arguments must be specified")
   }
   
@@ -25,6 +27,7 @@ check_dgu_input <- function(ud,
   check_mcmc_chains(mcmc_chains = mcmc_chains)
   check_mcmc_cores(mcmc_cores = mcmc_cores)
   check_hdi(hdi_lvl = hdi_lvl)
+  check_paired(paired = paired)
 }
 
 
@@ -168,3 +171,17 @@ check_ud <- function(ud) {
     }
   }
 }
+
+
+
+# Description:
+# paired input check
+check_paired <- function(paired) {
+  if(length(paired) != 1) {
+    stop('paired must be a logical (TRUE or FALSE)')
+  }
+  
+  if(is.logical(paired) == FALSE) {
+    stop('paired must be a logical (TRUE or FALSE)')
+  }
+}
\ No newline at end of file
diff --git a/R/utils_usage.R b/R/utils_usage.R
index e9e002a..dd19102 100644
--- a/R/utils_usage.R
+++ b/R/utils_usage.R
@@ -93,8 +93,26 @@ get_usage <- function(u) {
               has_conditions = max(condition_ids)>1))
 }
 
-get_model <- function(has_replicates, has_conditions, debug = FALSE) {
+
+
+# Description:
+# checks for structural problems in the data, and mismatch with given inputs
+check_ud_content <- function(ud, paired) {
+  has_conditions <- ud$has_conditions
+  has_replicates <- ud$has_replicates
+  ud <- M$ud$proc_ud
+  
+  table(ud$sample_id)
+  diag(table(ud$sample_id, ud$individual_id))
+  identical(diag(table(ud$sample_id, ud$individual_id)))
+}
+
+
+
+
+get_model <- function(has_replicates, has_conditions, paired, debug = FALSE) {
   model_type <- ifelse(test = has_conditions, yes = "DGU", no = "GU")
+  
   if(model_type == "GU") {
     if(has_replicates) {
       if(debug) {
@@ -136,6 +154,9 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
                 "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", 
                 "log_lik", "dgu", "dgu_prob", "theta")
       model_name <- "DGU_rep"
+      if(paired) {
+        model_name <- "DGU_rep_paired"
+      }
     } 
     else {
       if(debug) {
@@ -149,6 +170,9 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
                 "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", 
                 "log_lik", "dgu", "dgu_prob", "theta")
       model_name <- "DGU"
+      if(paired) {
+        model_name <- "DGU_paired"
+      }
     }
   }
   
@@ -157,5 +181,6 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
               model_type = model_type,
               pars = pars,
               has_replicates = has_replicates,
-              has_conditions = has_conditions))
+              has_conditions = has_conditions,
+              paired = paired))
 }
diff --git a/man/d_zibb_1.Rd b/man/d_zibb_1.Rd
index bb4a121..72825d2 100644
--- a/man/d_zibb_1.Rd
+++ b/man/d_zibb_1.Rd
@@ -8,7 +8,7 @@ A small example dataset that has the following features:
 
   \itemize{
     \item 1 conditions
-    \item 5 replicates (samples) per condition
+    \item 5 samples per condition
     \item 15 Ig genes
   }
 This dataset was simulated from zero-inflated beta-binomial (ZIBB) 
diff --git a/man/d_zibb_2.Rd b/man/d_zibb_2.Rd
index cd515fc..2c732e7 100644
--- a/man/d_zibb_2.Rd
+++ b/man/d_zibb_2.Rd
@@ -8,7 +8,7 @@ A small example dataset that has the following features:
 
   \itemize{
     \item 2 conditions
-    \item 5 replicates (samples) per condition
+    \item 5 samples per condition
     \item 15 Ig genes
   }
 This dataset was simulated from zero-inflated beta-binomial (ZIBB) 
diff --git a/man/d_zibb_3.Rd b/man/d_zibb_3.Rd
index 6bce8bc..2b32401 100644
--- a/man/d_zibb_3.Rd
+++ b/man/d_zibb_3.Rd
@@ -8,7 +8,7 @@ A small example dataset that has the following features:
 
   \itemize{
     \item 3 conditions
-    \item 5 replicates (samples) per condition
+    \item 5 samples per condition
     \item 15 Ig genes
   }
 This dataset was simulated from zero-inflated beta-binomial (ZIBB) 
diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd
index 39261b7..5bf03b4 100644
--- a/vignettes/User_Manual.Rmd
+++ b/vignettes/User_Manual.Rmd
@@ -28,6 +28,7 @@ require(ggforce)
 require(gridExtra)
 require(ggrepel)
 require(reshape2)
+require(patchwork)
 ```
 
 
@@ -87,7 +88,7 @@ with $\pi = 1$ for genes with strong differential usage, and $\pi = 0$ for
 genes with negligible differential gene usage. Both metrics are computed based
 on the posterior distribution of $\gamma$, and are thus related. 
 
-# Case Study A
+# Case Study A: analyzing data with two biological conditions
 `r Biocpkg("IgGeneUsage")` has a couple of built-in Ig gene usage datasets. 
 Some were obtained from studies and others were simulated.
 
@@ -146,12 +147,17 @@ M <- DGU(ud = d_zibb_2, # input data
 The following objects are provided as part of the output of DGU:
 
   * `dgu` (main results of `r Biocpkg("IgGeneUsage")`): quantitative 
-     DGU summary 
+     DGU summary [on a log-scale]
+  * `dgu_prob` (main results of `r Biocpkg("IgGeneUsage")`): quantitative 
+     DGU summary [on a probability-scale]
   * `gu`: quantitative summary of the gene usage (GU) of each gene in 
-     the different conditions
-  * `glm`: rstan ('stanfit') object of the fitted model $\rightarrow$ used for
-     model checks (see section 'Model checking')
+     the different conditions [on a probability-scale]
+  * `theta`: summary of the theta parameter marginals, i.e. the inferred 
+     probabilities of gene usage
   * `ppc`: posterior predictive checks data (see section 'Model checking')
+  * `ud`: processed Ig gene usage data 
+  * `fit`: rstan ('stanfit') object of the fitted model $\rightarrow$ used 
+     for model checks (see section 'Model checking')
 
 ```{r}
 summary(M)
@@ -449,6 +455,99 @@ plot(hclust(dist(x, method = "euclidean"), method = "average"))
 ```
 
 
+# Case Study B: analyzing data with biological replicates
+
+```{r}
+data("d_zibb_4", package = "IgGeneUsage")
+knitr::kable(head(d_zibb_4))
+```
+
+We can also visualize `d_zibb_4` with `r CRANpkg("ggplot")`:
+
+```{r, fig.width=6, fig.height=3.25}
+ggplot(data = d_zibb_4)+
+  geom_point(aes(x = gene_name, y = gene_usage_count, col = condition),
+             position = position_dodge(width = .7), shape = 21)+
+  theme_bw(base_size = 11)+
+  ylab(label = "Gene usage [count]")+
+  xlab(label = '')+
+  theme(legend.position = "top")+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
+```
+
+## Modeling
+
+```{r}
+M <- DGU(ud = d_zibb_4, # input data
+         mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500)
+         mcmc_steps = 1500, # how many MCMC steps per chain (default: 1,500)
+         mcmc_chains = 3, # how many MCMC chain to run (default: 4)
+         mcmc_cores = 1, # how many PC cores to use? (e.g. parallel chains)
+         hdi_lvl = 0.95, # highest density interval level (de fault: 0.95)
+         adapt_delta = 0.8, # MCMC target acceptance rate (default: 0.95)
+         max_treedepth = 10) # tree depth evaluated at each step (default: 12)
+```
+
+
+## Posterior predictive checks
+
+```{r, fig.height = 6, fig.width = 6}
+ggplot(data = M$ppc$ppc_rep)+
+  facet_wrap(facets = ~individual_id, ncol = 3)+
+  geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+
+  geom_errorbar(aes(x = observed_count, y = ppc_mean_count, 
+                    ymin = ppc_L_count, ymax = ppc_H_count), col = "darkgray")+
+  geom_point(aes(x = observed_count, y = ppc_mean_count), size = 1)+
+  theme_bw(base_size = 11)+
+  theme(legend.position = "top")+
+  xlab(label = "Observed usage [counts]")+
+  ylab(label = "PPC usage [counts]")
+```
+
+## Analysis of estimated effect sizes
+The top panel shows the average gene usage (GU) in different biological 
+conditions. The bottom panels shows the differential gene usage (DGU) 
+between pairs of biological conditions.
+
+```{r, fig.weight = 7, fig.height = 4}
+g1 <- ggplot(data = M$gu)+
+  geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L,
+                    ymax = prob_H, col = condition),
+                width = 0.1, position = position_dodge(width = 0.4))+
+  geom_point(aes(x = gene_name, y = prob_mean, col = condition), size = 1,
+             position = position_dodge(width = 0.4))+
+  theme_bw(base_size = 11)+
+  theme(legend.position = "top")+
+  ylab(label = "GU [probability]")+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
+
+
+stats <- M$dgu
+stats <- stats[order(abs(stats$es_mean), decreasing = FALSE), ]
+stats$gene_fac <- factor(x = stats$gene_name, levels = unique(stats$gene_name))
+
+g2 <- ggplot(data = stats)+
+  facet_wrap(facets = ~contrast)+
+  geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+
+  geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H), 
+                col = "darkgray")+
+  geom_point(aes(x = pmax, y = es_mean, col = contrast))+
+  geom_text_repel(data = stats[stats$pmax >= 0.9, ],
+                  aes(x = pmax, y = es_mean, label = gene_fac),
+                  min.segment.length = 0, size = 2.75)+
+  theme_bw(base_size = 11)+
+  theme(legend.position = "top")+
+  xlab(label = expression(pi))+
+  xlim(c(0, 1))+
+  ylab(expression(gamma))
+```
+
+
+```{r, fig.weight = 5, fig.height = 6}
+(g1/g2)
+```
+
+
 # Session
 
 ```{r}