From 9c56201ee92df58d7b1ec02585e7f313ad9ae713 Mon Sep 17 00:00:00 2001 From: sktron Date: Mon, 25 Mar 2024 15:33:53 +0100 Subject: [PATCH] updated vignette to include data on analysis with replicates --- DESCRIPTION | 3 +- NEWS | 3 +- R/dgu.R | 65 ++++++++++++----------- R/loo.R | 9 ++-- R/utils_input.R | 21 +++++++- R/utils_usage.R | 29 +++++++++- man/d_zibb_1.Rd | 2 +- man/d_zibb_2.Rd | 2 +- man/d_zibb_3.Rd | 2 +- vignettes/User_Manual.Rmd | 109 ++++++++++++++++++++++++++++++++++++-- 10 files changed, 197 insertions(+), 48 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a7d4f5d..d85822c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,8 @@ Suggests: ggplot2, ggforce, gridExtra, - ggrepel + ggrepel, + patchwork LinkingTo: BH (>= 1.66.0), Rcpp (>= 0.12.0), diff --git a/NEWS b/NEWS index 5bfab61..166b146 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,6 @@ Changes in version 1.17 -+ model explanded for designs with biological replicates ++ model expanded for designs with biological replicates ++ model expanded for paired sample designs (also with replicates) Changes in version 1.15 + model for gene usage (GU) analysis diff --git a/R/dgu.R b/R/dgu.R index 47c9a9f..ef08d7e 100644 --- a/R/dgu.R +++ b/R/dgu.R @@ -1,20 +1,22 @@ DGU <- function(ud, - mcmc_warmup = 500, - mcmc_steps = 1500, - mcmc_chains = 4, - mcmc_cores = 1, - hdi_lvl = 0.95, - adapt_delta = 0.95, - max_treedepth = 12) { + mcmc_warmup = 500, + mcmc_steps = 1500, + mcmc_chains = 4, + mcmc_cores = 1, + hdi_lvl = 0.95, + adapt_delta = 0.95, + max_treedepth = 12, + paired = FALSE) { # check inputs check_dgu_input(ud = ud, - mcmc_chains = base::as.integer(x = mcmc_chains), - mcmc_cores = base::as.integer(x = mcmc_cores), - mcmc_steps = base::as.integer(x = mcmc_steps), - mcmc_warmup = base::as.integer(x = mcmc_warmup), - hdi_lvl = hdi_lvl) + mcmc_chains = as.integer(x = mcmc_chains), + mcmc_cores = as.integer(x = mcmc_cores), + mcmc_steps = as.integer(x = mcmc_steps), + mcmc_warmup = as.integer(x = mcmc_warmup), + hdi_lvl = hdi_lvl, + paired = paired) udr <- ud ud <- get_usage(u = udr) @@ -25,19 +27,20 @@ DGU <- function(ud, # get model m <- get_model(has_conditions = ud$has_conditions, - has_replicates = ud$has_replicates) + has_replicates = ud$has_replicates, + paired = paired) # fit model - glm <- rstan::sampling(object = m$model, - data = ud, - chains = mcmc_chains, - cores = mcmc_cores, - iter = mcmc_steps, - warmup = mcmc_warmup, - algorithm = "NUTS", - control = control_list, - pars = m$pars, - refresh = 50) + glm <- sampling(object = m$model, + data = ud, + chains = mcmc_chains, + cores = mcmc_cores, + iter = mcmc_steps, + warmup = mcmc_warmup, + algorithm = "NUTS", + control = control_list, + pars = m$pars, + refresh = 50) message("Computing summaries ... \n") gu <- get_condition_prop(glm = glm, hdi_lvl = hdi_lvl, ud = ud, @@ -47,7 +50,7 @@ DGU <- function(ud, dgu_prob <- get_dgu_prob(glm = glm, hdi_lvl = hdi_lvl, ud = ud, model_type = m$model_type) theta <- get_sample_prop_gu(glm = glm, hdi_lvl = hdi_lvl, ud = ud) - + # ppc message("Computing posterior predictions ... \n") @@ -56,11 +59,11 @@ DGU <- function(ud, ppc_condition = get_ppc_condition(glm = glm, ud = ud, hdi_lvl = hdi_lvl)) # result pack - return (list(dgu = dgu, - dgu_prob = dgu_prob, - gu = gu, - theta = theta, - ppc = ppc, - ud = ud, - fit = glm)) + return(list(dgu = dgu, + dgu_prob = dgu_prob, + gu = gu, + theta = theta, + ppc = ppc, + ud = ud, + fit = glm)) } diff --git a/R/loo.R b/R/loo.R index d3d1083..3e26ea4 100644 --- a/R/loo.R +++ b/R/loo.R @@ -8,7 +8,8 @@ LOO <- function(ud, mcmc_cores = 1, hdi_lvl = 0.95, adapt_delta = 0.95, - max_treedepth = 12) { + max_treedepth = 12, + paired = FALSE) { # check inputs check_dgu_input(ud = ud, @@ -16,7 +17,8 @@ LOO <- function(ud, mcmc_cores = as.integer(x = mcmc_cores), mcmc_steps = as.integer(x = mcmc_steps), mcmc_warmup = as.integer(x = mcmc_warmup), - hdi_lvl = hdi_lvl) + hdi_lvl = hdi_lvl, + paired = paired) # process data udp <- get_usage(u = ud) @@ -57,7 +59,8 @@ LOO <- function(ud, mcmc_cores = mcmc_cores, hdi_lvl = hdi_lvl, adapt_delta = adapt_delta, - max_treedepth = max_treedepth) + max_treedepth = max_treedepth, + paired = paired) if(is.data.frame(out$gu)==TRUE) { out$gu$loo_id <- rs[r] diff --git a/R/utils_input.R b/R/utils_input.R index 34fd27f..dd5a1e1 100644 --- a/R/utils_input.R +++ b/R/utils_input.R @@ -8,14 +8,16 @@ check_dgu_input <- function(ud, mcmc_cores, mcmc_steps, mcmc_warmup, - hdi_lvl) { + hdi_lvl, + paired) { if(missing(ud) || is.null(ud) || missing(mcmc_chains) || is.null(mcmc_chains) || missing(mcmc_steps) || is.null(mcmc_steps) || missing(mcmc_warmup) || is.null(mcmc_warmup) || missing(mcmc_cores) || is.null(mcmc_cores) || - missing(hdi_lvl) || is.null(hdi_lvl)) { + missing(hdi_lvl) || is.null(hdi_lvl) || + missing(paired) || is.null(paired)) { stop("arguments must be specified") } @@ -25,6 +27,7 @@ check_dgu_input <- function(ud, check_mcmc_chains(mcmc_chains = mcmc_chains) check_mcmc_cores(mcmc_cores = mcmc_cores) check_hdi(hdi_lvl = hdi_lvl) + check_paired(paired = paired) } @@ -168,3 +171,17 @@ check_ud <- function(ud) { } } } + + + +# Description: +# paired input check +check_paired <- function(paired) { + if(length(paired) != 1) { + stop('paired must be a logical (TRUE or FALSE)') + } + + if(is.logical(paired) == FALSE) { + stop('paired must be a logical (TRUE or FALSE)') + } +} \ No newline at end of file diff --git a/R/utils_usage.R b/R/utils_usage.R index e9e002a..dd19102 100644 --- a/R/utils_usage.R +++ b/R/utils_usage.R @@ -93,8 +93,26 @@ get_usage <- function(u) { has_conditions = max(condition_ids)>1)) } -get_model <- function(has_replicates, has_conditions, debug = FALSE) { + + +# Description: +# checks for structural problems in the data, and mismatch with given inputs +check_ud_content <- function(ud, paired) { + has_conditions <- ud$has_conditions + has_replicates <- ud$has_replicates + ud <- M$ud$proc_ud + + table(ud$sample_id) + diag(table(ud$sample_id, ud$individual_id)) + identical(diag(table(ud$sample_id, ud$individual_id))) +} + + + + +get_model <- function(has_replicates, has_conditions, paired, debug = FALSE) { model_type <- ifelse(test = has_conditions, yes = "DGU", no = "GU") + if(model_type == "GU") { if(has_replicates) { if(debug) { @@ -136,6 +154,9 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) { "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", "log_lik", "dgu", "dgu_prob", "theta") model_name <- "DGU_rep" + if(paired) { + model_name <- "DGU_rep_paired" + } } else { if(debug) { @@ -149,6 +170,9 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) { "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", "log_lik", "dgu", "dgu_prob", "theta") model_name <- "DGU" + if(paired) { + model_name <- "DGU_paired" + } } } @@ -157,5 +181,6 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) { model_type = model_type, pars = pars, has_replicates = has_replicates, - has_conditions = has_conditions)) + has_conditions = has_conditions, + paired = paired)) } diff --git a/man/d_zibb_1.Rd b/man/d_zibb_1.Rd index bb4a121..72825d2 100644 --- a/man/d_zibb_1.Rd +++ b/man/d_zibb_1.Rd @@ -8,7 +8,7 @@ A small example dataset that has the following features: \itemize{ \item 1 conditions - \item 5 replicates (samples) per condition + \item 5 samples per condition \item 15 Ig genes } This dataset was simulated from zero-inflated beta-binomial (ZIBB) diff --git a/man/d_zibb_2.Rd b/man/d_zibb_2.Rd index cd515fc..2c732e7 100644 --- a/man/d_zibb_2.Rd +++ b/man/d_zibb_2.Rd @@ -8,7 +8,7 @@ A small example dataset that has the following features: \itemize{ \item 2 conditions - \item 5 replicates (samples) per condition + \item 5 samples per condition \item 15 Ig genes } This dataset was simulated from zero-inflated beta-binomial (ZIBB) diff --git a/man/d_zibb_3.Rd b/man/d_zibb_3.Rd index 6bce8bc..2b32401 100644 --- a/man/d_zibb_3.Rd +++ b/man/d_zibb_3.Rd @@ -8,7 +8,7 @@ A small example dataset that has the following features: \itemize{ \item 3 conditions - \item 5 replicates (samples) per condition + \item 5 samples per condition \item 15 Ig genes } This dataset was simulated from zero-inflated beta-binomial (ZIBB) diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd index 39261b7..5bf03b4 100644 --- a/vignettes/User_Manual.Rmd +++ b/vignettes/User_Manual.Rmd @@ -28,6 +28,7 @@ require(ggforce) require(gridExtra) require(ggrepel) require(reshape2) +require(patchwork) ``` @@ -87,7 +88,7 @@ with $\pi = 1$ for genes with strong differential usage, and $\pi = 0$ for genes with negligible differential gene usage. Both metrics are computed based on the posterior distribution of $\gamma$, and are thus related. -# Case Study A +# Case Study A: analyzing data with two biological conditions `r Biocpkg("IgGeneUsage")` has a couple of built-in Ig gene usage datasets. Some were obtained from studies and others were simulated. @@ -146,12 +147,17 @@ M <- DGU(ud = d_zibb_2, # input data The following objects are provided as part of the output of DGU: * `dgu` (main results of `r Biocpkg("IgGeneUsage")`): quantitative - DGU summary + DGU summary [on a log-scale] + * `dgu_prob` (main results of `r Biocpkg("IgGeneUsage")`): quantitative + DGU summary [on a probability-scale] * `gu`: quantitative summary of the gene usage (GU) of each gene in - the different conditions - * `glm`: rstan ('stanfit') object of the fitted model $\rightarrow$ used for - model checks (see section 'Model checking') + the different conditions [on a probability-scale] + * `theta`: summary of the theta parameter marginals, i.e. the inferred + probabilities of gene usage * `ppc`: posterior predictive checks data (see section 'Model checking') + * `ud`: processed Ig gene usage data + * `fit`: rstan ('stanfit') object of the fitted model $\rightarrow$ used + for model checks (see section 'Model checking') ```{r} summary(M) @@ -449,6 +455,99 @@ plot(hclust(dist(x, method = "euclidean"), method = "average")) ``` +# Case Study B: analyzing data with biological replicates + +```{r} +data("d_zibb_4", package = "IgGeneUsage") +knitr::kable(head(d_zibb_4)) +``` + +We can also visualize `d_zibb_4` with `r CRANpkg("ggplot")`: + +```{r, fig.width=6, fig.height=3.25} +ggplot(data = d_zibb_4)+ + geom_point(aes(x = gene_name, y = gene_usage_count, col = condition), + position = position_dodge(width = .7), shape = 21)+ + theme_bw(base_size = 11)+ + ylab(label = "Gene usage [count]")+ + xlab(label = '')+ + theme(legend.position = "top")+ + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4)) +``` + +## Modeling + +```{r} +M <- DGU(ud = d_zibb_4, # input data + mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500) + mcmc_steps = 1500, # how many MCMC steps per chain (default: 1,500) + mcmc_chains = 3, # how many MCMC chain to run (default: 4) + mcmc_cores = 1, # how many PC cores to use? (e.g. parallel chains) + hdi_lvl = 0.95, # highest density interval level (de fault: 0.95) + adapt_delta = 0.8, # MCMC target acceptance rate (default: 0.95) + max_treedepth = 10) # tree depth evaluated at each step (default: 12) +``` + + +## Posterior predictive checks + +```{r, fig.height = 6, fig.width = 6} +ggplot(data = M$ppc$ppc_rep)+ + facet_wrap(facets = ~individual_id, ncol = 3)+ + geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+ + geom_errorbar(aes(x = observed_count, y = ppc_mean_count, + ymin = ppc_L_count, ymax = ppc_H_count), col = "darkgray")+ + geom_point(aes(x = observed_count, y = ppc_mean_count), size = 1)+ + theme_bw(base_size = 11)+ + theme(legend.position = "top")+ + xlab(label = "Observed usage [counts]")+ + ylab(label = "PPC usage [counts]") +``` + +## Analysis of estimated effect sizes +The top panel shows the average gene usage (GU) in different biological +conditions. The bottom panels shows the differential gene usage (DGU) +between pairs of biological conditions. + +```{r, fig.weight = 7, fig.height = 4} +g1 <- ggplot(data = M$gu)+ + geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L, + ymax = prob_H, col = condition), + width = 0.1, position = position_dodge(width = 0.4))+ + geom_point(aes(x = gene_name, y = prob_mean, col = condition), size = 1, + position = position_dodge(width = 0.4))+ + theme_bw(base_size = 11)+ + theme(legend.position = "top")+ + ylab(label = "GU [probability]")+ + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4)) + + +stats <- M$dgu +stats <- stats[order(abs(stats$es_mean), decreasing = FALSE), ] +stats$gene_fac <- factor(x = stats$gene_name, levels = unique(stats$gene_name)) + +g2 <- ggplot(data = stats)+ + facet_wrap(facets = ~contrast)+ + geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+ + geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H), + col = "darkgray")+ + geom_point(aes(x = pmax, y = es_mean, col = contrast))+ + geom_text_repel(data = stats[stats$pmax >= 0.9, ], + aes(x = pmax, y = es_mean, label = gene_fac), + min.segment.length = 0, size = 2.75)+ + theme_bw(base_size = 11)+ + theme(legend.position = "top")+ + xlab(label = expression(pi))+ + xlim(c(0, 1))+ + ylab(expression(gamma)) +``` + + +```{r, fig.weight = 5, fig.height = 6} +(g1/g2) +``` + + # Session ```{r}