add combaseq to de report

bcbio · Jun 17, 2024 · d92db78 · d92db78
1 parent 298e52f
commit d92db78
Show file tree

Hide file tree

Showing 3 changed files with 163 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,10 +4,12 @@
 .Ruserdata
 *.Rproj
 test/*
+testthat/*/*/*/*/*/*html
+testthat/*/*/*/*/*/*csv
 inst/*/*/*/*/*/*html
 inst/*/*/*/*/*/*csv
 inst/doc
 docs
 /doc/
 /Meta/
-.DS*
+.DS*
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd
@@ -17,18 +17,21 @@ output:
 editor_options: 
   chunk_output_type: console
 params:
+  # Put hg38, mm10, mm39, or other
+
+  ## Combatseq and ruv can both be false or ONLY ONE can be true
+  ## Both cannot be true
   numerator: tumor
   denominator: normal
   column: sample_type
   subset_column: null
   subset_value: null
-  # Put hg38, mm10, mm39, or other
   genome: hg38
   ruv: false
-  params_file: params_de.R
+  combatseq: false
+  params_file: params_de-example.R
   project_file: ../information.R
   functions_file: load_data.R
-
 ---
 
 ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE}
@@ -38,14 +41,16 @@ source(params$params_file)
 source(params$project_file)
 # 3. Load custom functions to load data from coldata/metrics/counts
 source(params$functions_file)
-# IMPORTANT set these values if you are not using the parameters at the top
+# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31)
 genome=params$genome
 column=params$column
 numerator=params$numerator
 denominator=params$denominator
 subset_column=params$subset_column
 subset_value=params$subset_value
 run_ruv=params$ruv
+run_combatseq=params$combatseq
+factor_of_interest <- column
 ```
 
 
@@ -54,7 +59,6 @@ library(rtracklayer)
 library(DESeq2)
 library(tidyverse)
 library(stringr)
-library(RUVSeq)
 library(DEGreport)
 library(ggpubr)
 library(msigdbr)
@@ -67,6 +71,9 @@ library(ggprism)
 library(viridis)
 library(pheatmap)
 library(janitor)
+library(ggforce)
+library(vegan)
+
 colors=cb_friendly_cols(1:15)
 ggplot2::theme_set(theme_prism(base_size = 14))
 opts_chunk[["set"]](
@@ -86,6 +93,15 @@ opts_chunk[["set"]](
 set.seed(1234567890L)
 ```
 
+```{r sanitize_datatable}
+sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
+                   colnames=gsub("-", "_", colnames(df)))
+}
+```
+
+
 
 ```{r load_data, message=F, warning=F}
 # This code will load from bcbio or nf-core folder
@@ -108,13 +124,7 @@ coldata = coldata[rownames(metrics),]
 stopifnot(all(names(counts) == rownames(metrics)))
 ```
 
-```{r sanitize_datatable}
-sanitize_datatable = function(df, ...) {
- # remove dashes which cause wrapping
- DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
-                   colnames=gsub("-", "_", colnames(df)))
-}
-```
+
 
 # Overview
 
@@ -163,24 +173,67 @@ vsd_before <- vst(dds_to_use)
 norm_matrix = assay(vsd_before)
 ```
 
+
+# PCA and group level variance.
+
+**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).**
+
+Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.
+
+**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.**
+
+**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.**
+
+```{r set group, eval=FALSE, echo=FALSE}
+## Example of creating a group covariate
+
+meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment)
+
+factor_of_interest <- "insert column name for covariate of interest"
+```
+
+
+```{r PCA}
+pca <- degPCA(norm_matrix, metrics,
+              condition = factor_of_interest, name = "sample", data = T)
+
+pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) +
+  theme(plot.title=element_text(hjust=0.5)) +
+  geom_mark_ellipse(aes(color = sample_type)) +  scale_color_cb_friendly()
+```
+
+## PERMDISP
+
+Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test.  PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/)
+
+Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison.
+
+```{r PERMDISP}
+vare.disa <- vegdist(t(assay(vsd_before)))
+
+mod = betadisper(vare.disa, metrics[[factor_of_interest]])
+anova(mod)
+permutest(mod, pairwise = TRUE)
+
+```
+
+
+
 # Covariate analysis
 
+Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.
+
+Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.**
+
+
 ```{r covariates, fig.height = 6, fig.width = 10}
 degCovariates(
   norm_matrix,
   metrics,
 )
 ```
 
-# PCA analysis
 
-```{r before_RUV} 
-
-pca1 <- degPCA(norm_matrix, colData(dds_to_use), 
-       condition = column) + ggtitle('PCA')
-pca1 + scale_color_cb_friendly()
-
-```
 
 ```{r init_DESEQ}
 formula <- as.formula(paste0("~ ", " + ", column))
@@ -194,18 +247,31 @@ norm_matrix = assay(vsd_before)
 new_cdata <- coldata
 ```
 
-```{r, eval=run_ruv, results='asis', echo=FALSE}
+
+```{r, eval=F, echo=FALSE}
+#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION
+
+### RUV - LINES 261-296
+### COMBATSEQ - LINES 303-369
+```
+
+
+
+```{r, eval=run_ruv, results='asis', echo=run_ruv}
 cat("# Remove Unwanted Variability
 
 When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")
-
 ```
 
-```{r do_RUV, eval=run_ruv}
+```{r do_RUV, eval=run_ruv, echo=run_ruv}
+library(RUVSeq)
+
 # If you want to skip the code, just set up formula to be your model in the next chunk of code
 design <- coldata[[column]]
 diffs <- makeGroups(design)
 dat <- norm_matrix
+# by default is running one variable, 
+# change K parameter to other number to find more unknown covariates
 ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F)
 vars <- ruvset$W
 
@@ -231,6 +297,73 @@ vsd_to_use<- vst(dds_to_use, blind=FALSE)
 
 ```
 
+```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq}
+library(sva)
+
+cat("# Remove Batch Effects
+
+Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. 
+
+Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).")
+
+```
+
+
+```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq}
+
+## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ
+
+## Set your batch effect variable here this is the variable that combatseq will try to remove
+
+## Column name of your batch variable
+to_remove = "batch"
+
+## Column name of of your variable(s) of interest
+
+to_keep = "sample_type"
+
+
+coldata[[to_remove]] <- as.factor(coldata[[to_remove]])
+coldata[[to_keep]] <- as.factor(coldata[[to_keep]])
+
+
+batch = coldata[[to_remove]]
+treatment = coldata[[to_keep]]
+
+## If you have multiple variables of interest you will need to cbind them into one variable
+
+#treatment1 = metrics[[to_keep]]
+#treatment2 = metrics[[to_keep]]
+#treatment3 = metrics[[to_keep]]
+
+
+# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3)))
+
+```
+
+
+```{r do_combatseq, eval=run_combatseq}
+adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment)
+
+## For multiple variables of interest
+
+# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp)
+
+```
+
+```{r after_combatseq, eval=run_combatseq}
+# NOTE: Make sure the formula doens't contain the covariates used in combatseq above
+dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula)
+vsd_combat<- vst(dds_to_use, blind=FALSE)
+
+combat_matrix = assay(vsd_combat)
+
+pca_combat <- degPCA(combat_matrix, coldata, 
+       condition = column) + ggtitle('After Combatseq')
+pca_combat + scale_color_cb_friendly()
+
+```
+
 
 # Differential Expression
 
@@ -282,16 +415,15 @@ This volcano plot shows the genes that are significantly up- and down-regulated
 # degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) 
 EnhancedVolcano(res_mod,
                 lab= res_mod$gene_name, 
-                pCutoff = 1.345719e-03, 
+                pCutoff = 0.05, 
                 selectLab = c(res_sig$gene_name[1:15]),
                 FCcutoff = 0.5,
                 x = 'lfc',
                 y = 'padj', 
                 title="Volcano Tumor vs. Normal",
                 col=as.vector(colors[c("dark_grey", "light_blue",
                                          "purple", "purple")]),
-                subtitle = "", xlim=c(-5,5)) 
-
+                subtitle = "", drawConnectors = T,  max.overlaps = Inf) 
 ```
 
 ## Heatmap
@@ -428,6 +560,7 @@ write_csv(counts_norm, name_expression_fn)
 write_csv(res_for_writing, name_deg_fn)
 write_csv(pathways_for_writing, name_pathways_fn)
 ```
+
 # R session
 
 List and version of tools used for the DE report generation.

diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R
@@ -21,7 +21,7 @@ test_that("rnaseq testing", {
                       subset_value = subset_value,
                       numerator = numerator,
                       denominator = denominator,
-                      params_file = file.path(path,'DE/params_de.R'),
+                      params_file = file.path(path,'DE/params_de-example.R'),
                       project_file = file.path(path,'information.R'),
                       functions_file = file.path(path,'DE/load_data.R')
                     )