RUV and gene plot changes

#9 #1
bcbio · Apr 29, 2024 · a9cb331 · a9cb331
1 parent b70f9e8
commit a9cb331
Showing 1 changed file with 54 additions and 28 deletions.
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd
@@ -20,6 +20,7 @@ params:
   numerator: tumor
   denominator: normal
   subset_value: NA
+  ruv: false
   params_file: params_de.R
   project_file: ../information.R
 ---
@@ -43,8 +44,9 @@ library(org.Hs.eg.db)
 library(knitr)
 library(EnhancedVolcano)
 library(bcbioR)
+library(ggprism)
 
-ggplot2::theme_set(theme_light(base_size = 14))
+ggplot2::theme_set(theme_prism(base_size = 14))
 opts_chunk[["set"]](
     cache = F,
     cache.lazy = FALSE,
@@ -57,6 +59,9 @@ opts_chunk[["set"]](
     warning = FALSE,
     echo = T, 
     fig.height = 4)
+
+# set seed for reproducibility
+set.seed(1234567890L)
 ```
 
 ```{r sanitize_datatable}
@@ -85,6 +90,7 @@ if (!is.na(params$subset_value)){
 }
 
 contrasts = c(column,params$numerator,params$denominator)
+coef=paste0(column,"_",params$numerator,"_vs_",params$denominator)
 
 name_expression_fn=file.path(
                              basedir,
@@ -123,43 +129,56 @@ rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL
 
 ```
 
-# Remove Unwanted Variability
-
-When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data. 
-
-
 ```{r setup_RUV}
 
-dds_before <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
-
-vsd_before <- vst(dds_before)
+dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
 
+vsd_before <- vst(dds_to_use)
+norm_matrix = assay(vsd_before)
 ```
 
+# Covariate analysis
+
 ```{r covariates, fig.height = 6, fig.width = 10}
 
 se <- readRDS(se_object) #local
 metrics <- metadata(se)$metrics %>% mutate(sample = toupper(sample)) %>% 
   left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample')
 
 degCovariates(
-  assay(vsd_before),
+  norm_matrix,
   metrics
 )
 ```
 
+# PCA analysis
+
 ```{r before_RUV} 
 
-pca1 <- degPCA(assay(vsd_before), colData(vsd_before), 
+pca1 <- degPCA(norm_matrix, colData(dds_to_use), 
        condition = column) + ggtitle('Before RUV')
 pca1 + scale_color_cb_friendly()
 
 ```
 
-```{r do_RUV}
+```{r init_DESEQ}
+formula <- as.formula(paste0("~ ", " + ", column))
+
+dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula)
+
+vsd_before <- vst(dds_to_use)
+norm_matrix = assay(vsd_before)
+```
+
+```{eval=params$ruv, results='asis'}
+# Remove Unwanted Variability
+
+When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data. 
+
+```
+
+```{r do_RUV, eval=params$ruv}
 # If you want to skip the code, just set up formula to be your model in the next chunk of code
-design <-  coldata[[column]]
-names(design) <- coldata$name
 diffs <- makeGroups(design)
 dat <- assay(vsd_before)
 ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F)
@@ -173,17 +192,19 @@ formula <- as.formula(paste0("~ ",
                             collapse = " + "
                           ), " + ", column)
 )
+norm_matrix=ruvset$normalizedCounts
+pca2 <- degPCA(norm_matrix, new_cdata, 
+       condition = column) + ggtitle('After RUV')
+pca2 + scale_color_cb_friendly()
+
 ```
 
-```{r after_RUV}
+```{r after_RUV, eval=params$ruv}
 ## Check if sample name matches
 stopifnot(all(names(counts) == rownames(new_cdata)))
 
-dds_after <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
-vsd_after<- vst(dds_after, blind=FALSE)
-pca2 <- degPCA(ruvset$normalizedCounts, new_cdata, 
-       condition = column) + ggtitle('After RUV')
-pca2 + scale_color_cb_friendly()
+dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
+vsd_to_use<- vst(dds_to_use, blind=FALSE)
 
 ```
 
@@ -197,7 +218,7 @@ Before fitting the model, we often look at a metric called dispersion, which is
 We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. 
 
 ```{r DE}
-de <- DESeq(dds_after)
+de <- DESeq(dds_to_use)
 
 DESeq2::plotDispEsts(de)
 ```
@@ -207,7 +228,7 @@ Because it is difficult to accurately detect and quantify the expression of lowl
 ```{r lfc_shrink}
 # resultsNames(de) # check the order is right
 resLFC = results(de, contrast=contrasts)
-resLFCS <- lfcShrink(de, coef=resultsNames(de)[ncol(vars)+2], type="apeglm")
+resLFCS <- lfcShrink(de, coef=coef, type="apeglm")
 
 res <- as.data.frame(resLFCS) %>%
   rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% 
@@ -249,19 +270,24 @@ EnhancedVolcano(res_mod,
 res_sig %>% sanitize_datatable
 ```
 
+## Plot top 16 genes
+
 ```{r top n DEGs, fig.height = 6, fig.width = 8}
 n = 16
 top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% 
   dplyr::select(gene_name, gene_id)
-top_n_exp <- assays(vsd_after) %>% as.data.frame() %>% 
+top_n_exp <- norm_matrix %>% as.data.frame() %>% 
   rownames_to_column('gene_id') %>%
-  dplyr::select(-group, -group_name) %>% 
+  # dplyr::select(-group, -group_name) %>% 
   pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>%
-  right_join(top_n) %>%
+  right_join(top_n, relationship = "many-to-many") %>%
   left_join(coldata, by = c('sample' = 'description'))
 
-ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + geom_boxplot() + 
-  facet_wrap(~gene_name) + ggtitle(str_interp('Expression of Top ${n} DEGs'))
+ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) +
+  geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + 
+  geom_point() +
+  facet_wrap(~gene_name) + 
+  ggtitle(str_interp('Expression of Top ${n} DEGs'))
 
 ```
 
@@ -322,7 +348,7 @@ pathways_ora_all %>% sanitize_datatable()
 
 
 ```{r write-files}
-counts_norm=ruvset$normalizedCounts %>% as.data.frame() %>% 
+counts_norm=norm_matrix %>% as.data.frame() %>% 
   rownames_to_column("gene_id") %>% 
   mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))