Merge branch 'main' of github.com:bcbio/bcbioR

bcbio · Apr 23, 2024 · e39ea8f · e39ea8f
2 parents e1aa6f9 + 299e999
commit e39ea8f
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 86 deletions.
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/params_de.R b/inst/rmarkdown/templates/rnaseq/skeleton/params_de.R
@@ -6,10 +6,10 @@ aim = 'short description'
 analyst = 'person in the core'
 
 # project params
-root = "../"
+root = "../.."
 date = "YYYYMMDD"
-column = "treatment"
-subset_column = 'cell'
-metadata_fn = "../meta/samplesheet.csv"
-counts_fn = '../data/tximport-counts.csv'
-basedir <- 'reports'
+column = "sample_type"
+subset_column = NA
+metadata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv"
+counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv'
+basedir <- 'results'
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/params_qc.R b/inst/rmarkdown/templates/rnaseq/skeleton/params_qc.R
@@ -0,0 +1,10 @@
+# info params
+project = "name_hbcXXXXX"
+PI = 'person name'
+experiment = 'short description'
+aim = 'short description'
+analyst = 'person in the core'
+
+
+metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv'
+se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds")
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd
@@ -17,17 +17,17 @@ output:
 editor_options: 
   chunk_output_type: console
 params:
-  numerator: foo
-  denominator: bar
-  subset_value: nah
-  params_file: params_de.R
+  numerator: tumor
+  denominator: normal
+  subset_value: NA
+  params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/params_de.R
 ---
 
-```{r echo = F}
+```{r load_params, echo = F}
 source(params$params_file)
 ```
 
-```{r, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,}
+```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,}
 library(rtracklayer)
 library(DESeq2)
 library(tidyverse)
@@ -39,6 +39,7 @@ library(msigdbr)
 library(fgsea)
 library(org.Hs.eg.db)
 library(knitr)
+library(EnhancedVolcano)
 
 ggplot2::theme_set(theme_light(base_size = 14))
 opts_chunk[["set"]](
@@ -55,7 +56,7 @@ opts_chunk[["set"]](
     fig.height = 4)
 ```
 
-```{r sanitize-datatable}
+```{r sanitize_datatable}
 sanitize_datatable = function(df, ...) {
  # remove dashes which cause wrapping
  DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
@@ -72,8 +73,14 @@ sanitize_datatable = function(df, ...) {
 -   Aim: `r aim`
 -   Comparison: `r paste0(params$subset_value, ': ', params$numerator, ' vs. ', params$denominator)`
 
-```{r create-filenames}
-filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}")
+```{r create_filenames}
+
+if (!is.na(subset_value)){
+  filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}")
+} else {
+    filenames = str_interp("${params$numerator}_vs_${params$denominator}")
+}
+
 contrasts = c(column,params$numerator,params$denominator)
 
 name_expression_fn=file.path(root,
@@ -88,12 +95,14 @@ name_pathways_fn=file.path(root,
 
 ```
 
-```{r read-counts-data}
-coldata=read.csv(metadata_fn, row.names = 1)
+```{r read_counts_data}
+coldata=read.csv(metadata_fn)
 stopifnot(column %in% names(coldata))
 
 # use only some samples, by default use all
-coldata <- coldata[coldata[[paste(subset_column)]] == params$subset_value, ]
+if (!is.na(subset_column)){
+  coldata <- coldata[coldata[[paste(subset_column)]] == params$subset_value, ]
+}
 coldata <- coldata[coldata[[paste(column)]] %in% c(params$numerator, params$denominator), ]
 rownames(coldata) <- coldata$description
 coldata[[column]] = relevel(as.factor(coldata[[column]]), params$denominator)
@@ -115,7 +124,7 @@ rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL
 
 When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data. 
 
-```{r before-RUV}
+```{r before_RUV}
 
 dds_before <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
 
@@ -126,7 +135,7 @@ degPCA(assay(vsd_before), colData(vsd_before),
 
 ```
 
-```{r do-RUV}
+```{r do_RUV}
 design <-  coldata[[column]]
 names(design) <- coldata$name
 diffs <- makeGroups(design)
@@ -144,7 +153,7 @@ formula <- as.formula(paste0("~ ",
 )
 ```
 
-```{r after-RUV}
+```{r after_RUV}
 ## Check if sample name matches
 stopifnot(all(names(counts) == rownames(new_cdata)))
 
@@ -158,18 +167,28 @@ degPCA(ruvset$normalizedCounts, new_cdata,
 
 # Differential Expression
 
-Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated. We correct for this so that gene LFC is not dependent overall on basal gene expression level.
+Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. 
+
+Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. 
+
+We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. 
 
 ```{r DE}
 de <- DESeq(dds_after)
 
+DESeq2::plotDispEsts(de)
+```
+
+Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.
+
+```{r lfc_shrink}
 # resultsNames(de) # check the order is right
 resLFC = results(de, contrast=contrasts)
 resLFCS <- lfcShrink(de, coef=resultsNames(de)[ncol(vars)+2], type="apeglm")
 
 res <- as.data.frame(resLFCS) %>%
   rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% 
-  relocate(gene_name) %>% rename(lfc = log2FoldChange) %>%
+  relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>%
   mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi)
 
 res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj)
@@ -180,14 +199,24 @@ show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")])
 degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking')
 ```
 
-```{r}
+```{r after_lfc_shrink}
 degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking')
 
 ```
 
-This volcano plot shows the genes that are significantly up- and down-regulated as a result of the difference in treatment. 
-```{r}
-degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) 
+This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen.
+
+```{r volcano_plot, fig.height=6}
+# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) 
+EnhancedVolcano(res_mod,
+                lab= res_mod$gene_name, 
+                pCutoff = 1.345719e-03, 
+                selectLab = c(res_sig$gene_name[1:15]),
+                FCcutoff = 0.5,
+                x = 'lfc',
+                y = 'padj', 
+                title="Volcano Tumor vs. Normal", 
+                subtitle = "", xlim=c(-5,5))
 
 ```
 
@@ -205,15 +234,6 @@ universe=res %>%
   filter(!is.na(padj)) %>% pull(gene_id)
 mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL')
 
-ranks_df <- res %>% 
-  filter(padj < 0.01, !is.na(padj)) %>% # only if enough DE genes  
-  arrange((lfc)) %>% 
-  left_join(mapping, by = c("gene_id"="ENSEMBL")) %>% 
-  rename(entrez_id=ENTREZID) %>% 
-  filter(!is.na(entrez_id) & !is.na(padj))
-ranks <- ranks_df$lfc
-names(ranks) <- ranks_df$entrez_id
-
 all_in_life=list(
   msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"),
   msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
@@ -263,19 +283,27 @@ pathways_ora_all %>% sanitize_datatable()
 
 ```{r write-files}
 counts_norm=ruvset$normalizedCounts %>% as.data.frame() %>% 
-  rownames_to_column("gene_id")
-write_csv(counts_norm %>% 
-            mutate(subset = params$subset_value, 
-                   comparison = str_interp("${params$numerator}_vs_${params$denominator}")), 
-          name_expression_fn)
-write_csv(res %>% 
-            mutate(subset = params$subset_value, 
-                   comparison = str_interp("${params$numerator}_vs_${params$denominator}")), 
-          name_deg_fn)
-write_csv(pathways_long %>% 
-            mutate(subset = params$subset_value, 
-                   comparison = str_interp("${params$numerator}_vs_${params$denominator}")), 
-          name_pathways_fn)
+  rownames_to_column("gene_id") %>% 
+  mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
+
+res_for_writing <- res %>% 
+  mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
+
+pathways_for_writing <- pathways_long %>% 
+  mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
+         
+if (!is.na(subset_value)){
+  counts_norm <- counts_norm %>% 
+    mutate(subset = params$subset_value)  
+  res_for_writing <- res_for_writing %>% 
+    mutate(subset = params$subset_value)
+  pathways_for_writing <- pathways_for_writing %>% 
+    mutate(subset = params$subset_value)
+}
+
+write_csv(counts_norm, name_expression_fn)
+write_csv(res_for_writing, name_deg_fn)
+write_csv(pathways_for_writing, name_pathways_fn)
 ```
 # R session
 

diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/run_markdown.R b/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/run_markdown.R
@@ -1,18 +1,24 @@
 library(rmarkdown)
 
-render_de <- function(subset_value, numerator, denominator){
-  rmarkdown::render(input = "DEG.Rmd",
-                    output_dir = "reports",
+render_de <- function(numerator, denominator, subset_value = NA, 
+                      params_file = '../../params_de.R'){
+
+  rmarkdown::render(input = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd",
+                    output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/",
                     output_format = "html_document",
-                    output_file = paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'),
+                    output_file = ifelse(!is.na(subset_value), 
+                                         paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'),
+                                         paste0('DE_', numerator, '_vs_', denominator, '.html')
+                                         ),
                     clean = TRUE,
                     envir = new.env(),
                     params = list(
                       subset_value = subset_value,
                       numerator = numerator,
-                      denominator = denominator
+                      denominator = denominator,
+                      params_file = params_file
                     )
   )
 }
 
-render_de('HDFn', "TNF", "untreated")
+render_de("tumor", "normal")