Skip to content

Commit

Permalink
Merge branch 'main' of github.com:bcbio/bcbioR
Browse files Browse the repository at this point in the history
  • Loading branch information
abartlett004 committed Apr 23, 2024
2 parents e1aa6f9 + 299e999 commit e39ea8f
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 86 deletions.
12 changes: 6 additions & 6 deletions inst/rmarkdown/templates/rnaseq/skeleton/params_de.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ aim = 'short description'
analyst = 'person in the core'

# project params
root = "../"
root = "../.."
date = "YYYYMMDD"
column = "treatment"
subset_column = 'cell'
metadata_fn = "../meta/samplesheet.csv"
counts_fn = '../data/tximport-counts.csv'
basedir <- 'reports'
column = "sample_type"
subset_column = NA
metadata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv"
counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv'
basedir <- 'results'
10 changes: 10 additions & 0 deletions inst/rmarkdown/templates/rnaseq/skeleton/params_qc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# info params
project = "name_hbcXXXXX"
PI = 'person name'
experiment = 'short description'
aim = 'short description'
analyst = 'person in the core'


metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv'
se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds")
114 changes: 71 additions & 43 deletions inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ output:
editor_options:
chunk_output_type: console
params:
numerator: foo
denominator: bar
subset_value: nah
params_file: params_de.R
numerator: tumor
denominator: normal
subset_value: NA
params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/params_de.R
---

```{r echo = F}
```{r load_params, echo = F}
source(params$params_file)
```

```{r, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,}
```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,}
library(rtracklayer)
library(DESeq2)
library(tidyverse)
Expand All @@ -39,6 +39,7 @@ library(msigdbr)
library(fgsea)
library(org.Hs.eg.db)
library(knitr)
library(EnhancedVolcano)
ggplot2::theme_set(theme_light(base_size = 14))
opts_chunk[["set"]](
Expand All @@ -55,7 +56,7 @@ opts_chunk[["set"]](
fig.height = 4)
```

```{r sanitize-datatable}
```{r sanitize_datatable}
sanitize_datatable = function(df, ...) {
# remove dashes which cause wrapping
DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
Expand All @@ -72,8 +73,14 @@ sanitize_datatable = function(df, ...) {
- Aim: `r aim`
- Comparison: `r paste0(params$subset_value, ': ', params$numerator, ' vs. ', params$denominator)`

```{r create-filenames}
filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}")
```{r create_filenames}
if (!is.na(subset_value)){
filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}")
} else {
filenames = str_interp("${params$numerator}_vs_${params$denominator}")
}
contrasts = c(column,params$numerator,params$denominator)
name_expression_fn=file.path(root,
Expand All @@ -88,12 +95,14 @@ name_pathways_fn=file.path(root,
```

```{r read-counts-data}
coldata=read.csv(metadata_fn, row.names = 1)
```{r read_counts_data}
coldata=read.csv(metadata_fn)
stopifnot(column %in% names(coldata))
# use only some samples, by default use all
coldata <- coldata[coldata[[paste(subset_column)]] == params$subset_value, ]
if (!is.na(subset_column)){
coldata <- coldata[coldata[[paste(subset_column)]] == params$subset_value, ]
}
coldata <- coldata[coldata[[paste(column)]] %in% c(params$numerator, params$denominator), ]
rownames(coldata) <- coldata$description
coldata[[column]] = relevel(as.factor(coldata[[column]]), params$denominator)
Expand All @@ -115,7 +124,7 @@ rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL

When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.

```{r before-RUV}
```{r before_RUV}
dds_before <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
Expand All @@ -126,7 +135,7 @@ degPCA(assay(vsd_before), colData(vsd_before),
```

```{r do-RUV}
```{r do_RUV}
design <- coldata[[column]]
names(design) <- coldata$name
diffs <- makeGroups(design)
Expand All @@ -144,7 +153,7 @@ formula <- as.formula(paste0("~ ",
)
```

```{r after-RUV}
```{r after_RUV}
## Check if sample name matches
stopifnot(all(names(counts) == rownames(new_cdata)))
Expand All @@ -158,18 +167,28 @@ degPCA(ruvset$normalizedCounts, new_cdata,

# Differential Expression

Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated. We correct for this so that gene LFC is not dependent overall on basal gene expression level.
Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model.

Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit.

We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model.

```{r DE}
de <- DESeq(dds_after)
DESeq2::plotDispEsts(de)
```

Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.

```{r lfc_shrink}
# resultsNames(de) # check the order is right
resLFC = results(de, contrast=contrasts)
resLFCS <- lfcShrink(de, coef=resultsNames(de)[ncol(vars)+2], type="apeglm")
res <- as.data.frame(resLFCS) %>%
rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>%
relocate(gene_name) %>% rename(lfc = log2FoldChange) %>%
relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>%
mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi)
res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj)
Expand All @@ -180,14 +199,24 @@ show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")])
degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking')
```

```{r}
```{r after_lfc_shrink}
degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking')
```

This volcano plot shows the genes that are significantly up- and down-regulated as a result of the difference in treatment.
```{r}
degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show)
This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen.

```{r volcano_plot, fig.height=6}
# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show)
EnhancedVolcano(res_mod,
lab= res_mod$gene_name,
pCutoff = 1.345719e-03,
selectLab = c(res_sig$gene_name[1:15]),
FCcutoff = 0.5,
x = 'lfc',
y = 'padj',
title="Volcano Tumor vs. Normal",
subtitle = "", xlim=c(-5,5))
```

Expand All @@ -205,15 +234,6 @@ universe=res %>%
filter(!is.na(padj)) %>% pull(gene_id)
mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL')
ranks_df <- res %>%
filter(padj < 0.01, !is.na(padj)) %>% # only if enough DE genes
arrange((lfc)) %>%
left_join(mapping, by = c("gene_id"="ENSEMBL")) %>%
rename(entrez_id=ENTREZID) %>%
filter(!is.na(entrez_id) & !is.na(padj))
ranks <- ranks_df$lfc
names(ranks) <- ranks_df$entrez_id
all_in_life=list(
msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"),
msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
Expand Down Expand Up @@ -263,19 +283,27 @@ pathways_ora_all %>% sanitize_datatable()

```{r write-files}
counts_norm=ruvset$normalizedCounts %>% as.data.frame() %>%
rownames_to_column("gene_id")
write_csv(counts_norm %>%
mutate(subset = params$subset_value,
comparison = str_interp("${params$numerator}_vs_${params$denominator}")),
name_expression_fn)
write_csv(res %>%
mutate(subset = params$subset_value,
comparison = str_interp("${params$numerator}_vs_${params$denominator}")),
name_deg_fn)
write_csv(pathways_long %>%
mutate(subset = params$subset_value,
comparison = str_interp("${params$numerator}_vs_${params$denominator}")),
name_pathways_fn)
rownames_to_column("gene_id") %>%
mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
res_for_writing <- res %>%
mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
pathways_for_writing <- pathways_long %>%
mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
if (!is.na(subset_value)){
counts_norm <- counts_norm %>%
mutate(subset = params$subset_value)
res_for_writing <- res_for_writing %>%
mutate(subset = params$subset_value)
pathways_for_writing <- pathways_for_writing %>%
mutate(subset = params$subset_value)
}
write_csv(counts_norm, name_expression_fn)
write_csv(res_for_writing, name_deg_fn)
write_csv(pathways_for_writing, name_pathways_fn)
```
# R session

Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
library(rmarkdown)

render_de <- function(subset_value, numerator, denominator){
rmarkdown::render(input = "DEG.Rmd",
output_dir = "reports",
render_de <- function(numerator, denominator, subset_value = NA,
params_file = '../../params_de.R'){

rmarkdown::render(input = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd",
output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/",
output_format = "html_document",
output_file = paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'),
output_file = ifelse(!is.na(subset_value),
paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'),
paste0('DE_', numerator, '_vs_', denominator, '.html')
),
clean = TRUE,
envir = new.env(),
params = list(
subset_value = subset_value,
numerator = numerator,
denominator = denominator
denominator = denominator,
params_file = params_file
)
)
}

render_de('HDFn', "TNF", "untreated")
render_de("tumor", "normal")
Loading

0 comments on commit e39ea8f

Please sign in to comment.