diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd index 436d3ba..83ef510 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd @@ -20,6 +20,7 @@ params: numerator: tumor denominator: normal subset_value: NA + ruv: false params_file: params_de.R project_file: ../information.R --- @@ -43,8 +44,9 @@ library(org.Hs.eg.db) library(knitr) library(EnhancedVolcano) library(bcbioR) +library(ggprism) -ggplot2::theme_set(theme_light(base_size = 14)) +ggplot2::theme_set(theme_prism(base_size = 14)) opts_chunk[["set"]]( cache = F, cache.lazy = FALSE, @@ -57,6 +59,9 @@ opts_chunk[["set"]]( warning = FALSE, echo = T, fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) ``` ```{r sanitize_datatable} @@ -85,6 +90,7 @@ if (!is.na(params$subset_value)){ } contrasts = c(column,params$numerator,params$denominator) +coef=paste0(column,"_",params$numerator,"_vs_",params$denominator) name_expression_fn=file.path( basedir, @@ -123,19 +129,16 @@ rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL ``` -# Remove Unwanted Variability - -When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data. - - ```{r setup_RUV} -dds_before <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) - -vsd_before <- vst(dds_before) +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) ``` +# Covariate analysis + ```{r covariates, fig.height = 6, fig.width = 10} se <- readRDS(se_object) #local @@ -143,23 +146,39 @@ metrics <- metadata(se)$metrics %>% mutate(sample = toupper(sample)) %>% left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') degCovariates( - assay(vsd_before), + norm_matrix, metrics ) ``` +# PCA analysis + ```{r before_RUV} -pca1 <- degPCA(assay(vsd_before), colData(vsd_before), +pca1 <- degPCA(norm_matrix, colData(dds_to_use), condition = column) + ggtitle('Before RUV') pca1 + scale_color_cb_friendly() ``` -```{r do_RUV} +```{r init_DESEQ} +formula <- as.formula(paste0("~ ", " + ", column)) + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +``` + +```{eval=params$ruv, results='asis'} +# Remove Unwanted Variability + +When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data. + +``` + +```{r do_RUV, eval=params$ruv} # If you want to skip the code, just set up formula to be your model in the next chunk of code -design <- coldata[[column]] -names(design) <- coldata$name diffs <- makeGroups(design) dat <- assay(vsd_before) ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) @@ -173,17 +192,19 @@ formula <- as.formula(paste0("~ ", collapse = " + " ), " + ", column) ) +norm_matrix=ruvset$normalizedCounts +pca2 <- degPCA(norm_matrix, new_cdata, + condition = column) + ggtitle('After RUV') +pca2 + scale_color_cb_friendly() + ``` -```{r after_RUV} +```{r after_RUV, eval=params$ruv} ## Check if sample name matches stopifnot(all(names(counts) == rownames(new_cdata))) -dds_after <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) -vsd_after<- vst(dds_after, blind=FALSE) -pca2 <- degPCA(ruvset$normalizedCounts, new_cdata, - condition = column) + ggtitle('After RUV') -pca2 + scale_color_cb_friendly() +dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) +vsd_to_use<- vst(dds_to_use, blind=FALSE) ``` @@ -197,7 +218,7 @@ Before fitting the model, we often look at a metric called dispersion, which is We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. ```{r DE} -de <- DESeq(dds_after) +de <- DESeq(dds_to_use) DESeq2::plotDispEsts(de) ``` @@ -207,7 +228,7 @@ Because it is difficult to accurately detect and quantify the expression of lowl ```{r lfc_shrink} # resultsNames(de) # check the order is right resLFC = results(de, contrast=contrasts) -resLFCS <- lfcShrink(de, coef=resultsNames(de)[ncol(vars)+2], type="apeglm") +resLFCS <- lfcShrink(de, coef=coef, type="apeglm") res <- as.data.frame(resLFCS) %>% rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% @@ -249,19 +270,24 @@ EnhancedVolcano(res_mod, res_sig %>% sanitize_datatable ``` +## Plot top 16 genes + ```{r top n DEGs, fig.height = 6, fig.width = 8} n = 16 top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% dplyr::select(gene_name, gene_id) -top_n_exp <- assays(vsd_after) %>% as.data.frame() %>% +top_n_exp <- norm_matrix %>% as.data.frame() %>% rownames_to_column('gene_id') %>% - dplyr::select(-group, -group_name) %>% + # dplyr::select(-group, -group_name) %>% pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% - right_join(top_n) %>% + right_join(top_n, relationship = "many-to-many") %>% left_join(coldata, by = c('sample' = 'description')) -ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + geom_boxplot() + - facet_wrap(~gene_name) + ggtitle(str_interp('Expression of Top ${n} DEGs')) +ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + + geom_point() + + facet_wrap(~gene_name) + + ggtitle(str_interp('Expression of Top ${n} DEGs')) ``` @@ -322,7 +348,7 @@ pathways_ora_all %>% sanitize_datatable() ```{r write-files} -counts_norm=ruvset$normalizedCounts %>% as.data.frame() %>% +counts_norm=norm_matrix %>% as.data.frame() %>% rownames_to_column("gene_id") %>% mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))