From b70f9e8b23afc17a33cb9e737a5e954ee1f3c5e8 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Mon, 29 Apr 2024 10:04:21 -0400 Subject: [PATCH] qc for nf core --- .../templates/rnaseq/skeleton/QC/QC.Rmd | 4 +- .../rnaseq/skeleton/QC/QC_nf-core.Rmd | 439 ++++++++++++++++++ .../rnaseq/skeleton/QC/params_qc_nf-core.R | 5 + .../rnaseq/skeleton/QC/run_markdown.R | 6 +- 4 files changed, 449 insertions(+), 5 deletions(-) create mode 100644 inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd create mode 100644 inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd index 48afe76..14a5006 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd @@ -17,8 +17,8 @@ output: editor_options: chunk_output_type: console params: - params_file: params_qc.R - project_file: ../information.R + params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc.R + project_file: ./inst/rmarkdown/templates/rnaseq/skeleton/information.R --- diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd new file mode 100644 index 0000000..89b2a84 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd @@ -0,0 +1,439 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R + project_file: ./inst/rmarkdown/templates/rnaseq/skeleton/information.R +--- + + +```{r source_params, echo = F} +source(params$params_file) +source(params$project_file) +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(DESeq2) +library(DEGreport) +library(ggrepel) +library(pheatmap) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r subchunkify, echo=FALSE, eval=FALSE} +#' Create sub-chunks for plots +#' +#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots +#' +#' @param pl a plot object +#' @param fig.height figure height +#' @param fig.width figure width +#' @param chunk_name name of the chunk +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +subchunkify = function(pl, + fig.height = 7, + fig.width = 5, + chunk_name = 'plot') { + pl_deparsed = paste0(deparse(function() { + pl + }), collapse = '') + + sub_chunk = paste0( + "```{r ", + chunk_name, + ", fig.height=", + fig.height, + ", fig.width=", + fig.width, + ", dpi=72", + ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", + "\n(", + pl_deparsed, + ")()", + "\n```" + ) + + cat(knitr::knit( + text = knitr::knit_expand(text = sub_chunk), + quiet = TRUE + )) +} + +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + + +# Samples and metadata + +```{r load_metadata} + +meta_df=read_csv(metadata_fn) %>% mutate(sample = description) %>% + dplyr::select(-description) + +ggplot(meta_df, aes(sample_type, fill = sample_type)) + + geom_bar() + ylab("") + xlab("") + + scale_fill_cb_friendly() +``` + + +```{r prepare metrics} + +metrics <- read_tsv(paste0(multiqc_data_dir, 'multiqc_general_stats.txt')) +metrics_qualimap <- read_tsv(paste0(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) + +metrics <- metrics %>% full_join(metrics_qualimap) +metrics <- metrics %>% + clean_names() %>% + dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) + +total_reads <- metrics %>% + filter(grepl('_', sample)) %>% + remove_empty(which = 'cols') %>% + dplyr::rename(single_sample = sample) %>% + mutate(sample = gsub('_[12]+$', '', single_sample)) %>% + group_by(sample) %>% + summarize(total_reads = sum(fastqc_raw_total_sequences)) + +metrics <- metrics %>% + filter(!grepl('_', sample)) %>% + remove_empty(which = 'cols') %>% + full_join(total_reads) %>% + mutate(mapped_reads = samtools_reads_mapped) %>% + mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% + mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% + mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + mutate(r_rna_rate = custom_content_biotype_counts_percent_r_rna) %>% + mutate(x5_3_bias = qualimap_5_3_bias) + +metrics <- metrics %>% + full_join(meta_df , by = c("sample" = "sample")) +``` + +```{r show-metadata} +meta_sm <- meta_df %>% + as.data.frame() %>% + column_to_rownames("sample") + +meta_sm %>% sanitize_datatable() + +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads. + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample_type, + y = total_reads, + color = sample_type)) + + geom_point(alpha=0.5) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + + ggtitle("Total reads") + +``` + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +## Mapping rate + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (`r min_pct_mapped` - `r max_pct_mapped`%). + +```{r plot_mapping_rate} +metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) +metrics %>% + ggplot(aes(x = sample_type, + y = mapped_reads_pct, + color = sample_type)) + + geom_point() + + coord_flip() + + scale_color_cb_friendly() + + ylim(0, 100) + + ggtitle("Mapping rate") + + geom_hline(yintercept=70, color = cb_friendly_cols('blue')) +``` + + +## Number of genes detected + +The number of genes represented in every sample is expected to be consistent and over 20K (blue line). + +```{r plot_genes_detected} +se <- readRDS(se_object) +genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() +sample_names <- metrics[,c("sample"), drop=F] +genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) +genes_detected <- genes_detected %>% group_by(name) +genes_detected <- summarise(genes_detected, + n_genes = max(value)) + +metrics <- metrics %>% + left_join(genes_detected, by = c("sample" = "name")) +ggplot(metrics,aes(x = sample_type, + y = n_genes, color = sample_type)) + + geom_point() + + coord_flip() + + scale_color_cb_friendly() + + ggtitle("Number of genes") + + ylab("Number of genes") + + xlab("") + + geom_hline(yintercept=20000, color = cb_friendly_cols('blue')) +``` + + +## Gene detection saturation + +This plot shows how complex the samples are. We expect samples with more reads to detect more genes. + +```{r plot_gene_saturation} +metrics %>% + ggplot(aes(x = total_reads, + y = n_genes, + color = sample_type)) + + geom_point()+ + scale_x_log10() + + scale_color_cb_friendly() + + ggtitle("Gene saturation") + + ylab("Number of genes") +``` + +## Exonic mapping rate + +Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively). + +```{r plot_exonic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = exonic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Exonic rate %") + + ggtitle("Exonic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + + geom_hline(yintercept=75, color = cb_friendly_cols('brown')) +``` + +## Intronic mapping rate + +Here, we expect a low intronic mapping rate (≤ 15% - 20%) + +```{r plot_intronic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = intronic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Intronic rate %") + + ggtitle("Intronic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=20, color = cb_friendly_cols('blue')) + + geom_hline(yintercept=15, color = cb_friendly_cols('brown')) +``` + +## Intergenic mapping rate + +Here, we expect a low intergenic mapping rate, which is true for all samples. + +```{r plot_intergenic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = intergenic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Intergenic rate %") + + ggtitle("Intergenic mapping rate") + + coord_flip() + + scale_color_cb_friendly() + + ylim(c(0, 100)) +``` + +## rRNA mapping rate + +Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% + +```{r plot_rrna_mapping_rate} +# for some bad samples it could be > 50% +rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10 +metrics %>% + ggplot(aes(x = sample_type, + y = r_rna_rate * 100, + color = sample_type)) + + geom_point() + + ylab("rRNA rate, %")+ + ylim(0, rrna_ylim) + + ggtitle("rRNA mapping rate") + + coord_flip() + + scale_color_cb_friendly() +``` + +## 5'->3' bias + +There should be little bias, i.e. the values should be close to 1, or at least consistent among samples + +```{r plot_53_bias} +metrics %>% + ggplot(aes(x = sample_type, + y = x5_3_bias, + color = sample_type)) + + geom_point() + + ggtitle("5'-3' bias") + + coord_flip() + + ylim(c(0.5,1.5)) + + scale_color_cb_friendly()+ + geom_hline(yintercept=1, color = cb_friendly_cols('blue')) +``` + +## Counts per gene - all genes + +We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar + +```{r plot_counts_per_gene} +metrics_small <- metrics %>% dplyr::select(sample, sample_type) +metrics_small <- left_join(sample_names, metrics_small) + +counts <- + assays(se)[["counts"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + gather(name, counts) + +counts <- left_join(counts, metrics, by = c("name" = "sample")) + +ggplot(counts, aes(sample_type, + log2(counts+1), + fill = sample_type)) + + geom_boxplot() + + scale_fill_cb_friendly() + + ggtitle("Counts per gene, all non-zero genes") + + scale_color_cb_friendly() +``` + + +# Sample similarity analysis + +In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). + +## Principal component analysis (PCA) {.tabset} + +Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). + + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} +raw_counts <- assays(se)[["counts"]] %>% round() %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + as.matrix() + +vst <- vst(raw_counts) + +#fix samples names +coldat_for_pca <- as.data.frame(metrics) +rownames(coldat_for_pca) <- coldat_for_pca$sample +coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +pca1 <- degPCA(vst, coldat_for_pca, + condition = "sample_type", data = T)[["plot"]] +pca2 <- degPCA(vst, coldat_for_pca, + condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]] + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() +``` + + +```{r, eval=FALSE} +variables=degCovariates(vst, coldat_for_pca) +``` + + +```{r clustering fig, fig.width = 10, fig.asp = .62} +## Hierarchical clustering + +vst_cor <- cor(vst) + +annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca$sample_type))) +names(annotation_cols) <- unique(coldat_for_pca$sample_type) + +p <- pheatmap(vst_cor, + annotation = coldat_for_pca %>% select(sample_type) %>% mutate(sample_type = as.factor(sample_type)), + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15), + annotation_colors = list(sample_type = annotation_cols) +) +p + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R new file mode 100644 index 0000000..e9eb536 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R @@ -0,0 +1,5 @@ +# info params + +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/coldata.csv' +se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/salmon.merged.gene_counts.rds') +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/multiqc-report-data/' diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R index 1d61841..a2a2ec5 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R @@ -1,10 +1,10 @@ library(rmarkdown) -rmarkdown::render("QC.Rmd", - output_dir = "./", +rmarkdown::render("./inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd", + output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/QC/", clean = TRUE, output_format = "html_document", params = list( - params_file = 'params_qc.R', + params_file = 'params_qc_nf-core.R', project_file = '../information.R') )