diff --git a/DESCRIPTION b/DESCRIPTION index bee02ebc..6bf43e4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: MotrpacBicQC Type: Package Title: QC/QA functions for the MoTrPAC community -Version: 0.6.3 -Date: 2022-02-25 +Version: 0.6.4 +Date: 2022-02-27 Author: MoTrPAC Bioinformatics Center Maintainer: David Jimenez-Morales Description: R Package for the analysis of MoTrPAC datasets. diff --git a/NEWS.md b/NEWS.md index 4eb60e3e..30dc5c6c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# MotrpacBicQC 0.6.4 (2022-02-27) + +* Metabolomics: adjust metabolomic plots to deal with a large number of samples +* Metabolomics new plot: sum of intensity/concentration +* Metabolomics: detects negative values +* Metabolomics: update vignette + # MotrpacBicQC 0.6.3 (2022-02-25) * Proteomics: support for tmt16 diff --git a/R/metabolomics_plots.R b/R/metabolomics_plots.R index 278aba7c..6350c54d 100644 --- a/R/metabolomics_plots.R +++ b/R/metabolomics_plots.R @@ -5,78 +5,132 @@ #' @description Plot intensity distributions, number of unique ids, NA values #' per sample #' @param results (df) metabolomics results (both named and unnamed already merged, if untargeted) +#' @param results_long (df) metabolomics results (both named and unnamed already merged, if untargeted, in long format) #' @param m_s_n (df) metadata samples named #' @param out_qc_folder (char) output qc folder (it creates the folder if it doesn't exist) #' @param output_prefix (char) prefix for the file name output (pdf file) +#' @param untargeted (logical) `TRUE` if the dataset is untargeted (named + unnamed metabolites) #' @param printPDF (logical) `TRUE` (default) prints pdf file #' @param verbose (logical) `TRUE` (default) shows messages #' @export plot_basic_metabolomics_qc <- function(results, + results_long, m_s_n, out_qc_folder = NULL, output_prefix, printPDF = TRUE, + untargeted = TRUE, verbose = TRUE){ - metabolite_name = id_type = sample_id = sample_order = intensity = sample_type = NULL + metabolite_name = id_type = sample_id = sample_order = intensity = sample_type = sum_quant = NULL if(verbose) message(" + (+) QC PLOTS ------------------") - results_long <- results %>% tidyr::pivot_longer(cols = -c(metabolite_name, id_type), - names_to = "sample_id", - values_to = "intensity") - if(verbose) message(" - (p) Plot intensity distributions") - results_long <- merge(m_s_n, results_long, by = c("sample_id")) + # Get the total number of samples to customize the plots. If larger than 200, + # prepare for large plots + + sn <- length(unique(results_long$sample_id)) + # Set a limit for which remove labels + sn_limit <- 200 + + # Plot: sum of intensities------ + sum_int <- results_long %>% + group_by(sample_id, sample_type, sample_order) %>% + summarise(sum_quant = sum(intensity)) - results_long$sample_id <- as.character(results_long$sample_id) - results_long$sample_id <- as.factor(results_long$sample_id) - results_long$sample_type <- as.factor(results_long$sample_type) + psumint <- ggplot(sum_int, aes(x = reorder(sample_id, sample_order), y = sum_quant, fill = sample_type)) + + geom_bar(stat = "identity") + theme_classic() + + theme(axis.text.y = element_text(angle = 90), legend.position="top") + if(untargeted){ + psumint <- psumint + labs( + title = "Sum of Intensities by sample", + subtitle = paste(output_prefix), + x = "Samples", + y = "Sum of intensity" + ) + }else{ + psumint <- psumint + labs( + title = "Sum of concentrations by sample", + subtitle = paste(output_prefix), + x = "Samples", + y = "Sum of Concentrations" + ) + } + if(sn > sn_limit){ + psumint <- psumint + theme( + axis.text.x = element_blank()) + }else{ + psumint <- psumint + theme( + axis.text.x = element_text(angle = 90, + hjust = 1, + vjust = 0.5, + size = 6)) + } + # Boxplot distribution, as part of a grid---- piseso <- ggplot2::ggplot(results_long, aes( - # x = reorder(sample_id, log2(intensity), FUN = median, na.rm = TRUE), x = reorder(sample_id, sample_order), y = log2(intensity), fill = sample_type)) + geom_boxplot(na.rm = TRUE) + - theme_linedraw() + + theme_classic() + theme( - # axis.text.x = element_text(angle = 90, - # hjust = 1, - # vjust = 0.5, - # size = 8), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.text.y = element_text(angle = 90), - legend.position="top") + #legend.position = "none" - labs(title = "Intensity distribution / Unique IDs", - subtitle = paste(output_prefix), - # x = "sample_id", - # caption = "Sort by injection order", - x = "", - y = "log2(intensity)" - ) + legend.position="top") + + if(untargeted){ + labs( + title = "Intensity distribution / Unique IDs", + subtitle = paste(output_prefix), + y = "log2(intensity)" + ) + }else{ + labs( + title = "Concentration distribution / Unique IDs", + subtitle = paste(output_prefix), + y = "log2(Concentration)" + ) + } - - piseio <- ggplot2::ggplot(results_long, - aes(x = reorder(sample_id, sample_order), - # x = reorder(sample_id, log2(intensity), FUN = median, na.rm = TRUE), - y = log2(intensity), - fill = sample_type)) + + #Boxplots: standalone---- + piseio <- + ggplot2::ggplot(results_long, + aes(x = reorder(sample_id, sample_order), + y = log2(intensity), + fill = sample_type)) + geom_boxplot(na.rm = TRUE) + - theme_linedraw() + - theme(axis.text.x = element_text(angle = 90, - hjust = 1, - vjust = 0.5, - size = 8)) + #legend.position = "none" - labs(x = "sample_id", y = "log2(intensity)") + - labs(title = "Intensity distribution", - subtitle = paste(output_prefix), - caption = "Sort by injection order") + theme_classic() + + labs( + x = "sample_id", + subtitle = paste(output_prefix), + caption = "Sort by injection order" + ) + + theme(legend.position="top") + + if(sn > sn_limit){ + piseio <- piseio + theme( + axis.text.x = element_blank()) + }else{ + piseio <- piseio + theme( + axis.text.x = element_text(angle = 90, + hjust = 1, + vjust = 0.5, + size = 6)) + } + if(untargeted){ + piseio <- piseio + labs(title = "Sample Intensity distribution", + y = "log2(intensity)") + }else{ + piseio <- piseio + labs(title = "Sample Concentration distribution", + y = "log2(Concentration)") + } + - # Get data ready to count IDs------ + # Plot number fo IDs by identification------ if(verbose) message(" - (p) Plot ID counts") uid <- results_long %>% group_by(across(all_of(c("metabolite_name", "sample_id", "sample_type", "sample_order", "id_type")))) %>% @@ -84,6 +138,7 @@ plot_basic_metabolomics_qc <- function(results, uid2 <- uid[which(!is.na(uid$total_intensity)),] uid3 <- unique(uid2[c("metabolite_name", "sample_id", "sample_type", "sample_order", "id_type")]) %>% count(sample_id, sample_type, sample_order, id_type) + puid1 <- ggplot(uid3, aes(x = reorder(sample_id, sample_order), @@ -92,30 +147,29 @@ plot_basic_metabolomics_qc <- function(results, geom_bar(stat = "identity", position="stack", na.rm = TRUE) + - theme_linedraw() + - theme( - axis.text.x = element_text( - angle = 90, - hjust = 1, - vjust = 0.5, - size = 8 - ), - # legend.position = "none", - axis.text.y = element_text(angle = 90) - ) + - geom_text( - aes(label = n), - # vjust = -0.5, - hjust = 1, - size = 2.7, - angle = 90 - ) + + theme_classic() + labs(title = "Unique IDs in samples", subtitle = paste(output_prefix), caption = "Scales are different", - x = "Sample ID") + - facet_grid(rows = vars(id_type), scales = "free") + x = "Sample ID", y = "Number of identifications") + + facet_grid(rows = vars(id_type)) + + theme(legend.position = "top") + if(sn > sn_limit){ + puid1 <- puid1 + theme(axis.text.x = element_blank()) + }else{ + puid1 <- puid1 + theme( + axis.text.x = element_text(angle = 90, + hjust=0.95,vjust=0.2, + size = 6)) + + geom_text( + aes(label = n), + hjust = 1, + size = 2, + angle = 90 + ) + } + # Plot number of ids per sample----- uid4 <- unique(uid2[c("metabolite_name", "sample_id", "sample_type", "sample_order")]) %>% count(sample_id, sample_type, sample_order) @@ -123,32 +177,49 @@ plot_basic_metabolomics_qc <- function(results, aes(x = reorder(sample_id, sample_order), y = n, fill = sample_type)) + - geom_bar(stat = "identity", - na.rm = TRUE) + - theme_linedraw() + - theme( - axis.text.x = element_text( - angle = 90, + geom_bar(stat = "identity", + na.rm = TRUE) + + theme_classic() + + labs(caption = "Sort by injection order", + x = "Sample ID", + y = "") + + theme(legend.position = "none") + if(sn > sn_limit){ + puid2 <- puid2 + theme(axis.text.x = element_blank()) + }else{ + puid2 <- puid2 + theme( + axis.text.x = element_text(angle = 90, + hjust=0.95,vjust=0.2, + size = 6)) + + geom_text( + aes(label = n), hjust = 1, - vjust = 0.5, - size = 8 - ), - axis.text.y = element_text(angle = 90), - legend.position = "none" - ) + + size = 2, + angle = 90 + ) + ylab(NULL) + } + + # Plot proportion of ids------ + if(verbose) message(" - (p) Plot Sample counts and ID numbers/proportions") + + # Plot Total number of samples per sample_type + tns <- unique(uid2[c("sample_id", "sample_type")]) %>% group_by(sample_type) %>% count(sample_type) + ptns <- ggplot(tns, aes(x = sample_type, y = n, fill = sample_type)) + + geom_bar(stat = "identity") + + labs(title = "Number of samples by sample type", y = "", x = "") + + theme_minimal() + + theme(legend.position = "none", + axis.text.x = element_text(size = 14), + ) + geom_text( aes(label = n), - # vjust = -0.5, - hjust = 1, - size = 2.7, - angle = 90 + hjust = 0.5, + vjust = -0.2, + size = 4, + angle = 0 ) + - # ggtitle("Unique IDs in samples") + - labs(caption = "Sort by injection order") + - xlab("Sample IDs") + scale_fill_brewer(palette = "Dark2") - # Plot proportion of ids------ - if(verbose) message(" - (p) Plot ID proportions") ppids <- ggplot(results, aes(x = as.factor(id_type), fill = as.factor(id_type))) + geom_bar(aes(y = (..count..)/sum(..count..))) + @@ -156,7 +227,7 @@ plot_basic_metabolomics_qc <- function(results, label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) + scale_y_continuous(labels = percent) + - labs(title = "Proportion of Features Identified", y = "", x = "") + + labs(title = "Proportion of Features Identified (named/unnamed)", y = "", x = "") + theme_light() + theme(legend.position = "none", axis.text.x = element_text(size = 18)) + @@ -165,28 +236,38 @@ plot_basic_metabolomics_qc <- function(results, pnids <- ggplot(results, aes(x = id_type, fill = id_type)) + geom_bar() + geom_text(stat = 'count',aes(label =..count.., vjust = -0.2)) + - labs(title = "Number of Features Identified", y = "", x = "") + + labs(title = "Total Number of Features Identified (named/unnamed)", y = "", x = "") + theme_light() + theme(legend.position = "none", axis.text.x = element_text(size = 18)) + scale_fill_brewer(palette = "Dark2") + # Plot NA values------ if(verbose) message(" - (p) Plot NA values") - p_na_peprii <- results %>% - inspectdf::inspect_na() %>% - dplyr::arrange(match(col_name, colnames(results))) %>% - inspectdf::show_plot() + - ylim(0, 100) + theme_linedraw() + - theme(axis.text.x = element_text(angle = 90, - hjust = 1, - vjust = 0.5, - size = 8)) + - labs(title = "Prevalence of NAs", - subtitle = paste(output_prefix)) - - + suppressWarnings( + p_na_peprii <- results %>% + inspectdf::inspect_na() %>% + dplyr::arrange(match(col_name, colnames(results))) %>% + inspectdf::show_plot(text_labels = FALSE) + ylim(0, 100) + + theme_classic() + + labs(title = "Prevalence of NAs", + subtitle = paste(output_prefix), + y = "% of NAs in each sample")) + + if(sn > sn_limit){ + p_na_peprii <- p_na_peprii + + theme(axis.text.x = element_blank()) + }else{ + p_na_peprii <- p_na_peprii + + theme(axis.text.x = element_text(angle = 90, + hjust = 1, + vjust = 0.5, + size = 8)) + } + + # Print out plots----- if(!is.null(out_qc_folder)){ if(!dir.exists(file.path(out_qc_folder))){ dir.create(file.path(out_qc_folder), recursive = TRUE) @@ -195,14 +276,27 @@ plot_basic_metabolomics_qc <- function(results, out_qc_folder <- getwd() } - out_plot_dist <- file.path(normalizePath(out_qc_folder), paste0(output_prefix,"-qc-basic-plots.pdf")) + out_plot_large <- file.path(normalizePath(out_qc_folder), paste0(output_prefix,"-qc-basic-large-plots.pdf")) + out_plot_summary <- file.path(normalizePath(out_qc_folder), paste0(output_prefix,"-qc-basic-summary-plots.pdf")) - if(printPDF) pdf(out_plot_dist, width = 12, height = 8) + if(printPDF){ + if(sn > 800){ + pdf(out_plot_large, width = 40, height = 8) + }else if(sn <= 800 & sn > 200){ + pdf(out_plot_large, width = 22, height = 8) + }else{ + pdf(out_plot_large, width = 14, height = 8) + } + } + print(psumint) print(puid1) print(piseio) gridExtra::grid.arrange(piseso, puid2, ncol = 1, heights = c(2, 1)) + if(printPDF) garbage <- dev.off() + + if(printPDF) pdf(out_plot_summary, width = 12, height = 6) + print(ptns) gridExtra::grid.arrange(ppids, pnids, ncol = 2) - print(p_na_peprii) if(printPDF) garbage <- dev.off() } diff --git a/R/metabolomics_qc.R b/R/metabolomics_qc.R index 34f20764..3721513f 100644 --- a/R/metabolomics_qc.R +++ b/R/metabolomics_qc.R @@ -72,7 +72,7 @@ check_metadata_metabolites <- function(df, if(verbose) message(" + (+) {refmet_name} unique values: OK") } - if(verbose) message(" + Validating {refmet_name}\n") + if(verbose) message(" + Validating {refmet_name}") nrnna <- validate_refmetname(dataf = df, verbose = verbose) if(nrnna > 0){ if(verbose) message(paste0("\n SUMMARY: ", nrnna, " {refmet_name} not found in RefMet Metabolomics Data Dictionary: FAIL")) @@ -543,6 +543,8 @@ validate_metabolomics <- function(input_results_folder, out_qc_folder = NULL, printPDF = TRUE){ + metabolite_name = id_type = NULL + # validate folder structure ----- validate_cas(cas = cas) processfolder <- validate_processFolder(input_results_folder) @@ -777,11 +779,32 @@ validate_metabolomics <- function(input_results_folder, results <- r_m_n[eresults_coln] } + # results: check for negative values + results_long <- results %>% tidyr::pivot_longer(cols = -c(metabolite_name, id_type), + names_to = "sample_id", + values_to = "intensity") + + results_long <- merge(m_s_n, results_long, by = c("sample_id")) + results_long$sample_id <- as.character(results_long$sample_id) + results_long$sample_id <- as.factor(results_long$sample_id) + results_long$sample_type <- as.factor(results_long$sample_type) + results_long <- results_long[which(results_long$intensity != 0),] + results_long <- results_long[!is.na(results_long$intensity),] + if(any(results_long$intensity < 0)){ + message(" - (-) !!!!!!!!!!!!!!!!!!!!!!!!!!") + message(" - (-) NEGATIVE VALUES DETECTED!! ") + message(" - (-) !!!!!!!!!!!!!!!!!!!!!!!!!!") + results_long <- results_long[which(results_long$intensity > 0),] + ic <- ic + 1 + } + plot_basic_metabolomics_qc(results = results, + results_long = results_long, m_s_n = m_s_n, out_qc_folder = out_qc_folder, output_prefix = output_prefix, printPDF = printPDF, + untargeted = untargeted, verbose = verbose) }else{ @@ -790,9 +813,10 @@ validate_metabolomics <- function(input_results_folder, if(f_rmn & f_mmn){ m_m_n <- filter_required_columns(df = m_m_n, - type = "m_m", - name_id = "named", - verbose = FALSE) + type = "m_m", + name_id = "named", + verbose = FALSE) + r_m_merge <- merge(r_m_n, m_m_n, by = "metabolite_name") } } diff --git a/docs/404.html b/docs/404.html index 7c1a50d8..a46d51bf 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ MotrpacBicQC - 0.6.3 + 0.6.4 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index b6431185..bacd4022 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ MotrpacBicQC - 0.6.3 + 0.6.4 diff --git a/docs/articles/index.html b/docs/articles/index.html index a012b26c..3bb8f816 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ MotrpacBicQC - 0.6.3 + 0.6.4 diff --git a/docs/articles/other_functions.html b/docs/articles/other_functions.html index 277a24e2..e63d40f2 100644 --- a/docs/articles/other_functions.html +++ b/docs/articles/other_functions.html @@ -4,7 +4,7 @@ - + MotrpacBicQC: Other Functions @@ -122,7 +122,7 @@

MotrpacBicQC: Other Functions

-

2022-02-25

+

2022-02-27

diff --git a/docs/articles/qc_metabolomics.html b/docs/articles/qc_metabolomics.html index 64a7636d..314390af 100644 --- a/docs/articles/qc_metabolomics.html +++ b/docs/articles/qc_metabolomics.html @@ -4,7 +4,7 @@ - + MotrpacBicQC: Metabolomics QC @@ -124,7 +124,7 @@

MotrpacBicQC: Metabolomics QC

-

2022-02-25

+

2022-02-27

@@ -199,6 +199,21 @@