diff --git a/DESCRIPTION b/DESCRIPTION index c5954f1..0a693e7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: MetaboCoreUtils Title: Core Utils for Metabolomics Data -Version: 1.11.1 +Version: 1.11.2 Description: MetaboCoreUtils defines metabolomics-related core functionality provided as low-level functions to allow a data structure-independent usage across various R packages. This includes functions to calculate between ion diff --git a/NAMESPACE b/NAMESPACE index 6c7bd8d..c00ca19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,12 @@ export(mclosest) export(multiplyElements) export(mz2mass) export(pasteElements) +export(percentMissing) +export(rowBlank) +export(rowDratio) +export(rowPercentMissing) +export(rowRsd) +export(rsd) export(standardizeFormula) export(subtractElements) importFrom(BiocParallel,SerialParam) @@ -36,7 +42,10 @@ importFrom(MsCoreUtils,ppm) importFrom(methods,is) importFrom(stats,approx) importFrom(stats,lm) +importFrom(stats,mad) +importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,predict) +importFrom(stats,sd) importFrom(stats,setNames) importFrom(utils,read.table) diff --git a/NEWS.md b/NEWS.md index 4d82b19..1ce8ea6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # MetaboCoreUtils 1.11 +## MetaboCoreUtils 1.11.2 + +- Add functions to compute quality check of the data (issue + [#77]((https://github.com/rformassspectrometry/MetaboCoreUtils/issues/77)) + ## MetaboCoreUtils 1.11.1 - Add functions to enable linear model-based adjustment of (LC-MS derived) diff --git a/R/mclosest.R b/R/mclosest.R index e016e8b..8531ca7 100644 --- a/R/mclosest.R +++ b/R/mclosest.R @@ -76,7 +76,6 @@ mclosest <- function(x, ppm <- rep(ppm[1], nc) if (length(tolerance) != nc) tolerance <- rep(tolerance[1], nc) - ## Initialize a vector to store closest row indices closest_indices <- rep(NA_integer_ , nrow(x)) for (i in seq_len(nrow(x))) { diff --git a/R/quality-assessment.R b/R/quality-assessment.R new file mode 100644 index 0000000..343559e --- /dev/null +++ b/R/quality-assessment.R @@ -0,0 +1,150 @@ +#' @title Basic quality assessment functions for metabolomics +#' +#' @description +#' +#' The following functions allow to calculate basic quality assessment estimates +#' typically employed in the analysis of metabolomics data. These functions are +#' designed to be applied to entire rows of data, where each row corresponds to +#' a feature. Subsequently, these estimates can serve as a foundation for +#' feature filtering. +#' +#' - `rsd` and `rowRsd` are convenience functions to calculate the relative +#' standard deviation (i.e. coefficient of variation) of a numerical vector +#' or for rows of a numerical matrix, respectively. +#' +#' - `rowDratio` computes the D-ratio or *dispersion ratio*, defined as the +#' standard deviation for QC (Quality Control) samples divided by the +#' standard deviation for biological test samples, for each feature (row) in +#' the matrix. +#' +#' - `percentMissing` and `rowPercentMissing` determine the percentage of +#' missing values in a vector or for each row of a matrix, respectively. +#' +#' - `rowBlank` identifies rows (i.e., features) where the mean of test samples +#' is lower than a specified multiple (defined by the `threshold` parameter) +#' of the mean of blank samples. This can be used to flag features that result +#' from contamination in the solvent of the samples. +#' +#' These functions are based on standard filtering methods described in the +#' literature, and they are implemented to assist in preprocessing metabolomics +#' data. +#' +#' @param x `numeric` For `rsd`, a numeric vector; +#' for `rowRsd`, `rowDratio`, `percentMissing` and `rowBlank`, a numeric +#' matrix representing the biological samples. +#' +#' @param y `numeric` For `rowDratio` and `rowBlank`, a numeric matrix +#' representing feature abundances in QC samples or blank samples, +#' respectively. +#' +#' @param na.rm `logical(1)` indicates whether missing values (`NA`) should be +#' removed prior to the calculations. +#' +#' @param mad `logical(1)` indicates whether the *Median Absolute Deviation* +#' (MAD) should be used instead of the standard deviation. This is suggested +#' for non-gaussian distributed data. +#' +#' @param threshold `numeric` For `rowBlank`, indicates the minimum difference +#' required between the mean of a feature in samples compared to the mean of +#' the same feature in blanks for it to not be considered a possible +#' contaminant. For example, the default threshold of 2 signifies that the mean +#' of the features in samples has to be at least twice the mean in blanks for +#' it not to be flagged as a possible contaminant. +#' +#' @note +#' For `rsd` and `rowRsd` the feature abundances are expected to be provided in +#' natural scale and not e.g. log2 scale as it may lead to incorrect +#' interpretations. +#' +#' @return See individual function description above for details. +#' +#' @author Philippine Louail, Johannes Rainer +#' +#' @md +#' +#' @importFrom stats sd mad median +#' +#' @name quality_assessment +#' +#' @references +#' +#' Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +#' Dunn WB. Guidelines and considerations for the use of system suitability +#' and quality control samples in mass spectrometry assays applied in +#' untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +#' doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +#' PMCID: PMC5960010. +#' +#' @examples +#' +#' ## coefficient of variation +#' a <- c(4.3, 4.5, 3.6, 5.3) +#' rsd(a) +#' +#' A <- rbind(a, a, a) +#' rowRsd(A) +#' +#' ## Dratio +#' x <- c(4.3, 4.5, 3.6, 5.3) +#' X <- rbind(a, a, a) +#' rowDratio(X, X) +#' +#' #' ## Percent Missing +#' b <- c(1, NA, 3, 4, NA) +#' percentMissing(b) +#' +#' B <- matrix(c(1, 2, 3, NA, 5, 6, 7, 8, 9), nrow = 3) +#' rowPercentMissing(B) +#' +#' ## Blank Rows +#' test_samples <- matrix(c(13, 21, 3, 4, 5, 6), nrow = 2) +#' blank_samples <- matrix(c(0, 1, 2, 3, 4, 5), nrow = 2) +#' rowBlank(test_samples, blank_samples) +#' +NULL + +#' @export +#' @rdname quality_assessment +rsd <- function(x, na.rm = TRUE, mad = FALSE) { + if (mad) + mad(x, na.rm = na.rm) / abs(median(x, na.rm = na.rm)) + else + sd(x, na.rm = na.rm) / abs(mean(x, na.rm = na.rm)) +} + +#' @export +#' @rdname quality_assessment +rowRsd <- function(x, na.rm = TRUE, mad = FALSE) + apply(x, MARGIN = 1, rsd, na.rm = na.rm, mad = mad) + +#' @export +#' @rdname quality_assessment +rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ + if (mad) + vec <- apply(y, 1, mad, na.rm = na.rm) / + apply(x, 1, mad, na.rm = na.rm) + else + vec <- apply(y, 1, sd, na.rm = na.rm) / + apply(x, 1, sd, na.rm = na.rm) +} + +#' @export +#' @rdname quality_assessment +percentMissing <- function(x){ + ((sum(is.na(x))) / length(x))*100 +} + +#' @export +#' @rdname quality_assessment +rowPercentMissing <- function(x){ + apply(x, MARGIN = 1, percentMissing) +} + +#' @export +#' @rdname quality_assessment + +rowBlank <- function(x, y, threshold = 2, na.rm = TRUE){ + m_samples <- apply(x, 1, mean, na.rm = na.rm) + m_blank <- apply(y, 1, mean, na.rm = na.rm) + m_samples < threshold * m_blank +} diff --git a/man/quality_assessment.Rd b/man/quality_assessment.Rd new file mode 100644 index 0000000..3de5570 --- /dev/null +++ b/man/quality_assessment.Rd @@ -0,0 +1,119 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quality-assessment.R +\name{quality_assessment} +\alias{quality_assessment} +\alias{rsd} +\alias{rowRsd} +\alias{rowDratio} +\alias{percentMissing} +\alias{rowPercentMissing} +\alias{rowBlank} +\title{Basic quality assessment functions for metabolomics} +\usage{ +rsd(x, na.rm = TRUE, mad = FALSE) + +rowRsd(x, na.rm = TRUE, mad = FALSE) + +rowDratio(x, y, na.rm = TRUE, mad = FALSE) + +percentMissing(x) + +rowPercentMissing(x) + +rowBlank(x, y, threshold = 2, na.rm = TRUE) +} +\arguments{ +\item{x}{\code{numeric} For \code{rsd}, a numeric vector; +for \code{rowRsd}, \code{rowDratio}, \code{percentMissing} and \code{rowBlank}, a numeric +matrix representing the biological samples.} + +\item{na.rm}{\code{logical(1)} indicates whether missing values (\code{NA}) should be +removed prior to the calculations.} + +\item{mad}{\code{logical(1)} indicates whether the \emph{Median Absolute Deviation} +(MAD) should be used instead of the standard deviation. This is suggested +for non-gaussian distributed data.} + +\item{y}{\code{numeric} For \code{rowDratio} and \code{rowBlank}, a numeric matrix +representing feature abundances in QC samples or blank samples, +respectively.} + +\item{threshold}{\code{numeric} For \code{rowBlank}, indicates the minimum difference +required between the mean of a feature in samples compared to the mean of +the same feature in blanks for it to not be considered a possible +contaminant. For example, the default threshold of 2 signifies that the mean +of the features in samples has to be at least twice the mean in blanks for +it not to be flagged as a possible contaminant.} +} +\value{ +See individual function description above for details. +} +\description{ +The following functions allow to calculate basic quality assessment estimates +typically employed in the analysis of metabolomics data. These functions are +designed to be applied to entire rows of data, where each row corresponds to +a feature. Subsequently, these estimates can serve as a foundation for +feature filtering. +\itemize{ +\item \code{rsd} and \code{rowRsd} are convenience functions to calculate the relative +standard deviation (i.e. coefficient of variation) of a numerical vector +or for rows of a numerical matrix, respectively. +\item \code{rowDratio} computes the D-ratio or \emph{dispersion ratio}, defined as the +standard deviation for QC (Quality Control) samples divided by the +standard deviation for biological test samples, for each feature (row) in +the matrix. +\item \code{percentMissing} and \code{rowPercentMissing} determine the percentage of +missing values in a vector or for each row of a matrix, respectively. +\item \code{rowBlank} identifies rows (i.e features) where the mean of test samples +is lower than twice the mean of blank samples. This can be used to flag +features that results from contamination in the solvent of the samples. +Returns a \code{logical} vector of length equal to the number of rows of \code{x}. +} + +These functions are based on standard filtering methods described in the +literature, and they are implemented to assist in preprocessing metabolomics +data. +} +\note{ +For \code{rsd} and \code{rowRsd} the feature abundances are expected to be provided in +natural scale and not e.g. log2 scale as it may lead to incorrect +interpretations. +} +\examples{ + +## coefficient of variation +a <- c(4.3, 4.5, 3.6, 5.3) +rsd(a) + +A <- rbind(a, a, a) +rowRsd(A) + +## Dratio +x <- c(4.3, 4.5, 3.6, 5.3) +X <- rbind(a, a, a) +rowDratio(X, X) + +#' ## Percent Missing +b <- c(1, NA, 3, 4, NA) +percentMissing(b) + +B <- matrix(c(1, 2, 3, NA, 5, 6, 7, 8, 9), nrow = 3) +rowPercentMissing(B) + +## Blank Rows +test_samples <- matrix(c(13, 21, 3, 4, 5, 6), nrow = 2) +blank_samples <- matrix(c(0, 1, 2, 3, 4, 5), nrow = 2) +rowBlank(test_samples, blank_samples) + +} +\references{ +Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +Dunn WB. Guidelines and considerations for the use of system suitability +and quality control samples in mass spectrometry assays applied in +untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +PMCID: PMC5960010. +} +\author{ +Philippine Louail, Johannes Rainer +} diff --git a/tests/testthat/test_function-filtering.R b/tests/testthat/test_function-filtering.R new file mode 100644 index 0000000..a771ccb --- /dev/null +++ b/tests/testthat/test_function-filtering.R @@ -0,0 +1,32 @@ +test_that("Metabolomics Filtering Functions", { + + # Define some sample data for testing + a <- c(3.2, 4.1, 3.9, 4.8) + A <- rbind(a, a, a) + b <- c(2, NA, 1, 3, NA) + B <- matrix(c(2, NA, 1, 3, NA, 6, 7, 8, 9, 12), nrow = 2) + test_samples <- matrix(c(13, 21, 1, 3, 5, 6), nrow = 3) + blank_samples <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3) + + # Test rsd function + expect_equal(rsd(a), sd(a) / mean(a)) + expect_equal(rowRsd(A), apply(A, 1, function(row) sd(row) / mean(row))) + expect_equal(rsd(a, mad = TRUE), mad(a, na.rm = TRUE) / + abs(median(a, na.rm = TRUE))) + expect_equal(rowRsd(A, mad = TRUE), + apply(A, 1, function(row) mad(row, na.rm = TRUE) / + abs(median(row, na.rm = TRUE)))) + + + # Test rowDratio function + expect_equal(as.numeric(rowDratio(A, A)), rep(1, nrow(A))) + expect_equal(as.numeric(rowDratio(A, A, mad = TRUE)), rep(1, nrow(A))) + + # Test percentMissing function + expect_equal(percentMissing(b), 40) + res <- c() + expect_equal(rowPercentMissing(B), rep(20, nrow(B))) + + # Test rowBlank function + expect_equal(rowBlank(test_samples, blank_samples), c(FALSE, FALSE, TRUE)) + }) diff --git a/vignettes/MetaboCoreUtils.Rmd b/vignettes/MetaboCoreUtils.Rmd index 348f25e..b8eec43 100644 --- a/vignettes/MetaboCoreUtils.Rmd +++ b/vignettes/MetaboCoreUtils.Rmd @@ -546,6 +546,102 @@ Generally, injecting study samples in random order can reduce (or even avoid) influence of any related technical bias in the downstream analysis and is highly suggested to improve and assure data quality. +## Basic quality assessment and pre-filtering of metabolomics data + +When dealing with metabolomics results, it is often necessary to filter +features based on certain criteria. These criteria are typically derived +from statistical formulas applied to full rows of data, where each row +represents a feature. In this tutorial, we'll explore a set of functions +designed designed to calculate basic quality assessment metrics on which +metabolomics data can subsequently be filtered. + +First, to get more information on the available function you can check the documentation + +```{r} +?quality_assessment +``` + +We will use a matrix representing metabolomics measurements from different +samples. Let's start by introducing the data: + +```{r} +# Define sample data for metabolomics analysis +set.seed(123) +metabolomics_data <- matrix(rnorm(100), nrow = 10) +colnames(metabolomics_data) <- paste0("Sample", 1:10) +rownames(metabolomics_data) <- paste0("Feature", 1:10) +``` + +We will begin by calculating the coefficient of variation (CV) for each feature. +This measure helps assess the relative variability of each metabolite across +different samples. + +```{r} +# Calculate and display the coefficient of variation +cv_result <- rowRsd(metabolomics_data) +print(cv_result) +``` + +Next, we will compute the D-ratio [@broadhurst_guidelines_2018], a measure of +dispersion, by comparing the standard deviation of QC samples to that of +biological test samples. + +```{r} +# Generate QC samples +qc_samples <- matrix(rnorm(40), nrow = 10) +colnames(qc_samples) <- paste0("QC", 1:4) + +# Calculate D-ratio and display the result +dratio_result <- rowDratio(metabolomics_data, qc_samples) +print(dratio_result) +``` + +Now, let's analyze the percentage of missing values for each metabolite. +This information is crucial for quality control and data preprocessing. + +```{r} +# Introduce missing values in the data +metabolomics_data[sample(1:100, 10)] <- NA + +# Calculate and display the percentage of missing values +missing_result <- rowPercentMissing(metabolomics_data) +print(missing_result) +``` + +Finally, we will identify features where the mean of test samples is lower +than twice the mean of blank samples. This can be indicative of significant +contamination in the solvent of the samples. + +```{r} +# Generate blank samples +blank_samples <- matrix(rnorm(30), nrow = 10) +colnames(blank_samples) <- paste0("Blank", 1:3) + +# Detect rows where mean(test) > 2 * mean(blank) +blank_detection_result <- rowBlank(metabolomics_data, blank_samples) +print(blank_detection_result) +``` + +All of these computations can then be used to easily filter our data and remove +the features that do not fit our quality criteria. Below we remove all features +that have a D-ratio and coefficeint of variation < 0.8 with no missing values +and is not flagged to be a possible solvent contaminant. + +```{r} +# Set filtering thresholds +cv_threshold <- 8 +dratio_threshold <- 0.8 + +# Apply filters +filtered_data <- metabolomics_data[ + cv_result <= cv_threshold & + dratio_result <= dratio_threshold & + missing_result <= 10 & + !blank_detection_result, , drop = FALSE] + +# Display the filtered data +print(filtered_data) +``` # Contributions diff --git a/vignettes/references.bib b/vignettes/references.bib index 884f9f5..cc55b22 100644 --- a/vignettes/references.bib +++ b/vignettes/references.bib @@ -29,3 +29,20 @@ @article{wehrens_improved_2016 keywords = {Methodology, Data analysis, Mass Spectrometry, Metabolomics, Normalisation}, pages = {88} } + +@article{broadhurst_guidelines_2018, + title = {Guidelines and considerations for the use of system suitability and quality control samples in mass spectrometry assays applied in untargeted clinical metabolomic studies}, + volume = {14}, + issn = {1573-3882, 1573-3890}, + url = {http://link.springer.com/10.1007/s11306-018-1367-3}, + doi = {10.1007/s11306-018-1367-3}, + language = {en}, + number = {6}, + urldate = {2024-01-10}, + journal = {Metabolomics}, + author = {Broadhurst, David and Goodacre, Royston and Reinke, Stacey N. and Kuligowski, Julia and Wilson, Ian D. and Lewis, Matthew R. and Dunn, Warwick B.}, + month = jun, + year = {2018}, + pages = {72}, + file = {Full Text:C\:\\Users\\plouail\\Zotero\\storage\\L9RKVUIY\\Broadhurst et al. - 2018 - Guidelines and considerations for the use of syste.pdf:application/pdf}, +}