From cb9aedc19bb5d0ddcded1c4e2f4dcfd113de0d9d Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Thu, 5 Oct 2023 14:46:17 +0200 Subject: [PATCH 01/13] Update mclosest.Rd --- man/mclosest.Rd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/man/mclosest.Rd b/man/mclosest.Rd index 1b971d7..3c4bd26 100644 --- a/man/mclosest.Rd +++ b/man/mclosest.Rd @@ -36,7 +36,7 @@ of \code{x} and \code{table}. It returns the index of the closest row in \code{t each row in \code{x}. } \details{ -If, for a row of \code{x}, 2 rows of \code{table} are closest only the index of first +If, for a row of \code{x}, two rows of \code{table} are closest only the index of first row will be returned. For both the \code{tolerance} and \code{ppm} arguments, if their length is different to @@ -47,8 +47,8 @@ replicated to match it. x <- data.frame(a = 1:5, b = 3:7) table <- data.frame(c = c(11, 23, 3, 5, 1), d = c(32:35, 45)) -## Get for each row of `x` the index of the row in `table` with the smallest difference -## of values (per column) +## Get for each row of `x` the index of the row in `table` with the smallest +difference of values (per column) mclosest(x, table) ## If the absolute difference is larger than `tolerance`, return `NA`. Note @@ -58,5 +58,5 @@ mclosest(x, table, tolerance = 25) } \author{ -Philippine Louail +Philippine Louail, Johannes Rainer } From 621d352913310c459af9f9e71fccf8237812e587 Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Thu, 5 Oct 2023 14:56:37 +0200 Subject: [PATCH 02/13] Update mclosest.R --- R/mclosest.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/mclosest.R b/R/mclosest.R index 7ccf386..d0411d9 100644 --- a/R/mclosest.R +++ b/R/mclosest.R @@ -47,7 +47,7 @@ #' table <- data.frame(c = c(11, 23, 3, 5, 1), d = c(32:35, 45)) #' #' ## Get for each row of `x` the index of the row in `table` with the smallest -#' difference of values (per column) +#' ##difference of values (per column) #' mclosest(x, table) #' #' ## If the absolute difference is larger than `tolerance`, return `NA`. Note From abbad0d9d652194e06616cfbd8663a66fb7de8b2 Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Fri, 6 Oct 2023 08:10:02 +0200 Subject: [PATCH 03/13] Update mclosest.R --- R/mclosest.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/mclosest.R b/R/mclosest.R index d0411d9..024315f 100644 --- a/R/mclosest.R +++ b/R/mclosest.R @@ -1,4 +1,4 @@ -#'@title Extract closest values in a pairwise manner between two matrices +#' @title Extract closest values in a pairwise manner between two matrices #' #' @description #' @@ -47,7 +47,7 @@ #' table <- data.frame(c = c(11, 23, 3, 5, 1), d = c(32:35, 45)) #' #' ## Get for each row of `x` the index of the row in `table` with the smallest -#' ##difference of values (per column) +#' ## difference of values (per column) #' mclosest(x, table) #' #' ## If the absolute difference is larger than `tolerance`, return `NA`. Note From 6da2f245822cc951b811f6eac2bd68805dba4838 Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Mon, 16 Oct 2023 09:12:22 +0200 Subject: [PATCH 04/13] Update mclosest.R --- R/mclosest.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/mclosest.R b/R/mclosest.R index 024315f..3a40f8e 100644 --- a/R/mclosest.R +++ b/R/mclosest.R @@ -36,7 +36,7 @@ #' the number of columns of `x` and `table`, the input argument will be #' replicated to match it. #' -#' @author Philippine Louail, Johannes Rainer +#' @author Philippine Louail #' #' @importFrom MsCoreUtils ppm #' From 68f6b8780082b01238160950300e2b004982862c Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Mon, 16 Oct 2023 09:54:46 +0200 Subject: [PATCH 05/13] commit to get main branch --- R/mclosest.R | 121 +++++++++++++++++++++++------------------------- man/mclosest.Rd | 8 ++-- 2 files changed, 62 insertions(+), 67 deletions(-) diff --git a/R/mclosest.R b/R/mclosest.R index 3a40f8e..5798887 100644 --- a/R/mclosest.R +++ b/R/mclosest.R @@ -1,98 +1,93 @@ -#' @title Extract closest values in a pairwise manner between two matrices +#'@title Extract closest values in a pairwise manner between two matrices #' #' @description #' -#' The `mclosest` function calculates the closest rows between two matrices -#' (or data frames) considering pairwise differences between values in columns -#' of `x` and `table`. It returns the index of the closest row in `table` for -#' each row in `x`. +#' The `mclosest` function calculates the closest rows between two matrices +#' (or data frames) considering pairwise differences between values in columns +#' of `x` and `table`. It returns the index of the closest row in `table` for +#' each row in `x`. #' -#' @param x `numeric` matrix or data frame representing the query data. Each -#' row in `x` will be compared to every row in `table`. Both `x` and `table` are -#' expected to have the same number of columns, and the columns are expected to +#' @param x `numeric` matrix or data frame representing the query data. Each +#' row in `x` will be compared to every row in `table`. Both `x` and `table` are +#' expected to have the same number of columns, and the columns are expected to #' be in the same order. #' -#' @param table `numeric` matrix or data frame containing the reference data to -#' be matched with each row of `x`. Each row in `table` will be compared to -#' every row in `x`. Both `table` and `x` are expected to have the same number +#' @param table `numeric` matrix or data frame containing the reference data to +#' be matched with each row of `x`. Each row in `table` will be compared to +#' every row in `x`. Both `table` and `x` are expected to have the same number #' of columns, and the columns are expected to be in the same order. #' -#' @param ppm `numeric` representing a relative, value-specific -#' parts-per-million (PPM) tolerance that is added to tolerance (default is 0). +#' @param ppm `numeric` representing a relative, value-specific +#' parts-per-million (PPM) tolerance that is added to tolerance (default is 0). #' -#' @param tolerance `numeric` accepted tolerance. Defaults to `tolerance = Inf`, -#' thus for each row in x the closest row in table is reported, regardless of +#' @param tolerance `numeric` accepted tolerance. Defaults to `tolerance = Inf`, +#' thus for each row in x the closest row in table is reported, regardless of #' the magnitude of the (absolute) difference. #' #' @return `integer` vector of indices indicating the closest row of `table` for -#' each row of `x`. If no suitable match is found for a row in `x` based on the +#' each row of `x`. If no suitable match is found for a row in `x` based on the #' specified `tolerance` and `ppm`, the corresponding index is set to `NA`. -#' +#' #' @details -#' If, for a row of `x`, two rows of `table` are closest only the index of first -#' row will be returned. -#' -#' For both the `tolerance` and `ppm` arguments, if their length is different to -#' the number of columns of `x` and `table`, the input argument will be -#' replicated to match it. +#' If, for a row of `x`, two rows of `table` are closest only the index of first +#' row will be returned. +#' +#' For both the `tolerance` and `ppm` arguments, if their length is different to +#' the number of columns of `x` and `table`, the input argument will be +#' replicated to match it. #' #' @author Philippine Louail -#' +#' #' @importFrom MsCoreUtils ppm #' -#' @export +#' @export #' #' @examples #' x <- data.frame(a = 1:5, b = 3:7) #' table <- data.frame(c = c(11, 23, 3, 5, 1), d = c(32:35, 45)) #' -#' ## Get for each row of `x` the index of the row in `table` with the smallest +#' ## Get for each row of `x` the index of the row in `table` with the smallest #' ## difference of values (per column) #' mclosest(x, table) #' #' ## If the absolute difference is larger than `tolerance`, return `NA`. Note -#' ## that the tolerance value of `25` is used for difference for each pairwise +#' ## that the tolerance value of `25` is used for difference for each pairwise #' ## column in `x` and `table`. #' mclosest(x, table, tolerance = 25) #' - mclosest <- function(x, table, ppm = 0, tolerance = Inf) { - ## sanity checks - if (is.null(dim(x))) - stop("'x' needs to be an array") - if (is.null(dim(table))) - stop("'table' needs to be an array") - if (ncol(x) != ncol(table)) - stop("'x' and 'table' need to have same number of columns") - if (!is.matrix(x)) - x <- as.matrix(x) - if (!is.matrix(table)) - table <- as.matrix(table) - nc <- ncol(x) - nr <- nrow(table) - if (length(ppm) != nc) - ppm <- rep(ppm[1], nc) - if (length(tolerance) != nc) - tolerance <- rep(tolerance[1], nc) - - ## Initialize a vector to store closest row indices - closest_indices <- rep(NA_integer_ , nrow(x)) - for (i in seq_len(nrow(x))) { - abdiff <- abs(table - rep(x[i, ], each = nr)) - ## Remove differences lower than tolerance - abdiff[abdiff > rep((tolerance + ppm(x[i,], ppm)), each = nr)] <- NA - ranked <- apply(abdiff, 2, rank, na.last="keep") - rowProd <- apply(ranked, 1, prod) - res <- which.min(rowProd) - if (length(res)) - closest_indices[i] <- res - } - closest_indices -} - - - + ## sanity checks + if (is.null(dim(x))) + stop("'x' needs to be an array") + if (is.null(dim(table))) + stop("'table' needs to be an array") + if (ncol(x) != ncol(table)) + stop("'x' and 'table' need to have same number of columns") + if (!is.matrix(x)) + x <- as.matrix(x) + if (!is.matrix(table)) + table <- as.matrix(table) + nc <- ncol(x) + nr <- nrow(table) + if (length(ppm) != nc) + ppm <- rep(ppm[1], nc) + if (length(tolerance) != nc) + tolerance <- rep(tolerance[1], nc) + ## Initialize a vector to store closest row indices + closest_indices <- rep(NA_integer_ , nrow(x)) + for (i in seq_len(nrow(x))) { + abdiff <- abs(table - rep(x[i, ], each = nr)) + ## Remove differences lower than tolerance + abdiff[abdiff > rep((tolerance + ppm(x[i,], ppm)), each = nr)] <- NA + ranked <- apply(abdiff, 2, rank, na.last="keep") + rowProd <- apply(ranked, 1, prod) + res <- which.min(rowProd) + if (length(res)) + closest_indices[i] <- res + } + closest_indices +} diff --git a/man/mclosest.Rd b/man/mclosest.Rd index 3c4bd26..631710d 100644 --- a/man/mclosest.Rd +++ b/man/mclosest.Rd @@ -47,16 +47,16 @@ replicated to match it. x <- data.frame(a = 1:5, b = 3:7) table <- data.frame(c = c(11, 23, 3, 5, 1), d = c(32:35, 45)) -## Get for each row of `x` the index of the row in `table` with the smallest -difference of values (per column) +## Get for each row of `x` the index of the row in `table` with the smallest +## difference of values (per column) mclosest(x, table) ## If the absolute difference is larger than `tolerance`, return `NA`. Note -## that the tolerance value of `25` is used for difference for each pairwise +## that the tolerance value of `25` is used for difference for each pairwise ## column in `x` and `table`. mclosest(x, table, tolerance = 25) } \author{ -Philippine Louail, Johannes Rainer +Philippine Louail } From acf8b55e1993483bbaae9dfdbcc2a4537d7c6cb5 Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Mon, 16 Oct 2023 10:02:36 +0200 Subject: [PATCH 06/13] Update mclosest.R --- R/mclosest.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/mclosest.R b/R/mclosest.R index 5798887..8531ca7 100644 --- a/R/mclosest.R +++ b/R/mclosest.R @@ -76,7 +76,6 @@ mclosest <- function(x, ppm <- rep(ppm[1], nc) if (length(tolerance) != nc) tolerance <- rep(tolerance[1], nc) - ## Initialize a vector to store closest row indices closest_indices <- rep(NA_integer_ , nrow(x)) for (i in seq_len(nrow(x))) { From 990fa69396db4560dd2ab3eaf4302e32dd45304c Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Wed, 10 Jan 2024 11:26:54 +0100 Subject: [PATCH 07/13] Addition of filtering functions --- NAMESPACE | 9 ++ R/function-filtering.R | 148 +++++++++++++++++++++++ man/filteringFunctions.Rd | 106 ++++++++++++++++ tests/testthat/test_function-filtering.R | 26 ++++ vignettes/MetaboCoreUtils.Rmd | 3 + 5 files changed, 292 insertions(+) create mode 100644 R/function-filtering.R create mode 100644 man/filteringFunctions.Rd create mode 100644 tests/testthat/test_function-filtering.R diff --git a/NAMESPACE b/NAMESPACE index 6c7bd8d..c00ca19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,12 @@ export(mclosest) export(multiplyElements) export(mz2mass) export(pasteElements) +export(percentMissing) +export(rowBlank) +export(rowDratio) +export(rowPercentMissing) +export(rowRsd) +export(rsd) export(standardizeFormula) export(subtractElements) importFrom(BiocParallel,SerialParam) @@ -36,7 +42,10 @@ importFrom(MsCoreUtils,ppm) importFrom(methods,is) importFrom(stats,approx) importFrom(stats,lm) +importFrom(stats,mad) +importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,predict) +importFrom(stats,sd) importFrom(stats,setNames) importFrom(utils,read.table) diff --git a/R/function-filtering.R b/R/function-filtering.R new file mode 100644 index 0000000..0a466fe --- /dev/null +++ b/R/function-filtering.R @@ -0,0 +1,148 @@ +#' @title Basic filtering functions for metabolomics +#' +#' @description +#' +#' When dealing with metabolomics results, it is often necessary to filter +#' features based on certain criteria. These criteria are typically derived +#' from statistical formulas applied to full rows of data, where each row +#' represents a feature. The following functions provide basic filtering +#' methods commonly used in the analysis of metabolomics data. +#' +#' - `rsd` and `rowRsd` are convenience functions to calculate the relative +#' standard deviation (i.e. coefficient of variation) of a numerical vector +#' or for rows of a numerical matrix, respectively. +#' +#' - `rowDratio` computes the D-ratio or "dispersion ratio," defined as the +#' standard deviation for QC (Quality Control) samples divided by the +#' standard deviation for biological test samples, for each feature (row) in +#' the matrix. +#' +#' - `percentMissing` and `rowPercentMissing` determine the percentage of +#' missing values in a vector or for each row of a matrix, respectively. +#' +#' - `rowBlank` identifies rows (i.e features) where the mean of test samples +#' is greater than twice the mean of blank samples. This can highlights +#' features that results from contamination in the solvent of the samples. +#' Return a `logical` vector +#' +#' These functions are based on standard filtering methods described in the +#' literature, and they are implemented to assist in preprocessing metabolomics +#' data. +#' +#' @param x `numeric` For `rsd`, a numeric vector; +#' for `rowRsd`, `rowDratio`, `percentMissing` and `rowBlank`, a numeric +#' matrix representing the biological samples. +#' +#' @param y `numeric` For `rowDratio` and `rowBlank`, a numeric matrix +#' representing the QC samples and blank samples, respectively. +#' +#' @param na.rm `logical(1)` indicate whether missing values (`NA`) should be +#' removed prior to the calculations. +#' +#' @param mad `logical(1)` indicate whether the *Median Absolute Deviation* +#' (MAD) should be used instead of the standard deviation. This is suggested +#' for non-gaussian distributed data. +#' +#' @return See individual function description above for details. +#' +#' @author Philippine Louail, Johannes Rainer +#' +#' @md +#' +#' @importFrom stats sd mad median +#' +#' @name filteringFunctions +#' +#' @references +#' +#' Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +#' Dunn WB. Guidelines and considerations for the use of system suitability +#' and quality control samples in mass spectrometry assays applied in +#' untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +#' doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; +#' PMCID: PMC5960010. +#' +#' @examples +#' +#' ## coefficient of variation +#' a <- c(4.3, 4.5, 3.6, 5.3) +#' rsd(a) +#' +#' A <- rbind(a, a, a) +#' rowRsd(A) +#' +#' ## Dratio +#' x <- c(4.3, 4.5, 3.6, 5.3) +#' X <- rbind(a, a, a) +#' rowDratio(X, X) +#' +#' #' ## Percent Missing +#' b <- c(1, NA, 3, 4, NA) +#' percentMissing(b) +#' +#' B <- matrix(c(1, 2, 3, NA, 5, 6, 7, 8, 9), nrow = 3) +#' rowPercentMissing(B) +#' +#' ## Blank Rows +#' test_samples <- matrix(c(13, 21, 3, 4, 5, 6), nrow = 2) +#' blank_samples <- matrix(c(0, 1, 2, 3, 4, 5), nrow = 2) +#' rowBlank(test_samples, blank_samples) +#' +NULL + +#' @export +#' @rdname filteringFunctions +#' + +rsd <- function(x, na.rm = TRUE, mad = FALSE) { + if (mad) + mad(x, na.rm = na.rm) / abs(median(x, na.rm = na.rm)) + else + sd(x, na.rm = na.rm) / abs(mean(x, na.rm = na.rm)) +} + +#' @rdname filteringFunctions +#' @export +rowRsd <- function(x, na.rm = TRUE, mad = FALSE) + apply(x, MARGIN = 1, rsd, na.rm = na.rm, mad = mad) + + +#' @export +#' @rdname filteringFunctions +#' + +rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ + if (mad) + vec <- apply(y, 1, mad, na.rm = na.rm) / + apply(x, 1, mad, na.rm = na.rm) + else + vec <- apply(y, 1, sd, na.rm = na.rm) / + apply(x, 1, sd, na.rm = na.rm) +} + + +#' @export +#' @rdname filteringFunctions + + +percentMissing <- function(x){ + ((sum(is.na(x))) / length(x))*100 +} + +#' @export +#' @rdname filteringFunctions +#' +rowPercentMissing <- function(x){ + apply(x, MARGIN = 1, percentMissing) +} + + +#' @export +#' @rdname filteringFunctions +#' + +rowBlank <- function(x, y, na.rm = TRUE){ + m_samples <- apply(x, 1, mean, na.rm = na.rm) + m_blank <- apply(y, 1, mean, na.rm = na.rm) + vec <- m_samples > 2 * m_blank +} diff --git a/man/filteringFunctions.Rd b/man/filteringFunctions.Rd new file mode 100644 index 0000000..c2a60d3 --- /dev/null +++ b/man/filteringFunctions.Rd @@ -0,0 +1,106 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/function-filtering.R +\name{filteringFunctions} +\alias{filteringFunctions} +\alias{rsd} +\alias{rowRsd} +\alias{rowDratio} +\alias{percentMissing} +\alias{rowPercentMissing} +\alias{rowBlank} +\title{Basic filtering functions for metabolomics} +\usage{ +rsd(x, na.rm = TRUE, mad = FALSE) + +rowRsd(x, na.rm = TRUE, mad = FALSE) + +rowDratio(x, y, na.rm = TRUE, mad = FALSE) + +percentMissing(x) + +rowPercentMissing(x) + +rowBlank(x, y, na.rm = TRUE) +} +\arguments{ +\item{x}{\code{numeric} For \code{rsd}, a numeric vector; +for \code{rowRsd}, \code{rowDratio}, \code{percentMissing} and \code{rowBlank}, a numeric +matrix representing the biological samples.} + +\item{na.rm}{\code{logical(1)} indicate whether missing values (\code{NA}) should be +removed prior to the calculations.} + +\item{mad}{\code{logical(1)} indicate whether the \emph{Median Absolute Deviation} +(MAD) should be used instead of the standard deviation. This is suggested +for non-gaussian distributed data.} + +\item{y}{\code{numeric} For \code{rowDratio} and \code{rowBlank}, a numeric matrix +representing the QC samples and blank samples, respectively.} +} +\value{ +See individual function description above for details. +} +\description{ +When dealing with metabolomics results, it is often necessary to filter +features based on certain criteria. These criteria are typically derived +from statistical formulas applied to full rows of data, where each row +represents a feature. The following functions provide basic filtering +methods commonly used in the analysis of metabolomics data. +\itemize{ +\item \code{rsd} and \code{rowRsd} are convenience functions to calculate the relative +standard deviation (i.e. coefficient of variation) of a numerical vector +or for rows of a numerical matrix, respectively. +\item \code{rowDratio} computes the D-ratio or "dispersion ratio," defined as the +standard deviation for QC (Quality Control) samples divided by the +standard deviation for biological test samples, for each feature (row) in +the matrix. +\item \code{percentMissing} and \code{rowPercentMissing} determine the percentage of +missing values in a vector or for each row of a matrix, respectively. +\item \code{rowBlank} identifies rows (i.e features) where the mean of test samples +is greater than twice the mean of blank samples. This can highlights +features that results from contamination in the solvent of the samples. +Return a \code{logical} vector +} + +These functions are based on standard filtering methods described in the +literature, and they are implemented to assist in preprocessing metabolomics +data. +} +\examples{ + +## coefficient of variation +a <- c(4.3, 4.5, 3.6, 5.3) +rsd(a) + +A <- rbind(a, a, a) +rowRsd(A) + +## Dratio +x <- c(4.3, 4.5, 3.6, 5.3) +X <- rbind(a, a, a) +rowDratio(X, X) + +#' ## Percent Missing +b <- c(1, NA, 3, 4, NA) +percentMissing(b) + +B <- matrix(c(1, 2, 3, NA, 5, 6, 7, 8, 9), nrow = 3) +rowPercentMissing(B) + +## Blank Rows +test_samples <- matrix(c(13, 21, 3, 4, 5, 6), nrow = 2) +blank_samples <- matrix(c(0, 1, 2, 3, 4, 5), nrow = 2) +rowBlank(test_samples, blank_samples) + +} +\references{ +Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, +Dunn WB. Guidelines and considerations for the use of system suitability +and quality control samples in mass spectrometry assays applied in +untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. +doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; \ +PMCID: PMC5960010. +} +\author{ +Philippine Louail, Johannes Rainer +} diff --git a/tests/testthat/test_function-filtering.R b/tests/testthat/test_function-filtering.R new file mode 100644 index 0000000..a90d7af --- /dev/null +++ b/tests/testthat/test_function-filtering.R @@ -0,0 +1,26 @@ +# Create a test context +test_that("Metabolomics Filtering Functions", { + + # Define some sample data for testing + a <- c(3.2, 4.1, 3.9, 4.8) + A <- rbind(a, a, a) + b <- c(2, NA, 1, 3, NA) + B <- matrix(c(2, NA, 1, 3, NA, 6, 7, 8, 9, 12), nrow = 2) + test_samples <- matrix(c(13, 21, 1, 3, 5, 6), nrow = 3) + blank_samples <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3) + + # Test rsd function + expect_equal(rsd(a), sd(a) / mean(a)) + expect_equal(rowRsd(A), apply(A, 1, function(row) sd(row) / mean(row))) + + # Test rowDratio function + expect_equal(as.numeric(rowDratio(A, A)), rep(1, nrow(A))) + + # Test percentMissing function + expect_equal(percentMissing(b), 40) + res <- c() + expect_equal(rowPercentMissing(B), rep(20, nrow(B))) + + # Test rowBlank function + expect_equal(rowBlank(test_samples, blank_samples), c(TRUE, TRUE, FALSE)) + }) diff --git a/vignettes/MetaboCoreUtils.Rmd b/vignettes/MetaboCoreUtils.Rmd index 348f25e..0c4f1de 100644 --- a/vignettes/MetaboCoreUtils.Rmd +++ b/vignettes/MetaboCoreUtils.Rmd @@ -546,6 +546,9 @@ Generally, injecting study samples in random order can reduce (or even avoid) influence of any related technical bias in the downstream analysis and is highly suggested to improve and assure data quality. +## Filtering data: Identifying measurement error + + # Contributions From 9185c0ce5e89f74b0eaa059662f37a5e70add3cd Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Wed, 10 Jan 2024 11:44:38 +0100 Subject: [PATCH 08/13] add NEWS and description info --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/function-filtering.R | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c5954f1..0a693e7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: MetaboCoreUtils Title: Core Utils for Metabolomics Data -Version: 1.11.1 +Version: 1.11.2 Description: MetaboCoreUtils defines metabolomics-related core functionality provided as low-level functions to allow a data structure-independent usage across various R packages. This includes functions to calculate between ion diff --git a/NEWS.md b/NEWS.md index 4d82b19..5767197 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # MetaboCoreUtils 1.11 +## MetaboCoreUtils 1.11.2 + +- Add function to compute quality check of the data (issue + [#77]((https://github.com/rformassspectrometry/MetaboCoreUtils/issues/77)) + ## MetaboCoreUtils 1.11.1 - Add functions to enable linear model-based adjustment of (LC-MS derived) diff --git a/R/function-filtering.R b/R/function-filtering.R index 0a466fe..d67a15d 100644 --- a/R/function-filtering.R +++ b/R/function-filtering.R @@ -124,7 +124,6 @@ rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ #' @export #' @rdname filteringFunctions - percentMissing <- function(x){ ((sum(is.na(x))) / length(x))*100 } From 1cd05ae99e29c87776af12811f1605600a91d04d Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Wed, 10 Jan 2024 11:44:59 +0100 Subject: [PATCH 09/13] Update NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5767197..1ce8ea6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ ## MetaboCoreUtils 1.11.2 -- Add function to compute quality check of the data (issue +- Add functions to compute quality check of the data (issue [#77]((https://github.com/rformassspectrometry/MetaboCoreUtils/issues/77)) ## MetaboCoreUtils 1.11.1 From dbf7c011bd1936ab6d06063acf65f58e62900a8f Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Wed, 10 Jan 2024 13:51:22 +0100 Subject: [PATCH 10/13] fix: johannes comment and add vignette --- R/function-filtering.R | 29 ++++---- man/filteringFunctions.Rd | 16 +++-- tests/testthat/test_function-filtering.R | 10 ++- vignettes/MetaboCoreUtils.Rmd | 91 ++++++++++++++++++++++++ vignettes/references.bib | 17 +++++ 5 files changed, 139 insertions(+), 24 deletions(-) diff --git a/R/function-filtering.R b/R/function-filtering.R index d67a15d..19038f1 100644 --- a/R/function-filtering.R +++ b/R/function-filtering.R @@ -12,7 +12,7 @@ #' standard deviation (i.e. coefficient of variation) of a numerical vector #' or for rows of a numerical matrix, respectively. #' -#' - `rowDratio` computes the D-ratio or "dispersion ratio," defined as the +#' - `rowDratio` computes the D-ratio or *dispersion ratio*, defined as the #' standard deviation for QC (Quality Control) samples divided by the #' standard deviation for biological test samples, for each feature (row) in #' the matrix. @@ -21,9 +21,9 @@ #' missing values in a vector or for each row of a matrix, respectively. #' #' - `rowBlank` identifies rows (i.e features) where the mean of test samples -#' is greater than twice the mean of blank samples. This can highlights +#' is lower than twice the mean of blank samples. This can be used to flag #' features that results from contamination in the solvent of the samples. -#' Return a `logical` vector +#' Returns a `logical` vector of length equal to the number of rows of `x`. #' #' These functions are based on standard filtering methods described in the #' literature, and they are implemented to assist in preprocessing metabolomics @@ -34,7 +34,8 @@ #' matrix representing the biological samples. #' #' @param y `numeric` For `rowDratio` and `rowBlank`, a numeric matrix -#' representing the QC samples and blank samples, respectively. +#' representing feature abundances in QC samples or blank samples, +#' respectively. #' #' @param na.rm `logical(1)` indicate whether missing values (`NA`) should be #' removed prior to the calculations. @@ -43,6 +44,11 @@ #' (MAD) should be used instead of the standard deviation. This is suggested #' for non-gaussian distributed data. #' +#' @note +#' For `rsd` and `rowRsd` the feature abundances are expected to be provided +#' in natural scale and not e.g. log2 scale as it may lead to incorrect +#' interpretations. +#' #' @return See individual function description above for details. #' #' @author Philippine Louail, Johannes Rainer @@ -92,8 +98,6 @@ NULL #' @export #' @rdname filteringFunctions -#' - rsd <- function(x, na.rm = TRUE, mad = FALSE) { if (mad) mad(x, na.rm = na.rm) / abs(median(x, na.rm = na.rm)) @@ -101,16 +105,13 @@ rsd <- function(x, na.rm = TRUE, mad = FALSE) { sd(x, na.rm = na.rm) / abs(mean(x, na.rm = na.rm)) } -#' @rdname filteringFunctions #' @export +#' @rdname filteringFunctions rowRsd <- function(x, na.rm = TRUE, mad = FALSE) apply(x, MARGIN = 1, rsd, na.rm = na.rm, mad = mad) - #' @export #' @rdname filteringFunctions -#' - rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ if (mad) vec <- apply(y, 1, mad, na.rm = na.rm) / @@ -120,28 +121,22 @@ rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ apply(x, 1, sd, na.rm = na.rm) } - #' @export #' @rdname filteringFunctions - percentMissing <- function(x){ ((sum(is.na(x))) / length(x))*100 } #' @export #' @rdname filteringFunctions -#' rowPercentMissing <- function(x){ apply(x, MARGIN = 1, percentMissing) } - #' @export #' @rdname filteringFunctions -#' - rowBlank <- function(x, y, na.rm = TRUE){ m_samples <- apply(x, 1, mean, na.rm = na.rm) m_blank <- apply(y, 1, mean, na.rm = na.rm) - vec <- m_samples > 2 * m_blank + m_samples < 2 * m_blank } diff --git a/man/filteringFunctions.Rd b/man/filteringFunctions.Rd index c2a60d3..38ee320 100644 --- a/man/filteringFunctions.Rd +++ b/man/filteringFunctions.Rd @@ -35,7 +35,8 @@ removed prior to the calculations.} for non-gaussian distributed data.} \item{y}{\code{numeric} For \code{rowDratio} and \code{rowBlank}, a numeric matrix -representing the QC samples and blank samples, respectively.} +representing feature abundances in QC samples or blank samples, +respectively.} } \value{ See individual function description above for details. @@ -50,22 +51,27 @@ methods commonly used in the analysis of metabolomics data. \item \code{rsd} and \code{rowRsd} are convenience functions to calculate the relative standard deviation (i.e. coefficient of variation) of a numerical vector or for rows of a numerical matrix, respectively. -\item \code{rowDratio} computes the D-ratio or "dispersion ratio," defined as the +\item \code{rowDratio} computes the D-ratio or \emph{dispersion ratio}, defined as the standard deviation for QC (Quality Control) samples divided by the standard deviation for biological test samples, for each feature (row) in the matrix. \item \code{percentMissing} and \code{rowPercentMissing} determine the percentage of missing values in a vector or for each row of a matrix, respectively. \item \code{rowBlank} identifies rows (i.e features) where the mean of test samples -is greater than twice the mean of blank samples. This can highlights +is lower than twice the mean of blank samples. This can be used to flag features that results from contamination in the solvent of the samples. -Return a \code{logical} vector +Returns a \code{logical} vector of length equal to the number of rows of \code{x}. } These functions are based on standard filtering methods described in the literature, and they are implemented to assist in preprocessing metabolomics data. } +\note{ +For \code{rsd} and \code{rowRsd} the feature abundances are expected to be provided +in natural scale and not e.g. log2 scale as it may lead to incorrect +interpretations. +} \examples{ ## coefficient of variation @@ -98,7 +104,7 @@ Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR, Dunn WB. Guidelines and considerations for the use of system suitability and quality control samples in mass spectrometry assays applied in untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72. -doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; \ +doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336; PMCID: PMC5960010. } \author{ diff --git a/tests/testthat/test_function-filtering.R b/tests/testthat/test_function-filtering.R index a90d7af..a771ccb 100644 --- a/tests/testthat/test_function-filtering.R +++ b/tests/testthat/test_function-filtering.R @@ -1,4 +1,3 @@ -# Create a test context test_that("Metabolomics Filtering Functions", { # Define some sample data for testing @@ -12,9 +11,16 @@ test_that("Metabolomics Filtering Functions", { # Test rsd function expect_equal(rsd(a), sd(a) / mean(a)) expect_equal(rowRsd(A), apply(A, 1, function(row) sd(row) / mean(row))) + expect_equal(rsd(a, mad = TRUE), mad(a, na.rm = TRUE) / + abs(median(a, na.rm = TRUE))) + expect_equal(rowRsd(A, mad = TRUE), + apply(A, 1, function(row) mad(row, na.rm = TRUE) / + abs(median(row, na.rm = TRUE)))) + # Test rowDratio function expect_equal(as.numeric(rowDratio(A, A)), rep(1, nrow(A))) + expect_equal(as.numeric(rowDratio(A, A, mad = TRUE)), rep(1, nrow(A))) # Test percentMissing function expect_equal(percentMissing(b), 40) @@ -22,5 +28,5 @@ test_that("Metabolomics Filtering Functions", { expect_equal(rowPercentMissing(B), rep(20, nrow(B))) # Test rowBlank function - expect_equal(rowBlank(test_samples, blank_samples), c(TRUE, TRUE, FALSE)) + expect_equal(rowBlank(test_samples, blank_samples), c(FALSE, FALSE, TRUE)) }) diff --git a/vignettes/MetaboCoreUtils.Rmd b/vignettes/MetaboCoreUtils.Rmd index 0c4f1de..c06b829 100644 --- a/vignettes/MetaboCoreUtils.Rmd +++ b/vignettes/MetaboCoreUtils.Rmd @@ -548,7 +548,98 @@ suggested to improve and assure data quality. ## Filtering data: Identifying measurement error +When dealing with metabolomics results, it is often necessary to filter +features based on certain criteria. These criteria are typically derived +from statistical formulas applied to full rows of data, where each row +represents a feature. In this tutorial, we'll explore a set of functions +designed for filtering metabolomics data. +First, to get more information on the available function you can check the documentation + +```{r} +?filteringFunctions +``` + +We will use a matrix representing metabolomics measurements from different +samples. Let's start by introducing the data: + +```{r} +# Define sample data for metabolomics analysis +set.seed(123) +metabolomics_data <- matrix(rnorm(100), nrow = 10) +colnames(metabolomics_data) <- paste0("Sample", 1:10) +rownames(metabolomics_data) <- paste0("Feature", 1:10) +``` + +We will begin by calculating the coefficient of variation (CV) for each feature. +This measure helps assess the relative variability of each metabolite across +different samples. + +```{r} +# Calculate and display the coefficient of variation +cv_result <- rowRsd(metabolomics_data) +print(cv_result) +``` + +Next, we will compute the D-ratio, a measure of dispersion, by comparing the +standard deviation of QC samples to that of biological test samples. + +```{r} +# Generate QC samples +qc_samples <- matrix(rnorm(40), nrow = 10) +colnames(qc_samples) <- paste0("QC", 1:4) + +# Calculate D-ratio and display the result +dratio_result <- rowDratio(metabolomics_data, qc_samples) +print(dratio_result) +``` + +Now, let's analyze the percentage of missing values for each metabolite. +This information is crucial for quality control and data preprocessing. + +```{r} +# Introduce missing values in the data +metabolomics_data[sample(1:100, 10)] <- NA + +# Calculate and display the percentage of missing values +missing_result <- rowPercentMissing(metabolomics_data) +print(missing_result) +``` + +Finally, we will identify features where the mean of test samples is lower +than twice the mean of blank samples. This can be indicative of significant +contamination in the solvent of the samples. + +```{r} +# Generate blank samples +blank_samples <- matrix(rnorm(30), nrow = 10) +colnames(blank_samples) <- paste0("Blank", 1:3) + +# Detect rows where mean(test) > 2 * mean(blank) +blank_detection_result <- rowBlank(metabolomics_data, blank_samples) +print(blank_detection_result) +``` + +All of these computations can then be used to easily filter our data and remove +the features that do not fit our quality criteria. Below we remove all features +that have a D-ratio and coefficeint of variation < 0.8 with no missing values +and is not flagged to be a possible solvent contaminant. + +```{r} +# Set filtering thresholds +cv_threshold <- 8 +dratio_threshold <- 0.8 + +# Apply filters +filtered_data <- metabolomics_data[ + cv_result <= cv_threshold & + dratio_result <= dratio_threshold & + missing_result <= 10 & + !blank_detection_result, , drop = FALSE] + +# Display the filtered data +print(filtered_data) +``` # Contributions diff --git a/vignettes/references.bib b/vignettes/references.bib index 884f9f5..cc55b22 100644 --- a/vignettes/references.bib +++ b/vignettes/references.bib @@ -29,3 +29,20 @@ @article{wehrens_improved_2016 keywords = {Methodology, Data analysis, Mass Spectrometry, Metabolomics, Normalisation}, pages = {88} } + +@article{broadhurst_guidelines_2018, + title = {Guidelines and considerations for the use of system suitability and quality control samples in mass spectrometry assays applied in untargeted clinical metabolomic studies}, + volume = {14}, + issn = {1573-3882, 1573-3890}, + url = {http://link.springer.com/10.1007/s11306-018-1367-3}, + doi = {10.1007/s11306-018-1367-3}, + language = {en}, + number = {6}, + urldate = {2024-01-10}, + journal = {Metabolomics}, + author = {Broadhurst, David and Goodacre, Royston and Reinke, Stacey N. and Kuligowski, Julia and Wilson, Ian D. and Lewis, Matthew R. and Dunn, Warwick B.}, + month = jun, + year = {2018}, + pages = {72}, + file = {Full Text:C\:\\Users\\plouail\\Zotero\\storage\\L9RKVUIY\\Broadhurst et al. - 2018 - Guidelines and considerations for the use of syste.pdf:application/pdf}, +} From 443b0eca76b401c7e0ea1fa8544af70de6d8d911 Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Thu, 11 Jan 2024 14:35:12 +0100 Subject: [PATCH 11/13] fix: naming system --- ...ction-filtering.R => quality-assessment.R} | 33 ++++++++++--------- ...ringFunctions.Rd => quality_assessment.Rd} | 22 ++++++------- vignettes/MetaboCoreUtils.Rmd | 12 ++++--- 3 files changed, 35 insertions(+), 32 deletions(-) rename R/{function-filtering.R => quality-assessment.R} (84%) rename man/{filteringFunctions.Rd => quality_assessment.Rd} (84%) diff --git a/R/function-filtering.R b/R/quality-assessment.R similarity index 84% rename from R/function-filtering.R rename to R/quality-assessment.R index 19038f1..4cbaa2d 100644 --- a/R/function-filtering.R +++ b/R/quality-assessment.R @@ -1,12 +1,12 @@ -#' @title Basic filtering functions for metabolomics +#' @title Basic quality assessment functions for metabolomics #' #' @description #' -#' When dealing with metabolomics results, it is often necessary to filter -#' features based on certain criteria. These criteria are typically derived -#' from statistical formulas applied to full rows of data, where each row -#' represents a feature. The following functions provide basic filtering -#' methods commonly used in the analysis of metabolomics data. +#' The following functions allow to calculate basic quality assessment estimates +#' typically employed in the analysis of metabolomics data. These functions are +#' designed to be applied to entire rows of data, where each row corresponds to +#' a feature. Subsequently, these estimates can serve as a foundation for +#' feature filtering. #' #' - `rsd` and `rowRsd` are convenience functions to calculate the relative #' standard deviation (i.e. coefficient of variation) of a numerical vector @@ -45,9 +45,9 @@ #' for non-gaussian distributed data. #' #' @note -#' For `rsd` and `rowRsd` the feature abundances are expected to be provided -#' in natural scale and not e.g. log2 scale as it may lead to incorrect -#' interpretations. +#' For `rsd` and `rowRsd` the feature abundances are expected to be provided in +#' natural scale and not e.g. log2 scale as it may lead to incorrect +#' interpretations. #' #' @return See individual function description above for details. #' @@ -57,7 +57,7 @@ #' #' @importFrom stats sd mad median #' -#' @name filteringFunctions +#' @name quality_assessment #' #' @references #' @@ -97,7 +97,7 @@ NULL #' @export -#' @rdname filteringFunctions +#' @rdname quality_assessment rsd <- function(x, na.rm = TRUE, mad = FALSE) { if (mad) mad(x, na.rm = na.rm) / abs(median(x, na.rm = na.rm)) @@ -106,12 +106,12 @@ rsd <- function(x, na.rm = TRUE, mad = FALSE) { } #' @export -#' @rdname filteringFunctions +#' @rdname quality_assessment rowRsd <- function(x, na.rm = TRUE, mad = FALSE) apply(x, MARGIN = 1, rsd, na.rm = na.rm, mad = mad) #' @export -#' @rdname filteringFunctions +#' @rdname quality_assessment rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ if (mad) vec <- apply(y, 1, mad, na.rm = na.rm) / @@ -122,19 +122,20 @@ rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){ } #' @export -#' @rdname filteringFunctions +#' @rdname quality_assessment percentMissing <- function(x){ ((sum(is.na(x))) / length(x))*100 } #' @export -#' @rdname filteringFunctions +#' @rdname quality_assessment rowPercentMissing <- function(x){ apply(x, MARGIN = 1, percentMissing) } #' @export -#' @rdname filteringFunctions +#' @rdname quality_assessment + rowBlank <- function(x, y, na.rm = TRUE){ m_samples <- apply(x, 1, mean, na.rm = na.rm) m_blank <- apply(y, 1, mean, na.rm = na.rm) diff --git a/man/filteringFunctions.Rd b/man/quality_assessment.Rd similarity index 84% rename from man/filteringFunctions.Rd rename to man/quality_assessment.Rd index 38ee320..c706dcf 100644 --- a/man/filteringFunctions.Rd +++ b/man/quality_assessment.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/function-filtering.R -\name{filteringFunctions} -\alias{filteringFunctions} +% Please edit documentation in R/quality-assessment.R +\name{quality_assessment} +\alias{quality_assessment} \alias{rsd} \alias{rowRsd} \alias{rowDratio} \alias{percentMissing} \alias{rowPercentMissing} \alias{rowBlank} -\title{Basic filtering functions for metabolomics} +\title{Basic quality assessment functions for metabolomics} \usage{ rsd(x, na.rm = TRUE, mad = FALSE) @@ -42,11 +42,11 @@ respectively.} See individual function description above for details. } \description{ -When dealing with metabolomics results, it is often necessary to filter -features based on certain criteria. These criteria are typically derived -from statistical formulas applied to full rows of data, where each row -represents a feature. The following functions provide basic filtering -methods commonly used in the analysis of metabolomics data. +The following functions allow to calculate basic quality assessment estimates +typically employed in the analysis of metabolomics data. These functions are +designed to be applied to entire rows of data, where each row corresponds to +a feature. Subsequently, these estimates can serve as a foundation for +feature filtering. \itemize{ \item \code{rsd} and \code{rowRsd} are convenience functions to calculate the relative standard deviation (i.e. coefficient of variation) of a numerical vector @@ -68,8 +68,8 @@ literature, and they are implemented to assist in preprocessing metabolomics data. } \note{ -For \code{rsd} and \code{rowRsd} the feature abundances are expected to be provided -in natural scale and not e.g. log2 scale as it may lead to incorrect +For \code{rsd} and \code{rowRsd} the feature abundances are expected to be provided in +natural scale and not e.g. log2 scale as it may lead to incorrect interpretations. } \examples{ diff --git a/vignettes/MetaboCoreUtils.Rmd b/vignettes/MetaboCoreUtils.Rmd index c06b829..b8eec43 100644 --- a/vignettes/MetaboCoreUtils.Rmd +++ b/vignettes/MetaboCoreUtils.Rmd @@ -546,18 +546,19 @@ Generally, injecting study samples in random order can reduce (or even avoid) influence of any related technical bias in the downstream analysis and is highly suggested to improve and assure data quality. -## Filtering data: Identifying measurement error +## Basic quality assessment and pre-filtering of metabolomics data When dealing with metabolomics results, it is often necessary to filter features based on certain criteria. These criteria are typically derived from statistical formulas applied to full rows of data, where each row represents a feature. In this tutorial, we'll explore a set of functions -designed for filtering metabolomics data. +designed designed to calculate basic quality assessment metrics on which +metabolomics data can subsequently be filtered. First, to get more information on the available function you can check the documentation ```{r} -?filteringFunctions +?quality_assessment ``` We will use a matrix representing metabolomics measurements from different @@ -581,8 +582,9 @@ cv_result <- rowRsd(metabolomics_data) print(cv_result) ``` -Next, we will compute the D-ratio, a measure of dispersion, by comparing the -standard deviation of QC samples to that of biological test samples. +Next, we will compute the D-ratio [@broadhurst_guidelines_2018], a measure of +dispersion, by comparing the standard deviation of QC samples to that of +biological test samples. ```{r} # Generate QC samples From 623572b0afea6a31de12457f7ee06af6b9d230a2 Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Wed, 17 Jan 2024 08:05:26 +0100 Subject: [PATCH 12/13] update parameter --- R/quality-assessment.R | 15 +++++++++++---- man/quality_assessment.Rd | 13 ++++++++++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/R/quality-assessment.R b/R/quality-assessment.R index 4cbaa2d..9a38099 100644 --- a/R/quality-assessment.R +++ b/R/quality-assessment.R @@ -37,13 +37,20 @@ #' representing feature abundances in QC samples or blank samples, #' respectively. #' -#' @param na.rm `logical(1)` indicate whether missing values (`NA`) should be +#' @param na.rm `logical(1)` indicates whether missing values (`NA`) should be #' removed prior to the calculations. #' -#' @param mad `logical(1)` indicate whether the *Median Absolute Deviation* +#' @param mad `logical(1)` indicates whether the *Median Absolute Deviation* #' (MAD) should be used instead of the standard deviation. This is suggested #' for non-gaussian distributed data. #' +#' @param threshold `numeric` For `rowBlank`, indicates the minimum difference +#' required between the mean of a feature in samples compared to the mean of +#' the same feature in blanks for it to not be considered a possible +#' contaminant. For example, the default threshold of 2 signifies that the mean +#' of the features in samples has to be at least twice the mean in blanks for +#' it not to be flagged as a possible contaminant. +#' #' @note #' For `rsd` and `rowRsd` the feature abundances are expected to be provided in #' natural scale and not e.g. log2 scale as it may lead to incorrect @@ -136,8 +143,8 @@ rowPercentMissing <- function(x){ #' @export #' @rdname quality_assessment -rowBlank <- function(x, y, na.rm = TRUE){ +rowBlank <- function(x, y, threshold = 2, na.rm = TRUE){ m_samples <- apply(x, 1, mean, na.rm = na.rm) m_blank <- apply(y, 1, mean, na.rm = na.rm) - m_samples < 2 * m_blank + m_samples < threshold * m_blank } diff --git a/man/quality_assessment.Rd b/man/quality_assessment.Rd index c706dcf..3de5570 100644 --- a/man/quality_assessment.Rd +++ b/man/quality_assessment.Rd @@ -20,23 +20,30 @@ percentMissing(x) rowPercentMissing(x) -rowBlank(x, y, na.rm = TRUE) +rowBlank(x, y, threshold = 2, na.rm = TRUE) } \arguments{ \item{x}{\code{numeric} For \code{rsd}, a numeric vector; for \code{rowRsd}, \code{rowDratio}, \code{percentMissing} and \code{rowBlank}, a numeric matrix representing the biological samples.} -\item{na.rm}{\code{logical(1)} indicate whether missing values (\code{NA}) should be +\item{na.rm}{\code{logical(1)} indicates whether missing values (\code{NA}) should be removed prior to the calculations.} -\item{mad}{\code{logical(1)} indicate whether the \emph{Median Absolute Deviation} +\item{mad}{\code{logical(1)} indicates whether the \emph{Median Absolute Deviation} (MAD) should be used instead of the standard deviation. This is suggested for non-gaussian distributed data.} \item{y}{\code{numeric} For \code{rowDratio} and \code{rowBlank}, a numeric matrix representing feature abundances in QC samples or blank samples, respectively.} + +\item{threshold}{\code{numeric} For \code{rowBlank}, indicates the minimum difference +required between the mean of a feature in samples compared to the mean of +the same feature in blanks for it to not be considered a possible +contaminant. For example, the default threshold of 2 signifies that the mean +of the features in samples has to be at least twice the mean in blanks for +it not to be flagged as a possible contaminant.} } \value{ See individual function description above for details. From 3250c63a153dc8ea5d5290f45cae88030efc72ef Mon Sep 17 00:00:00 2001 From: Philippine Louail Date: Wed, 17 Jan 2024 09:03:22 +0100 Subject: [PATCH 13/13] Update quality-assessment.R --- R/quality-assessment.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/quality-assessment.R b/R/quality-assessment.R index 9a38099..343559e 100644 --- a/R/quality-assessment.R +++ b/R/quality-assessment.R @@ -20,10 +20,10 @@ #' - `percentMissing` and `rowPercentMissing` determine the percentage of #' missing values in a vector or for each row of a matrix, respectively. #' -#' - `rowBlank` identifies rows (i.e features) where the mean of test samples -#' is lower than twice the mean of blank samples. This can be used to flag -#' features that results from contamination in the solvent of the samples. -#' Returns a `logical` vector of length equal to the number of rows of `x`. +#' - `rowBlank` identifies rows (i.e., features) where the mean of test samples +#' is lower than a specified multiple (defined by the `threshold` parameter) +#' of the mean of blank samples. This can be used to flag features that result +#' from contamination in the solvent of the samples. #' #' These functions are based on standard filtering methods described in the #' literature, and they are implemented to assist in preprocessing metabolomics