rformassspectrometry · jorainer · Jan 19, 2024 · Oct 5, 2023 · Oct 5, 2023 · Oct 6, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MetaboCoreUtils
 Title: Core Utils for Metabolomics Data
-Version: 1.11.1
+Version: 1.11.2
 Description: MetaboCoreUtils defines metabolomics-related core functionality
     provided as low-level functions to allow a data structure-independent usage
     across various R packages. This includes functions to calculate between ion

diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,12 @@ export(mclosest)
 export(multiplyElements)
 export(mz2mass)
 export(pasteElements)
+export(percentMissing)
+export(rowBlank)
+export(rowDratio)
+export(rowPercentMissing)
+export(rowRsd)
+export(rsd)
 export(standardizeFormula)
 export(subtractElements)
 importFrom(BiocParallel,SerialParam)
@@ -36,7 +42,10 @@ importFrom(MsCoreUtils,ppm)
 importFrom(methods,is)
 importFrom(stats,approx)
 importFrom(stats,lm)
+importFrom(stats,mad)
+importFrom(stats,median)
 importFrom(stats,na.omit)
 importFrom(stats,predict)
+importFrom(stats,sd)
 importFrom(stats,setNames)
 importFrom(utils,read.table)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # MetaboCoreUtils 1.11
 
+## MetaboCoreUtils 1.11.2
+
+- Add functions to compute quality check of the data (issue
+  [#77]((https://github.com/rformassspectrometry/MetaboCoreUtils/issues/77))
+
 ## MetaboCoreUtils 1.11.1
 
 - Add functions to enable linear model-based adjustment of (LC-MS derived)

diff --git a/R/function-filtering.R b/R/function-filtering.R
@@ -0,0 +1,142 @@
+#' @title Basic filtering functions for metabolomics
+#'
+#' @description
+#'
+#' When dealing with metabolomics results, it is often necessary to filter
+#' features based on certain criteria. These criteria are typically derived
+#' from statistical formulas applied to full rows of data, where each row
+#' represents a feature. The following functions provide basic filtering
+#' methods commonly used in the analysis of metabolomics data.
+#'
+#' - `rsd` and `rowRsd` are convenience functions to calculate the relative
+#'  standard deviation (i.e. coefficient of variation) of a numerical vector
+#'  or for rows of a numerical matrix, respectively.
+#'
+#' - `rowDratio` computes the D-ratio or *dispersion ratio*, defined as the
+#'  standard deviation for QC (Quality Control) samples divided by the
+#'  standard deviation for biological test samples, for each feature (row) in
+#'  the matrix.
+#'
+#' - `percentMissing` and `rowPercentMissing` determine the percentage of
+#'  missing values in a vector or for each row of a matrix, respectively.
+#'
+#' - `rowBlank` identifies rows (i.e features) where the mean of test samples
+#'  is lower than twice the mean of blank samples. This can be used to flag
+#'  features that results from contamination in the solvent of the samples.
+#'  Returns a `logical` vector of length equal to the number of rows of `x`.
+#'
+#' These functions are based on standard filtering methods described in the
+#' literature, and they are implemented to assist in preprocessing metabolomics
+#' data.
+#'
+#' @param x `numeric` For `rsd`, a numeric vector;
+#'  for `rowRsd`, `rowDratio`, `percentMissing` and `rowBlank`, a numeric
+#'  matrix representing the biological samples.
+#'
+#' @param y `numeric` For `rowDratio` and `rowBlank`, a numeric matrix
+#'  representing feature abundances in QC samples or blank samples,
+#'  respectively.
+#'
+#' @param na.rm `logical(1)` indicate whether missing values (`NA`) should be
+#'  removed prior to the calculations.
+#'
+#' @param mad `logical(1)` indicate whether the *Median Absolute Deviation*
+#'  (MAD) should be used instead of the standard deviation. This is suggested
+#'  for non-gaussian distributed data.
+#'
+#' @note
+#' For `rsd` and `rowRsd` the feature abundances are expected to be provided
+#' in natural scale and not e.g. log2 scale as it may lead to incorrect
+#'  interpretations.
+#'
+#' @return  See individual function description above for details.
+#'
+#' @author Philippine Louail, Johannes Rainer
+#'
+#' @md
+#'
+#' @importFrom stats sd mad median
+#'
+#' @name filteringFunctions
+#'
+#' @references
+#'
+#' Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR,
+#' Dunn WB. Guidelines and considerations for the use of system suitability
+#' and quality control samples in mass spectrometry assays applied in
+#' untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72.
+#' doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336;
+#' PMCID: PMC5960010.
+#'
+#' @examples
+#'
+#' ## coefficient of variation
+#' a <- c(4.3, 4.5, 3.6, 5.3)
+#' rsd(a)
+#'
+#' A <- rbind(a, a, a)
+#' rowRsd(A)
+#'
+#' ## Dratio
+#' x <- c(4.3, 4.5, 3.6, 5.3)
+#' X <- rbind(a, a, a)
+#' rowDratio(X, X)
+#'
+#' #' ## Percent Missing
+#' b <- c(1, NA, 3, 4, NA)
+#' percentMissing(b)
+#'
+#' B <- matrix(c(1, 2, 3, NA, 5, 6, 7, 8, 9), nrow = 3)
+#' rowPercentMissing(B)
+#'
+#' ## Blank Rows
+#' test_samples <- matrix(c(13, 21, 3, 4, 5, 6), nrow = 2)
+#' blank_samples <- matrix(c(0, 1, 2, 3, 4, 5), nrow = 2)
+#' rowBlank(test_samples, blank_samples)
+#'
+NULL
+
+#' @export
+#' @rdname filteringFunctions
+rsd <- function(x, na.rm = TRUE, mad = FALSE) {
+    if (mad)
+        mad(x, na.rm = na.rm) / abs(median(x, na.rm = na.rm))
+    else
+        sd(x, na.rm = na.rm) / abs(mean(x, na.rm = na.rm))
+}
+
+#' @export
+#' @rdname filteringFunctions
+rowRsd <- function(x, na.rm = TRUE, mad = FALSE)
+    apply(x, MARGIN = 1, rsd, na.rm = na.rm, mad = mad)
+
+#' @export
+#' @rdname filteringFunctions
+rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){
+    if (mad)
+        vec <- apply(y, 1, mad, na.rm = na.rm) /
+            apply(x, 1, mad, na.rm = na.rm)
+    else
+        vec <- apply(y, 1, sd, na.rm = na.rm) /
+        apply(x, 1, sd, na.rm = na.rm)
+}
+
+#' @export
+#' @rdname filteringFunctions
+percentMissing <- function(x){
+   ((sum(is.na(x))) / length(x))*100
+}
+
+#' @export
+#' @rdname filteringFunctions
+rowPercentMissing <- function(x){
+    apply(x, MARGIN = 1, percentMissing)
+}
+
+#' @export
+#' @rdname filteringFunctions
+rowBlank <- function(x, y, na.rm = TRUE){
+    m_samples <- apply(x, 1, mean, na.rm = na.rm)
+    m_blank <- apply(y, 1, mean, na.rm = na.rm)
+    m_samples < 2 * m_blank
+}
diff --git a/R/mclosest.R b/R/mclosest.R
@@ -76,7 +76,6 @@ mclosest <- function(x,
         ppm <- rep(ppm[1], nc)
     if (length(tolerance) != nc)
         tolerance <- rep(tolerance[1], nc)
-
     ## Initialize a vector to store closest row indices
     closest_indices <- rep(NA_integer_ , nrow(x))
     for (i in seq_len(nrow(x))) {

diff --git a/man/filteringFunctions.Rd b/man/filteringFunctions.Rd
diff --git a/tests/testthat/test_function-filtering.R b/tests/testthat/test_function-filtering.R
@@ -0,0 +1,32 @@
+test_that("Metabolomics Filtering Functions", {
+
+    # Define some sample data for testing
+    a <- c(3.2, 4.1, 3.9, 4.8)
+    A <- rbind(a, a, a)
+    b <- c(2, NA, 1, 3, NA)
+    B <- matrix(c(2, NA, 1, 3, NA, 6, 7, 8, 9, 12), nrow = 2)
+    test_samples <- matrix(c(13, 21, 1, 3, 5, 6), nrow = 3)
+    blank_samples <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3)
+
+    # Test rsd function
+    expect_equal(rsd(a), sd(a) / mean(a))
+    expect_equal(rowRsd(A), apply(A, 1, function(row) sd(row) / mean(row)))
+    expect_equal(rsd(a, mad = TRUE), mad(a, na.rm = TRUE) /
+                     abs(median(a, na.rm = TRUE)))
+    expect_equal(rowRsd(A, mad = TRUE),
+                 apply(A, 1, function(row) mad(row, na.rm = TRUE) /
+                           abs(median(row, na.rm = TRUE))))
+
+
+    # Test rowDratio function
+    expect_equal(as.numeric(rowDratio(A, A)), rep(1, nrow(A)))
+    expect_equal(as.numeric(rowDratio(A, A, mad = TRUE)), rep(1, nrow(A)))
+
+    # Test percentMissing function
+    expect_equal(percentMissing(b), 40)
+    res <- c()
+    expect_equal(rowPercentMissing(B), rep(20, nrow(B)))
+
+    # Test rowBlank function
+    expect_equal(rowBlank(test_samples, blank_samples), c(FALSE, FALSE, TRUE))
+    })