Addition of filtering functions

rformassspectrometry · Jan 10, 2024 · 990fa69 · 990fa69
1 parent 81a8f8f
commit 990fa69
Show file tree

Hide file tree

Showing 5 changed files with 292 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,12 @@ export(mclosest)
 export(multiplyElements)
 export(mz2mass)
 export(pasteElements)
+export(percentMissing)
+export(rowBlank)
+export(rowDratio)
+export(rowPercentMissing)
+export(rowRsd)
+export(rsd)
 export(standardizeFormula)
 export(subtractElements)
 importFrom(BiocParallel,SerialParam)
@@ -36,7 +42,10 @@ importFrom(MsCoreUtils,ppm)
 importFrom(methods,is)
 importFrom(stats,approx)
 importFrom(stats,lm)
+importFrom(stats,mad)
+importFrom(stats,median)
 importFrom(stats,na.omit)
 importFrom(stats,predict)
+importFrom(stats,sd)
 importFrom(stats,setNames)
 importFrom(utils,read.table)
diff --git a/R/function-filtering.R b/R/function-filtering.R
@@ -0,0 +1,148 @@
+#' @title Basic filtering functions for metabolomics
+#'
+#' @description
+#'
+#' When dealing with metabolomics results, it is often necessary to filter
+#' features based on certain criteria. These criteria are typically derived
+#' from statistical formulas applied to full rows of data, where each row
+#' represents a feature. The following functions provide basic filtering
+#' methods commonly used in the analysis of metabolomics data.
+#'
+#' - `rsd` and `rowRsd` are convenience functions to calculate the relative
+#'  standard deviation (i.e. coefficient of variation) of a numerical vector
+#'  or for rows of a numerical matrix, respectively.
+#'
+#' - `rowDratio` computes the D-ratio or "dispersion ratio," defined as the
+#'  standard deviation for QC (Quality Control) samples divided by the
+#'  standard deviation for biological test samples, for each feature (row) in
+#'  the matrix.
+#'
+#' - `percentMissing` and `rowPercentMissing` determine the percentage of
+#'  missing values in a vector or for each row of a matrix, respectively.
+#'
+#' - `rowBlank` identifies rows (i.e features) where the mean of test samples
+#'  is greater than twice the mean of blank samples. This can highlights
+#'  features that results from contamination in the solvent of the samples.
+#'   Return a `logical` vector
+#'
+#' These functions are based on standard filtering methods described in the
+#' literature, and they are implemented to assist in preprocessing metabolomics
+#' data.
+#'
+#' @param x `numeric` For `rsd`, a numeric vector;
+#'  for `rowRsd`, `rowDratio`, `percentMissing` and `rowBlank`, a numeric
+#'  matrix representing the biological samples.
+#'
+#' @param y `numeric` For `rowDratio` and `rowBlank`, a numeric matrix
+#'  representing the QC samples and blank samples, respectively.
+#'
+#' @param na.rm `logical(1)` indicate whether missing values (`NA`) should be
+#'  removed prior to the calculations.
+#'
+#' @param mad `logical(1)` indicate whether the *Median Absolute Deviation*
+#'  (MAD) should be used instead of the standard deviation. This is suggested
+#'  for non-gaussian distributed data.
+#'
+#' @return  See individual function description above for details.
+#'
+#' @author Philippine Louail, Johannes Rainer
+#'
+#' @md
+#'
+#' @importFrom stats sd mad median
+#'
+#' @name filteringFunctions
+#'
+#' @references
+#'
+#' Broadhurst D, Goodacre R, Reinke SN, Kuligowski J, Wilson ID, Lewis MR,
+#' Dunn WB. Guidelines and considerations for the use of system suitability
+#' and quality control samples in mass spectrometry assays applied in
+#' untargeted clinical metabolomic studies. Metabolomics. 2018;14(6):72.
+#' doi: 10.1007/s11306-018-1367-3. Epub 2018 May 18. PMID: 29805336;
+#' PMCID: PMC5960010.
+#'
+#' @examples
+#'
+#' ## coefficient of variation
+#' a <- c(4.3, 4.5, 3.6, 5.3)
+#' rsd(a)
+#'
+#' A <- rbind(a, a, a)
+#' rowRsd(A)
+#'
+#' ## Dratio
+#' x <- c(4.3, 4.5, 3.6, 5.3)
+#' X <- rbind(a, a, a)
+#' rowDratio(X, X)
+#'
+#' #' ## Percent Missing
+#' b <- c(1, NA, 3, 4, NA)
+#' percentMissing(b)
+#'
+#' B <- matrix(c(1, 2, 3, NA, 5, 6, 7, 8, 9), nrow = 3)
+#' rowPercentMissing(B)
+#'
+#' ## Blank Rows
+#' test_samples <- matrix(c(13, 21, 3, 4, 5, 6), nrow = 2)
+#' blank_samples <- matrix(c(0, 1, 2, 3, 4, 5), nrow = 2)
+#' rowBlank(test_samples, blank_samples)
+#'
+NULL
+
+#' @export
+#' @rdname filteringFunctions
+#'
+
+rsd <- function(x, na.rm = TRUE, mad = FALSE) {
+    if (mad)
+        mad(x, na.rm = na.rm) / abs(median(x, na.rm = na.rm))
+    else
+        sd(x, na.rm = na.rm) / abs(mean(x, na.rm = na.rm))
+}
+
+#' @rdname filteringFunctions
+#' @export
+rowRsd <- function(x, na.rm = TRUE, mad = FALSE)
+    apply(x, MARGIN = 1, rsd, na.rm = na.rm, mad = mad)
+
+
+#' @export
+#' @rdname filteringFunctions
+#'
+
+rowDratio <- function(x, y, na.rm = TRUE, mad = FALSE){
+    if (mad)
+        vec <- apply(y, 1, mad, na.rm = na.rm) /
+            apply(x, 1, mad, na.rm = na.rm)
+    else
+        vec <- apply(y, 1, sd, na.rm = na.rm) /
+        apply(x, 1, sd, na.rm = na.rm)
+}
+
+
+#' @export
+#' @rdname filteringFunctions
+
+
+percentMissing <- function(x){
+   ((sum(is.na(x))) / length(x))*100
+}
+
+#' @export
+#' @rdname filteringFunctions
+#'
+rowPercentMissing <- function(x){
+    apply(x, MARGIN = 1, percentMissing)
+}
+
+
+#' @export
+#' @rdname filteringFunctions
+#'
+
+rowBlank <- function(x, y, na.rm = TRUE){
+    m_samples <- apply(x, 1, mean, na.rm = na.rm)
+    m_blank <- apply(y, 1, mean, na.rm = na.rm)
+    vec <- m_samples > 2 * m_blank
+}
diff --git a/man/filteringFunctions.Rd b/man/filteringFunctions.Rd
diff --git a/tests/testthat/test_function-filtering.R b/tests/testthat/test_function-filtering.R
@@ -0,0 +1,26 @@
+# Create a test context
+test_that("Metabolomics Filtering Functions", {
+
+    # Define some sample data for testing
+    a <- c(3.2, 4.1, 3.9, 4.8)
+    A <- rbind(a, a, a)
+    b <- c(2, NA, 1, 3, NA)
+    B <- matrix(c(2, NA, 1, 3, NA, 6, 7, 8, 9, 12), nrow = 2)
+    test_samples <- matrix(c(13, 21, 1, 3, 5, 6), nrow = 3)
+    blank_samples <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 3)
+
+    # Test rsd function
+    expect_equal(rsd(a), sd(a) / mean(a))
+    expect_equal(rowRsd(A), apply(A, 1, function(row) sd(row) / mean(row)))
+
+    # Test rowDratio function
+    expect_equal(as.numeric(rowDratio(A, A)), rep(1, nrow(A)))
+
+    # Test percentMissing function
+    expect_equal(percentMissing(b), 40)
+    res <- c()
+    expect_equal(rowPercentMissing(B), rep(20, nrow(B)))
+
+    # Test rowBlank function
+    expect_equal(rowBlank(test_samples, blank_samples), c(TRUE, TRUE, FALSE))
+    })
diff --git a/vignettes/MetaboCoreUtils.Rmd b/vignettes/MetaboCoreUtils.Rmd
@@ -546,6 +546,9 @@ Generally, injecting study samples in random order can reduce (or even avoid)
 influence of any related technical bias in the downstream analysis and is highly
 suggested to improve and assure data quality.
 
+## Filtering data: Identifying measurement error 
+
+
 
 # Contributions