feat: add functions to perform lm-based normalization

- Add functions to perform linear model-based normalization, such as adjustment for an injection index dependent signal drift (issue #75).
rformassspectrometry · Dec 14, 2023 · 45cacb0 · 45cacb0
1 parent 9904d1f
commit 45cacb0
Show file tree

Hide file tree

Showing 8 changed files with 689 additions and 47 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -44,14 +44,16 @@ Depends:
     R (>= 4.0)
 Imports:
     utils,
-    MsCoreUtils
+    MsCoreUtils,
+    BiocParallel,
+    methods,
+    stats
 Suggests:
     BiocStyle,
     testthat,
     knitr,
     rmarkdown,
-    robustbase,
-    BiocParallel
+    robustbase
 License: Artistic-2.0
 LazyData: no
 VignetteBuilder: knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -4,6 +4,7 @@ export(addElements)
 export(adductFormula)
 export(adductNames)
 export(adducts)
+export(adjust_lm)
 export(calculateKm)
 export(calculateKmd)
 export(calculateMass)
@@ -27,10 +28,15 @@ export(mz2mass)
 export(pasteElements)
 export(standardizeFormula)
 export(subtractElements)
+importFrom(BiocParallel,SerialParam)
+importFrom(BiocParallel,bplapply)
 importFrom(MsCoreUtils,closest)
 importFrom(MsCoreUtils,group)
 importFrom(MsCoreUtils,ppm)
+importFrom(methods,is)
 importFrom(stats,approx)
+importFrom(stats,lm)
 importFrom(stats,na.omit)
+importFrom(stats,predict)
 importFrom(stats,setNames)
 importFrom(utils,read.table)
diff --git a/NEWS.md b/NEWS.md
@@ -3,7 +3,8 @@
 ## MetaboCoreUtils 1.11.1
 
 - Add functions to enable linear model-based adjustment of (LC-MS derived)
-  abundance matrices.
+  abundance matrices (issue
+  [#75](https://github.com/rformassspectrometry/MetaboCoreUtils/issues/75)).
 
 # MetaboCoreUtils 1.9
 

diff --git a/R/normalization.R b/R/normalization.R
@@ -10,23 +10,25 @@ NULL
 #' The `fit_lm` and `adjust_lm` functions facilitate linear model-based
 #' normalization of abundance matrices. The expected noise in a numeric
 #' data matrix can be modeled with a linear regression model using the
-#' `fit_lm` and the data can subsequently be adjusted (i.e., the modeled
-#' noise can be removed from abundance values) using the `adjust_lm`
-#' function. A typical use case would be to remove injection index
-#' dependent signal drifts in a LC-MS derived metabolomics data set:
-#' a linear model of the form `y ~ injection_index` would model the
-#' measured abundances of each feature (each row in a data matrix) as a
+#' `fit_lm` function and the data can subsequently be adjusted using the
+#' `adjust_lm` function (i.e., the modeled noise will be removed from the
+#' abundance values). A typical use case would be to remove injection index
+#' dependent signal drifts in a LC-MS derived metabolomics data:
+#' a linear model of the form `y ~ injection_index` could be used to model
+#' the measured abundances of each feature (each row in a data matrix) as a
 #' function of the injection index in which a specific sample was measured
-#' during a LC-MS measurement run. The fitted models can then be used to
-#' adjust the abundance matrix by removing these dependencies from the
-#' feature abundances using the `adjust_lm` function.
+#' during the LC-MS measurement run. The fitted linear regression models
+#' can subsequently be used to adjust the abundance matrix by removing
+#' these dependencies from the data. This allows to perform signal
+#' adjustments as described in (Wehrens et al. 2016).
 #'
-#' The two functions are desribed in more details below:
+#' The two functions are described in more details below:
 #'
 #' `fit_lm` allows to fit a linear regression model (defined with parameter
 #' `formula`) to each row of the numeric data matrix submitted with parameter
-#' `y`. Additional covariates used in each model are expected to be provided
-#' as a `data.frame` with parameter `data`.
+#' `y`. Additional covariates of the linear model defined in `formula` are
+#' expected to be provided as columns in a `data.frame` provided with
+#' parameter `data`.
 #'
 #' The linear model is expected to be defined by a formula starting with
 #' `y ~ `. To model for example an injection index dependency of values in
@@ -39,15 +41,87 @@ NULL
 #' [lm()] by setting `method = "lm"` (the default), or with the more robust
 #' methods from the *robustbase* package with `method = "lmrob"`.
 #'
-#' @return A `list` with linear models (either of type *lm* or *lmrob*) or
-#'     length equal to the number of rows of `y`. `NA` is reported for rows
-#'     with too few non-missing data points (depending on parameter
-#'     `minValues`).
+#' `adjust_lm` can be used to adjust abundances in a data matrix `y` based
+#' on linear regression models provided with parameter `lm`. Parameter `lm`
+#' is expected to be a `list` of length equal to the number of rows of `y`,
+#' each element being a linear model (i.e., a results from `lm` or `lmrob`).
+#' Covariates for the model need to be provided as columns in a `data.frame`
+#' provided with parameter `data`. The number of rows of that `data.frame`
+#' need to match the number of columns of `y`. The function returns the
+#' input matrix `y` with values in rows adjusted with the linear models
+#' provided by `lm`. No adjustment is performed for rows for which the
+#' respective element in `lm` is `NA`. See examples below for details or the
+#' vignette for more examples, descriptions and information.
+#'
+#' @param BPPARAM parallel processing setup. See [bpparam()] for more
+#'     information. Parallel processing can improve performance especially
+#'     for `method = "lmrob"`.
+#'
+#' @param control a `list` speficying control parameters for `lmrob`. Only
+#'     used if `method = "lmrob"`. See help of `lmrob.control` in the
+#'     `robustbase` package for details. By default `control = NULL` the
+#'     *KS2014* settings are used and scale-finding iterations are increased
+#'     to 10000.
+#'
+#' @param data `data.frame` containing the covariates for the linear model
+#'     defined by `formula` (for `fit_lm`) or used in `lm` (for `adjust_lm`).
+#'     The number of rows has to match the number of columns of `y`.
+#'
+#' @param formula `formula` defining the model that should be fitted to the
+#'     data. See also [lm()] for more information. Formulas should begin
+#'     with `y ~ ` as values in rows of `y` will be defined as *y*. See
+#'     description of the `fit_lm` function for more information.
+#'
+#' @param lm `list` of linear models (as returned by `lm` or `lmrob`) such
+#'     as returned by the `fit_lm` function. The length of the list is
+#'     expected to match the number of rows of `y`, i.e., each element
+#'     should be a linear model to adjust the specific row, or `NA` to skip
+#'     adjustment for that particular row in `y`.
+#'
+#' @param method `character(1)` defining the linear regression function that
+#'     should be used for model fitting. Can be either `method = "lm"` (the
+#'     default) for standard least squares model fitting or `method = "lmrob"`
+#'     for a robust alternative defined in the *robustbase* package.
+#'
+#' @param model `logical(1)` whether the model frame are included in the
+#'     returned linear models. Passed to the `lm` or `lmrob` functions.
+#'
+#' @param minVals `numeric(1)` defining the minimum number of non-missing
+#'     values (per feature/row) required to perform the model fitting. For
+#'     rows in `y` for which fewer non-`NA` values are available no model
+#'     will be fitted and a `NA` will be reported instead.
+#'
+#' @param y for `fit_lm`: `matrix` of abundances on which the linear model
+#'     pefined with `formula` should be fitted. For `adjust_lm`: `matrix`
+#'     of abundances that should be adjusted using the models provided with
+#'     parameter `lm`.
+#'
+#' @param ... for `fit_lm`: additional parameters to be passed to the
+#'     downstream calls to `lm` or `lmrob`. For `adjust_lm`: ignored.
+#'
+#' @return
+#'     For `fit_lm`: a `list` with linear models (either of type *lm* or
+#'     *lmrob*) or length equal to the number of rows of `y`. `NA` is
+#'     reported for rows with too few non-missing data points (depending
+#'     on parameter `minValues`).
+#'     For `adjust_lm`: a numeric matrix (same dimensions as input matrix
+#'     `y`) with the values adjusted with the provided linear models.
 #'
 #' @export
 #'
 #' @author Johannes Rainer
 #'
+#' @importFrom BiocParallel SerialParam bplapply
+#'
+#' @importFrom methods is
+#'
+#' @references
+#'
+#' Wehrens R, Hageman JA, van Eeuwijk F, Kooke R, Flood PJ, Wijnker E,
+#' Keurentjes JJ, Lommen A, van Eekelen HD, Hall RD Mumm R and de Vos RC.
+#' Improved batch correction in untargeted MS-based metabolomics.
+#' *Metabolomics* 2016; 12:88.
+#'
 #' @examples
 #'
 #' ## See also the vignette for more details and examples.
@@ -60,29 +134,56 @@ NULL
 #' ## Define a data.frame with the covariates to be used to model the noise
 #' sdata <- data.frame(injection_index = seq_len(ncol(vals)))
 #'
-#' ## Fit a simple linear model describing the feature abundances as a
+#' ## Fit a linear model describing the feature abundances as a
 #' ## function of the index in which samples were injected during the LC-MS
-#' ## run. Note that such a model should **only** be fitted if the samples
+#' ## run. We're fitting the model to log2 transformed data.
+#' ## Note that such a model should **only** be fitted if the samples
 #' ## were randomized, i.e. the injection index is independent of any
-#' ## experimental covariate.
-#' ii_lm <- fit_lm(y ~ injection_index, data = sdata, y = vals)
+#' ## experimental covariate. Alternatively, the injection order dependent
+#' ## signal drift could be estimated using QC samples (if they were
+#' ## repeatedly injected) - see vignette for more details.
+#' ii_lm <- fit_lm(y ~ injection_index, data = sdata, y = log2(vals))
 #'
 #' ## The result is a list of linear models
 #' ii_lm[[1]]
 #'
 #' ## Plotting the data for one feature:
-#' plot(x = sdata$injection_index, y = vals[2, ])
+#' plot(x = sdata$injection_index, y = log2(vals[2, ]),
+#'     ylab = expression(log[2]~abundance), xlab = "injection index")
 #' grid()
 #' ## plot also the fitted model
 #' abline(ii_lm[[2]], lty = 2)
 #'
 #' ## For this feature (row) a decreasing signal intensity with injection
 #' ## index was observed (and modeled).
 #'
-#' ## See the vignette for more details, examples and explanations.
+#' ## For another feature an increasing intensity can be observed.
+#' plot(x = sdata$injection_index, y = log2(vals[3, ]),
+#'     ylab = expression(log[2]~abundance), xlab = "injection index")
+#' grid()
+#' ## plot also the fitted model
+#' abline(ii_lm[[3]], lty = 2)
+#'
+#' ## This trend can be removed from the data using the `adjust_lm` function
+#' ## by providing the linear models descring the drift. Note that, because
+#' ## we're adjusting log2 transformed data, the resulting abundances are
+#' ## also in log2 scale.
+#' vals_adj <- adjust_lm(log2(vals), data = sdata, lm = ii_lm)
+#'
+#' ## Plotting the data before (open circles) and after adjustment (filled
+#' ## points)
+#' plot(x = sdata$injection_index, y = log2(vals[2, ]),
+#'     ylab = expression(log[2]~abundance), xlab = "injection index")
+#' points(x = sdata$injection_index, y = vals_adj[2, ], pch = 16)
+#' grid()
+#' ## Adding the line fitted through the raw data
+#' abline(ii_lm[[2]], lty = 2)
+#' ## Adding a line fitted through the adjusted data
+#' abline(lm(vals_adj[2, ] ~ sdata$injection_index), lty = 1)
+#' ## After adjustment there is no more dependency on injection index.
 fit_lm <- function(formula, data, y, method = c("lm", "lmrob"), control = NULL,
                    minVals = ceiling(nrow(data) * 0.75), model = TRUE, ...,
-                   BPPARAM = BiocParallel::SerialParam()) {
+                   BPPARAM = SerialParam()) {
     requireNamespace("BiocParallel", quietly = TRUE)
     method <- match.arg(method)
     if (missing(formula) || !is(formula, "formula"))
@@ -108,19 +209,21 @@ fit_lm <- function(formula, data, y, method = c("lm", "lmrob"), control = NULL,
             control$k.max <- 10000
             control$refine.tol <- 1e-7
         }
-        res <- BiocParallel::bplapply(y, FUN = .fit_lmrob, formula = formula,
-                                      data = data, minVals = minVals,
-                                      model = model, BPPARAM = BPPARAM,
-                                      control = control, ...)
+        res <- bplapply(y, FUN = .fit_lmrob, formula = formula,
+                        data = data, minVals = minVals,
+                        model = model, BPPARAM = BPPARAM,
+                        control = control, ...)
     } else
-        res <- BiocParallel::bplapply(y, FUN = .fit_lm, formula = formula,
-                                      data = data, minVals = minVals,
-                                      model = model, BPPARAM = BPPARAM, ...)
+        res <- bplapply(y, FUN = .fit_lm, formula = formula,
+                        data = data, minVals = minVals,
+                        model = model, BPPARAM = BPPARAM, ...)
     res
 }
 
 #' Fit the model for a single row of data.
 #'
+#' @importFrom stats lm
+#'
 #' @noRd
 .fit_lm <- function(y, formula, data, minVals, model = TRUE, ...) {
     nna <- sum(!is.na(y))
@@ -146,11 +249,26 @@ fit_lm <- function(formula, data, y, method = c("lm", "lmrob"), control = NULL,
         stop("All variables defined in 'formula' need to be present in 'data'")
 }
 
-adjust_lm <- function() {
-    ## Check
-    ## - all covariates available?
-
-    ## Need to handle negative values? Maybe not directly in here?
+#' @rdname fit_lm
+#'
+#' @export
+adjust_lm <- function(y = matrix(), data = data.frame(), lm = list(), ...) {
+    if (length(lm)) {
+        if (!is.matrix(y)) stop("'y' is expected to be a numeric matrix")
+        if (!is.data.frame(data)) stop("'data' is expected to be a data.frame")
+        if (is(lm, "lm") || is(lm, "lmrob"))
+            lm <- list(lm)
+        if (length(lm) != nrow(y))
+            stop("Length of parameter 'lm' has to match number of rows of 'y'")
+        if (ncol(y) != nrow(data))
+            stop("Number of rows of 'data' and number of columns of 'y' have ",
+                 "to match.")
+        ## Note: no big gain by parallel processing here.
+        for (i in which(!is.na(lm))) {
+            y[i, ] <- .adjust_with_lm(y = y[i, ], data, lm[[i]])
+        }
+    }
+    y
 }
 
 #' Adjust values `y` based on the linear model provided with `lm`
@@ -168,8 +286,11 @@ adjust_lm <- function() {
 #' @author Johannes Rainer based on original code from Ron Wehrens from
 #'         https://github.com/rwehrens/BatchCorrMetabolomics
 #'
+#' @importFrom stats predict
+#'
 #' @noRd
 .adjust_with_lm <- function(y, data, lm) {
+    if (length(lm) <= 1L) return(y)
     data$y <- y
     pred <- predict(lm, newdata = data)
     y - pred + mean(lm$fitted.values + lm$residuals)