sneumann · jorainer · Sep 16, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: xcms
-Version: 4.3.3
+Version: 4.3.4
 Title: LC-MS and GC-MS Data Analysis
 Description: Framework for processing and visualization of chromatographically
     separated and single-spectra mass spectral data. Imports from AIA/ANDI NetCDF,

diff --git a/NAMESPACE b/NAMESPACE
@@ -461,7 +461,8 @@ export("CentWaveParam",
        "CleanPeaksParam",
        "MergeNeighboringPeaksParam",
        "FilterIntensityParam",
-       "ChromPeakAreaParam")
+       "ChromPeakAreaParam",
+       "BetaDistributionParam")
 ## Param class methods.
 
 ## New Classes
@@ -530,7 +531,8 @@ exportMethods("hasChromPeaks",
               "featureSpectra",
               "chromPeakSpectra",
               "chromPeakChromatograms",
-              "featureChromatograms"
+              "featureChromatograms",
+              "chromPeakSummary"
               )
 
 ## feature grouping functions and methods.

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,17 @@
 # xcms 4.3
 
+## Changes in version 4.3.4
+
+- Address issue #765: peak detection on chromatographic data: report a
+  chromatogram's `"mz"`, `"mzmin"` and `"mzmax"` as the mean m/z and lower and
+  upper m/z in the `chromPeaks()` matrix.
+- Fix calculation of the correlation coefficient for peak shape similarity with
+  an idealized bell shape (*beta*) during gap filling for centWave-based
+  chromatographic peak detection with parameter `verboseBetaColumns = TRUE`.
+- Add `chromPeakSummary` generic (issue #705).
+- Add `chromPeakSummary()` method to calculate the *beta* quality metrics.
+
+
 ## Changes in version 4.3.3
 
 - Fix issue #755: `chromatogram()` with `msLevel = 2` fails to extract

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
@@ -535,6 +535,65 @@ setGeneric("chromPeakData<-", function(object, value)
 setGeneric("chromPeakSpectra", function(object, ...)
     standardGeneric("chromPeakSpectra"))
 
+#' @title Chromatographic peak summaries
+#'
+#' @name chromPeakSummary
+#'
+#' @description
+#'
+#' The `chromPeakSummary()` method calculates summary statistics or other
+#' metrics for each of the identified chromatographic peaks in an *xcms* result
+#' object, such as the [XcmsExperiment()]. Different metrics can be calculated,
+#' depending upon (and configured by) using dedicated *parameter* classes. As a
+#' result, the method returns a `matrix` or `data.frame` with one row per
+#' chromatographic peak. Each column contains calculated values, depending on
+#' the used method/parameter class.
+#'
+#' Currently implemented methods/parameter classes are:
+#'
+#' - `BetaDistributionParam`: calculates the *beta_cor* and *beta_snr* quality
+#'   metrics as described in Kumler 2023 representing the result from a
+#'   (correlation) test of similarity (using Pearson's correlation coefficient)
+#'   to a bell curve and the signal-to-noise ratio calculated on the residuals
+#'   of this test.
+#'
+#' @param BPPARAM Parallel processing setup. See [bpparam()] for details.
+#'
+#' @param chunkSize `integer(1)` defining the number of samples from which data
+#'     should be loaded and processed at a time.
+#'
+#' @param msLevel `integer(1)` with the MS level of the chromatographic peaks
+#'     on which the metric should be calculated.
+#'
+#' @param object an *xcms* result object containing information on
+#'     identified chromatographic peaks.
+#'
+#' @param param a parameter object defining the method/summaries that should
+#'     be calculated (see description above for supported parameter classes).
+#'
+#' @param ... additional arguments passed to the method implementation.
+#'
+#' @return
+#'
+#' A `matrix` or `data.frame` with the same number of rows as there are
+#' chromatographic peaks. Columns contain the calculated values. The number of
+#' columns, their names and content depend on the used parameter object. See
+#' the respective documentation above for more details.
+#'
+#' @author Pablo Vangeenderhuysen, Johannes Rainer, William Kumler
+#'
+#' @md
+#'
+#' @references
+#'
+#' Kumler W, Hazelton B J and Ingalls A E (2023) "Picky with peakpicking:
+#' assessing chromatographic peak quality with simple metrics in metabolomics"
+#' *BMC Bioinformatics* 24(1):404. doi: 10.1186/s12859-023-05533-4
+#'
+#' @export
+setGeneric("chromPeakSummary", function(object, param, ...)
+           standardGeneric("chromPeakSummary"))
+
 setGeneric("collect", function(object, ...) standardGeneric("collect"))
 setGeneric("consecMissedLimit", function(object, ...)
     standardGeneric("consecMissedLimit"))

diff --git a/R/DataClasses.R b/R/DataClasses.R
@@ -2182,3 +2182,8 @@ setClass("FilterIntensityParam",
                  msg
              else TRUE
          })
+
+setClass("BetaDistributionParam",
+         contains = "Param"
+         )
+
diff --git a/R/XcmsExperiment-functions.R b/R/XcmsExperiment-functions.R
@@ -520,8 +520,9 @@
             ## consider adding 0 or NA intensity for those.
             mat <- do.call(rbind, xsub)
             if (nrow(mat)) {
+                nr <- vapply(xsub, nrow, NA_integer_)
                 ## can have 0, 1 or x values per rt; repeat rt accordingly
-                rts <- rep(rt[keep], vapply(xsub, nrow, integer(1L)))
+                rts <- rep(rt[keep], nr)
                 maxi <- which.max(mat[, 2L])[1L]
                 mmz <- do.call(mzCenterFun, list(mat[, 1L], mat[, 2L]))
                 if (is.na(mmz)) mmz <- mat[maxi, 1L]
@@ -530,15 +531,54 @@
                     sum(mat[, 2L], na.rm = TRUE) *
                     ((rtr[2L] - rtr[1L]) / max(1L, (length(keep) - 1L)))
                 )
-                if ("beta_cor" %in% cn)
+                if ("beta_cor" %in% cn) {
                     res[i, c("beta_cor", "beta_snr")] <- .get_beta_values(
-                        mat[, 2L], rts)
+                        vapply(xsub[nr > 0], function(z) sum(z[, "intensity"]), 
+                               NA_real_),
+                        rt[keep][nr > 0])
+                }
             }
         }
     }
     res[!is.na(res[, "maxo"]), , drop = FALSE]
 }
 
+
+#' Calculates quality metrics for a chromatographic peak. 
+#'
+#' @param x `list` of peak matrices (from a single MS level and from a single
+#'     file/sample).
+#'
+#' @param rt retention time for each peak matrix.
+#'
+#' @param peakArea `matrix` defining the chrom peak area.
+#'
+#' @author Pablo Vangeenderhuysen
+#'
+#' @noRd
+.chrom_peak_beta_metrics <- function(x, rt, peakArea, ...) {
+  res <- matrix(NA_real_, ncol = 2L, nrow = nrow(peakArea))
+  rownames(res) <- rownames(peakArea)
+  colnames(res) <- c("beta_cor","beta_snr")
+  for (i in seq_len(nrow(res))) {
+    rtr <- peakArea[i, c("rtmin", "rtmax")]
+    keep <- which(between(rt, rtr))
+    if (length(keep)) {
+      xsub <- lapply(x[keep], .pmat_filter_mz,
+                     mzr = peakArea[i, c("mzmin", "mzmax")])
+      nr <- vapply(xsub, nrow, NA_integer_)
+      res[i, c("beta_cor", "beta_snr")] <- .get_beta_values(
+          vapply(xsub[nr > 0], function(z) sum(z[, "intensity"]), NA_real_),
+          rt[keep][nr > 0])
+      }
+    }
+  res
+}
+
+
+
+
+
 #' Difference to the original code is that the weighted mean is also calculated
 #' if some of the peak intensities in the profile matrix are 0
 #'

diff --git a/R/XcmsExperiment.R b/R/XcmsExperiment.R
@@ -2013,3 +2013,39 @@ setMethod(
         object[i = sort(unique(file)), keepAdjustedRtime = keepAdjustedRtime,
                keepFeatures = keepFeatures, ...]
     })
+
+#' @rdname chromPeakSummary
+setMethod(
+  "chromPeakSummary",
+  signature(object = "XcmsExperiment", param = "BetaDistributionParam"),
+  function(object, param, msLevel = 1L, chunkSize = 2L, BPPARAM = bpparam()) {
+    if (length(msLevel) != 1)
+      stop("Can only perform peak metrics for one MS level at a time.")
+    if (!hasChromPeaks(object, msLevel = msLevel))
+      stop("No ChromPeaks definitions for MS level ", msLevel, " present.")
+    ## Define region to calculate metrics from for each file
+    cp <- chromPeaks(object, msLevel = msLevel)
+    f <- factor(cp[,"sample"], seq_along(object))
+    pal <- split.data.frame(cp[, c("mzmin", "mzmax", "rtmin", "rtmax")], f)
+    names(pal) <- seq_along(pal)
+    ## Manual chunk processi ng because we have to split `object` and `pal`
+    idx <- seq_along(object)
+    chunks <- split(idx, ceiling(idx / chunkSize))
+    pb <- progress_bar$new(format = paste0("[:bar] :current/:",
+                                           "total (:percent) in ",
+                                           ":elapsed"),
+                           total = length(chunks) + 1L, clear = FALSE)
+    pb$tick(0)
+    # mzf <- "wMean"
+    res <- lapply(chunks, function(z, ...) {
+      pb$tick()
+      .xmse_integrate_chrom_peaks(
+        .subset_xcms_experiment(object, i = z, keepAdjustedRtime = TRUE,
+                                ignoreHistory = TRUE),
+        pal = pal[z], intFun = .chrom_peak_beta_metrics,
+        msLevel = msLevel, BPPARAM = BPPARAM)
+    })
+    res <- do.call(rbind, res)
+    pb$tick()
+    res
+  })
diff --git a/R/do_findChromPeaks-functions.R b/R/do_findChromPeaks-functions.R
@@ -3720,14 +3720,20 @@ peaksWithCentWave <- function(int, rt,
 #' requires at least 5 scans or it will return NA for both parameters.
 #'
 #' @param intensity A numeric vector corresponding to the peak intensities
+#'
 #' @param rtime A numeric vector corresponding to the retention times of each
-#' intensity. If not provided, intensities will be assumed to be equally spaced.
+#'     intensity. Retention times are expected to be in increasing order
+#'     without duplicates. If not provided, intensities will be assumed to be
+#'     equally spaced.
+#'
 #' @param skews A numeric vector of the skews to try, corresponding to the
-#' shape1 of dbeta with a shape2 of 5. Values less than 5 will be increasingly
-#' right-skewed, while values greater than 5 will be left-skewed.
+#'     shape1 of dbeta with a shape2 of 5. Values less than 5 will be
+#'     increasingly right-skewed, while values greater than 5 will be
+#'     left-skewed.
+#'
 #' @param zero.rm Boolean value controlling whether "missing" scans are dropped
-#' prior to curve fitting. The default, TRUE, will remove intensities of zero
-#' or NA
+#'     prior to curve fitting. The default, TRUE, will remove intensities of
+#'     zero or NA
 #'
 #' @author William Kumler
 #'
@@ -3740,21 +3746,22 @@ peaksWithCentWave <- function(int, rt,
     rtime <- rtime[keep]
     intensity <- intensity[keep]
   }
-  if(length(intensity)<5){
+  if (length(intensity) < 5) {
     best_cor <- NA
     beta_snr <- NA
   } else {
-    beta_sequence <- rep(.scale_zero_one(rtime), each=length(skews))
+    beta_sequence <- rep(.scale_zero_one(rtime), each = length(skews))
     beta_vals <- t(matrix(dbeta(beta_sequence, shape1 = skews, shape2 = 5),
                           nrow = length(skews)))
     # matplot(beta_vals)
     beta_cors <- cor(intensity, beta_vals)
     best_cor <- max(beta_cors)
     best_curve <- beta_vals[, which.max(beta_cors)]
-    noise_level <- sd(diff(.scale_zero_one(best_curve)-.scale_zero_one(intensity)))
-    beta_snr <- log10(max(intensity)/noise_level)
+    noise_level <- sd(diff(.scale_zero_one(best_curve) -
+                           .scale_zero_one(intensity)))
+    beta_snr <- log10(max(intensity) / noise_level)
   }
-  c(best_cor=best_cor, beta_snr=beta_snr)
+  c(best_cor = best_cor, beta_snr = beta_snr)
 }
 
 
@@ -3769,5 +3776,5 @@ peaksWithCentWave <- function(int, rt,
 #'
 #' @noRd
 .scale_zero_one <- function(num_vec){
-  (num_vec-min(num_vec))/(max(num_vec)-min(num_vec))
+  (num_vec-min(num_vec)) / (max(num_vec) - min(num_vec))
 }
diff --git a/R/functions-Params.R b/R/functions-Params.R
@@ -397,3 +397,8 @@ FilterIntensityParam <- function(threshold = 0, nValues = 1L, value = "maxo") {
     new("FilterIntensityParam", threshold = as.numeric(threshold),
         nValues = as.integer(nValues), value = value)
 }
+
+#' @rdname chromPeakSummary
+BetaDistributionParam <- function() {
+  new("BetaDistributionParam")
+}
diff --git a/R/functions-XCMSnExp.R b/R/functions-XCMSnExp.R
@@ -279,6 +279,7 @@ dropGenericProcessHistory <- function(x, fun) {
                        valsPerSpect = valsPerSpect, rtrange = rtr,
                        mzrange = mzr)
         if (length(mtx)) {
+            ## mtx: time, mz, intensity
             if (any(!is.na(mtx[, 3]))) {
                 ## How to calculate the area: (1)sum of all intensities / (2)by
                 ## the number of data points (REAL ones, considering also NAs)
@@ -290,21 +291,20 @@ dropGenericProcessHistory <- function(x, fun) {
                 ## as e.g. centWave. Using max(1, ... to avoid getting Inf in
                 ## case the signal is based on a single data point.
                 ## (3) rtr[2] - rtr[1]
-                res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE) *
+                res[i, "into"] <- sum(mtx[, 3L], na.rm = TRUE) *
                     ((rtr[2] - rtr[1]) /
                      max(1, (sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1)))
-                maxi <- which.max(mtx[, 3])
+                maxi <- which.max(mtx[, 3L])
                 res[i, c("rt", "maxo")] <- mtx[maxi[1], c(1, 3)]
                 res[i, c("rtmin", "rtmax")] <- rtr
                 ## Calculate the intensity weighted mean mz
                 meanMz <- do.call(mzCenterFun, list(mtx[, 2], mtx[, 3]))
                 if (is.na(meanMz)) meanMz <- mtx[maxi[1], 2]
                 res[i, "mz"] <- meanMz
-
-                if ("beta_cor" %in% cn) {
+                if ("beta_cor" %in% cn)
                     res[i, c("beta_cor", "beta_snr")] <- .get_beta_values(
-                        mtx[, 3L], mtx[, 1L])
-                }
+                        vapply(split(mtx[, 3L], mtx[, 1L]), sum, NA_real_),
+                        unique(mtx[, 1L]))
             } else {
                 res[i, ] <- rep(NA_real_, ncols)
             }

diff --git a/R/methods-Chromatogram.R b/R/methods-Chromatogram.R
@@ -31,8 +31,10 @@
 #' @return
 #'
 #' If called on a `Chromatogram` object, the method returns an [XChromatogram]
-#' object with the identified peaks. See [peaksWithCentWave()] for details on
-#' the peak matrix content.
+#' object with the identified peaks. Columns `"mz"`, `"mzmin"` and `"mzmax"` in
+#' the `chromPeaks()` peak matrix provide the mean m/z and the maximum and
+#' minimum m/z value of the `Chromatogram` object. See [peaksWithCentWave()]
+#' for details on the remaining columns.
 #'
 #' @seealso [peaksWithCentWave()] for the downstream function and [centWave]
 #'     for details on the method.
@@ -70,10 +72,18 @@ setMethod("findChromPeaks", signature(object = "Chromatogram",
                                            rt = rtime(object)),
                                       as(param, "list")))
               object <- as(object, "XChromatogram")
-              chromPeaks(object) <- res
+              chromPeaks(object) <- .add_mz(res, object@mz)
               object
           })
 
+.add_mz <- function(x, mz = c(NA_real_, NA_real_)) {
+    nx <- nrow(x)
+    tmp <- cbind(mz = rep(mean(mz), nx),
+                 mzmin = rep(mz[1L], nx),
+                 mzmax = rep(mz[2L], nx))
+    cbind(tmp, x)
+}
+
 #' @title matchedFilter-based peak detection in purely chromatographic data
 #'
 #' @description
@@ -97,8 +107,10 @@ setMethod("findChromPeaks", signature(object = "Chromatogram",
 #' @return
 #'
 #' If called on a `Chromatogram` object, the method returns a `matrix` with
-#' the identified peaks. See [peaksWithMatchedFilter()] for details on the
-#' matrix content.
+#' the identified peaks. Columns `"mz"`, `"mzmin"` and `"mzmax"` in
+#' the `chromPeaks()` peak matrix provide the mean m/z and the maximum and
+#' minimum m/z value of the `Chromatogram` object. See
+#' [peaksWithMatchedFilter()] for details on the remaining columns.
 #'
 #' @seealso [peaksWithMatchedFilter()] for the downstream function and
 #'     [matchedFilter] for details on the method.
@@ -134,7 +146,7 @@ setMethod("findChromPeaks", signature(object = "Chromatogram",
                                            rt = rtime(object)),
                                       as(param, "list")))
               object <- as(object, "XChromatogram")
-              chromPeaks(object) <- res
+              chromPeaks(object) <- .add_mz(res, object@mz)
               object
           })