diff --git a/DESCRIPTION b/DESCRIPTION index a9c46263..6fd20074 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: xcms -Version: 4.3.3 +Version: 4.3.4 Title: LC-MS and GC-MS Data Analysis Description: Framework for processing and visualization of chromatographically separated and single-spectra mass spectral data. Imports from AIA/ANDI NetCDF, diff --git a/NAMESPACE b/NAMESPACE index 24fb6605..41082821 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -461,7 +461,8 @@ export("CentWaveParam", "CleanPeaksParam", "MergeNeighboringPeaksParam", "FilterIntensityParam", - "ChromPeakAreaParam") + "ChromPeakAreaParam", + "BetaDistributionParam") ## Param class methods. ## New Classes @@ -530,7 +531,8 @@ exportMethods("hasChromPeaks", "featureSpectra", "chromPeakSpectra", "chromPeakChromatograms", - "featureChromatograms" + "featureChromatograms", + "chromPeakSummary" ) ## feature grouping functions and methods. diff --git a/NEWS.md b/NEWS.md index 1a47aa96..f3f1dec3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,17 @@ # xcms 4.3 +## Changes in version 4.3.4 + +- Address issue #765: peak detection on chromatographic data: report a + chromatogram's `"mz"`, `"mzmin"` and `"mzmax"` as the mean m/z and lower and + upper m/z in the `chromPeaks()` matrix. +- Fix calculation of the correlation coefficient for peak shape similarity with + an idealized bell shape (*beta*) during gap filling for centWave-based + chromatographic peak detection with parameter `verboseBetaColumns = TRUE`. +- Add `chromPeakSummary` generic (issue #705). +- Add `chromPeakSummary()` method to calculate the *beta* quality metrics. + + ## Changes in version 4.3.3 - Fix issue #755: `chromatogram()` with `msLevel = 2` fails to extract diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 72a0aeb9..82a23e6b 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -535,6 +535,65 @@ setGeneric("chromPeakData<-", function(object, value) setGeneric("chromPeakSpectra", function(object, ...) standardGeneric("chromPeakSpectra")) +#' @title Chromatographic peak summaries +#' +#' @name chromPeakSummary +#' +#' @description +#' +#' The `chromPeakSummary()` method calculates summary statistics or other +#' metrics for each of the identified chromatographic peaks in an *xcms* result +#' object, such as the [XcmsExperiment()]. Different metrics can be calculated, +#' depending upon (and configured by) using dedicated *parameter* classes. As a +#' result, the method returns a `matrix` or `data.frame` with one row per +#' chromatographic peak. Each column contains calculated values, depending on +#' the used method/parameter class. +#' +#' Currently implemented methods/parameter classes are: +#' +#' - `BetaDistributionParam`: calculates the *beta_cor* and *beta_snr* quality +#' metrics as described in Kumler 2023 representing the result from a +#' (correlation) test of similarity (using Pearson's correlation coefficient) +#' to a bell curve and the signal-to-noise ratio calculated on the residuals +#' of this test. +#' +#' @param BPPARAM Parallel processing setup. See [bpparam()] for details. +#' +#' @param chunkSize `integer(1)` defining the number of samples from which data +#' should be loaded and processed at a time. +#' +#' @param msLevel `integer(1)` with the MS level of the chromatographic peaks +#' on which the metric should be calculated. +#' +#' @param object an *xcms* result object containing information on +#' identified chromatographic peaks. +#' +#' @param param a parameter object defining the method/summaries that should +#' be calculated (see description above for supported parameter classes). +#' +#' @param ... additional arguments passed to the method implementation. +#' +#' @return +#' +#' A `matrix` or `data.frame` with the same number of rows as there are +#' chromatographic peaks. Columns contain the calculated values. The number of +#' columns, their names and content depend on the used parameter object. See +#' the respective documentation above for more details. +#' +#' @author Pablo Vangeenderhuysen, Johannes Rainer, William Kumler +#' +#' @md +#' +#' @references +#' +#' Kumler W, Hazelton B J and Ingalls A E (2023) "Picky with peakpicking: +#' assessing chromatographic peak quality with simple metrics in metabolomics" +#' *BMC Bioinformatics* 24(1):404. doi: 10.1186/s12859-023-05533-4 +#' +#' @export +setGeneric("chromPeakSummary", function(object, param, ...) + standardGeneric("chromPeakSummary")) + setGeneric("collect", function(object, ...) standardGeneric("collect")) setGeneric("consecMissedLimit", function(object, ...) standardGeneric("consecMissedLimit")) diff --git a/R/DataClasses.R b/R/DataClasses.R index d32073e3..c3fd0198 100644 --- a/R/DataClasses.R +++ b/R/DataClasses.R @@ -2182,3 +2182,8 @@ setClass("FilterIntensityParam", msg else TRUE }) + +setClass("BetaDistributionParam", + contains = "Param" + ) + \ No newline at end of file diff --git a/R/XcmsExperiment-functions.R b/R/XcmsExperiment-functions.R index 931bcdb7..e5c9b8fe 100644 --- a/R/XcmsExperiment-functions.R +++ b/R/XcmsExperiment-functions.R @@ -520,8 +520,9 @@ ## consider adding 0 or NA intensity for those. mat <- do.call(rbind, xsub) if (nrow(mat)) { + nr <- vapply(xsub, nrow, NA_integer_) ## can have 0, 1 or x values per rt; repeat rt accordingly - rts <- rep(rt[keep], vapply(xsub, nrow, integer(1L))) + rts <- rep(rt[keep], nr) maxi <- which.max(mat[, 2L])[1L] mmz <- do.call(mzCenterFun, list(mat[, 1L], mat[, 2L])) if (is.na(mmz)) mmz <- mat[maxi, 1L] @@ -530,15 +531,54 @@ sum(mat[, 2L], na.rm = TRUE) * ((rtr[2L] - rtr[1L]) / max(1L, (length(keep) - 1L))) ) - if ("beta_cor" %in% cn) + if ("beta_cor" %in% cn) { res[i, c("beta_cor", "beta_snr")] <- .get_beta_values( - mat[, 2L], rts) + vapply(xsub[nr > 0], function(z) sum(z[, "intensity"]), + NA_real_), + rt[keep][nr > 0]) + } } } } res[!is.na(res[, "maxo"]), , drop = FALSE] } + +#' Calculates quality metrics for a chromatographic peak. +#' +#' @param x `list` of peak matrices (from a single MS level and from a single +#' file/sample). +#' +#' @param rt retention time for each peak matrix. +#' +#' @param peakArea `matrix` defining the chrom peak area. +#' +#' @author Pablo Vangeenderhuysen +#' +#' @noRd +.chrom_peak_beta_metrics <- function(x, rt, peakArea, ...) { + res <- matrix(NA_real_, ncol = 2L, nrow = nrow(peakArea)) + rownames(res) <- rownames(peakArea) + colnames(res) <- c("beta_cor","beta_snr") + for (i in seq_len(nrow(res))) { + rtr <- peakArea[i, c("rtmin", "rtmax")] + keep <- which(between(rt, rtr)) + if (length(keep)) { + xsub <- lapply(x[keep], .pmat_filter_mz, + mzr = peakArea[i, c("mzmin", "mzmax")]) + nr <- vapply(xsub, nrow, NA_integer_) + res[i, c("beta_cor", "beta_snr")] <- .get_beta_values( + vapply(xsub[nr > 0], function(z) sum(z[, "intensity"]), NA_real_), + rt[keep][nr > 0]) + } + } + res +} + + + + + #' Difference to the original code is that the weighted mean is also calculated #' if some of the peak intensities in the profile matrix are 0 #' diff --git a/R/XcmsExperiment.R b/R/XcmsExperiment.R index db3efa05..5b76de69 100644 --- a/R/XcmsExperiment.R +++ b/R/XcmsExperiment.R @@ -2013,3 +2013,39 @@ setMethod( object[i = sort(unique(file)), keepAdjustedRtime = keepAdjustedRtime, keepFeatures = keepFeatures, ...] }) + +#' @rdname chromPeakSummary +setMethod( + "chromPeakSummary", + signature(object = "XcmsExperiment", param = "BetaDistributionParam"), + function(object, param, msLevel = 1L, chunkSize = 2L, BPPARAM = bpparam()) { + if (length(msLevel) != 1) + stop("Can only perform peak metrics for one MS level at a time.") + if (!hasChromPeaks(object, msLevel = msLevel)) + stop("No ChromPeaks definitions for MS level ", msLevel, " present.") + ## Define region to calculate metrics from for each file + cp <- chromPeaks(object, msLevel = msLevel) + f <- factor(cp[,"sample"], seq_along(object)) + pal <- split.data.frame(cp[, c("mzmin", "mzmax", "rtmin", "rtmax")], f) + names(pal) <- seq_along(pal) + ## Manual chunk processi ng because we have to split `object` and `pal` + idx <- seq_along(object) + chunks <- split(idx, ceiling(idx / chunkSize)) + pb <- progress_bar$new(format = paste0("[:bar] :current/:", + "total (:percent) in ", + ":elapsed"), + total = length(chunks) + 1L, clear = FALSE) + pb$tick(0) + # mzf <- "wMean" + res <- lapply(chunks, function(z, ...) { + pb$tick() + .xmse_integrate_chrom_peaks( + .subset_xcms_experiment(object, i = z, keepAdjustedRtime = TRUE, + ignoreHistory = TRUE), + pal = pal[z], intFun = .chrom_peak_beta_metrics, + msLevel = msLevel, BPPARAM = BPPARAM) + }) + res <- do.call(rbind, res) + pb$tick() + res + }) diff --git a/R/do_findChromPeaks-functions.R b/R/do_findChromPeaks-functions.R index 9767c8f0..11b59e9b 100644 --- a/R/do_findChromPeaks-functions.R +++ b/R/do_findChromPeaks-functions.R @@ -3720,14 +3720,20 @@ peaksWithCentWave <- function(int, rt, #' requires at least 5 scans or it will return NA for both parameters. #' #' @param intensity A numeric vector corresponding to the peak intensities +#' #' @param rtime A numeric vector corresponding to the retention times of each -#' intensity. If not provided, intensities will be assumed to be equally spaced. +#' intensity. Retention times are expected to be in increasing order +#' without duplicates. If not provided, intensities will be assumed to be +#' equally spaced. +#' #' @param skews A numeric vector of the skews to try, corresponding to the -#' shape1 of dbeta with a shape2 of 5. Values less than 5 will be increasingly -#' right-skewed, while values greater than 5 will be left-skewed. +#' shape1 of dbeta with a shape2 of 5. Values less than 5 will be +#' increasingly right-skewed, while values greater than 5 will be +#' left-skewed. +#' #' @param zero.rm Boolean value controlling whether "missing" scans are dropped -#' prior to curve fitting. The default, TRUE, will remove intensities of zero -#' or NA +#' prior to curve fitting. The default, TRUE, will remove intensities of +#' zero or NA #' #' @author William Kumler #' @@ -3740,21 +3746,22 @@ peaksWithCentWave <- function(int, rt, rtime <- rtime[keep] intensity <- intensity[keep] } - if(length(intensity)<5){ + if (length(intensity) < 5) { best_cor <- NA beta_snr <- NA } else { - beta_sequence <- rep(.scale_zero_one(rtime), each=length(skews)) + beta_sequence <- rep(.scale_zero_one(rtime), each = length(skews)) beta_vals <- t(matrix(dbeta(beta_sequence, shape1 = skews, shape2 = 5), nrow = length(skews))) # matplot(beta_vals) beta_cors <- cor(intensity, beta_vals) best_cor <- max(beta_cors) best_curve <- beta_vals[, which.max(beta_cors)] - noise_level <- sd(diff(.scale_zero_one(best_curve)-.scale_zero_one(intensity))) - beta_snr <- log10(max(intensity)/noise_level) + noise_level <- sd(diff(.scale_zero_one(best_curve) - + .scale_zero_one(intensity))) + beta_snr <- log10(max(intensity) / noise_level) } - c(best_cor=best_cor, beta_snr=beta_snr) + c(best_cor = best_cor, beta_snr = beta_snr) } @@ -3769,5 +3776,5 @@ peaksWithCentWave <- function(int, rt, #' #' @noRd .scale_zero_one <- function(num_vec){ - (num_vec-min(num_vec))/(max(num_vec)-min(num_vec)) + (num_vec-min(num_vec)) / (max(num_vec) - min(num_vec)) } diff --git a/R/functions-Params.R b/R/functions-Params.R index d790162c..b79a57b3 100644 --- a/R/functions-Params.R +++ b/R/functions-Params.R @@ -397,3 +397,8 @@ FilterIntensityParam <- function(threshold = 0, nValues = 1L, value = "maxo") { new("FilterIntensityParam", threshold = as.numeric(threshold), nValues = as.integer(nValues), value = value) } + +#' @rdname chromPeakSummary +BetaDistributionParam <- function() { + new("BetaDistributionParam") +} diff --git a/R/functions-XCMSnExp.R b/R/functions-XCMSnExp.R index 2bd44572..e2baa6d8 100644 --- a/R/functions-XCMSnExp.R +++ b/R/functions-XCMSnExp.R @@ -279,6 +279,7 @@ dropGenericProcessHistory <- function(x, fun) { valsPerSpect = valsPerSpect, rtrange = rtr, mzrange = mzr) if (length(mtx)) { + ## mtx: time, mz, intensity if (any(!is.na(mtx[, 3]))) { ## How to calculate the area: (1)sum of all intensities / (2)by ## the number of data points (REAL ones, considering also NAs) @@ -290,21 +291,20 @@ dropGenericProcessHistory <- function(x, fun) { ## as e.g. centWave. Using max(1, ... to avoid getting Inf in ## case the signal is based on a single data point. ## (3) rtr[2] - rtr[1] - res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE) * + res[i, "into"] <- sum(mtx[, 3L], na.rm = TRUE) * ((rtr[2] - rtr[1]) / max(1, (sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1))) - maxi <- which.max(mtx[, 3]) + maxi <- which.max(mtx[, 3L]) res[i, c("rt", "maxo")] <- mtx[maxi[1], c(1, 3)] res[i, c("rtmin", "rtmax")] <- rtr ## Calculate the intensity weighted mean mz meanMz <- do.call(mzCenterFun, list(mtx[, 2], mtx[, 3])) if (is.na(meanMz)) meanMz <- mtx[maxi[1], 2] res[i, "mz"] <- meanMz - - if ("beta_cor" %in% cn) { + if ("beta_cor" %in% cn) res[i, c("beta_cor", "beta_snr")] <- .get_beta_values( - mtx[, 3L], mtx[, 1L]) - } + vapply(split(mtx[, 3L], mtx[, 1L]), sum, NA_real_), + unique(mtx[, 1L])) } else { res[i, ] <- rep(NA_real_, ncols) } diff --git a/R/methods-Chromatogram.R b/R/methods-Chromatogram.R index cdff7f10..7fe59738 100644 --- a/R/methods-Chromatogram.R +++ b/R/methods-Chromatogram.R @@ -31,8 +31,10 @@ #' @return #' #' If called on a `Chromatogram` object, the method returns an [XChromatogram] -#' object with the identified peaks. See [peaksWithCentWave()] for details on -#' the peak matrix content. +#' object with the identified peaks. Columns `"mz"`, `"mzmin"` and `"mzmax"` in +#' the `chromPeaks()` peak matrix provide the mean m/z and the maximum and +#' minimum m/z value of the `Chromatogram` object. See [peaksWithCentWave()] +#' for details on the remaining columns. #' #' @seealso [peaksWithCentWave()] for the downstream function and [centWave] #' for details on the method. @@ -70,10 +72,18 @@ setMethod("findChromPeaks", signature(object = "Chromatogram", rt = rtime(object)), as(param, "list"))) object <- as(object, "XChromatogram") - chromPeaks(object) <- res + chromPeaks(object) <- .add_mz(res, object@mz) object }) +.add_mz <- function(x, mz = c(NA_real_, NA_real_)) { + nx <- nrow(x) + tmp <- cbind(mz = rep(mean(mz), nx), + mzmin = rep(mz[1L], nx), + mzmax = rep(mz[2L], nx)) + cbind(tmp, x) +} + #' @title matchedFilter-based peak detection in purely chromatographic data #' #' @description @@ -97,8 +107,10 @@ setMethod("findChromPeaks", signature(object = "Chromatogram", #' @return #' #' If called on a `Chromatogram` object, the method returns a `matrix` with -#' the identified peaks. See [peaksWithMatchedFilter()] for details on the -#' matrix content. +#' the identified peaks. Columns `"mz"`, `"mzmin"` and `"mzmax"` in +#' the `chromPeaks()` peak matrix provide the mean m/z and the maximum and +#' minimum m/z value of the `Chromatogram` object. See +#' [peaksWithMatchedFilter()] for details on the remaining columns. #' #' @seealso [peaksWithMatchedFilter()] for the downstream function and #' [matchedFilter] for details on the method. @@ -134,7 +146,7 @@ setMethod("findChromPeaks", signature(object = "Chromatogram", rt = rtime(object)), as(param, "list"))) object <- as(object, "XChromatogram") - chromPeaks(object) <- res + chromPeaks(object) <- .add_mz(res, object@mz) object }) diff --git a/R/methods-XChromatogram.R b/R/methods-XChromatogram.R index c7a24c1c..4cab15fc 100644 --- a/R/methods-XChromatogram.R +++ b/R/methods-XChromatogram.R @@ -28,10 +28,13 @@ setMethod("show", "XChromatogram", function(object) { #' retention time range for which peaks should be returned along with #' parameter `type` that defines how *overlapping* is defined (parameter #' description for details). For `XChromatogram` objects the function returns -#' a `matrix` with columns `"rt"` (retention time of the peak apex), -#' `"rtmin"` (the lower peak boundary), `"rtmax"` (the upper peak boundary), -#' `"into"` (the ingegrated peak signal/area of the peak), `"maxo"` (the -#' maximum instensity of the peak and `"sn"` (the signal to noise ratio). +#' a `matrix` with columns `"mz"` (mean m/z value), `"mzmin"` (minimal m/z +#' value) and `"mzmax"` (maximal m/z value), `"rt"` (retention time of the +#' peak apex), `"rtmin"` (the lower peak boundary in retention time +#' dimension), `"rtmax"` (the upper peak boundary in retention time +#' dimension), `"into"` (the integrated peak signal/area of the peak), +#' `"maxo"` (the maximum instensity of the peak and `"sn"` (the signal to +#' noise ratio). #' Note that, depending on the peak detection algorithm, the matrix may #' contain additional columns. #' For `XChromatograms` objects the `matrix` contains also columns `"row"` diff --git a/man/XChromatogram.Rd b/man/XChromatogram.Rd index 8b233081..9e21ad0f 100644 --- a/man/XChromatogram.Rd +++ b/man/XChromatogram.Rd @@ -451,10 +451,13 @@ chromatographic peak definitions. Parameter \code{rt} allows to specify a retention time range for which peaks should be returned along with parameter \code{type} that defines how \emph{overlapping} is defined (parameter description for details). For \code{XChromatogram} objects the function returns -a \code{matrix} with columns \code{"rt"} (retention time of the peak apex), -\code{"rtmin"} (the lower peak boundary), \code{"rtmax"} (the upper peak boundary), -\code{"into"} (the ingegrated peak signal/area of the peak), \code{"maxo"} (the -maximum instensity of the peak and \code{"sn"} (the signal to noise ratio). +a \code{matrix} with columns \code{"mz"} (mean m/z value), \code{"mzmin"} (minimal m/z +value) and \code{"mzmax"} (maximal m/z value), \code{"rt"} (retention time of the +peak apex), \code{"rtmin"} (the lower peak boundary in retention time +dimension), \code{"rtmax"} (the upper peak boundary in retention time +dimension), \code{"into"} (the integrated peak signal/area of the peak), +\code{"maxo"} (the maximum instensity of the peak and \code{"sn"} (the signal to +noise ratio). Note that, depending on the peak detection algorithm, the matrix may contain additional columns. For \code{XChromatograms} objects the \code{matrix} contains also columns \code{"row"} diff --git a/man/chromPeakSummary.Rd b/man/chromPeakSummary.Rd new file mode 100644 index 00000000..9fe313d6 --- /dev/null +++ b/man/chromPeakSummary.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllGenerics.R, R/XcmsExperiment.R, +% R/functions-Params.R +\name{chromPeakSummary} +\alias{chromPeakSummary} +\alias{chromPeakSummary,XcmsExperiment,BetaDistributionParam-method} +\alias{BetaDistributionParam} +\title{Chromatographic peak summaries} +\usage{ +chromPeakSummary(object, param, ...) + +\S4method{chromPeakSummary}{XcmsExperiment,BetaDistributionParam}( + object, + param, + msLevel = 1L, + chunkSize = 2L, + BPPARAM = bpparam() +) + +BetaDistributionParam() +} +\arguments{ +\item{object}{an \emph{xcms} result object containing information on +identified chromatographic peaks.} + +\item{param}{a parameter object defining the method/summaries that should +be calculated (see description above for supported parameter classes).} + +\item{...}{additional arguments passed to the method implementation.} + +\item{msLevel}{\code{integer(1)} with the MS level of the chromatographic peaks +on which the metric should be calculated.} + +\item{chunkSize}{\code{integer(1)} defining the number of samples from which data +should be loaded and processed at a time.} + +\item{BPPARAM}{Parallel processing setup. See \code{\link[=bpparam]{bpparam()}} for details.} +} +\value{ +A \code{matrix} or \code{data.frame} with the same number of rows as there are +chromatographic peaks. Columns contain the calculated values. The number of +columns, their names and content depend on the used parameter object. See +the respective documentation above for more details. +} +\description{ +The \code{chromPeakSummary()} method calculates summary statistics or other +metrics for each of the identified chromatographic peaks in an \emph{xcms} result +object, such as the \code{\link[=XcmsExperiment]{XcmsExperiment()}}. Different metrics can be calculated, +depending upon (and configured by) using dedicated \emph{parameter} classes. As a +result, the method returns a \code{matrix} or \code{data.frame} with one row per +chromatographic peak. Each column contains calculated values, depending on +the used method/parameter class. + +Currently implemented methods/parameter classes are: +\itemize{ +\item \code{BetaDistributionParam}: calculates the \emph{beta_cor} and \emph{beta_snr} quality +metrics as described in Kumler 2023 representing the result from a +(correlation) test of similarity (using Pearson's correlation coefficient) +to a bell curve and the signal-to-noise ratio calculated on the residuals +of this test. +} +} +\references{ +Kumler W, Hazelton B J and Ingalls A E (2023) "Picky with peakpicking: +assessing chromatographic peak quality with simple metrics in metabolomics" +\emph{BMC Bioinformatics} 24(1):404. doi: 10.1186/s12859-023-05533-4 +} +\author{ +Pablo Vangeenderhuysen, Johannes Rainer, William Kumler +} diff --git a/man/findChromPeaks-Chromatogram-CentWaveParam.Rd b/man/findChromPeaks-Chromatogram-CentWaveParam.Rd index 9f342440..04d90536 100644 --- a/man/findChromPeaks-Chromatogram-CentWaveParam.Rd +++ b/man/findChromPeaks-Chromatogram-CentWaveParam.Rd @@ -29,8 +29,10 @@ should be performed (only for \code{XChromatograms} objects). It defaults to } \value{ If called on a \code{Chromatogram} object, the method returns an \link{XChromatogram} -object with the identified peaks. See \code{\link[=peaksWithCentWave]{peaksWithCentWave()}} for details on -the peak matrix content. +object with the identified peaks. Columns \code{"mz"}, \code{"mzmin"} and \code{"mzmax"} in +the \code{chromPeaks()} peak matrix provide the mean m/z and the maximum and +minimum m/z value of the \code{Chromatogram} object. See \code{\link[=peaksWithCentWave]{peaksWithCentWave()}} +for details on the remaining columns. } \description{ \code{findChromPeaks} on a \link{Chromatogram} or \link{MChromatograms} object with a diff --git a/man/findChromPeaks-Chromatogram-MatchedFilter.Rd b/man/findChromPeaks-Chromatogram-MatchedFilter.Rd index 15444da3..31956c8c 100644 --- a/man/findChromPeaks-Chromatogram-MatchedFilter.Rd +++ b/man/findChromPeaks-Chromatogram-MatchedFilter.Rd @@ -17,8 +17,10 @@ arguments used for peak detection.} } \value{ If called on a \code{Chromatogram} object, the method returns a \code{matrix} with -the identified peaks. See \code{\link[=peaksWithMatchedFilter]{peaksWithMatchedFilter()}} for details on the -matrix content. +the identified peaks. Columns \code{"mz"}, \code{"mzmin"} and \code{"mzmax"} in +the \code{chromPeaks()} peak matrix provide the mean m/z and the maximum and +minimum m/z value of the \code{Chromatogram} object. See +\code{\link[=peaksWithMatchedFilter]{peaksWithMatchedFilter()}} for details on the remaining columns. } \description{ \code{findChromPeaks} on a \link{Chromatogram} or \link{MChromatograms} object with a diff --git a/tests/testthat/test_Param_classes.R b/tests/testthat/test_Param_classes.R index 4281d56f..4f1754cb 100644 --- a/tests/testthat/test_Param_classes.R +++ b/tests/testthat/test_Param_classes.R @@ -1002,3 +1002,9 @@ test_that("FilterIntensityParam works", { res@threshold <- c(10, 20) expect_error(validObject(res), "length 1") }) + + +test_that("BetaDistributionParam works", { + res <- BetaDistributionParam() + expect_true(is(res, "BetaDistributionParam")) +}) diff --git a/tests/testthat/test_XcmsExperiment.R b/tests/testthat/test_XcmsExperiment.R index 272a58db..e6ddb8a6 100644 --- a/tests/testthat/test_XcmsExperiment.R +++ b/tests/testthat/test_XcmsExperiment.R @@ -825,6 +825,19 @@ test_that(".chrom_peak_intensity_centWave works", { ## pks[11, ]. }) + +## That's from XcmsExperiment-functions.R +test_that(".chrom_peak_beta_metrics works", { + x <- Spectra::peaksData(spectra(xmse[2L])) + rt <- rtime(spectra(xmse[2L])) + pks <- chromPeaks(xmse)[chromPeaks(xmse)[, "sample"] == 2L, ] + + res <- .chrom_peak_beta_metrics(x, rt, pks, sampleIndex = 2L, + cn = colnames(pks)) + expect_equal(nrow(res), nrow(pks)) + +}) + ## That's from XcmsExperiment-functions.R test_that(".chrom_peak_intensity_matchedFilter works", { x <- Spectra::peaksData(spectra(xmse[2L])) @@ -1439,3 +1452,12 @@ test_that("fillChromPeaks,XcmsExperiment works with verboseBetaColumns", { pks_fil <- chromPeaks(res)[chromPeakData(res)$is_filled, ] expect_true(sum(is.na(pks_fil[, "beta_cor"])) < 4) }) + +test_that("chromPeakSummary,XcmsExperiment works", { + p <- CentWaveParam(noise = 10000, snthresh = 40, prefilter = c(3, 10000), + verboseBetaColumns = FALSE) + xmse <- findChromPeaks(mse, param = p) + mat <- chromPeakSummary(xmse,BetaDistributionParam()) + expect_true(all(c("beta_cor", "beta_snr") %in% colnames(mat))) + expect_true(is.numeric(mat)) +}) diff --git a/tests/testthat/test_do_findChromPeaks-functions.R b/tests/testthat/test_do_findChromPeaks-functions.R index 32592fdc..d855dca3 100644 --- a/tests/testthat/test_do_findChromPeaks-functions.R +++ b/tests/testthat/test_do_findChromPeaks-functions.R @@ -46,29 +46,29 @@ test_that("beta calculation returns expected values", { expect_lt(.get_beta_values(1:10, zero.rm = FALSE)["beta_snr"], 2) expect_lt(.get_beta_values(1:10)["best_cor"], 0.0001) expect_lt(.get_beta_values(1:10)["beta_snr"], 2) - + ideal_beta <- dbeta(seq(0, 1, length.out=10), 5, 5) expect_gte(.get_beta_values(ideal_beta, zero.rm = FALSE)["best_cor"], 1) expect_gte(.get_beta_values(ideal_beta, zero.rm = FALSE)["beta_snr"], 16) expect_gte(.get_beta_values(ideal_beta)["best_cor"], 0.97) expect_gte(.get_beta_values(ideal_beta)["beta_snr"], 1) - + skew_beta <- dbeta(seq(0, 1, length.out=10), 3, 5) expect_gte(.get_beta_values(ideal_beta, zero.rm = FALSE)["best_cor"], 1) expect_gte(.get_beta_values(ideal_beta, zero.rm = FALSE)["beta_snr"], 16) expect_gte(.get_beta_values(ideal_beta)["best_cor"], 0.97) expect_gte(.get_beta_values(ideal_beta)["beta_snr"], 1) - + rightskew_beta <- dbeta(seq(0, 1, length.out=10), 7, 5) expect_gt(.get_beta_values(rightskew_beta, skews = c(3,5,7))["best_cor"], 0.95) - + noise_beta <- dbeta(seq(0, 1, length.out=21), 5, 5)*10+runif(21) expect_gt(.get_beta_values(noise_beta)["best_cor"], 0.9) - + expect_no_error(.get_beta_values(runif(1))) expect_no_error(.get_beta_values(runif(10))) expect_no_error(.get_beta_values(runif(100))) - + expect_length(.get_beta_values(1), 2) expect_true(is.na(.get_beta_values(1)["best_cor"])) expect_true(is.na(.get_beta_values(1)["beta_snr"])) @@ -78,31 +78,31 @@ test_that("New beta columns perform as expected", { # faahko_xod comes from testthat.R # faahko_xod <- findChromPeaks( # faahko_od, param = CentWaveParam(noise = 10000, snthresh = 40, - # prefilter = c(3, 10000))) + # prefilter = c(3, 10000))) # Same params as before but with verboseBetaColumns = TRUE - beta_cwp <- CentWaveParam(noise = 10000, snthresh = 40, - prefilter = c(3, 10000), + beta_cwp <- CentWaveParam(noise = 10000, snthresh = 40, + prefilter = c(3, 10000), verboseBetaColumns = TRUE) faahko_xod_beta <- findChromPeaks(faahko_od, beta_cwp) - + # Unit test - check that the new object contains expected columns - expect_contains(colnames(chromPeaks(faahko_xod_beta)), + expect_contains(colnames(chromPeaks(faahko_xod_beta)), c("beta_cor", "beta_snr")) - + # Unit test - check that everything else in the object is the same orig_chrompeaks <- chromPeaks(faahko_xod) beta_chrompeaks <- chromPeaks(faahko_xod_beta) expect_identical(orig_chrompeaks, beta_chrompeaks[,colnames(orig_chrompeaks)]) - + # Object will contain NAs because there are peaks <5 scans wide expect_true(any(is.na(beta_chrompeaks[,"beta_snr"]))) beta_chrompeaks <- beta_chrompeaks[!is.na(beta_chrompeaks[,"beta_cor"]),] - + # Unit test - check that beta values make sense expect_true(all(beta_chrompeaks[,"beta_cor"]<=1)) expect_true(all(beta_chrompeaks[,"beta_cor"]>=-1)) expect_true(all(beta_chrompeaks[,"beta_snr"]>=0)) - + # Unit test - finds a single good peak (beta_cor>0.8, beta_snr>7) # Skinny peak copied from below peaksWithCentWave tests skinny_peak <- c(9107, 3326, 9523, 3245, 3429, 9394, 1123, 935, 5128, 8576, @@ -113,33 +113,33 @@ test_that("New beta columns perform as expected", { 8283, 3410, 5935, 3332, 7041, 3284, 7478, 76, 3739, 2158, 5507) skinny_peak_rt <- seq_along(skinny_peak)+100 cw_output_beta <- .centWave_orig(int = skinny_peak, scantime = skinny_peak_rt, - mz=sort(rnorm(60)/1000+100), + mz=sort(rnorm(60)/1000+100), valsPerSpect = rep(1, length(skinny_peak)), - peakwidth = c(20, 80), extendLengthMSW = TRUE, + peakwidth = c(20, 80), extendLengthMSW = TRUE, verboseBetaColumns = TRUE, snthresh = 0) expect_equal(nrow(cw_output_beta), 1) # Known values to ensure performance doesn't degrade unexpectedly expect_gt(cw_output_beta[,"beta_cor"], 0.8) expect_gt(cw_output_beta[,"beta_snr"], 7) - + # Unit test - finds a single noise peak (beta_cor < 0.5, beta_snr < 6) # set.seed(123) # noise_peak <- round(runif(100), 3) - noise_peak <- c(0.288, 0.788, 0.409, 0.883, 0.94, 0.046, 0.528, 0.892, 0.551, - 0.457, 0.957, 0.453, 0.678, 0.573, 0.103, 0.9, 0.246, 0.042, - 0.328, 0.955, 0.89, 0.693, 0.641, 0.994, 0.656, 0.709, 0.544, - 0.594, 0.289, 0.147, 0.963, 0.902, 0.691, 0.795, 0.025, 0.478, - 0.758, 0.216, 0.318, 0.232, 0.143, 0.415, 0.414, 0.369, 0.152, - 0.139, 0.233, 0.466, 0.266, 0.858, 0.046, 0.442, 0.799, 0.122, - 0.561, 0.207, 0.128, 0.753, 0.895, 0.374, 0.665, 0.095, 0.384, - 0.274, 0.815, 0.449, 0.81, 0.812, 0.794, 0.44, 0.754, 0.629, - 0.71, 0.001, 0.475, 0.22, 0.38, 0.613, 0.352, 0.111, 0.244, - 0.668, - 0.418, 0.788, 0.103, 0.435, 0.985, 0.893, 0.886, 0.175, 0.131, + noise_peak <- c(0.288, 0.788, 0.409, 0.883, 0.94, 0.046, 0.528, 0.892, 0.551, + 0.457, 0.957, 0.453, 0.678, 0.573, 0.103, 0.9, 0.246, 0.042, + 0.328, 0.955, 0.89, 0.693, 0.641, 0.994, 0.656, 0.709, 0.544, + 0.594, 0.289, 0.147, 0.963, 0.902, 0.691, 0.795, 0.025, 0.478, + 0.758, 0.216, 0.318, 0.232, 0.143, 0.415, 0.414, 0.369, 0.152, + 0.139, 0.233, 0.466, 0.266, 0.858, 0.046, 0.442, 0.799, 0.122, + 0.561, 0.207, 0.128, 0.753, 0.895, 0.374, 0.665, 0.095, 0.384, + 0.274, 0.815, 0.449, 0.81, 0.812, 0.794, 0.44, 0.754, 0.629, + 0.71, 0.001, 0.475, 0.22, 0.38, 0.613, 0.352, 0.111, 0.244, + 0.668, + 0.418, 0.788, 0.103, 0.435, 0.985, 0.893, 0.886, 0.175, 0.131, 0.653, 0.344, 0.657, 0.32, 0.188, 0.782, 0.094, 0.467, 0.512) - cw_output_beta <- .centWave_orig(int = noise_peak*100000, + cw_output_beta <- .centWave_orig(int = noise_peak*100000, scantime = seq_along(noise_peak), - mz=rep(530.1, length(noise_peak)), + mz=rep(530.1, length(noise_peak)), valsPerSpect = rep(1, length(noise_peak)), peakwidth = c(20, 80), extendLengthMSW = TRUE, verboseBetaColumns = TRUE, snthresh = 0) @@ -149,6 +149,21 @@ test_that("New beta columns perform as expected", { expect_lt(cw_output_beta[,"beta_snr"], 6) }) +test_that(".get_beta_values works with chromatographic and MS data", { + vals <- c(2, 3, 4, 6, 8, 10, 7, 6, 4, 3, 2) + res <- .get_beta_values(vals) + ## intensity values with duplicated retention times + vals <- c(2, 3, 4, 6, 2, 8, 10, 7, 6, 4, 3, 2) + rts <- c(1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10) + res_a <- .get_beta_values(vals) + expect_true(res_a[1L] < res[1L]) + ## WARNING: does not work if we have duplicated retentio times! + expect_warning( + res_b <- .get_beta_values(vals, rts) + ) + expect_true(is.na(res_b[1L])) +}) + test_that("do_findChromPeaks_centWaveWithPredIsoROIs works", { skip_on_os(os = "windows", arch = "i386") diff --git a/tests/testthat/test_functions-Chromatogram.R b/tests/testthat/test_functions-Chromatogram.R index 6616d20b..10e78ac4 100644 --- a/tests/testthat/test_functions-Chromatogram.R +++ b/tests/testthat/test_functions-Chromatogram.R @@ -52,13 +52,15 @@ test_that(".chrom_merge_neighboring_peaks works", { expect_equal(res$chromPeakData$index, 3L) ## Check "into" calculation. - pks <- rbind(pks[-c(1, 2), ], - c(18, pks[2, "rtmin"], pks[2, "rtmin"] + 4, NA_real_, - NA_real_, 3, 3), - c(20, pks[2, "rtmax"] - 5, pks[2, "rtmax"], NA_real_, - NA_real_, 9, 8)) + pks2 <- rbind(pks[-c(1, 2), ], + c(NA_real_, NA_real_, NA_real_, 18, pks[2, "rtmin"], + pks[2, "rtmin"] + 4, NA_real_, + NA_real_, 3, 3), + c(NA_real_, NA_real_, NA_real_, 20, pks[2, "rtmax"] - 5, + pks[2, "rtmax"], NA_real_, + NA_real_, 9, 8)) res <- .chrom_merge_neighboring_peaks( - chr, pks, pkd, diffRt = 5, minProp = 0.75) + chr, pks2, pkd, diffRt = 5, minProp = 0.75) expect_equal(unname(res$chromPeaks[1, "into"]), unname(chromPeaks(xchr)[2, "into"])) }) diff --git a/tests/testthat/test_methods-Chromatogram.R b/tests/testthat/test_methods-Chromatogram.R index 2746b6ef..a2884fae 100644 --- a/tests/testthat/test_methods-Chromatogram.R +++ b/tests/testthat/test_methods-Chromatogram.R @@ -55,3 +55,22 @@ test_that("removeIntensity,Chromatogram works", { expect_equal(intensity(res), c(NA_real_, NA_real_, NA_real_, 22, 34, NA_real_, NA_real_)) }) + +test_that(".add_mz works", { + a <- matrix(nrow = 0, ncol = 3) + colnames(a) <- c("rt", "rtmin", "rtmax") + res <- .add_mz(a, c(1, 2)) + expect_equal(ncol(res), 6) + expect_equal(colnames(res), c("mz", "mzmin", "mzmax", + "rt", "rtmin", "rtmax")) + expect_true(nrow(res) == 0) + a <- matrix(nrow = 2, ncol = 3) + colnames(a) <- c("rt", "rtmin", "rtmax") + res <- .add_mz(a, c(1, 2)) + expect_equal(ncol(res), 6) + expect_equal(colnames(res), c("mz", "mzmin", "mzmax", + "rt", "rtmin", "rtmax")) + expect_equal(res[, "mzmin"], c(1, 1)) + expect_equal(res[, "mzmax"], c(2, 2)) + expect_equal(res[, "mz"], c(1.5, 1.5)) +}) diff --git a/vignettes/references.bib b/vignettes/references.bib index 78d8c894..bf6a2aed 100644 --- a/vignettes/references.bib +++ b/vignettes/references.bib @@ -122,3 +122,58 @@ @article{gattoMSnbaseEfficientElegant2020a language = {eng}, pmid = {32902283} } + +@article{Kumler2023, + title = "Picky with peakpicking: assessing chromatographic peak quality + with simple metrics in metabolomics", + author = "Kumler, William and Hazelton, Bryna J and Ingalls, Anitra E", + abstract = "BACKGROUND: Chromatographic peakpicking continues to represent a + significant bottleneck in automated LC-MS workflows. + Uncontrolled false discovery rates and the lack of + manually-calibrated quality metrics require researchers to + visually evaluate individual peaks, requiring large amounts of + time and breaking replicability. This problem is exacerbated in + noisy environmental datasets and for novel separation methods + such as hydrophilic interaction columns in metabolomics, + creating a demand for a simple, intuitive, and robust metric of + peak quality. RESULTS: Here, we manually labeled four HILIC + oceanographic particulate metabolite datasets to assess the + performance of individual peak quality metrics. We used these + datasets to construct a predictive model calibrated to the + likelihood that visual inspection by an MS expert would include + a given mass feature in the downstream analysis. We implemented + two novel peak quality metrics, a custom signal-to-noise metric + and a test of similarity to a bell curve, both calculated from + the raw data in the extracted ion chromatogram, and found that + these outperformed existing measurements of peak quality. A + simple logistic regression model built on two metrics reduced + the fraction of false positives in the analysis from 70-80\% + down to 1-5\% and showed minimal overfitting when applied to + novel datasets. We then explored the implications of this + quality thresholding on the conclusions obtained by the + downstream analysis and found that while only 10\% of the + variance in the dataset could be explained by depth in the + default output from the peakpicker, approximately 40\% of the + variance was explained when restricted to high-quality peaks + alone. CONCLUSIONS: We conclude that the poor performance of + peakpicking algorithms significantly reduces the power of both + univariate and multivariate statistical analyses to detect + environmental differences. We demonstrate that simple models + built on intuitive metrics and derived from the raw data are + more robust and can outperform more complex models when applied + to new data. Finally, we show that in properly curated datasets, + depth is a major driver of variability in the marine microbial + metabolome and identify several interesting metabolite trends + for future investigation.", + journal = "BMC Bioinformatics", + publisher = "Springer Science and Business Media LLC", + volume = 24, + number = 1, + pages = "404", + month = oct, + year = 2023, + keywords = "Ground Truth Dataset; Marine environment; Mass-spectrometry; + Metabolomics; Peakpicking; XCMS", + copyright = "https://creativecommons.org/licenses/by/4.0", + language = "en" +} diff --git a/vignettes/xcms.Rmd b/vignettes/xcms.Rmd index 3f59a2e9..ba16810b 100644 --- a/vignettes/xcms.Rmd +++ b/vignettes/xcms.Rmd @@ -431,6 +431,56 @@ numeric). chromPeakData(faahko) ``` +### Chromatographic peak quality + +Based on the publication by Kumler et al. published in 2023 [@Kumler2023], new +quality metrics (*beta_cor* and *beta_snr*) were added to *xcms*. *beta_cor* +indicates how bell-shaped the chromatographic peak is and *beta_snr* is +estimating the signal-to-noise ratio using the residuals from this fit. These +metrics can be calculated during peak picking by setting `verboseBetaColumns = +TRUE` in the `CentWaveParam` object, or they can be calculated afterwards by +using the `chromPeakSummary()` function with the `XcmsExperiment` object and +the `BetaDistributionParam` parameter object as input: + +```{r peak-detection-chromPeakSummary} +beta_metrics <- chromPeakSummary(faahko, BetaDistributionParam()) +head(beta_metrics) + +``` + +The result returned by `chromPeakSummary()` is thus a numeric matrix with the +values for these quality estimates, one row for each chromatographic peak. +Using summary statistics, one can explore the distribution of these metrics in +the data. + +```{r beta-metrics} +summary(beta_metrics) +``` + +Visual inspection gives a better idea of what these metrics represent in terms +of peak quality in the data at hand. This information can be used to e.g. +filter out peaks that don't meet a chosen quality metric threshold. In order to +plot the detected peaks, their EIC can be extracted with the function +`chromPeakChromatograms`. An example of a peak with a high *beta_cor* and for +a peak with a low *beta_cor* score is given below. + +```{r chromPeakChromatograms, message=FALSE} +beta_metrics[c(4, 6), ] +eics <- chromPeakChromatograms( + faahko, peaks = rownames(chromPeaks(faahko))[c(4, 6)]) +``` + +```{r peak-quality-metrics, fig.width = 10, fig.height = 5, fig.cap = "Plots of high and low quality peaks. Left: peak CP0004 with a beta_cor = 0.98, right: peak CP0006 with a beta_cor = 0.13."} +peak_1 <- eics[1] +peak_2 <- eics[2] +par(mfrow = c(1, 2)) +plot(peak_1) +plot(peak_2) +``` + + +### Refining peak detection + Peak detection will not always work perfectly for all types of peak shapes present in the data set leading to peak detection artifacts, such as (partially or completely) overlapping peaks or artificially split peaks (common issues @@ -459,7 +509,7 @@ faahko_pp <- refineChromPeaks(faahko, mpp) An example for a merged peak is given below. -```{r peak-postprocessing-merged, fig.widht = 10, fig.height = 5, fig.cap = "Result from the peak refinement step. Left: data before processing, right: after refinement. The splitted peak was merged into one."} +```{r peak-postprocessing-merged, fig.width = 10, fig.height = 5, fig.cap = "Result from the peak refinement step. Left: data before processing, right: after refinement. The splitted peak was merged into one."} mzr_1 <- 305.1 + c(-0.01, 0.01) chr_1 <- chromatogram(faahko[1], mz = mzr_1) chr_2 <- chromatogram(faahko_pp[1], mz = mzr_1) @@ -474,7 +524,7 @@ plot(chr_2) peaks into a single one (right panel in the figure above). Other close peaks, with a lower intensity between them, were however not merged (see below). -```{r peak-postprocessing-not-merged, fig.widht = 10, fig.height = 5, fig.cap = "Result from the peak refinement step. Left: data before processing, right: after refinement. The peaks were not merged."} +```{r peak-postprocessing-not-merged, fig.width = 10, fig.height = 5, fig.cap = "Result from the peak refinement step. Left: data before processing, right: after refinement. The peaks were not merged."} mzr_1 <- 496.2 + c(-0.01, 0.01) chr_1 <- chromatogram(faahko[1], mz = mzr_1) chr_2 <- chromatogram(faahko_pp[1], mz = mzr_1)