From 477706601ef5fed401e9d79c00241a16bdddfd2c Mon Sep 17 00:00:00 2001 From: jorainer Date: Fri, 22 Dec 2023 12:36:29 +0100 Subject: [PATCH] feat: add SingleMatchParam - Add `SingleMatchParam` to allow easy reduction of matches between each query and target elements from 1:n to 1:(0,1). - Add `queryVariables` and `targetVariables` functions to extract the corresponding variables/colnames. --- .github/workflows/check-bioc.yml | 3 +- NEWS.md | 2 + R/Matched.R | 125 +++++++++++++++++++++++-------- man/Matched.Rd | 74 +++++++++++------- tests/testthat/test_Matched.R | 96 ++++++++++++++++++++++++ 5 files changed, 240 insertions(+), 60 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 71306d8..4241ecb 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -22,7 +22,8 @@ on: push: - pull_request: + paths-ignore: + - 'README.md' name: R-CMD-check-bioc diff --git a/NEWS.md b/NEWS.md index ebf3659..6cfa62e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,8 @@ ## Changes in 1.7.3 +- Add `SingleMatchParam` for `filterMatches` to allow selection of (at most) a + single match to a target element for each query element. - Add new methods `queryVariables` and `targetVariables` to extract the names of variables (columns) of *query* and *target*. diff --git a/R/Matched.R b/R/Matched.R index 1c31bf6..20f0387 100644 --- a/R/Matched.R +++ b/R/Matched.R @@ -23,7 +23,7 @@ #' objects (including `SummarizedExperiment` or `QFeatures`). For [QFeatures()] #' objects matches to only one of the *assays* within the object is supported. #' -#' @section Creation and subsetting: +#' @section Ceation and general handling: #' #' `Matched` object is returned as result from the [matchValues()] function. #' @@ -32,11 +32,6 @@ #' `data.frame` with two columns of integer indices defining which elements #' from *query* match which element from *target*. #' -#' - `[`: subset the object selecting `query` object elements to keep with -#' parameter `i`. The resulting object will contain all the matches -#' for the selected query elements. The `target` object will by default be -#' returned as-is. -#' #' - `addMatches`: add new matches to an existing object. Parameters #' `queryValue` and `targetValue` allow to define which element(s) in #' `query` and `target` should be considered matching. If `isIndex = TRUE`, @@ -57,25 +52,23 @@ #' in a single `Matched` object representing updated matches. Note that `FUN` #' has to return a `Matched` object. #' +#' - `lapply`: applies a user defined function `FUN` to each subset of +#' matches in a `Matched` object for each `query` element (i.e. to each `x[i]` +#' with `i` from `1` to `length(x)`). It returns a `list` of `length(object)` +#' elements where each element is the output of `FUN` applied to each subset +#' of matches. +#' +#' +#' @section Filtering and subsetting: +#' +#' - `[`: subset the object selecting `query` object elements to keep with +#' parameter `i`. The resulting object will contain all the matches +#' for the selected query elements. The `target` object will by default be +#' returned as-is. +#' #' - `filterMatches`: filter matches in a `Matched` object using different #' approaches depending on the class of `param`: #' -#' - `SingleMatchParam`: reduces matches to keep only (at most) a -#' single match per query. The deduplication strategy can be defined with -#' parameter `duplicates`: -#' - `duplicates = "remove"`: all matches for query elements matching more -#' than one target element will be removed. -#' - `duplicates = "closest"`: keep only the *closest* match for each -#' query element. The closest match is defined by the value(s) of -#' *score* (and eventually *score_rt*, if present). The one match with -#' the smallest value for this (these) column(s) is retained. This is -#' equivalent to `TopRankedMatchesParam(n = 1L, decreasing = FALSE)`. -#' - `duplicates = "top_ranked"`: select the best ranking match for each -#' query element. Parameter `column` allows to specify the column by -#' which matches are ranked (use LLLLLL to list possible columns). -#' The column. Parameter `decreasing` allows -#' to define whether the match with the highest (`decreasing = TRUE`) -#' or lowest (`decreasing = FALSE`) value will be selected. #' - `ScoreThresholdParam`: keeps only the matches whose score is strictly #' above or strictly below a certain threshold (respectively when parameter #' `above = TRUE` and `above = FALSE`). The name of the column containing @@ -87,6 +80,7 @@ #' used to filter matches based on retention time scores for `Matched` #' objects returned by [matchValues()] when `param` objects involving a #' retention time comparison are used. +#' #' - `SelectMatchesParam`: keeps or removes (respectively when parameter #' `keep = TRUE` and `keep = FALSE`) matches corresponding to certain #' indices or values of `query` and `target`. If `queryValue` and @@ -96,6 +90,25 @@ #' from the [matches()] matrix from the `Matched` object but thus not alter #' the `query` or `target` in the object. See examples below for more #' information. +#' +#' - `SingleMatchParam`: reduces matches to keep only (at most) a +#' single match per query. The deduplication strategy can be defined with +#' parameter `duplicates`: +#' - `duplicates = "remove"`: all matches for query elements matching more +#' than one target element will be removed. +#' - `duplicates = "closest"`: keep only the *closest* match for each +#' query element. The closest match is defined by the value(s) of +#' *score* (and eventually *score_rt*, if present). The one match with +#' the smallest value for this (these) column(s) is retained. This is +#' equivalent to `TopRankedMatchesParam(n = 1L, decreasing = FALSE)`. +#' - `duplicates = "top_ranked"`: select the *best ranking* match for each +#' query element. Parameter `column` allows to specify the column by +#' which matches are ranked (use `targetVariables(object)` or +#' `scoreVariables(object)` to list possible columns). Parameter +#' `decreasing` allows to define whether the match with the highest +#' (`decreasing = TRUE`) or lowest (`decreasing = FALSE`) value in +#' `column` for each *query* will be selected. +#' #' - `TopRankedMatchesParam`: for each query element the matches are ranked #' according to their score and only the `n` best of them are kept (if `n` #' is larger than the number of matches for a given query element all the @@ -114,16 +127,9 @@ #' small (or, depending on parameter `decreasing`, large) values for #' `"score"` **and** `"score_rt"` are returned. #' -#' - `lapply`: applies a user defined function `FUN` to each subset of -#' matches in a `Matched` object for each `query` element (i.e. to each `x[i]` -#' with `i` from `1` to `length(x)`). It returns a `list` of `length(object)` -#' elements where each element is the output of `FUN` applied to each subset -#' of matches. -#' #' - `pruneTarget`: *cleans* the object by removing non-matched #' **target** elements. #' -#' #' @section Extracting data: #' #' - `$` extracts a single variable from the `Matched` `x`. The variables that @@ -202,7 +208,9 @@ #' #' @param column for `ScoreThresholdParam`: `character(1)` specifying the name #' of the score variable to consider for the filtering (the default is -#' `column = "score"`). +#' `column = "score"`). For `SingleMatchParam`: `character(1)` defining the +#' name of the column to be used for de-duplication. See description of +#' `SingleMatchParam` in the *Filtering and subsetting* section for details. #' #' @param columns for `matchedData`: `character` vector with column names of #' variables that should be extracted. @@ -213,6 +221,10 @@ #' #' @param drop for `[`: ignored. #' +#' @param duplicates for `SingleMatchParam`: `character(1)` defining the +#' *de-duplication* strategy. See the description of `SingleMatchParam` in +#' the *Filtering and subsetting* subsection for choices and details. +#' #' @param FUN for `lapply` and `endoapply`: user defined `function` that takes a #' `Matched` object as a first parameter and possibly additional parameters #' (that need to be provided in the `lapply` or `endoapply` call. For lapply @@ -1286,6 +1298,59 @@ SingleMatchParam <- function(duplicates = c("remove", "closest", "top_ranked"), decreasing = decreasing[1L]) } +#' @rdname Matched +#' +#' @export +setMethod( + "filterMatches", c("Matched", "SingleMatchParam"), + function (object, param, ...) { + if (!param@column %in% c(scoreVariables(object), + targetVariables(object))) + stop("Variable \"", param@column, "\" not found. `column` ", + "should be one of 'scoreVariables(object)' or ", + "'targetVariables(object)'.") + object@metadata <- c(object@metadata, param = param) + if (!nrow(object@matches)) + return(object) + switch( + param@duplicates[1L], + "remove" = { + s <- split(seq_len(nrow(object@matches)), + object@matches$query_idx) + keep <- unlist(s[lengths(s) == 1L], use.names = FALSE) + object@matches <- object@matches[keep, , drop = FALSE] + }, + "closest" = { + object <- filterMatches( + object, TopRankedMatchesParam(n = 1L, decreasing = FALSE)) + }, + "top_ranked" = { + ## Rank matches by "column" + if (param@column %in% scoreVariables(object)) + vals <- cbind(seq_len(nrow(object@matches)), + object@matches$query_idx, + object@matches[, param@column]) + else + vals <- cbind( + seq_len(nrow(object@matches)), + object@matches$query_idx, + .extract_elements( + .objectToMatch(object@target, object@targetAssay), + object@matches$target_idx, + sub("target_", "", param@column))) + vals <- vals[order(vals[, 3L], + decreasing = param@decreasing), , + drop = FALSE] + keep <- vals[match(unique(object@matches$query_idx), + vals[, 2L]), 1L] + object@matches <- object@matches[keep, , drop = FALSE] + }, + stop("'duplicates' has to be one of \"remove\", \"closest\"", + " or \"top_ranked\".")) + validObject(object) + object + }) + #' @importFrom MsCoreUtils rbindFill .addMatches <- function(query, target, matches, queryValue = integer(), targetValue = integer(), queryColname = character(), diff --git a/man/Matched.Rd b/man/Matched.Rd index 88a54d4..149c783 100644 --- a/man/Matched.Rd +++ b/man/Matched.Rd @@ -35,6 +35,7 @@ \alias{filterMatches,Matched,TopRankedMatchesParam-method} \alias{filterMatches,Matched,ScoreThresholdParam-method} \alias{SingleMatchParam} +\alias{filterMatches,Matched,SingleMatchParam-method} \alias{addMatches,Matched-method} \alias{endoapply,ANY-method} \alias{endoapply,Matched-method} @@ -132,6 +133,8 @@ SingleMatchParam( decreasing = TRUE ) +\S4method{filterMatches}{Matched,SingleMatchParam}(object, param, ...) + \S4method{addMatches}{Matched}( object, queryValue = integer(), @@ -245,7 +248,13 @@ threshold.} \item{column}{for \code{ScoreThresholdParam}: \code{character(1)} specifying the name of the score variable to consider for the filtering (the default is -\code{column = "score"}).} +\code{column = "score"}). For \code{SingleMatchParam}: \code{character(1)} defining the +name of the column to be used for de-duplication. See description of +\code{SingleMatchParam} in the \emph{Filtering and subsetting} section for details.} + +\item{duplicates}{for \code{SingleMatchParam}: \code{character(1)} defining the +\emph{de-duplication} strategy. See the description of \code{SingleMatchParam} in +the \emph{Filtering and subsetting} subsection for choices and details.} \item{score}{for \code{addMatches}: \code{numeric} (same length than \code{queryValue}) or \code{data.frame} (same number of rows than \code{queryValue}) specifying the scores @@ -275,7 +284,7 @@ between elements of one-dimensional objects, or rows for two-dimensional objects (including \code{SummarizedExperiment} or \code{QFeatures}). For \code{\link[=QFeatures]{QFeatures()}} objects matches to only one of the \emph{assays} within the object is supported. } -\section{Creation and subsetting}{ +\section{Ceation and general handling}{ \code{Matched} object is returned as result from the \code{\link[=matchValues]{matchValues()}} function. @@ -285,10 +294,6 @@ function providing the \code{query} and \code{target} objects as well as the \co \code{data.frame} with two columns of integer indices defining which elements from \emph{query} match which element from \emph{target}. \itemize{ -\item \code{[}: subset the object selecting \code{query} object elements to keep with -parameter \code{i}. The resulting object will contain all the matches -for the selected query elements. The \code{target} object will by default be -returned as-is. \item \code{addMatches}: add new matches to an existing object. Parameters \code{queryValue} and \code{targetValue} allow to define which element(s) in \code{query} and \code{target} should be considered matching. If \code{isIndex = TRUE}, @@ -307,27 +312,24 @@ matches in a \code{Matched} object corresponding to a \code{query} element (i.e. each \code{x[i]} with \code{i} being 1 to \code{length(x)}). The results are then combined in a single \code{Matched} object representing updated matches. Note that \code{FUN} has to return a \code{Matched} object. +\item \code{lapply}: applies a user defined function \code{FUN} to each subset of +matches in a \code{Matched} object for each \code{query} element (i.e. to each \code{x[i]} +with \code{i} from \code{1} to \code{length(x)}). It returns a \code{list} of \code{length(object)} +elements where each element is the output of \code{FUN} applied to each subset +of matches. +} +} + +\section{Filtering and subsetting}{ + +\itemize{ +\item \code{[}: subset the object selecting \code{query} object elements to keep with +parameter \code{i}. The resulting object will contain all the matches +for the selected query elements. The \code{target} object will by default be +returned as-is. \item \code{filterMatches}: filter matches in a \code{Matched} object using different approaches depending on the class of \code{param}: \itemize{ -\item \code{SingleMatchParam}: reduces matches to keep only (at most) a -single match per query. The deduplication strategy can be defined with -parameter \code{duplicates}: -\itemize{ -\item \code{duplicates = "remove"}: all matches for query elements matching more -than one target element will be removed. -\item \code{duplicates = "closest"}: keep only the \emph{closest} match for each -query element. The closest match is defined by the value(s) of -\emph{score} (and eventually \emph{score_rt}, if present). The one match with -the smallest value for this (these) column(s) is retained. This is -equivalent to \code{TopRankedMatchesParam(n = 1L, decreasing = FALSE)}. -\item \code{duplicates = "top_ranked"}: select the best ranking match for each -query element. Parameter \code{column} allows to specify the column by -which matches are ranked (use LLLLLL to list possible columns). -The column. Parameter \code{decreasing} allows -to define whether the match with the highest (\code{decreasing = TRUE}) -or lowest (\code{decreasing = FALSE}) value will be selected. -} \item \code{ScoreThresholdParam}: keeps only the matches whose score is strictly above or strictly below a certain threshold (respectively when parameter \code{above = TRUE} and \code{above = FALSE}). The name of the column containing @@ -345,6 +347,25 @@ indices or values of \code{query} and \code{target}. If \code{queryValue} and \code{targetValue} are provided, matches for these value pairs are kept or removed. Parameter index\verb{allows to filter matches providing their index in the [matches()] matrix. Note that}filterMatches\verb{removes only matches from the [matches()] matrix from the}Matched\verb{object but thus not alter the}query\code{or}target` in the object. See examples below for more information. +\item \code{SingleMatchParam}: reduces matches to keep only (at most) a +single match per query. The deduplication strategy can be defined with +parameter \code{duplicates}: +\itemize{ +\item \code{duplicates = "remove"}: all matches for query elements matching more +than one target element will be removed. +\item \code{duplicates = "closest"}: keep only the \emph{closest} match for each +query element. The closest match is defined by the value(s) of +\emph{score} (and eventually \emph{score_rt}, if present). The one match with +the smallest value for this (these) column(s) is retained. This is +equivalent to \code{TopRankedMatchesParam(n = 1L, decreasing = FALSE)}. +\item \code{duplicates = "top_ranked"}: select the \emph{best ranking} match for each +query element. Parameter \code{column} allows to specify the column by +which matches are ranked (use \code{targetVariables(object)} or +\code{scoreVariables(object)} to list possible columns). Parameter +\code{decreasing} allows to define whether the match with the highest +(\code{decreasing = TRUE}) or lowest (\code{decreasing = FALSE}) value in +\code{column} for each \emph{query} will be selected. +} \item \code{TopRankedMatchesParam}: for each query element the matches are ranked according to their score and only the \code{n} best of them are kept (if \code{n} is larger than the number of matches for a given query element all the @@ -363,11 +384,6 @@ is performed on the absolute value of \code{"score_rt"}). Thus, matches with small (or, depending on parameter \code{decreasing}, large) values for \code{"score"} \strong{and} \code{"score_rt"} are returned. } -\item \code{lapply}: applies a user defined function \code{FUN} to each subset of -matches in a \code{Matched} object for each \code{query} element (i.e. to each \code{x[i]} -with \code{i} from \code{1} to \code{length(x)}). It returns a \code{list} of \code{length(object)} -elements where each element is the output of \code{FUN} applied to each subset -of matches. \item \code{pruneTarget}: \emph{cleans} the object by removing non-matched \strong{target} elements. } diff --git a/tests/testthat/test_Matched.R b/tests/testthat/test_Matched.R index 8955b01..d2b2f83 100644 --- a/tests/testthat/test_Matched.R +++ b/tests/testthat/test_Matched.R @@ -1161,3 +1161,99 @@ test_that("queryIndex works", { expect_identical(queryIndex(a), c(1L, 1L, 2L)) expect_error(queryIndex(4), "'Matched'") }) + +test_that("SingleMatchParam works", { + res <- SingleMatchParam() + expect_s4_class(res, "SingleMatchParam") + expect_true(validObject(res)) + expect_error(SingleMatchParam(duplicates = "other"), "should be") + res <- SingleMatchParam("closest", column = "other", decreasing = FALSE) + expect_equal(res@duplicates, "closest") + expect_equal(res@column, "other") + expect_equal(res@decreasing, FALSE) +}) + +test_that("filterMatches,Matched,SingleMatchParam works", { + a <- Matched() + p <- SingleMatchParam(column = "ops") + expect_error(filterMatches(a, p), "not found") + + ## target is data.frame + a <- Matched( + query = q1, target = t1, + matches = data.frame(query_idx = c(1L, 2L, 2L, 2L, 5L), + target_idx = c(2L, 2L, 3L, 4L, 5L), + score = seq(0.5, 0.9, by = 0.1))) + p@duplicates <- "sum" + p@column <- "score" + expect_error(filterMatches(a, p), "has to be one") + p <- SingleMatchParam() + ## remove + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$target_idx, c(2L, 5L)) + ## closest + p <- SingleMatchParam(duplicates = "closest") + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$target_idx, c(2L, 2L, 5L)) + ## top_ranked + p <- SingleMatchParam(duplicates = "top_ranked", column = "target_col2", + decreasing = TRUE) + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$target_idx, c(2L, 4L, 5L)) + + ## target is vector + a <- Matched( + query = q1, target = c(100, 200, 300, 800), + matches = data.frame(query_idx = c(1L, 1L, 1L, 3L, 3L, 3L, 3L), + target_idx = c(3L, 4L, 1L, 2L, 3L, 1L, 4L), + score = c(1, 2, 4, 3, 1, 2, 7)) + ) + ## remove + p <- SingleMatchParam() + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$target_idx, integer()) + ## closest + p <- SingleMatchParam(duplicates = "closest") + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$target_idx, c(3L, 3L)) + ## top_ranked + p <- SingleMatchParam(duplicates = "top_ranked", column = "target", + decreasing = FALSE) + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$target_idx, c(1L, 1L)) + + ## target is SummarizedExperiment + rowData(q3)$new_col <- seq_len(nrow(q1)) + a <- Matched(query = q1, target = q3, + matches = data.frame( + query_idx = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 5L), + target_idx = c(2L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), + score = c(4, 1, 2, 3, 4, 3, 1, 9))) + ## remove + p <- SingleMatchParam() + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$query_idx, 5L) + expect_equal(res@matches$target_idx, 5L) + + ## closest + p <- SingleMatchParam(duplicates = "closest") + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$query_idx, c(1L, 2L, 3L, 5L)) + expect_equal(res@matches$target_idx, c(4L, 1L, 4L, 5L)) + + ## top_ranked + p <- SingleMatchParam(duplicates = "top_ranked", column = "target_new_col", + decreasing = TRUE) + res <- filterMatches(a, p) + expect_equal(anyDuplicated(res@matches$query_idx), 0L) + expect_equal(res@matches$query_idx, c(1L, 2L, 3L, 5L)) + expect_equal(res@matches$target_idx, c(5L, 2L, 4L, 5L)) +})