From 477706601ef5fed401e9d79c00241a16bdddfd2c Mon Sep 17 00:00:00 2001
From: jorainer <johannes.rainer@gmail.com>
Date: Fri, 22 Dec 2023 12:36:29 +0100
Subject: [PATCH] feat: add SingleMatchParam

- Add `SingleMatchParam` to allow easy reduction of matches between each query
  and target elements from 1:n to 1:(0,1).
- Add `queryVariables` and `targetVariables` functions to extract the
  corresponding variables/colnames.
---
 .github/workflows/check-bioc.yml |   3 +-
 NEWS.md                          |   2 +
 R/Matched.R                      | 125 +++++++++++++++++++++++--------
 man/Matched.Rd                   |  74 +++++++++++-------
 tests/testthat/test_Matched.R    |  96 ++++++++++++++++++++++++
 5 files changed, 240 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index 71306d8..4241ecb 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -22,7 +22,8 @@
 
 on:
   push:
-  pull_request:
+    paths-ignore:
+      - 'README.md'
 
 name: R-CMD-check-bioc
 
diff --git a/NEWS.md b/NEWS.md
index ebf3659..6cfa62e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,8 @@
 
 ## Changes in 1.7.3
 
+- Add `SingleMatchParam` for `filterMatches` to allow selection of (at most) a
+  single match to a target element for each query element.
 - Add new methods `queryVariables` and `targetVariables` to extract the names
   of variables (columns) of *query* and *target*.
 
diff --git a/R/Matched.R b/R/Matched.R
index 1c31bf6..20f0387 100644
--- a/R/Matched.R
+++ b/R/Matched.R
@@ -23,7 +23,7 @@
 #' objects (including `SummarizedExperiment` or `QFeatures`). For [QFeatures()]
 #' objects matches to only one of the *assays* within the object is supported.
 #'
-#' @section Creation and subsetting:
+#' @section Ceation and general handling:
 #'
 #' `Matched` object is returned as result from the [matchValues()] function.
 #'
@@ -32,11 +32,6 @@
 #' `data.frame` with two columns of integer indices defining which elements
 #' from *query* match which element from *target*.
 #'
-#' - `[`: subset the object selecting `query` object elements to keep with
-#'   parameter `i`. The resulting object will contain all the matches
-#'   for the selected query elements. The `target` object will by default be
-#'   returned as-is.
-#'
 #' - `addMatches`: add new matches to an existing object. Parameters
 #'   `queryValue` and `targetValue` allow to define which element(s) in
 #'   `query` and `target` should be considered matching. If `isIndex = TRUE`,
@@ -57,25 +52,23 @@
 #'   in a single `Matched` object representing updated matches. Note that `FUN`
 #'   has to return a `Matched` object.
 #'
+#' - `lapply`: applies a user defined function `FUN` to each subset of
+#'   matches in a `Matched` object for each `query` element (i.e. to each `x[i]`
+#'   with `i` from `1` to `length(x)`). It returns a `list` of `length(object)`
+#'   elements where each element is the output of `FUN` applied to each subset
+#'   of matches.
+#'
+#'
+#' @section Filtering and subsetting:
+#'
+#' - `[`: subset the object selecting `query` object elements to keep with
+#'   parameter `i`. The resulting object will contain all the matches
+#'   for the selected query elements. The `target` object will by default be
+#'   returned as-is.
+#'
 #' - `filterMatches`: filter matches in a `Matched` object using different
 #'    approaches depending on the class of `param`:
 #'
-#'   - `SingleMatchParam`: reduces matches to keep only (at most) a
-#'     single match per query. The deduplication strategy can be defined with
-#'     parameter `duplicates`:
-#'     - `duplicates = "remove"`: all matches for query elements matching more
-#'       than one target element will be removed.
-#'     - `duplicates = "closest"`: keep only the *closest* match for each
-#'       query element. The closest match is defined by the value(s) of
-#'       *score* (and eventually *score_rt*, if present). The one match with
-#'       the smallest value for this (these) column(s) is retained. This is
-#'       equivalent to `TopRankedMatchesParam(n = 1L, decreasing = FALSE)`.
-#'     - `duplicates = "top_ranked"`: select the best ranking match for each
-#'       query element. Parameter `column` allows to specify the column by
-#'       which matches are ranked (use LLLLLL to list possible columns).
-#'       The column. Parameter `decreasing` allows
-#'       to define whether the match with the highest (`decreasing = TRUE`)
-#'       or lowest (`decreasing = FALSE`) value will be selected.
 #'   - `ScoreThresholdParam`: keeps only the matches whose score is strictly
 #'     above or strictly below a certain threshold (respectively when parameter
 #'     `above = TRUE` and `above = FALSE`). The name of the column containing
@@ -87,6 +80,7 @@
 #'     used to filter matches based on retention time scores for `Matched`
 #'     objects returned by [matchValues()] when `param` objects involving a
 #'     retention time comparison are used.
+#'
 #'   - `SelectMatchesParam`: keeps or removes (respectively when parameter
 #'     `keep = TRUE` and `keep = FALSE`) matches corresponding to certain
 #'     indices or values of `query` and `target`. If `queryValue` and
@@ -96,6 +90,25 @@
 #'     from the [matches()] matrix from the `Matched` object but thus not alter
 #'     the `query` or `target` in the object. See examples below for more
 #'     information.
+#'
+#'   - `SingleMatchParam`: reduces matches to keep only (at most) a
+#'     single match per query. The deduplication strategy can be defined with
+#'     parameter `duplicates`:
+#'     - `duplicates = "remove"`: all matches for query elements matching more
+#'       than one target element will be removed.
+#'     - `duplicates = "closest"`: keep only the *closest* match for each
+#'       query element. The closest match is defined by the value(s) of
+#'       *score* (and eventually *score_rt*, if present). The one match with
+#'       the smallest value for this (these) column(s) is retained. This is
+#'       equivalent to `TopRankedMatchesParam(n = 1L, decreasing = FALSE)`.
+#'     - `duplicates = "top_ranked"`: select the *best ranking* match for each
+#'       query element. Parameter `column` allows to specify the column by
+#'       which matches are ranked (use `targetVariables(object)` or
+#'       `scoreVariables(object)` to list possible columns). Parameter
+#'       `decreasing` allows to define whether the match with the highest
+#'       (`decreasing = TRUE`) or lowest (`decreasing = FALSE`) value in
+#'       `column` for each *query* will be selected.
+#'
 #'   - `TopRankedMatchesParam`: for each query element the matches are ranked
 #'     according to their score and only the `n` best of them are kept (if `n`
 #'     is larger than the number of matches for a given query element all the
@@ -114,16 +127,9 @@
 #'     small (or, depending on parameter `decreasing`, large) values for
 #'     `"score"` **and** `"score_rt"` are returned.
 #'
-#' - `lapply`: applies a user defined function `FUN` to each subset of
-#'   matches in a `Matched` object for each `query` element (i.e. to each `x[i]`
-#'   with `i` from `1` to `length(x)`). It returns a `list` of `length(object)`
-#'   elements where each element is the output of `FUN` applied to each subset
-#'   of matches.
-#'
 #' - `pruneTarget`: *cleans* the object by removing non-matched
 #'   **target** elements.
 #'
-#'
 #' @section Extracting data:
 #'
 #' - `$` extracts a single variable from the `Matched` `x`. The variables that
@@ -202,7 +208,9 @@
 #'
 #' @param column for `ScoreThresholdParam`: `character(1)` specifying the name
 #'   of the score variable to consider for the filtering (the default is
-#'   `column = "score"`).
+#'   `column = "score"`). For `SingleMatchParam`: `character(1)` defining the
+#'   name of the column to be used for de-duplication. See description of
+#'   `SingleMatchParam` in the *Filtering and subsetting* section for details.
 #'
 #' @param columns for `matchedData`: `character` vector with column names of
 #'   variables that should be extracted.
@@ -213,6 +221,10 @@
 #'
 #' @param drop for `[`: ignored.
 #'
+#' @param duplicates for `SingleMatchParam`: `character(1)` defining the
+#'   *de-duplication* strategy. See the description of `SingleMatchParam` in
+#'   the *Filtering and subsetting* subsection for choices and details.
+#'
 #' @param FUN for `lapply` and `endoapply`: user defined `function` that takes a
 #'   `Matched` object as a first parameter and possibly additional parameters
 #'   (that need to be provided in the `lapply` or `endoapply` call. For lapply
@@ -1286,6 +1298,59 @@ SingleMatchParam <- function(duplicates = c("remove", "closest", "top_ranked"),
         decreasing = decreasing[1L])
 }
 
+#' @rdname Matched
+#'
+#' @export
+setMethod(
+    "filterMatches", c("Matched", "SingleMatchParam"),
+    function (object, param, ...) {
+        if (!param@column %in% c(scoreVariables(object),
+                                 targetVariables(object)))
+            stop("Variable \"", param@column, "\" not found. `column` ",
+                 "should be one of 'scoreVariables(object)' or ",
+                 "'targetVariables(object)'.")
+        object@metadata <- c(object@metadata, param = param)
+        if (!nrow(object@matches))
+            return(object)
+        switch(
+            param@duplicates[1L],
+            "remove" = {
+                s <- split(seq_len(nrow(object@matches)),
+                           object@matches$query_idx)
+                keep <- unlist(s[lengths(s) == 1L], use.names = FALSE)
+                object@matches <- object@matches[keep, , drop = FALSE]
+            },
+            "closest" = {
+                object <- filterMatches(
+                    object, TopRankedMatchesParam(n = 1L, decreasing = FALSE))
+            },
+            "top_ranked" = {
+                ## Rank matches by "column"
+                if (param@column %in% scoreVariables(object))
+                    vals <- cbind(seq_len(nrow(object@matches)),
+                                  object@matches$query_idx,
+                                  object@matches[, param@column])
+                else
+                    vals <- cbind(
+                        seq_len(nrow(object@matches)),
+                        object@matches$query_idx,
+                        .extract_elements(
+                            .objectToMatch(object@target, object@targetAssay),
+                            object@matches$target_idx,
+                            sub("target_", "", param@column)))
+                    vals <- vals[order(vals[, 3L],
+                                       decreasing = param@decreasing), ,
+                                 drop = FALSE]
+                    keep <- vals[match(unique(object@matches$query_idx),
+                                       vals[, 2L]), 1L]
+                    object@matches <- object@matches[keep, , drop = FALSE]
+            },
+            stop("'duplicates' has to be one of \"remove\", \"closest\"",
+                 " or \"top_ranked\"."))
+        validObject(object)
+        object
+    })
+
 #' @importFrom MsCoreUtils rbindFill
 .addMatches <- function(query, target, matches, queryValue = integer(),
                         targetValue = integer(), queryColname = character(),
diff --git a/man/Matched.Rd b/man/Matched.Rd
index 88a54d4..149c783 100644
--- a/man/Matched.Rd
+++ b/man/Matched.Rd
@@ -35,6 +35,7 @@
 \alias{filterMatches,Matched,TopRankedMatchesParam-method}
 \alias{filterMatches,Matched,ScoreThresholdParam-method}
 \alias{SingleMatchParam}
+\alias{filterMatches,Matched,SingleMatchParam-method}
 \alias{addMatches,Matched-method}
 \alias{endoapply,ANY-method}
 \alias{endoapply,Matched-method}
@@ -132,6 +133,8 @@ SingleMatchParam(
   decreasing = TRUE
 )
 
+\S4method{filterMatches}{Matched,SingleMatchParam}(object, param, ...)
+
 \S4method{addMatches}{Matched}(
   object,
   queryValue = integer(),
@@ -245,7 +248,13 @@ threshold.}
 
 \item{column}{for \code{ScoreThresholdParam}: \code{character(1)} specifying the name
 of the score variable to consider for the filtering (the default is
-\code{column = "score"}).}
+\code{column = "score"}). For \code{SingleMatchParam}: \code{character(1)} defining the
+name of the column to be used for de-duplication. See description of
+\code{SingleMatchParam} in the \emph{Filtering and subsetting} section for details.}
+
+\item{duplicates}{for \code{SingleMatchParam}: \code{character(1)} defining the
+\emph{de-duplication} strategy. See the description of \code{SingleMatchParam} in
+the \emph{Filtering and subsetting} subsection for choices and details.}
 
 \item{score}{for \code{addMatches}: \code{numeric} (same length than \code{queryValue}) or
 \code{data.frame} (same number of rows than \code{queryValue}) specifying the scores
@@ -275,7 +284,7 @@ between elements of one-dimensional objects, or rows for two-dimensional
 objects (including \code{SummarizedExperiment} or \code{QFeatures}). For \code{\link[=QFeatures]{QFeatures()}}
 objects matches to only one of the \emph{assays} within the object is supported.
 }
-\section{Creation and subsetting}{
+\section{Ceation and general handling}{
 
 
 \code{Matched} object is returned as result from the \code{\link[=matchValues]{matchValues()}} function.
@@ -285,10 +294,6 @@ function providing the \code{query} and \code{target} objects as well as the \co
 \code{data.frame} with two columns of integer indices defining which elements
 from \emph{query} match which element from \emph{target}.
 \itemize{
-\item \code{[}: subset the object selecting \code{query} object elements to keep with
-parameter \code{i}. The resulting object will contain all the matches
-for the selected query elements. The \code{target} object will by default be
-returned as-is.
 \item \code{addMatches}: add new matches to an existing object. Parameters
 \code{queryValue} and \code{targetValue} allow to define which element(s) in
 \code{query} and \code{target} should be considered matching. If \code{isIndex = TRUE},
@@ -307,27 +312,24 @@ matches in a \code{Matched} object corresponding to a \code{query} element (i.e.
 each \code{x[i]} with \code{i} being 1 to \code{length(x)}). The results are then combined
 in a single \code{Matched} object representing updated matches. Note that \code{FUN}
 has to return a \code{Matched} object.
+\item \code{lapply}: applies a user defined function \code{FUN} to each subset of
+matches in a \code{Matched} object for each \code{query} element (i.e. to each \code{x[i]}
+with \code{i} from \code{1} to \code{length(x)}). It returns a \code{list} of \code{length(object)}
+elements where each element is the output of \code{FUN} applied to each subset
+of matches.
+}
+}
+
+\section{Filtering and subsetting}{
+
+\itemize{
+\item \code{[}: subset the object selecting \code{query} object elements to keep with
+parameter \code{i}. The resulting object will contain all the matches
+for the selected query elements. The \code{target} object will by default be
+returned as-is.
 \item \code{filterMatches}: filter matches in a \code{Matched} object using different
 approaches depending on the class of \code{param}:
 \itemize{
-\item \code{SingleMatchParam}: reduces matches to keep only (at most) a
-single match per query. The deduplication strategy can be defined with
-parameter \code{duplicates}:
-\itemize{
-\item \code{duplicates = "remove"}: all matches for query elements matching more
-than one target element will be removed.
-\item \code{duplicates = "closest"}: keep only the \emph{closest} match for each
-query element. The closest match is defined by the value(s) of
-\emph{score} (and eventually \emph{score_rt}, if present). The one match with
-the smallest value for this (these) column(s) is retained. This is
-equivalent to \code{TopRankedMatchesParam(n = 1L, decreasing = FALSE)}.
-\item \code{duplicates = "top_ranked"}: select the best ranking match for each
-query element. Parameter \code{column} allows to specify the column by
-which matches are ranked (use LLLLLL to list possible columns).
-The column. Parameter \code{decreasing} allows
-to define whether the match with the highest (\code{decreasing = TRUE})
-or lowest (\code{decreasing = FALSE}) value will be selected.
-}
 \item \code{ScoreThresholdParam}: keeps only the matches whose score is strictly
 above or strictly below a certain threshold (respectively when parameter
 \code{above = TRUE} and \code{above = FALSE}). The name of the column containing
@@ -345,6 +347,25 @@ indices or values of \code{query} and \code{target}. If \code{queryValue} and
 \code{targetValue} are provided, matches for these value pairs are kept or
 removed. Parameter index\verb{allows to filter matches providing their index in the [matches()] matrix. Note that}filterMatches\verb{removes only matches from the [matches()] matrix from the}Matched\verb{object but thus not alter the}query\code{or}target` in the object. See examples below for more
 information.
+\item \code{SingleMatchParam}: reduces matches to keep only (at most) a
+single match per query. The deduplication strategy can be defined with
+parameter \code{duplicates}:
+\itemize{
+\item \code{duplicates = "remove"}: all matches for query elements matching more
+than one target element will be removed.
+\item \code{duplicates = "closest"}: keep only the \emph{closest} match for each
+query element. The closest match is defined by the value(s) of
+\emph{score} (and eventually \emph{score_rt}, if present). The one match with
+the smallest value for this (these) column(s) is retained. This is
+equivalent to \code{TopRankedMatchesParam(n = 1L, decreasing = FALSE)}.
+\item \code{duplicates = "top_ranked"}: select the \emph{best ranking} match for each
+query element. Parameter \code{column} allows to specify the column by
+which matches are ranked (use \code{targetVariables(object)} or
+\code{scoreVariables(object)} to list possible columns). Parameter
+\code{decreasing} allows to define whether the match with the highest
+(\code{decreasing = TRUE}) or lowest (\code{decreasing = FALSE}) value in
+\code{column} for each \emph{query} will be selected.
+}
 \item \code{TopRankedMatchesParam}: for each query element the matches are ranked
 according to their score and only the \code{n} best of them are kept (if \code{n}
 is larger than the number of matches for a given query element all the
@@ -363,11 +384,6 @@ is performed on the absolute value of \code{"score_rt"}). Thus, matches with
 small (or, depending on parameter \code{decreasing}, large) values for
 \code{"score"} \strong{and} \code{"score_rt"} are returned.
 }
-\item \code{lapply}: applies a user defined function \code{FUN} to each subset of
-matches in a \code{Matched} object for each \code{query} element (i.e. to each \code{x[i]}
-with \code{i} from \code{1} to \code{length(x)}). It returns a \code{list} of \code{length(object)}
-elements where each element is the output of \code{FUN} applied to each subset
-of matches.
 \item \code{pruneTarget}: \emph{cleans} the object by removing non-matched
 \strong{target} elements.
 }
diff --git a/tests/testthat/test_Matched.R b/tests/testthat/test_Matched.R
index 8955b01..d2b2f83 100644
--- a/tests/testthat/test_Matched.R
+++ b/tests/testthat/test_Matched.R
@@ -1161,3 +1161,99 @@ test_that("queryIndex works", {
     expect_identical(queryIndex(a), c(1L, 1L, 2L))
     expect_error(queryIndex(4), "'Matched'")
 })
+
+test_that("SingleMatchParam works", {
+    res <- SingleMatchParam()
+    expect_s4_class(res, "SingleMatchParam")
+    expect_true(validObject(res))
+    expect_error(SingleMatchParam(duplicates = "other"), "should be")
+    res <- SingleMatchParam("closest", column = "other", decreasing = FALSE)
+    expect_equal(res@duplicates, "closest")
+    expect_equal(res@column, "other")
+    expect_equal(res@decreasing, FALSE)
+})
+
+test_that("filterMatches,Matched,SingleMatchParam works", {
+    a <- Matched()
+    p <- SingleMatchParam(column = "ops")
+    expect_error(filterMatches(a, p), "not found")
+
+    ## target is data.frame
+    a <- Matched(
+        query = q1, target = t1,
+        matches = data.frame(query_idx = c(1L, 2L, 2L, 2L, 5L),
+                             target_idx = c(2L, 2L, 3L, 4L, 5L),
+                             score = seq(0.5, 0.9, by = 0.1)))
+    p@duplicates <- "sum"
+    p@column <- "score"
+    expect_error(filterMatches(a, p), "has to be one")
+    p <- SingleMatchParam()
+    ## remove
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$target_idx, c(2L, 5L))
+    ## closest
+    p <- SingleMatchParam(duplicates = "closest")
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$target_idx, c(2L, 2L, 5L))
+    ## top_ranked
+    p <- SingleMatchParam(duplicates = "top_ranked", column = "target_col2",
+                          decreasing = TRUE)
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$target_idx, c(2L, 4L, 5L))
+
+    ## target is vector
+    a <- Matched(
+        query = q1, target = c(100, 200, 300, 800),
+        matches = data.frame(query_idx = c(1L, 1L, 1L, 3L, 3L, 3L, 3L),
+                             target_idx = c(3L, 4L, 1L, 2L, 3L, 1L, 4L),
+                             score = c(1, 2, 4, 3, 1, 2, 7))
+    )
+    ## remove
+    p <- SingleMatchParam()
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$target_idx, integer())
+    ## closest
+    p <- SingleMatchParam(duplicates = "closest")
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$target_idx, c(3L, 3L))
+    ## top_ranked
+    p <- SingleMatchParam(duplicates = "top_ranked", column = "target",
+                          decreasing = FALSE)
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$target_idx, c(1L, 1L))
+
+    ## target is SummarizedExperiment
+    rowData(q3)$new_col <- seq_len(nrow(q1))
+    a <- Matched(query = q1, target = q3,
+                 matches = data.frame(
+                     query_idx = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 5L),
+                     target_idx = c(2L, 4L, 5L, 1L, 2L, 3L, 4L, 5L),
+                     score = c(4, 1, 2, 3, 4, 3, 1, 9)))
+    ## remove
+    p <- SingleMatchParam()
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$query_idx, 5L)
+    expect_equal(res@matches$target_idx, 5L)
+
+    ## closest
+    p <- SingleMatchParam(duplicates = "closest")
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$query_idx, c(1L, 2L, 3L, 5L))
+    expect_equal(res@matches$target_idx, c(4L, 1L, 4L, 5L))
+
+    ## top_ranked
+    p <- SingleMatchParam(duplicates = "top_ranked", column = "target_new_col",
+                          decreasing = TRUE)
+    res <- filterMatches(a, p)
+    expect_equal(anyDuplicated(res@matches$query_idx), 0L)
+    expect_equal(res@matches$query_idx, c(1L, 2L, 3L, 5L))
+    expect_equal(res@matches$target_idx, c(5L, 2L, 4L, 5L))
+})