rformassspectrometry · sgibb · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 26, 2024
diff --git a/R/fragments-calculate.R b/R/fragments-calculate.R
@@ -24,7 +24,8 @@
 #' @return The methods with `oject = "missing"` returns a
 #'     `data.frame`.
 #'
-#' @param sequence character() providing a peptide sequence.
+#' @param sequence character() providing a peptide sequence. ProForma delta
+#' masses are supported (e.g. `"EM[+15.9949]EVEES[-79.9663]PEK"`).
 #'
 #' @param type `character` vector of target ions; possible values:
 #'     `c("a", "b", "c", "x", "y", "z")`. Default is `type = c("b",
@@ -63,6 +64,9 @@
 #'
 #' @exportMethod calculateFragments
 #'
+#' @references
+#' [HUPO-PSI ProForma specification](http://www.psidev.info/proforma)
+#'
 #' @examples
 #'
 #' ## calculate fragments for ACE with default modification
@@ -93,6 +97,9 @@
 #'
 #' ## disable neutral loss completely
 #' calculateFragments("PQR", neutralLoss=NULL)
+#'
+#' ## ProForma encoded delta masses
+#' calculateFragments("EM[+15.9949]EVEES[-79.9663]PEK")
 setMethod("calculateFragments", c("character", "missing"),
           function(sequence, type = c("b", "y"), z = 1,
                    modifications = c(C = 57.02146),
@@ -172,14 +179,17 @@ setMethod("calculateFragments", c("character", "missing"),
         message("Modifications used: ", mods)
     }
 
+    clean_sequence <- .proforma_clean_sequences(sequence)[[1]]
+    delta_mass <- .proforma_delta_masses(sequence)[[1]]
+
     ## split peptide sequence into aa
-    fragment.seq <- strsplit(sequence, "")[[1]]
+    fragment.seq <- strsplit(clean_sequence, "")[[1]]
     fn <- length(fragment.seq)
 
     ## calculate cumulative mass starting at the amino-terminus (for a, b, c)
-    amz <- cumsum(aamass[fragment.seq[-fn]])
+    amz <- cumsum(aamass[fragment.seq[-fn]] + delta_mass[-fn])
     ## calculate cumulative mass starting at the carboxyl-terminus (for x, y, z)
-    cmz <- cumsum(aamass[rev(fragment.seq[-1L])])
+    cmz <- cumsum(aamass[rev(fragment.seq[-1L])] + rev(delta_mass[-1L]))
 
     ## calculate fragment mass (amino-terminus)
     tn <- length(amz)
@@ -202,11 +212,11 @@ setMethod("calculateFragments", c("character", "missing"),
     cmz <- cmz + mass["p"]
 
     ## fragment seq (amino-terminus)
-    aseq <- rep(rep(substring(sequence, rep(1L, fn - 1L),
+    aseq <- rep(rep(substring(clean_sequence, rep(1L, fn - 1L),
                               1L:(fn - 1L)), each = zn), nat)
 
     ## fragment seq (carboxyl-terminus)
-    cseq <- rep(rep(rev(substring(sequence, 2L:fn,
+    cseq <- rep(rep(rev(substring(clean_sequence, 2L:fn,
                                   rep(fn, fn - 1L))), each=zn), nct)
 
     ## fragment str (amino-terminus)

diff --git a/R/proforma-parser.R b/R/proforma-parser.R
@@ -0,0 +1,60 @@
+#' ProForma parser
+#'
+#' This helper functions provide a basic implementation of the PSI ProForma
+#' notation.
+#'
+#' @author Sebastian Gibb <mail@sebastiangibb.de>
+#'
+#' @references http://www.psidev.info/proforma
+#' @name proforma-parser
+
+#' Remove all modifications
+#'
+#' @name proforma-parser
+#' @param x `character`, ProForma sequence.
+#' @return `character`, a `character` cleaned of all modifications.
+#' @noRd
+#' @examples
+#' .proforma_clean_sequences(
+#'    c("EM[+15.9949]EVEES[+79.9663]PEK",
+#'      "EM[+15.995]EVEES[-18.01]PEK")
+#' )
+.proforma_clean_sequences <- function(x) {
+    gsub(pattern = "\\[[^]]*\\]|<[^>]*>", "", x)
+}
+
+#' Extract delta masses
+#'
+#' @name proforma-parser
+#' @param x `character`, ProForma sequence.
+#' @return `list`, a `list` of `doubles` representing the delta masses for each
+#' sequence.
+#' @noRd
+#' @examples
+#' .proforma_delta_masses(
+#'    c("EM[+15.9949]EVEES[+79.9663]PEK",
+#'      "EM[+15.995]EVEES[-18.01]PEK")
+#' )
+.proforma_delta_masses <- function(x) {
+    rx <- gregexpr(
+        pattern =
+            "(?<AA>[A-Z])(?:\\[[GMURX]?:?)?(?<DeltaMass>[+-][0-9.]+)?(?:\\])?",
+        text = x,
+        perl = TRUE
+    )
+
+    mapply(function(sequence, start, matched_length) {
+        mod <- as.double(
+            substring(sequence, start, start + matched_length - 1L)
+        )
+        mod[is.na(mod)] <- 0
+        mod
+    },
+        sequence = x,
+        start =
+            lapply(rx, function(r)attr(r, "capture.start")[, "DeltaMass"]),
+        matched_length =
+            lapply(rx, function(r)attr(r, "capture.length")[, "DeltaMass"]),
+        SIMPLIFY = FALSE, USE.NAMES = FALSE
+    )
+}
diff --git a/man/calculateFragments.Rd b/man/calculateFragments.Rd
diff --git a/tests/testthat/test_proforma-parser.R b/tests/testthat/test_proforma-parser.R
@@ -0,0 +1,42 @@
+test_that("proforma clean sequences works", {
+    expect_identical(
+        .proforma_clean_sequences(
+            c("EM[+15.9949]EVEES[+79.9663]PEK", "EM[+15.995]EVEES[-18.01]PEK")
+        ),
+        c("EMEVEESPEK", "EMEVEESPEK")
+    )
+})
+
+test_that("proforma delta masses without prefixes are supported", {
+    expect_equal(
+        .proforma_delta_masses(
+            c("EMEVEESPEK", "EM[+15.995]EVEESPEK")
+        ),
+        list(
+            rep(0, 10), 
+            c(0, 15.995, 0, 0, 0, 0, 0, 0, 0, 0)
+        )
+    )
+    expect_equal(
+        .proforma_delta_masses(
+            c("EM[+15.9949]EVEES[+79.9663]PEK", "EM[+15.995]EVEES[-18.01]PEK")
+        ),
+        list(
+            c(0, 15.9949, 0, 0, 0, 0, 79.9663, 0, 0, 0),
+            c(0, 15.995, 0, 0, 0, 0, -18.01, 0, 0, 0)
+        )
+    )
+})
+
+test_that("proforma delta masses with prefixes are supported", {
+    expect_equal(
+        .proforma_delta_masses(
+            c("EM[U:+15.9949]EVEES[M:+79.9663]PEK",
+              "EM[X:+15.995]EVEES[R:-18.01]PEK")
+        ),
+        list(
+            c(0, 15.9949, 0, 0, 0, 0, 79.9663, 0, 0, 0),
+            c(0, 15.995, 0, 0, 0, 0, -18.01, 0, 0, 0)
+        )
+    )
+})