MannLabs · vbrennsteiner · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/docs/api.md b/docs/api.md
@@ -35,6 +35,7 @@
 
     tl.get_id2gene_map
     tl.map_genes_to_protein_groups
+    tl.find_protease_cut_sites
     tl.nan_safe_bh_correction
     tl.nan_safe_ttest_ind
     tl.diff_exp_ttest

diff --git a/src/alphapepttools/tl/__init__.py b/src/alphapepttools/tl/__init__.py
@@ -4,4 +4,4 @@
 from .diff_exp.ttest import diff_exp_ttest, nan_safe_ttest_ind
 from .embeddings import bpca, pca
 from .stats import nan_safe_bh_correction
-from .tools import get_id2gene_map, map_genes_to_protein_groups
+from .tools import find_protease_cut_sites, get_id2gene_map, map_genes_to_protein_groups
diff --git a/src/alphapepttools/tl/tools.py b/src/alphapepttools/tl/tools.py
@@ -5,7 +5,9 @@
 from io import StringIO
 from pathlib import Path
 
+import anndata as ad
 import numpy as np
+import pandas as pd
 import regex as re
 from Bio import SeqIO
 
@@ -158,3 +160,42 @@ def map_genes_to_protein_groups(
         out_gene_names.append(";".join(gene_names))
 
     return out_gene_names
+
+
+def find_protease_cut_sites(
+    adata: ad.AnnData,
+    sequence_column: str = "sequence",
+    cleavage_pattern: str = r"(?<!P)[KR](?!P)",
+) -> pd.Series:
+    """Find internal protease cut sites in peptide sequences to detect miscleavages
+
+    The cleavage pattern can be defined as a regex pattern, and looks for tryptic
+    cleavage sites by default (K or R not followed by P). The function counts only
+    internal cleavage sites by excluding the C-terminal residue from the search.
+
+    Parameters
+    ----------
+    adata
+        AnnData object containing peptide-level data. Must have `sequence_column` in `adata.var`.
+    sequence_column
+        Column name in `adata.var` containing peptide sequences. Defaults to "sequence".
+    cleavage_pattern
+        Regular expression pattern defining the protease cleavage sites. Defaults to r"(?<!P)[KR](?!P)" for trypsin.
+
+    Returns
+    -------
+    pd.Series
+        Series containing the count of internal cleavage sites for each peptide sequence.
+        A value of 0 indicates no miscleavages (fully tryptic), values ≥1 indicate the number of missed cleavages.
+
+    """
+    if sequence_column not in adata.var.columns:
+        raise ValueError(f"{sequence_column} column not found in adata.var.columns, is this a precursor table?")
+
+    # replace empty sequences with a placeholder to avoid regex errors
+    valid_sequences = adata.var[sequence_column].apply(lambda x: x + "_" if len(x) == 0 else x)
+
+    # remove C-terminal residue to focus on internal cleavage sites
+    valid_sequences_no_c_term = valid_sequences.apply(lambda x: x[:-1])
+
+    return valid_sequences_no_c_term.apply(lambda x: len(re.findall(cleavage_pattern, x)))
diff --git a/tests/tl/test_tools.py b/tests/tl/test_tools.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import pytest
 
-from alphapepttools.tl.tools import get_id2gene_map, map_genes_to_protein_groups
+from alphapepttools.tl.tools import find_protease_cut_sites, get_id2gene_map, map_genes_to_protein_groups
 from alphapepttools.tl.utils import drop_features_with_too_few_valid_values
 
 DUMMY_FASTA = """>tr|ID0|ID0_HUMAN Protein1 OS=Homo sapiens OX=9606 GN=GN0 PE=1 SV=1
@@ -134,3 +134,34 @@ def test_drop_features_with_too_few_valid_values(example_adata):
 
     expected_columns = ["A", "C"]
     assert list(filtered_adata.var_names) == expected_columns
+
+
+# Test find_protease_cut_sites function
+@pytest.mark.parametrize(
+    ("sequences", "expected_counts"),
+    [
+        # Fully tryptic peptides (0 miscleavages)
+        (["AAAAAA", "AAAAAAK", "AAAAAAAR"], [0, 0, 0]),
+        # One miscleavage
+        (["AAAAAKAAAAAR", "AAAAARAAAAAAK"], [1, 1]),
+        # Two miscleavages
+        (["AAAAAKAAAAAKAAAAAAR", "AAAAARAAAAARAAAAAAK"], [2, 2]),
+        # Proline rule: KP and RP should NOT be counted as cleavage sites
+        (["AAAAKPAAAAAR", "AAAARPAAAAAAK"], [0, 0]),
+        # Mixed cases
+        (["AAAAAA", "AAAAAAK", "AAAAAKAAAAAR", "AAAAAKAAAAAKAAAAAAR"], [0, 0, 1, 2]),
+        # Edge case: empty sequence
+        ([""], [0]),
+        # Edge case: single residue
+        (["K"], [0]),
+    ],
+)
+def test_find_protease_cut_sites(sequences, expected_counts):
+    adata = ad.AnnData(
+        np.zeros((1, len(sequences))),
+        var=pd.DataFrame({"sequence": sequences}),
+    )
+
+    counts = find_protease_cut_sites(adata, sequence_column="sequence")
+
+    assert list(counts) == expected_counts