Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

tl.get_id2gene_map
tl.map_genes_to_protein_groups
tl.find_protease_cut_sites
tl.nan_safe_bh_correction
tl.nan_safe_ttest_ind
tl.diff_exp_ttest
Expand Down
2 changes: 1 addition & 1 deletion src/alphapepttools/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .diff_exp.ttest import diff_exp_ttest, nan_safe_ttest_ind
from .embeddings import bpca, pca
from .stats import nan_safe_bh_correction
from .tools import get_id2gene_map, map_genes_to_protein_groups
from .tools import find_protease_cut_sites, get_id2gene_map, map_genes_to_protein_groups
41 changes: 41 additions & 0 deletions src/alphapepttools/tl/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from io import StringIO
from pathlib import Path

import anndata as ad
import numpy as np
import pandas as pd
import regex as re
from Bio import SeqIO

Expand Down Expand Up @@ -158,3 +160,42 @@ def map_genes_to_protein_groups(
out_gene_names.append(";".join(gene_names))

return out_gene_names


def find_protease_cut_sites(
adata: ad.AnnData,
sequence_column: str = "sequence",
cleavage_pattern: str = r"(?<!P)[KR](?!P)",
) -> pd.Series:
"""Find internal protease cut sites in peptide sequences to detect miscleavages

The cleavage pattern can be defined as a regex pattern, and looks for tryptic
cleavage sites by default (K or R not followed by P). The function counts only
internal cleavage sites by excluding the C-terminal residue from the search.

Parameters
----------
adata
AnnData object containing peptide-level data. Must have `sequence_column` in `adata.var`.
sequence_column
Column name in `adata.var` containing peptide sequences. Defaults to "sequence".
cleavage_pattern
Regular expression pattern defining the protease cleavage sites. Defaults to r"(?<!P)[KR](?!P)" for trypsin.

Returns
-------
pd.Series
Series containing the count of internal cleavage sites for each peptide sequence.
A value of 0 indicates no miscleavages (fully tryptic), values ≥1 indicate the number of missed cleavages.

"""
if sequence_column not in adata.var.columns:
raise ValueError(f"{sequence_column} column not found in adata.var.columns, is this a precursor table?")

# replace empty sequences with a placeholder to avoid regex errors
valid_sequences = adata.var[sequence_column].apply(lambda x: x + "_" if len(x) == 0 else x)

# remove C-terminal residue to focus on internal cleavage sites
valid_sequences_no_c_term = valid_sequences.apply(lambda x: x[:-1])

return valid_sequences_no_c_term.apply(lambda x: len(re.findall(cleavage_pattern, x)))
33 changes: 32 additions & 1 deletion tests/tl/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
import pytest

from alphapepttools.tl.tools import get_id2gene_map, map_genes_to_protein_groups
from alphapepttools.tl.tools import find_protease_cut_sites, get_id2gene_map, map_genes_to_protein_groups
from alphapepttools.tl.utils import drop_features_with_too_few_valid_values

DUMMY_FASTA = """>tr|ID0|ID0_HUMAN Protein1 OS=Homo sapiens OX=9606 GN=GN0 PE=1 SV=1
Expand Down Expand Up @@ -134,3 +134,34 @@ def test_drop_features_with_too_few_valid_values(example_adata):

expected_columns = ["A", "C"]
assert list(filtered_adata.var_names) == expected_columns


# Test find_protease_cut_sites function
@pytest.mark.parametrize(
("sequences", "expected_counts"),
[
# Fully tryptic peptides (0 miscleavages)
(["AAAAAA", "AAAAAAK", "AAAAAAAR"], [0, 0, 0]),
# One miscleavage
(["AAAAAKAAAAAR", "AAAAARAAAAAAK"], [1, 1]),
# Two miscleavages
(["AAAAAKAAAAAKAAAAAAR", "AAAAARAAAAARAAAAAAK"], [2, 2]),
# Proline rule: KP and RP should NOT be counted as cleavage sites
(["AAAAKPAAAAAR", "AAAARPAAAAAAK"], [0, 0]),
# Mixed cases
(["AAAAAA", "AAAAAAK", "AAAAAKAAAAAR", "AAAAAKAAAAAKAAAAAAR"], [0, 0, 1, 2]),
# Edge case: empty sequence
([""], [0]),
# Edge case: single residue
(["K"], [0]),
],
)
def test_find_protease_cut_sites(sequences, expected_counts):
adata = ad.AnnData(
np.zeros((1, len(sequences))),
var=pd.DataFrame({"sequence": sequences}),
)

counts = find_protease_cut_sites(adata, sequence_column="sequence")

assert list(counts) == expected_counts
Loading