diff --git a/nbs_tests/hla/hla_class1.ipynb b/nbs_tests/hla/hla_class1.ipynb
new file mode 100644
index 00000000..d0fa0eb3
--- /dev/null
+++ b/nbs_tests/hla/hla_class1.ipynb
@@ -0,0 +1,329 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q pydivsufsort"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
+ ]
+ }
+ ],
+ "source": [
+ "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n",
+ "\n",
+ "model = HLA1_Binding_Classifier()\n",
+ "model.load_pretrained_hla_model()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prot1 = 'MABCDEKFGHIJKLMNOPQRST'\n",
+ "prot2 = 'FGHIJKLMNOPQR'\n",
+ "protein_dict = {\n",
+ " 'xx': {\n",
+ " 'protein_id': 'xx',\n",
+ " 'gene_name': '',\n",
+ " 'sequence': prot1\n",
+ " },\n",
+ " 'yy': {\n",
+ " 'protein_id': 'yy',\n",
+ " 'gene_name': 'gene',\n",
+ " 'sequence': prot2\n",
+ " }\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 1/1 [00:00<00:00, 14.32it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_pos | \n",
+ " end_pos | \n",
+ " nAA | \n",
+ " HLA_prob_pred | \n",
+ " sequence | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 9 | \n",
+ " 8 | \n",
+ " 0.124847 | \n",
+ " MABCDEKF | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14 | \n",
+ " 22 | \n",
+ " 8 | \n",
+ " 0.040122 | \n",
+ " LMNOPQRS | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13 | \n",
+ " 21 | \n",
+ " 8 | \n",
+ " 0.674667 | \n",
+ " KLMNOPQR | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 12 | \n",
+ " 20 | \n",
+ " 8 | \n",
+ " 0.119722 | \n",
+ " JKLMNOPQ | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11 | \n",
+ " 19 | \n",
+ " 8 | \n",
+ " 0.104152 | \n",
+ " IJKLMNOP | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 79 | \n",
+ " 5 | \n",
+ " 19 | \n",
+ " 14 | \n",
+ " 0.163758 | \n",
+ " DEKFGHIJKLMNOP | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " 4 | \n",
+ " 18 | \n",
+ " 14 | \n",
+ " 0.000618 | \n",
+ " CDEKFGHIJKLMNO | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " 3 | \n",
+ " 17 | \n",
+ " 14 | \n",
+ " 0.000773 | \n",
+ " BCDEKFGHIJKLMN | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " 9 | \n",
+ " 23 | \n",
+ " 14 | \n",
+ " 0.525840 | \n",
+ " GHIJKLMNOPQRST | \n",
+ "
\n",
+ " \n",
+ " 83 | \n",
+ " 6 | \n",
+ " 20 | \n",
+ " 14 | \n",
+ " 0.156962 | \n",
+ " EKFGHIJKLMNOPQ | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
84 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start_pos end_pos nAA HLA_prob_pred sequence\n",
+ "0 1 9 8 0.124847 MABCDEKF\n",
+ "1 14 22 8 0.040122 LMNOPQRS\n",
+ "2 13 21 8 0.674667 KLMNOPQR\n",
+ "3 12 20 8 0.119722 JKLMNOPQ\n",
+ "4 11 19 8 0.104152 IJKLMNOP\n",
+ ".. ... ... ... ... ...\n",
+ "79 5 19 14 0.163758 DEKFGHIJKLMNOP\n",
+ "80 4 18 14 0.000618 CDEKFGHIJKLMNO\n",
+ "81 3 17 14 0.000773 BCDEKFGHIJKLMN\n",
+ "82 9 23 14 0.525840 GHIJKLMNOPQRST\n",
+ "83 6 20 14 0.156962 EKFGHIJKLMNOPQ\n",
+ "\n",
+ "[84 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.predict_from_proteins(protein_data=protein_dict, prob_threshold=0.0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sequence | \n",
+ " nAA | \n",
+ " HLA_prob_pred | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " MABCDEKF | \n",
+ " 8 | \n",
+ " 0.124847 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " KLMNOPQR | \n",
+ " 8 | \n",
+ " 0.674667 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DEKFGHIJKLMNOP | \n",
+ " 14 | \n",
+ " 0.163758 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sequence nAA HLA_prob_pred\n",
+ "0 MABCDEKF 8 0.124847\n",
+ "1 KLMNOPQR 8 0.674667\n",
+ "2 DEKFGHIJKLMNOP 14 0.163758"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "peptide_df = pd.DataFrame({\n",
+ " \"sequence\": [\"MABCDEKF\",\"KLMNOPQR\",\"DEKFGHIJKLMNOP\"]\n",
+ "})\n",
+ "model.predict_peptide_df_(peptide_df=peptide_df)\n",
+ "peptide_df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "base",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/peptdeep/constants/default_settings.yaml b/peptdeep/constants/default_settings.yaml
index 29ddb833..d01acbc2 100644
--- a/peptdeep/constants/default_settings.yaml
+++ b/peptdeep/constants/default_settings.yaml
@@ -10,9 +10,11 @@ model:
PEPTDEEP_HOME: "~/peptdeep" # ~ refers to user folder (e.g. C:/Users/username)
local_model_zip_name: "pretrained_models.zip"
+local_hla_model_zip_name: "hla_model.zip"
# overwritable config
model_url: "https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/pretrained_models.zip"
+hla_model_url: "https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/hla_model.zip"
task_workflow: [library]
task_choices:
diff --git a/peptdeep/hla/__init__.py b/peptdeep/hla/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py
new file mode 100644
index 00000000..3a9f5789
--- /dev/null
+++ b/peptdeep/hla/hla_class1.py
@@ -0,0 +1,392 @@
+import os
+import torch
+import pandas as pd
+import tqdm
+
+from typing import Union
+
+import peptdeep.model.building_block as building_block
+from peptdeep.model.model_interface import ModelInterface
+from peptdeep.model.featurize import get_ascii_indices
+from peptdeep.pretrained_models import pretrain_dir, download_models, global_settings
+
+from .hla_utils import (
+ get_random_sequences,
+ load_prot_df,
+ cat_proteins,
+ get_seq_series,
+ nonspecific_digest_cat_proteins,
+)
+
+
+class HLA_Class_I_LSTM(torch.nn.Module):
+ """
+ HLA-I-binding peptide prediction model using LSTM.
+ """
+
+ def __init__(
+ self,
+ *,
+ hidden_dim=256,
+ input_dim=128,
+ n_lstm_layers=4,
+ dropout=0.1,
+ ):
+ """
+ Parameters
+ ----------
+ hidden_dim : int, optional
+ hidden dimension, by default 256
+ input_dim : int, optional
+ input dimension, by default 128 (ASCII)
+ n_lstm_layers : int, optional
+ number of LSTM layers, by default 4
+ dropout : float, optional
+ dropout rate, by default 0.1
+ """
+ super().__init__()
+ self.dropout = torch.nn.Dropout(dropout)
+
+ self.nn = torch.nn.Sequential(
+ torch.nn.Embedding(input_dim, hidden_dim // 4),
+ building_block.SeqCNN(hidden_dim // 4),
+ self.dropout,
+ building_block.SeqLSTM(hidden_dim, hidden_dim, rnn_layer=n_lstm_layers),
+ building_block.SeqAttentionSum(hidden_dim),
+ self.dropout,
+ torch.nn.Linear(hidden_dim, 64),
+ torch.nn.GELU(),
+ torch.nn.Linear(64, 1),
+ torch.nn.Sigmoid(),
+ )
+
+ def forward(self, x):
+ return self.nn(x).squeeze(-1)
+
+
+class HLA_Class_I_Bert(torch.nn.Module):
+ """
+ Model based on a transformer Architecture from
+ Huggingface's BertEncoder class.
+ """
+
+ def __init__(
+ self,
+ nlayers=4,
+ input_dim=128,
+ hidden_dim=256,
+ output_attentions=False,
+ dropout=0.1,
+ **kwargs,
+ ):
+ super().__init__()
+
+ self.dropout = torch.nn.Dropout(dropout)
+
+ self.input_nn = torch.nn.Sequential(
+ torch.nn.Embedding(input_dim, hidden_dim),
+ building_block.PositionalEncoding(hidden_dim),
+ )
+
+ self._output_attentions = output_attentions
+
+ self.hidden_nn = building_block.Hidden_HFace_Transformer(
+ hidden_dim,
+ nlayers=nlayers,
+ dropout=dropout,
+ output_attentions=output_attentions,
+ )
+
+ self.output_nn = torch.nn.Sequential(
+ building_block.SeqAttentionSum(hidden_dim),
+ torch.nn.PReLU(),
+ self.dropout,
+ torch.nn.Linear(hidden_dim, 1),
+ torch.nn.Sigmoid(),
+ )
+
+ @property
+ def output_attentions(self):
+ return self._output_attentions
+
+ @output_attentions.setter
+ def output_attentions(self, val: bool):
+ self._output_attentions = val
+ self.hidden_nn.output_attentions = val
+
+ def forward(self, x):
+ x = self.dropout(self.input_nn(x))
+
+ x = self.hidden_nn(x)
+ if self.output_attentions:
+ self.attentions = x[1]
+ else:
+ self.attentions = None
+ x = self.dropout(x[0])
+
+ return self.output_nn(x).squeeze(1)
+
+
+class HLA1_Binding_Classifier(ModelInterface):
+ """
+ Class to predict HLA-binding probabilities of peptides.
+ """
+
+ _model_zip_name = global_settings["local_hla_model_zip_name"]
+ _model_url = global_settings["hla_model_url"]
+ _model_zip = os.path.join(pretrain_dir, _model_zip_name)
+
+ def __init__(
+ self,
+ dropout: float = 0.1,
+ model_class: type = HLA_Class_I_LSTM, # model defined above
+ device: str = "gpu",
+ min_peptide_length: int = 8,
+ max_peptide_length: int = 14,
+ **kwargs,
+ ):
+ """
+ Parameters
+ ----------
+ dropout : float, optional
+ dropout rate of the model, by default 0.1
+ model_class : torch.nn.Module, optional
+ The model class type, can be :class:`HLA_Class_I_LSTM` or
+ :class:`HLA_Class_I_Bert`, by default :class:`HLA_Class_I_LSTM`
+ min_peptide_length : int, optional
+ minimal peptide length after digestion, by default 8
+ max_peptide_length : int, optional
+ maximal peptide length after digestion, by default 14
+ """
+ super().__init__(device=device)
+ self.build(model_class, dropout=dropout, **kwargs)
+ self.loss_func = torch.nn.BCELoss()
+ self.target_column_to_predict = "HLA_prob_pred"
+ self.min_peptide_length = min_peptide_length
+ self.max_peptide_length = max_peptide_length
+ self._n_neg_per_pos_training = 1
+
+ self.predict_batch_size = 4096
+
+ def _prepare_predict_data_df(
+ self,
+ precursor_df: pd.DataFrame,
+ ):
+ """
+ Prepare the predicting data from `precursor_df`.
+
+ Parameters
+ ----------
+ precursor_df : pd.DataFrame
+ The dataframe to predict.
+ """
+ self.__training = False
+ precursor_df[self.target_column_to_predict] = 0.0
+ self.predict_df = precursor_df
+
+ def _prepare_train_data_df(self, precursor_df: pd.DataFrame, **kwargs):
+ """
+ Prepare data for training from precursor_df.
+
+ Parameters
+ ----------
+ precursor_df : pd.DataFrame
+ The dataframe for training.
+ """
+ self.__training = True
+ precursor_df["nAA"] = precursor_df.sequence.str.len()
+ precursor_df.drop(
+ index=precursor_df[
+ (precursor_df.nAA < self.min_peptide_length)
+ | (precursor_df.nAA > self.max_peptide_length)
+ ].index,
+ inplace=True,
+ )
+ precursor_df.reset_index(inplace=True, drop=True)
+
+ def _get_features_from_batch_df(
+ self,
+ batch_df: pd.DataFrame,
+ **kwargs,
+ ) -> torch.LongTensor:
+ """
+ Convert AA sequences to tokens, which are `torch.LongTensor` of AA ASCII code array.
+
+ Parameters
+ ----------
+ batch_df : pd.DataFrame
+ The batch dataframe containing the `sequence` column.
+ All sequences in batch_df are treated as positive.
+ When training, negative sequences are sampled from self.protein_df.
+
+ Returns
+ -------
+ torch.LongTensor
+ The ASCII tokens of AA sequences.
+ """
+ aa_indices = self._as_tensor(
+ get_ascii_indices(batch_df["sequence"].values.astype("U")), dtype=torch.long
+ )
+
+ if self.__training:
+ rnd_seqs = get_random_sequences(
+ self.protein_df,
+ n=int(len(batch_df) * self._n_neg_per_pos_training),
+ pep_len=batch_df.nAA.values[0],
+ )
+ aa_indices = torch.cat(
+ [
+ aa_indices,
+ self._as_tensor(get_ascii_indices(rnd_seqs), dtype=torch.long),
+ ],
+ axis=0,
+ )
+
+ return aa_indices
+
+ def _get_targets_from_batch_df(
+ self, batch_df: pd.DataFrame, **kwargs
+ ) -> torch.Tensor:
+ """
+ Get target (y) value for training from batch_df.
+
+ Parameters
+ ----------
+ batch_df : pd.DataFrame
+ All sequences in batch_df are positive.
+ Random sequences are negative.
+
+ Returns
+ -------
+ torch.Tensor
+ Tensor with 0-1 binary values.
+ """
+ x = torch.zeros(
+ len(batch_df)
+ + (
+ int(len(batch_df) * self._n_neg_per_pos_training)
+ if self.__training
+ else 0
+ ),
+ device=self.device,
+ )
+ x[: len(batch_df)] = 1
+ return x
+
+ def load_proteins(
+ self,
+ protein_data: Union[pd.DataFrame, str, list, dict],
+ ):
+ """
+ Load proteins, and generate :attr:`protein_df` and
+ :attr:`_cat_protein_sequence` in this object.
+
+ Parameters
+ ----------
+ protein_data : pd.DataFrame | str | list | dict
+ pd.DataFrame: protein_df with a `sequence` column
+ str : absolute or relative fasta file path
+ list: list of fasta file path
+ dict: protein dict structure
+ """
+
+ if isinstance(protein_data, pd.DataFrame):
+ self.protein_df = protein_data
+ self._cat_protein_sequence = cat_proteins(
+ self.protein_df["sequence"].to_numpy()
+ )
+ else:
+ self.protein_df = load_prot_df(protein_data)
+ self._cat_protein_sequence = cat_proteins(
+ self.protein_df["sequence"].to_numpy()
+ )
+
+ def _digest_proteins(self):
+ """
+ Unspecific digestion of proteins generates :attr:`digested_idxes_df`.
+ """
+ self.digested_idxes_df = nonspecific_digest_cat_proteins(
+ self._cat_protein_sequence, self.min_peptide_length, self.max_peptide_length
+ )
+
+ def _predict_all_probs(self, digest_batch_size: int):
+ """
+ Predict probabilities for self.digested_idxes_df.
+ """
+ for i in tqdm.tqdm(range(0, len(self.digested_idxes_df), digest_batch_size)):
+ _df = self.digested_idxes_df.iloc[i : i + digest_batch_size]
+ seq_df = get_seq_series(_df, self._cat_protein_sequence).to_frame(
+ "sequence"
+ )
+ seq_df["nAA"] = _df.nAA
+ self.predict(seq_df, batch_size=self.predict_batch_size)
+ self.digested_idxes_df[self.target_column_to_predict].values[
+ i : i + digest_batch_size
+ ] = seq_df[self.target_column_to_predict]
+
+ def predict_peptide_df_(
+ self,
+ peptide_df: pd.DataFrame,
+ ):
+ """
+ Predict HLA probabilities for the given peptide dataframe
+ Probabilities are predicted inplace in `peptide_df` with
+ the predicted `HLA_prob_pred` column.
+
+ Parameters
+ ----------
+ peptide_df : pd.DataFrame
+ peptide dataframe with `sequence` column.
+ """
+ peptide_df = self.predict(peptide_df, batch_size=self.predict_batch_size)
+
+ def predict_from_proteins(
+ self,
+ protein_data: Union[pd.DataFrame, str, list, dict],
+ prob_threshold: float = 0.7,
+ digest_batch_size: int = 1024000,
+ ) -> pd.DataFrame:
+ """
+ Digest peptides from :attr:`protein_df`.
+
+ Parameters
+ ----------
+ protein_data : pd.DataFrame | str | list | dict
+ pd.DataFrame: protein_df with a `sequence` column
+ str : absolute or relative fasta file path
+ list: list of fasta file path
+ dict: protein dict structure
+ prob_threshold : float, optional
+ Peptides above this probability are kept, by default 0.7
+ digest_batch_size : int, optional
+ Batch size for digestion, by default 1024000
+
+ Returns
+ -------
+ pd.DataFrame
+ The peptide dataframe in alphabase format.
+ """
+ self.load_proteins(protein_data=protein_data)
+
+ self._digest_proteins()
+ self.digested_idxes_df[self.target_column_to_predict] = 0.0
+
+ self._predict_all_probs(digest_batch_size)
+
+ peptide_df = self.digested_idxes_df[
+ self.digested_idxes_df[self.target_column_to_predict] >= prob_threshold
+ ].reset_index(drop=True)
+
+ peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence)
+ return peptide_df
+
+ def _download_pretrained_hla_model(self):
+ download_models(url=self._model_url, target_path=self._model_zip)
+
+ def load_pretrained_hla_model(self):
+ """
+ Load pretrained `HLA1_IEDB.pt` model.
+ """
+ if not os.path.exists(self._model_zip):
+ self._download_pretrained_hla_model()
+ self.load(model_file=self._model_zip, model_path_in_zip="HLA1_IEDB.pt")
diff --git a/peptdeep/hla/hla_utils.py b/peptdeep/hla/hla_utils.py
new file mode 100644
index 00000000..d74d36b0
--- /dev/null
+++ b/peptdeep/hla/hla_utils.py
@@ -0,0 +1,187 @@
+import pandas as pd
+import numpy as np
+import numba
+
+import os
+
+from typing import Union, List
+
+from alphabase.protein.lcp_digest import get_substring_indices
+
+from alphabase.protein.fasta import load_all_proteins
+
+
+def load_prot_df(
+ protein_data: Union[str, list, tuple, set, dict],
+) -> pd.DataFrame:
+ """
+ Load protein dataframe from input protein_data.
+
+ Parameters
+ ----------
+ protein_data : Union[str,list,tuple,set,dict]
+ str: fasta file
+ list (tuple, or set): a list of fasta files
+ dict: protein dict
+
+ Returns
+ -------
+ pd.DataFrame
+ protein dataframe
+
+ Raises
+ ------
+ TypeError
+ protein_data type is not one of str, list, tuple, set, or dict.
+ """
+ if isinstance(protein_data, str):
+ protein_dict = load_all_proteins([protein_data])
+ elif isinstance(protein_data, (list, tuple, set)):
+ protein_dict = load_all_proteins(protein_data)
+ elif isinstance(protein_data, dict):
+ protein_dict = protein_data
+ else:
+ raise TypeError(
+ "`protein_data` must be str, list, tuple, set or dict, "
+ f"`{type(protein_data)}` is given."
+ )
+ prot_df = pd.DataFrame().from_dict(protein_dict, orient="index")
+ prot_df["nAA"] = prot_df.sequence.str.len()
+ return prot_df
+
+
+def cat_proteins(sequences: List[str], sep: str = "$") -> str:
+ """
+ Concatenate protein sequences in `prot_df` into a single sequence.
+
+ Parameters
+ ----------
+ sequences : list
+ List-like sequence list.
+ sep : str, optional
+ Separater of the concat string, by default '$'
+
+ Returns
+ -------
+ str
+ The concat protein sequence.
+
+ Example
+ -------
+ >>> sequences = ["ABC","DEF"]
+ >>> cat_proteins(sequences, sep="$")
+ '$ABC$DEF$'
+ """
+ return sep + sep.join(sequences) + sep
+
+
+def nonspecific_digest_cat_proteins(
+ cat_sequence: str, min_len: int, max_len: int
+) -> pd.DataFrame:
+ """
+ Digest the concat protein sequence to non-specific peptides.
+
+ Parameters
+ ----------
+ cat_sequence : str
+ The concat protein sequence generated by :func:`cat_proteins`
+ min_len : int
+ Min peptide length
+ max_len : int
+ Max peptide length
+
+ Returns
+ -------
+ pd.DataFrame
+ A dataframe sorted by `nAA` with three columns:
+ `start_pos`: the start index of the peptide in cat_protein
+ `end_pos`: the stop/end index of the peptide in cat_protein
+ `nAA`: the number of amino acids (peptide length).
+ """
+ pos_starts, pos_ends = get_substring_indices(cat_sequence, min_len, max_len)
+ digest_df = pd.DataFrame(dict(start_pos=pos_starts, end_pos=pos_ends))
+ digest_df["nAA"] = digest_df.end_pos - digest_df.start_pos
+ digest_df.sort_values("nAA", inplace=True)
+ digest_df.reset_index(inplace=True, drop=True)
+ return digest_df
+
+
+def _get_rnd_subseq(prot_seq_len: tuple, pep_len: int) -> str:
+ """
+ Get random subsequence from a protein sequence.
+ This function is only used by :func:`get_random_sequences`.
+
+ Parameters
+ ----------
+ prot_seq_len : tuple
+ (protein sequence, sequence length)
+ pep_len : int
+ peptide length to get
+
+ Returns
+ -------
+ str
+ The peptide sequence.
+ """
+ sequence, prot_len = prot_seq_len
+ if prot_len <= pep_len:
+ return (
+ "".join([sequence] * (pep_len // prot_len)) + sequence[: pep_len % prot_len]
+ )
+ start = np.random.randint(0, prot_len - pep_len)
+ return sequence[start : start + pep_len]
+
+
+def get_random_sequences(prot_df: pd.DataFrame, n: int, pep_len: int):
+ """
+ Random peptide sampling from proteins
+ """
+ return (
+ prot_df.sample(n, replace=True, weights="nAA")[["sequence", "nAA"]]
+ .apply(_get_rnd_subseq, pep_len=pep_len, axis=1)
+ .values.astype("U")
+ )
+
+
+@numba.njit
+def _check_sty(seq: str) -> bool:
+ """
+ If a sequence contains STY.
+ """
+ for aa in seq:
+ if aa in "STY":
+ return True
+ return False
+
+
+def get_seq_series(idxes_df: pd.DataFrame, cat_prot: str) -> pd.Series:
+ """
+ Get sub-sequence pd.Series from a concat protein sequence based on `idxes_df`.
+
+ Parameters
+ ----------
+ idxes_df : pd.DataFrame
+ a dataframe with `start_pos` and `stop_pos` columns of `cat_prot`.
+ cat_prot : str
+ The concat protein sequence.
+
+ Returns
+ -------
+ pd.Series
+ pd.Series with sub-sequences (peptide sequences).
+ """
+ return idxes_df[["start_pos", "end_pos"]].apply(
+ lambda x: cat_prot[slice(*x)], axis=1
+ )
+
+
+def check_is_file(file_path: str):
+ """
+ Check if a file_path exists.
+ """
+ if os.path.isfile(file_path):
+ print(f"Loading `{file_path}`")
+ return True
+ else:
+ print(f"`{file_path}` does not exist, ignore it.")
+ return False
diff --git a/peptdeep/pretrained_models.py b/peptdeep/pretrained_models.py
index 1e82a240..3d6f8ef2 100644
--- a/peptdeep/pretrained_models.py
+++ b/peptdeep/pretrained_models.py
@@ -64,13 +64,17 @@ def is_model_zip(downloaded_zip):
return any(x == "generic/ms2.pth" for x in zip.namelist())
-def download_models(url: str = model_url, overwrite=True):
+def download_models(url: str = model_url, target_path: str = model_zip, overwrite=True):
"""
Parameters
----------
url : str, optional
Remote or local path.
- Defaults to `peptdeep.pretrained_models.model_url`
+ Defaults to :data:`peptdeep.pretrained_models.model_url`
+
+ target_path : str, optional
+ Target file path after download.
+ Defaults to :data:`peptdeep.pretrained_models.model_zip`
overwrite : bool, optional
overwirte old model files.
@@ -82,13 +86,13 @@ def download_models(url: str = model_url, overwrite=True):
If remote url is not accessible.
"""
if not os.path.isfile(url):
- logging.info(f"Downloading {model_zip_name} ...")
+ logging.info(f"Downloading {url} ...")
try:
context = ssl._create_unverified_context()
requests = urllib.request.urlopen(url, context=context, timeout=10)
- with open(model_zip, "wb") as f:
+ with open(target_path, "wb") as f:
f.write(requests.read())
- except (socket.timeout, urllib.error.URLError, urllib.error.HTTPError) as e:
+ except (socket.timeout, urllib.error.URLError, urllib.error.HTTPError):
raise FileNotFoundError(
"Downloading model failed! Please download the "
f'zip or tar file by yourself from "{url}",'
@@ -97,8 +101,8 @@ def download_models(url: str = model_url, overwrite=True):
" to install the models"
)
else:
- shutil.copy(url, model_zip)
- logging.info(f"The pretrained models had been downloaded in {model_zip}")
+ shutil.copy(url, target_path)
+ logging.info(f"The pretrained models had been downloaded in {target_path}")
if not os.path.exists(model_zip):
@@ -486,7 +490,7 @@ def _load_file(model, model_file):
return
else:
model.load(model_file)
- except (UnpicklingError, TypeError, ValueError, KeyError) as e:
+ except (UnpicklingError, TypeError, ValueError, KeyError):
logging.info(
f"Cannot load {model_file} as {model.__class__} model, peptdeep will use the pretrained model instead."
)
diff --git a/requirements/requirements_hla.txt b/requirements/requirements_hla.txt
new file mode 100644
index 00000000..f6012f1f
--- /dev/null
+++ b/requirements/requirements_hla.txt
@@ -0,0 +1 @@
+pydivsufsort # used by alphabase.protein.lcp_digest