diff --git a/nbs_tests/hla/hla_class1.ipynb b/nbs_tests/hla/hla_class1.ipynb new file mode 100644 index 00000000..d0fa0eb3 --- /dev/null +++ b/nbs_tests/hla/hla_class1.ipynb @@ -0,0 +1,329 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q pydivsufsort" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], + "source": [ + "from peptdeep.hla.hla_class1 import HLA1_Binding_Classifier\n", + "\n", + "model = HLA1_Binding_Classifier()\n", + "model.load_pretrained_hla_model()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "prot1 = 'MABCDEKFGHIJKLMNOPQRST'\n", + "prot2 = 'FGHIJKLMNOPQR'\n", + "protein_dict = {\n", + " 'xx': {\n", + " 'protein_id': 'xx',\n", + " 'gene_name': '',\n", + " 'sequence': prot1\n", + " },\n", + " 'yy': {\n", + " 'protein_id': 'yy',\n", + " 'gene_name': 'gene',\n", + " 'sequence': prot2\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 14.32it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_posend_posnAAHLA_prob_predsequence
01980.124847MABCDEKF
1142280.040122LMNOPQRS
2132180.674667KLMNOPQR
3122080.119722JKLMNOPQ
4111980.104152IJKLMNOP
..................
79519140.163758DEKFGHIJKLMNOP
80418140.000618CDEKFGHIJKLMNO
81317140.000773BCDEKFGHIJKLMN
82923140.525840GHIJKLMNOPQRST
83620140.156962EKFGHIJKLMNOPQ
\n", + "

84 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " start_pos end_pos nAA HLA_prob_pred sequence\n", + "0 1 9 8 0.124847 MABCDEKF\n", + "1 14 22 8 0.040122 LMNOPQRS\n", + "2 13 21 8 0.674667 KLMNOPQR\n", + "3 12 20 8 0.119722 JKLMNOPQ\n", + "4 11 19 8 0.104152 IJKLMNOP\n", + ".. ... ... ... ... ...\n", + "79 5 19 14 0.163758 DEKFGHIJKLMNOP\n", + "80 4 18 14 0.000618 CDEKFGHIJKLMNO\n", + "81 3 17 14 0.000773 BCDEKFGHIJKLMN\n", + "82 9 23 14 0.525840 GHIJKLMNOPQRST\n", + "83 6 20 14 0.156962 EKFGHIJKLMNOPQ\n", + "\n", + "[84 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_from_proteins(protein_data=protein_dict, prob_threshold=0.0)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencenAAHLA_prob_pred
0MABCDEKF80.124847
1KLMNOPQR80.674667
2DEKFGHIJKLMNOP140.163758
\n", + "
" + ], + "text/plain": [ + " sequence nAA HLA_prob_pred\n", + "0 MABCDEKF 8 0.124847\n", + "1 KLMNOPQR 8 0.674667\n", + "2 DEKFGHIJKLMNOP 14 0.163758" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "peptide_df = pd.DataFrame({\n", + " \"sequence\": [\"MABCDEKF\",\"KLMNOPQR\",\"DEKFGHIJKLMNOP\"]\n", + "})\n", + "model.predict_peptide_df_(peptide_df=peptide_df)\n", + "peptide_df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/peptdeep/constants/default_settings.yaml b/peptdeep/constants/default_settings.yaml index 29ddb833..d01acbc2 100644 --- a/peptdeep/constants/default_settings.yaml +++ b/peptdeep/constants/default_settings.yaml @@ -10,9 +10,11 @@ model: PEPTDEEP_HOME: "~/peptdeep" # ~ refers to user folder (e.g. C:/Users/username) local_model_zip_name: "pretrained_models.zip" +local_hla_model_zip_name: "hla_model.zip" # overwritable config model_url: "https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/pretrained_models.zip" +hla_model_url: "https://github.com/MannLabs/alphapeptdeep/releases/download/pre-trained-models/hla_model.zip" task_workflow: [library] task_choices: diff --git a/peptdeep/hla/__init__.py b/peptdeep/hla/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/peptdeep/hla/hla_class1.py b/peptdeep/hla/hla_class1.py new file mode 100644 index 00000000..3a9f5789 --- /dev/null +++ b/peptdeep/hla/hla_class1.py @@ -0,0 +1,392 @@ +import os +import torch +import pandas as pd +import tqdm + +from typing import Union + +import peptdeep.model.building_block as building_block +from peptdeep.model.model_interface import ModelInterface +from peptdeep.model.featurize import get_ascii_indices +from peptdeep.pretrained_models import pretrain_dir, download_models, global_settings + +from .hla_utils import ( + get_random_sequences, + load_prot_df, + cat_proteins, + get_seq_series, + nonspecific_digest_cat_proteins, +) + + +class HLA_Class_I_LSTM(torch.nn.Module): + """ + HLA-I-binding peptide prediction model using LSTM. + """ + + def __init__( + self, + *, + hidden_dim=256, + input_dim=128, + n_lstm_layers=4, + dropout=0.1, + ): + """ + Parameters + ---------- + hidden_dim : int, optional + hidden dimension, by default 256 + input_dim : int, optional + input dimension, by default 128 (ASCII) + n_lstm_layers : int, optional + number of LSTM layers, by default 4 + dropout : float, optional + dropout rate, by default 0.1 + """ + super().__init__() + self.dropout = torch.nn.Dropout(dropout) + + self.nn = torch.nn.Sequential( + torch.nn.Embedding(input_dim, hidden_dim // 4), + building_block.SeqCNN(hidden_dim // 4), + self.dropout, + building_block.SeqLSTM(hidden_dim, hidden_dim, rnn_layer=n_lstm_layers), + building_block.SeqAttentionSum(hidden_dim), + self.dropout, + torch.nn.Linear(hidden_dim, 64), + torch.nn.GELU(), + torch.nn.Linear(64, 1), + torch.nn.Sigmoid(), + ) + + def forward(self, x): + return self.nn(x).squeeze(-1) + + +class HLA_Class_I_Bert(torch.nn.Module): + """ + Model based on a transformer Architecture from + Huggingface's BertEncoder class. + """ + + def __init__( + self, + nlayers=4, + input_dim=128, + hidden_dim=256, + output_attentions=False, + dropout=0.1, + **kwargs, + ): + super().__init__() + + self.dropout = torch.nn.Dropout(dropout) + + self.input_nn = torch.nn.Sequential( + torch.nn.Embedding(input_dim, hidden_dim), + building_block.PositionalEncoding(hidden_dim), + ) + + self._output_attentions = output_attentions + + self.hidden_nn = building_block.Hidden_HFace_Transformer( + hidden_dim, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, + ) + + self.output_nn = torch.nn.Sequential( + building_block.SeqAttentionSum(hidden_dim), + torch.nn.PReLU(), + self.dropout, + torch.nn.Linear(hidden_dim, 1), + torch.nn.Sigmoid(), + ) + + @property + def output_attentions(self): + return self._output_attentions + + @output_attentions.setter + def output_attentions(self, val: bool): + self._output_attentions = val + self.hidden_nn.output_attentions = val + + def forward(self, x): + x = self.dropout(self.input_nn(x)) + + x = self.hidden_nn(x) + if self.output_attentions: + self.attentions = x[1] + else: + self.attentions = None + x = self.dropout(x[0]) + + return self.output_nn(x).squeeze(1) + + +class HLA1_Binding_Classifier(ModelInterface): + """ + Class to predict HLA-binding probabilities of peptides. + """ + + _model_zip_name = global_settings["local_hla_model_zip_name"] + _model_url = global_settings["hla_model_url"] + _model_zip = os.path.join(pretrain_dir, _model_zip_name) + + def __init__( + self, + dropout: float = 0.1, + model_class: type = HLA_Class_I_LSTM, # model defined above + device: str = "gpu", + min_peptide_length: int = 8, + max_peptide_length: int = 14, + **kwargs, + ): + """ + Parameters + ---------- + dropout : float, optional + dropout rate of the model, by default 0.1 + model_class : torch.nn.Module, optional + The model class type, can be :class:`HLA_Class_I_LSTM` or + :class:`HLA_Class_I_Bert`, by default :class:`HLA_Class_I_LSTM` + min_peptide_length : int, optional + minimal peptide length after digestion, by default 8 + max_peptide_length : int, optional + maximal peptide length after digestion, by default 14 + """ + super().__init__(device=device) + self.build(model_class, dropout=dropout, **kwargs) + self.loss_func = torch.nn.BCELoss() + self.target_column_to_predict = "HLA_prob_pred" + self.min_peptide_length = min_peptide_length + self.max_peptide_length = max_peptide_length + self._n_neg_per_pos_training = 1 + + self.predict_batch_size = 4096 + + def _prepare_predict_data_df( + self, + precursor_df: pd.DataFrame, + ): + """ + Prepare the predicting data from `precursor_df`. + + Parameters + ---------- + precursor_df : pd.DataFrame + The dataframe to predict. + """ + self.__training = False + precursor_df[self.target_column_to_predict] = 0.0 + self.predict_df = precursor_df + + def _prepare_train_data_df(self, precursor_df: pd.DataFrame, **kwargs): + """ + Prepare data for training from precursor_df. + + Parameters + ---------- + precursor_df : pd.DataFrame + The dataframe for training. + """ + self.__training = True + precursor_df["nAA"] = precursor_df.sequence.str.len() + precursor_df.drop( + index=precursor_df[ + (precursor_df.nAA < self.min_peptide_length) + | (precursor_df.nAA > self.max_peptide_length) + ].index, + inplace=True, + ) + precursor_df.reset_index(inplace=True, drop=True) + + def _get_features_from_batch_df( + self, + batch_df: pd.DataFrame, + **kwargs, + ) -> torch.LongTensor: + """ + Convert AA sequences to tokens, which are `torch.LongTensor` of AA ASCII code array. + + Parameters + ---------- + batch_df : pd.DataFrame + The batch dataframe containing the `sequence` column. + All sequences in batch_df are treated as positive. + When training, negative sequences are sampled from self.protein_df. + + Returns + ------- + torch.LongTensor + The ASCII tokens of AA sequences. + """ + aa_indices = self._as_tensor( + get_ascii_indices(batch_df["sequence"].values.astype("U")), dtype=torch.long + ) + + if self.__training: + rnd_seqs = get_random_sequences( + self.protein_df, + n=int(len(batch_df) * self._n_neg_per_pos_training), + pep_len=batch_df.nAA.values[0], + ) + aa_indices = torch.cat( + [ + aa_indices, + self._as_tensor(get_ascii_indices(rnd_seqs), dtype=torch.long), + ], + axis=0, + ) + + return aa_indices + + def _get_targets_from_batch_df( + self, batch_df: pd.DataFrame, **kwargs + ) -> torch.Tensor: + """ + Get target (y) value for training from batch_df. + + Parameters + ---------- + batch_df : pd.DataFrame + All sequences in batch_df are positive. + Random sequences are negative. + + Returns + ------- + torch.Tensor + Tensor with 0-1 binary values. + """ + x = torch.zeros( + len(batch_df) + + ( + int(len(batch_df) * self._n_neg_per_pos_training) + if self.__training + else 0 + ), + device=self.device, + ) + x[: len(batch_df)] = 1 + return x + + def load_proteins( + self, + protein_data: Union[pd.DataFrame, str, list, dict], + ): + """ + Load proteins, and generate :attr:`protein_df` and + :attr:`_cat_protein_sequence` in this object. + + Parameters + ---------- + protein_data : pd.DataFrame | str | list | dict + pd.DataFrame: protein_df with a `sequence` column + str : absolute or relative fasta file path + list: list of fasta file path + dict: protein dict structure + """ + + if isinstance(protein_data, pd.DataFrame): + self.protein_df = protein_data + self._cat_protein_sequence = cat_proteins( + self.protein_df["sequence"].to_numpy() + ) + else: + self.protein_df = load_prot_df(protein_data) + self._cat_protein_sequence = cat_proteins( + self.protein_df["sequence"].to_numpy() + ) + + def _digest_proteins(self): + """ + Unspecific digestion of proteins generates :attr:`digested_idxes_df`. + """ + self.digested_idxes_df = nonspecific_digest_cat_proteins( + self._cat_protein_sequence, self.min_peptide_length, self.max_peptide_length + ) + + def _predict_all_probs(self, digest_batch_size: int): + """ + Predict probabilities for self.digested_idxes_df. + """ + for i in tqdm.tqdm(range(0, len(self.digested_idxes_df), digest_batch_size)): + _df = self.digested_idxes_df.iloc[i : i + digest_batch_size] + seq_df = get_seq_series(_df, self._cat_protein_sequence).to_frame( + "sequence" + ) + seq_df["nAA"] = _df.nAA + self.predict(seq_df, batch_size=self.predict_batch_size) + self.digested_idxes_df[self.target_column_to_predict].values[ + i : i + digest_batch_size + ] = seq_df[self.target_column_to_predict] + + def predict_peptide_df_( + self, + peptide_df: pd.DataFrame, + ): + """ + Predict HLA probabilities for the given peptide dataframe + Probabilities are predicted inplace in `peptide_df` with + the predicted `HLA_prob_pred` column. + + Parameters + ---------- + peptide_df : pd.DataFrame + peptide dataframe with `sequence` column. + """ + peptide_df = self.predict(peptide_df, batch_size=self.predict_batch_size) + + def predict_from_proteins( + self, + protein_data: Union[pd.DataFrame, str, list, dict], + prob_threshold: float = 0.7, + digest_batch_size: int = 1024000, + ) -> pd.DataFrame: + """ + Digest peptides from :attr:`protein_df`. + + Parameters + ---------- + protein_data : pd.DataFrame | str | list | dict + pd.DataFrame: protein_df with a `sequence` column + str : absolute or relative fasta file path + list: list of fasta file path + dict: protein dict structure + prob_threshold : float, optional + Peptides above this probability are kept, by default 0.7 + digest_batch_size : int, optional + Batch size for digestion, by default 1024000 + + Returns + ------- + pd.DataFrame + The peptide dataframe in alphabase format. + """ + self.load_proteins(protein_data=protein_data) + + self._digest_proteins() + self.digested_idxes_df[self.target_column_to_predict] = 0.0 + + self._predict_all_probs(digest_batch_size) + + peptide_df = self.digested_idxes_df[ + self.digested_idxes_df[self.target_column_to_predict] >= prob_threshold + ].reset_index(drop=True) + + peptide_df["sequence"] = get_seq_series(peptide_df, self._cat_protein_sequence) + return peptide_df + + def _download_pretrained_hla_model(self): + download_models(url=self._model_url, target_path=self._model_zip) + + def load_pretrained_hla_model(self): + """ + Load pretrained `HLA1_IEDB.pt` model. + """ + if not os.path.exists(self._model_zip): + self._download_pretrained_hla_model() + self.load(model_file=self._model_zip, model_path_in_zip="HLA1_IEDB.pt") diff --git a/peptdeep/hla/hla_utils.py b/peptdeep/hla/hla_utils.py new file mode 100644 index 00000000..d74d36b0 --- /dev/null +++ b/peptdeep/hla/hla_utils.py @@ -0,0 +1,187 @@ +import pandas as pd +import numpy as np +import numba + +import os + +from typing import Union, List + +from alphabase.protein.lcp_digest import get_substring_indices + +from alphabase.protein.fasta import load_all_proteins + + +def load_prot_df( + protein_data: Union[str, list, tuple, set, dict], +) -> pd.DataFrame: + """ + Load protein dataframe from input protein_data. + + Parameters + ---------- + protein_data : Union[str,list,tuple,set,dict] + str: fasta file + list (tuple, or set): a list of fasta files + dict: protein dict + + Returns + ------- + pd.DataFrame + protein dataframe + + Raises + ------ + TypeError + protein_data type is not one of str, list, tuple, set, or dict. + """ + if isinstance(protein_data, str): + protein_dict = load_all_proteins([protein_data]) + elif isinstance(protein_data, (list, tuple, set)): + protein_dict = load_all_proteins(protein_data) + elif isinstance(protein_data, dict): + protein_dict = protein_data + else: + raise TypeError( + "`protein_data` must be str, list, tuple, set or dict, " + f"`{type(protein_data)}` is given." + ) + prot_df = pd.DataFrame().from_dict(protein_dict, orient="index") + prot_df["nAA"] = prot_df.sequence.str.len() + return prot_df + + +def cat_proteins(sequences: List[str], sep: str = "$") -> str: + """ + Concatenate protein sequences in `prot_df` into a single sequence. + + Parameters + ---------- + sequences : list + List-like sequence list. + sep : str, optional + Separater of the concat string, by default '$' + + Returns + ------- + str + The concat protein sequence. + + Example + ------- + >>> sequences = ["ABC","DEF"] + >>> cat_proteins(sequences, sep="$") + '$ABC$DEF$' + """ + return sep + sep.join(sequences) + sep + + +def nonspecific_digest_cat_proteins( + cat_sequence: str, min_len: int, max_len: int +) -> pd.DataFrame: + """ + Digest the concat protein sequence to non-specific peptides. + + Parameters + ---------- + cat_sequence : str + The concat protein sequence generated by :func:`cat_proteins` + min_len : int + Min peptide length + max_len : int + Max peptide length + + Returns + ------- + pd.DataFrame + A dataframe sorted by `nAA` with three columns: + `start_pos`: the start index of the peptide in cat_protein + `end_pos`: the stop/end index of the peptide in cat_protein + `nAA`: the number of amino acids (peptide length). + """ + pos_starts, pos_ends = get_substring_indices(cat_sequence, min_len, max_len) + digest_df = pd.DataFrame(dict(start_pos=pos_starts, end_pos=pos_ends)) + digest_df["nAA"] = digest_df.end_pos - digest_df.start_pos + digest_df.sort_values("nAA", inplace=True) + digest_df.reset_index(inplace=True, drop=True) + return digest_df + + +def _get_rnd_subseq(prot_seq_len: tuple, pep_len: int) -> str: + """ + Get random subsequence from a protein sequence. + This function is only used by :func:`get_random_sequences`. + + Parameters + ---------- + prot_seq_len : tuple + (protein sequence, sequence length) + pep_len : int + peptide length to get + + Returns + ------- + str + The peptide sequence. + """ + sequence, prot_len = prot_seq_len + if prot_len <= pep_len: + return ( + "".join([sequence] * (pep_len // prot_len)) + sequence[: pep_len % prot_len] + ) + start = np.random.randint(0, prot_len - pep_len) + return sequence[start : start + pep_len] + + +def get_random_sequences(prot_df: pd.DataFrame, n: int, pep_len: int): + """ + Random peptide sampling from proteins + """ + return ( + prot_df.sample(n, replace=True, weights="nAA")[["sequence", "nAA"]] + .apply(_get_rnd_subseq, pep_len=pep_len, axis=1) + .values.astype("U") + ) + + +@numba.njit +def _check_sty(seq: str) -> bool: + """ + If a sequence contains STY. + """ + for aa in seq: + if aa in "STY": + return True + return False + + +def get_seq_series(idxes_df: pd.DataFrame, cat_prot: str) -> pd.Series: + """ + Get sub-sequence pd.Series from a concat protein sequence based on `idxes_df`. + + Parameters + ---------- + idxes_df : pd.DataFrame + a dataframe with `start_pos` and `stop_pos` columns of `cat_prot`. + cat_prot : str + The concat protein sequence. + + Returns + ------- + pd.Series + pd.Series with sub-sequences (peptide sequences). + """ + return idxes_df[["start_pos", "end_pos"]].apply( + lambda x: cat_prot[slice(*x)], axis=1 + ) + + +def check_is_file(file_path: str): + """ + Check if a file_path exists. + """ + if os.path.isfile(file_path): + print(f"Loading `{file_path}`") + return True + else: + print(f"`{file_path}` does not exist, ignore it.") + return False diff --git a/peptdeep/pretrained_models.py b/peptdeep/pretrained_models.py index 1e82a240..3d6f8ef2 100644 --- a/peptdeep/pretrained_models.py +++ b/peptdeep/pretrained_models.py @@ -64,13 +64,17 @@ def is_model_zip(downloaded_zip): return any(x == "generic/ms2.pth" for x in zip.namelist()) -def download_models(url: str = model_url, overwrite=True): +def download_models(url: str = model_url, target_path: str = model_zip, overwrite=True): """ Parameters ---------- url : str, optional Remote or local path. - Defaults to `peptdeep.pretrained_models.model_url` + Defaults to :data:`peptdeep.pretrained_models.model_url` + + target_path : str, optional + Target file path after download. + Defaults to :data:`peptdeep.pretrained_models.model_zip` overwrite : bool, optional overwirte old model files. @@ -82,13 +86,13 @@ def download_models(url: str = model_url, overwrite=True): If remote url is not accessible. """ if not os.path.isfile(url): - logging.info(f"Downloading {model_zip_name} ...") + logging.info(f"Downloading {url} ...") try: context = ssl._create_unverified_context() requests = urllib.request.urlopen(url, context=context, timeout=10) - with open(model_zip, "wb") as f: + with open(target_path, "wb") as f: f.write(requests.read()) - except (socket.timeout, urllib.error.URLError, urllib.error.HTTPError) as e: + except (socket.timeout, urllib.error.URLError, urllib.error.HTTPError): raise FileNotFoundError( "Downloading model failed! Please download the " f'zip or tar file by yourself from "{url}",' @@ -97,8 +101,8 @@ def download_models(url: str = model_url, overwrite=True): " to install the models" ) else: - shutil.copy(url, model_zip) - logging.info(f"The pretrained models had been downloaded in {model_zip}") + shutil.copy(url, target_path) + logging.info(f"The pretrained models had been downloaded in {target_path}") if not os.path.exists(model_zip): @@ -486,7 +490,7 @@ def _load_file(model, model_file): return else: model.load(model_file) - except (UnpicklingError, TypeError, ValueError, KeyError) as e: + except (UnpicklingError, TypeError, ValueError, KeyError): logging.info( f"Cannot load {model_file} as {model.__class__} model, peptdeep will use the pretrained model instead." ) diff --git a/requirements/requirements_hla.txt b/requirements/requirements_hla.txt new file mode 100644 index 00000000..f6012f1f --- /dev/null +++ b/requirements/requirements_hla.txt @@ -0,0 +1 @@ +pydivsufsort # used by alphabase.protein.lcp_digest