Skip to content

Commit

Permalink
Finish creation and testing of SequencePreprocessor (examples are mis…
Browse files Browse the repository at this point in the history
…sing)
  • Loading branch information
breimanntools committed Jun 28, 2024
1 parent d729bdf commit 131bd9a
Show file tree
Hide file tree
Showing 12 changed files with 833 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


# II Main Functions
def encode_integer(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
def encode_integer(list_seq=None, alphabet="ACDEFGHIKLMNPQRSTVWY", gap="-", pad_at="C"):
"""
Integer-encode a list of protein sequences into a feature matrix, padding shorter sequences
with gaps represented as zero vectors.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"):


# II Main Functions
def encode_one_hot(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
def encode_one_hot(list_seq=None, alphabet="ACDEFGHIKLMNPQRSTVWY", gap="-", pad_at="C"):
"""
One-hot-encode a list of protein sequences into a feature matrix with padding shorter sequences
with gaps represented as zero vectors.
Expand Down
44 changes: 25 additions & 19 deletions aaanalysis/data_handling/_seq_preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,13 @@ def check_match_seq_slide_start_window_size(seq=None, slide_start=None, window_s
# TODO e.g., seq_filter, comp_seq_sim, SHAP ...
class SequencePreprocessor:
"""
This class provides methods for preprocessing protein sequences, including encoding and window extraction.
Utility data preprocessing class to encode and represent protein sequences.
"""

# Sequence encoding
@staticmethod
def encode_one_hot(list_seq: Union[List[str], str] = None,
alphabet: str = "ARNDCEQGHILKMFPSTWYV",
alphabet: str = "ACDEFGHIKLMNPQRSTVWY",
gap: str = "-",
pad_at: Literal["C", "N"] = "C",
) -> Tuple[np.ndarray, List[str]]:
Expand All @@ -140,11 +140,12 @@ def encode_one_hot(list_seq: Union[List[str], str] = None,
Parameters
----------
list_seq : list of str or str
List of protein sequences to encode.
alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
List of protein sequences to encode. All characters in each sequence must part of the ``alphabet`` or
be represented by the ``gap``.
alphabet : str, default='ACDEFGHIKLMNPQRSTVWY'
The alphabet of amino acids used for encoding.
gap : str, default='-'
The character used to represent gaps in sequences.
The character used to represent gaps within sequences. It should not be included in the ``alphabet``.
pad_at : str, default='C'
Specifies where to add the padding:
Expand Down Expand Up @@ -177,7 +178,7 @@ def encode_one_hot(list_seq: Union[List[str], str] = None,

@staticmethod
def encode_integer(list_seq: Union[List[str], str] = None,
alphabet: str = "ARNDCEQGHILKMFPSTWYV",
alphabet: str = "ACDEFGHIKLMNPQRSTVWY",
gap: str = "-",
pad_at: Literal["C", "N"] = "C",
) -> Tuple[np.ndarray, List[str]]:
Expand All @@ -190,11 +191,12 @@ def encode_integer(list_seq: Union[List[str], str] = None,
Parameters
----------
list_seq : list of str or str
List of protein sequences to encode.
alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
List of protein sequences to encode. All characters in each sequence must part of the ``alphabet`` or
be represented by the ``gap``.
alphabet : str, default='ACDEFGHIKLMNPQRSTVWY'
The alphabet of amino acids used for encoding.
gap : str, default='-'
The character used to represent gaps in sequences.
The character used to represent gaps within sequences. It should not be included in the ``alphabet``.
pad_at : str, default='C'
Specifies where to add the padding:
Expand Down Expand Up @@ -226,8 +228,8 @@ def encode_integer(list_seq: Union[List[str], str] = None,
return X, features

@staticmethod
def get_aa_window(seq: str,
pos_start: int,
def get_aa_window(seq: str = None,
pos_start: int = 0,
pos_stop: Optional[int] = None,
window_size: Optional[int] = None,
index1: bool = False,
Expand All @@ -237,23 +239,22 @@ def get_aa_window(seq: str,
"""
Extracts a window of amino acids from a sequence.
This window starts from a given start position (``pos_start``, starting from 1)
and stops either at a defined stop position (``pos_stop``) or after a number of
residues defined by ``window_size``.
This window starts from a given start position (``pos_start``) and stops either at a defined
stop position (``pos_stop``) or after a number of residues defined by ``window_size``.
Parameters
----------
seq : str
The protein sequence from which to extract the window.
pos_start : int
pos_start : int, default=0
The starting position (>=0) of the window.
pos_stop : int, optional
The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used.
The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used to determine it.
window_size : int, optional
The size of the window (>=1) to extract. Only used if ``pos_end`` is ``None``.
The size of the window (>=1) to extract. Only used if ``pos_stop`` is ``None``.
index1 : bool, default=False
Whether position index starts at 1 (if ``True``) or 0 (if ``False``),
where first amino acid is at position 1 or 0, respectively.
where the first amino acid is at position 1 or 0, respectively.
gap : str, default='-'
The character used to represent gaps.
accept_gap : bool, default=True
Expand All @@ -264,6 +265,10 @@ def get_aa_window(seq: str,
window : str
The extracted window of amino acids.
Notes
-----
* A ``ValueError`` is raised if both ``pos_stop`` and ``window_size`` are ``None`` or if both are provided.
Examples
--------
.. include:: examples/sp_get_aa_window.rst
Expand Down Expand Up @@ -298,7 +303,7 @@ def get_sliding_aa_window(seq: str = None,
window_size: int = 5,
index1: bool = False,
gap: str = '-',
accept_gap: bool = False
accept_gap: bool = True
) -> List[str]:
"""
Extract sliding windows of amino acids from a sequence.
Expand All @@ -325,6 +330,7 @@ def get_sliding_aa_window(seq: str = None,
-------
list_windows : list of str
A list of extracted windows of amino acids.
Examples
--------
.. include:: examples/sp_get_sliding_aa_window.rst
Expand Down
3 changes: 1 addition & 2 deletions aaanalysis/data_handling_pro/_comp_seq_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ def comp_seq_sim(seq1: Optional[str] = None,
df_seq: Optional[pd.DataFrame] = None,
) -> Union[float, pd.DataFrame]:
"""
Compute sequence similarity between two sequences or
pairwise sequence similarity between all sequences in a DataFrame.
Compute pairwise similarity between two or more sequences.
The normalized sequence similarity score between two sequences is computed as a fraction of the alignment score
to the length of the longest sequence. The alignment score is obtained using the :class:`Bio.Align.PairwiseAligner`
Expand Down
50 changes: 50 additions & 0 deletions examples/data_handling/sp_encode_integer.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"You can integer encode protein sequences using the ``SequencePreprocessor().encode_integer()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
],
"metadata": {
"collapsed": false
},
"id": "91e15230dad23b05"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import aaanalysis as aa\n",
"\n",
"seq = \"AACDEFGHII\"\n",
"sp = aa.SequencePreprocessor()"
],
"metadata": {
"collapsed": false
},
"id": "6529c65f51e1c14f"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
50 changes: 50 additions & 0 deletions examples/data_handling/sp_encode_one_hot.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"You can one-hot encode protein sequences using the ``SequencePreprocessor().encode_integer()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
],
"metadata": {
"collapsed": false
},
"id": "a0933f329a756b28"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import aaanalysis as aa\n",
"\n",
"seq = \"AACDEFGHII\"\n",
"sp = aa.SequencePreprocessor()"
],
"metadata": {
"collapsed": false
},
"id": "7a26ecf62120d4d3"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
50 changes: 50 additions & 0 deletions examples/data_handling/sp_get_aa_window.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"You can obtain a defined amino acid window (a subsequence of defined length) from a protein sequences using the ``SequencePreprocessor().get_aa_window()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
],
"metadata": {
"collapsed": false
},
"id": "9c145a5f9339adbb"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import aaanalysis as aa\n",
"\n",
"seq = \"AACDEFGHII\"\n",
"sp = aa.SequencePreprocessor()"
],
"metadata": {
"collapsed": false
},
"id": "72d98628b21cc579"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
50 changes: 50 additions & 0 deletions examples/data_handling/sp_get_sliding_aa_window.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"You can obtain multiple defined amino acid windows (shifted by 1 residue position towards the C-terminus) from a protein sequences using the ``SequencePreprocessor().get_sliding_aa_window()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
],
"metadata": {
"collapsed": false
},
"id": "a3d92c0e51155422"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import aaanalysis as aa\n",
"\n",
"seq = \"AACDEFGHII\"\n",
"sp = aa.SequencePreprocessor()"
],
"metadata": {
"collapsed": false
},
"id": "6cabbb7fb20d38c8"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 131bd9a

Please sign in to comment.