Finish creation and testing of SequencePreprocessor (examples are mis…

…sing)
breimanntools · Jun 28, 2024 · 131bd9a · 131bd9a
1 parent d729bdf
commit 131bd9a
Show file tree

Hide file tree

Showing 12 changed files with 833 additions and 23 deletions.
diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py
@@ -11,7 +11,7 @@
 
 
 # II Main Functions
-def encode_integer(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
+def encode_integer(list_seq=None, alphabet="ACDEFGHIKLMNPQRSTVWY", gap="-", pad_at="C"):
     """
     Integer-encode a list of protein sequences into a feature matrix, padding shorter sequences
     with gaps represented as zero vectors.

diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py
@@ -22,7 +22,7 @@ def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"):
 
 
 # II Main Functions
-def encode_one_hot(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"):
+def encode_one_hot(list_seq=None, alphabet="ACDEFGHIKLMNPQRSTVWY", gap="-", pad_at="C"):
     """
     One-hot-encode a list of protein sequences into a feature matrix with padding shorter sequences
     with gaps represented as zero vectors.

diff --git a/aaanalysis/data_handling/_seq_preproc.py b/aaanalysis/data_handling/_seq_preproc.py
@@ -119,13 +119,13 @@ def check_match_seq_slide_start_window_size(seq=None, slide_start=None, window_s
 # TODO e.g., seq_filter, comp_seq_sim, SHAP ...
 class SequencePreprocessor:
     """
-    This class provides methods for preprocessing protein sequences, including encoding and window extraction.
+    Utility data preprocessing class to encode and represent protein sequences.
     """
 
     # Sequence encoding
     @staticmethod
     def encode_one_hot(list_seq: Union[List[str], str] = None,
-                       alphabet: str = "ARNDCEQGHILKMFPSTWYV",
+                       alphabet: str = "ACDEFGHIKLMNPQRSTVWY",
                        gap: str = "-",
                        pad_at: Literal["C", "N"] = "C",
                        ) -> Tuple[np.ndarray, List[str]]:
@@ -140,11 +140,12 @@ def encode_one_hot(list_seq: Union[List[str], str] = None,
         Parameters
         ----------
         list_seq : list of str or str
-            List of protein sequences to encode.
-        alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
+            List of protein sequences to encode. All characters in each sequence must part of the ``alphabet`` or
+            be represented by the ``gap``.
+        alphabet : str, default='ACDEFGHIKLMNPQRSTVWY'
             The alphabet of amino acids used for encoding.
         gap : str, default='-'
-            The character used to represent gaps in sequences.
+            The character used to represent gaps within sequences. It should not be included in the ``alphabet``.
         pad_at : str, default='C'
             Specifies where to add the padding:
 
@@ -177,7 +178,7 @@ def encode_one_hot(list_seq: Union[List[str], str] = None,
 
     @staticmethod
     def encode_integer(list_seq: Union[List[str], str] = None,
-                       alphabet: str = "ARNDCEQGHILKMFPSTWYV",
+                       alphabet: str = "ACDEFGHIKLMNPQRSTVWY",
                        gap: str = "-",
                        pad_at: Literal["C", "N"] = "C",
                        ) -> Tuple[np.ndarray, List[str]]:
@@ -190,11 +191,12 @@ def encode_integer(list_seq: Union[List[str], str] = None,
         Parameters
         ----------
         list_seq : list of str or str
-            List of protein sequences to encode.
-        alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
+            List of protein sequences to encode. All characters in each sequence must part of the ``alphabet`` or
+            be represented by the ``gap``.
+        alphabet : str, default='ACDEFGHIKLMNPQRSTVWY'
             The alphabet of amino acids used for encoding.
         gap : str, default='-'
-            The character used to represent gaps in sequences.
+            The character used to represent gaps within sequences. It should not be included in the ``alphabet``.
         pad_at : str, default='C'
             Specifies where to add the padding:
 
@@ -226,8 +228,8 @@ def encode_integer(list_seq: Union[List[str], str] = None,
         return X, features
 
     @staticmethod
-    def get_aa_window(seq: str,
-                      pos_start: int,
+    def get_aa_window(seq: str = None,
+                      pos_start: int = 0,
                       pos_stop: Optional[int] = None,
                       window_size: Optional[int] = None,
                       index1: bool = False,
@@ -237,23 +239,22 @@ def get_aa_window(seq: str,
         """
         Extracts a window of amino acids from a sequence.
 
-        This window starts from a given start position (``pos_start``, starting from 1)
-        and stops either at a defined stop position (``pos_stop``) or after a number of
-        residues defined by ``window_size``.
+        This window starts from a given start position (``pos_start``) and stops either at a defined
+        stop position (``pos_stop``) or after a number of residues defined by ``window_size``.
 
         Parameters
         ----------
         seq : str
             The protein sequence from which to extract the window.
-        pos_start : int
+        pos_start : int, default=0
             The starting position (>=0) of the window.
         pos_stop : int, optional
-            The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used.
+            The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used to determine it.
         window_size : int, optional
-            The size of the window (>=1) to extract. Only used if ``pos_end`` is ``None``.
+            The size of the window (>=1) to extract. Only used if ``pos_stop`` is ``None``.
         index1 : bool, default=False
             Whether position index starts at 1 (if ``True``) or 0 (if ``False``),
-            where first amino acid is at position 1 or 0, respectively.
+            where the first amino acid is at position 1 or 0, respectively.
         gap : str, default='-'
             The character used to represent gaps.
         accept_gap : bool, default=True
@@ -264,6 +265,10 @@ def get_aa_window(seq: str,
         window : str
             The extracted window of amino acids.
 
+        Notes
+        -----
+        * A ``ValueError`` is raised if both ``pos_stop`` and ``window_size`` are ``None`` or if both are provided.
+
         Examples
         --------
         .. include:: examples/sp_get_aa_window.rst
@@ -298,7 +303,7 @@ def get_sliding_aa_window(seq: str = None,
                               window_size: int = 5,
                               index1: bool = False,
                               gap: str = '-',
-                              accept_gap: bool = False
+                              accept_gap: bool = True
                               ) -> List[str]:
         """
         Extract sliding windows of amino acids from a sequence.
@@ -325,6 +330,7 @@ def get_sliding_aa_window(seq: str = None,
         -------
         list_windows : list of str
             A list of extracted windows of amino acids.
+
         Examples
         --------
         .. include:: examples/sp_get_sliding_aa_window.rst

diff --git a/aaanalysis/data_handling_pro/_comp_seq_sim.py b/aaanalysis/data_handling_pro/_comp_seq_sim.py
@@ -15,8 +15,7 @@ def comp_seq_sim(seq1: Optional[str] = None,
                  df_seq: Optional[pd.DataFrame] = None,
                  ) -> Union[float, pd.DataFrame]:
     """
-    Compute sequence similarity between two sequences or
-    pairwise sequence similarity between all sequences in a DataFrame.
+    Compute pairwise similarity between two or more sequences.
 
     The normalized sequence similarity score between two sequences is computed as a fraction of the alignment score
     to the length of the longest sequence. The alignment score is obtained using the :class:`Bio.Align.PairwiseAligner`

diff --git a/examples/data_handling/sp_encode_integer.ipynb b/examples/data_handling/sp_encode_integer.ipynb
@@ -0,0 +1,50 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can integer encode protein sequences using the ``SequencePreprocessor().encode_integer()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "91e15230dad23b05"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import aaanalysis as aa\n",
+    "\n",
+    "seq = \"AACDEFGHII\"\n",
+    "sp = aa.SequencePreprocessor()"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "6529c65f51e1c14f"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/data_handling/sp_encode_one_hot.ipynb b/examples/data_handling/sp_encode_one_hot.ipynb
@@ -0,0 +1,50 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can one-hot encode protein sequences using the ``SequencePreprocessor().encode_integer()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "a0933f329a756b28"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import aaanalysis as aa\n",
+    "\n",
+    "seq = \"AACDEFGHII\"\n",
+    "sp = aa.SequencePreprocessor()"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "7a26ecf62120d4d3"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/data_handling/sp_get_aa_window.ipynb b/examples/data_handling/sp_get_aa_window.ipynb
@@ -0,0 +1,50 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can obtain a defined amino acid window (a subsequence of defined length) from a protein sequences using the ``SequencePreprocessor().get_aa_window()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "9c145a5f9339adbb"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import aaanalysis as aa\n",
+    "\n",
+    "seq = \"AACDEFGHII\"\n",
+    "sp = aa.SequencePreprocessor()"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "72d98628b21cc579"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/data_handling/sp_get_sliding_aa_window.ipynb b/examples/data_handling/sp_get_sliding_aa_window.ipynb
@@ -0,0 +1,50 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can obtain multiple defined amino acid windows (shifted by 1 residue position towards the C-terminus) from a protein sequences using the ``SequencePreprocessor().get_sliding_aa_window()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "a3d92c0e51155422"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import aaanalysis as aa\n",
+    "\n",
+    "seq = \"AACDEFGHII\"\n",
+    "sp = aa.SequencePreprocessor()"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "6cabbb7fb20d38c8"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}