From 131bd9a8a722ca06e8d722993668bd3e7cc6cc40 Mon Sep 17 00:00:00 2001 From: stephanbreimann Date: Fri, 28 Jun 2024 21:54:11 +0200 Subject: [PATCH] Finish creation and testing of SequencePreprocessor (examples are missing) --- .../_backend/seq_preproc/encode_integer.py | 2 +- .../_backend/seq_preproc/encode_one_hot.py | 2 +- aaanalysis/data_handling/_seq_preproc.py | 44 +++-- aaanalysis/data_handling_pro/_comp_seq_sim.py | 3 +- .../data_handling/sp_encode_integer.ipynb | 50 +++++ .../data_handling/sp_encode_one_hot.ipynb | 50 +++++ examples/data_handling/sp_get_aa_window.ipynb | 50 +++++ .../sp_get_sliding_aa_window.ipynb | 50 +++++ .../test_sp_encode_integer.py | 131 +++++++++++++ .../test_sp_encode_one_hot.py | 130 +++++++++++++ .../test_sp_get_aa_window.py | 166 ++++++++++++++++ .../test_sp_get_sliding_aa_window.py | 178 ++++++++++++++++++ 12 files changed, 833 insertions(+), 23 deletions(-) create mode 100644 examples/data_handling/sp_encode_integer.ipynb create mode 100644 examples/data_handling/sp_encode_one_hot.ipynb create mode 100644 examples/data_handling/sp_get_aa_window.ipynb create mode 100644 examples/data_handling/sp_get_sliding_aa_window.ipynb create mode 100644 tests/unit/data_handling_tests/test_sp_encode_integer.py create mode 100644 tests/unit/data_handling_tests/test_sp_encode_one_hot.py create mode 100644 tests/unit/data_handling_tests/test_sp_get_aa_window.py create mode 100644 tests/unit/data_handling_tests/test_sp_get_sliding_aa_window.py diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py index fa4c09f1..7fc33937 100644 --- a/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py +++ b/aaanalysis/data_handling/_backend/seq_preproc/encode_integer.py @@ -11,7 +11,7 @@ # II Main Functions -def encode_integer(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"): +def encode_integer(list_seq=None, alphabet="ACDEFGHIKLMNPQRSTVWY", gap="-", pad_at="C"): """ Integer-encode a list of protein sequences into a feature matrix, padding shorter sequences with gaps represented as zero vectors. diff --git a/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py b/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py index 19c8f508..1e98c3d2 100644 --- a/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py +++ b/aaanalysis/data_handling/_backend/seq_preproc/encode_one_hot.py @@ -22,7 +22,7 @@ def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"): # II Main Functions -def encode_one_hot(list_seq=None, alphabet="ARNDCEQGHILKMFPSTWYV", gap="-", pad_at="C"): +def encode_one_hot(list_seq=None, alphabet="ACDEFGHIKLMNPQRSTVWY", gap="-", pad_at="C"): """ One-hot-encode a list of protein sequences into a feature matrix with padding shorter sequences with gaps represented as zero vectors. diff --git a/aaanalysis/data_handling/_seq_preproc.py b/aaanalysis/data_handling/_seq_preproc.py index ee2e0edf..7379de9b 100644 --- a/aaanalysis/data_handling/_seq_preproc.py +++ b/aaanalysis/data_handling/_seq_preproc.py @@ -119,13 +119,13 @@ def check_match_seq_slide_start_window_size(seq=None, slide_start=None, window_s # TODO e.g., seq_filter, comp_seq_sim, SHAP ... class SequencePreprocessor: """ - This class provides methods for preprocessing protein sequences, including encoding and window extraction. + Utility data preprocessing class to encode and represent protein sequences. """ # Sequence encoding @staticmethod def encode_one_hot(list_seq: Union[List[str], str] = None, - alphabet: str = "ARNDCEQGHILKMFPSTWYV", + alphabet: str = "ACDEFGHIKLMNPQRSTVWY", gap: str = "-", pad_at: Literal["C", "N"] = "C", ) -> Tuple[np.ndarray, List[str]]: @@ -140,11 +140,12 @@ def encode_one_hot(list_seq: Union[List[str], str] = None, Parameters ---------- list_seq : list of str or str - List of protein sequences to encode. - alphabet : str, default='ARNDCEQGHILKMFPSTWYV' + List of protein sequences to encode. All characters in each sequence must part of the ``alphabet`` or + be represented by the ``gap``. + alphabet : str, default='ACDEFGHIKLMNPQRSTVWY' The alphabet of amino acids used for encoding. gap : str, default='-' - The character used to represent gaps in sequences. + The character used to represent gaps within sequences. It should not be included in the ``alphabet``. pad_at : str, default='C' Specifies where to add the padding: @@ -177,7 +178,7 @@ def encode_one_hot(list_seq: Union[List[str], str] = None, @staticmethod def encode_integer(list_seq: Union[List[str], str] = None, - alphabet: str = "ARNDCEQGHILKMFPSTWYV", + alphabet: str = "ACDEFGHIKLMNPQRSTVWY", gap: str = "-", pad_at: Literal["C", "N"] = "C", ) -> Tuple[np.ndarray, List[str]]: @@ -190,11 +191,12 @@ def encode_integer(list_seq: Union[List[str], str] = None, Parameters ---------- list_seq : list of str or str - List of protein sequences to encode. - alphabet : str, default='ARNDCEQGHILKMFPSTWYV' + List of protein sequences to encode. All characters in each sequence must part of the ``alphabet`` or + be represented by the ``gap``. + alphabet : str, default='ACDEFGHIKLMNPQRSTVWY' The alphabet of amino acids used for encoding. gap : str, default='-' - The character used to represent gaps in sequences. + The character used to represent gaps within sequences. It should not be included in the ``alphabet``. pad_at : str, default='C' Specifies where to add the padding: @@ -226,8 +228,8 @@ def encode_integer(list_seq: Union[List[str], str] = None, return X, features @staticmethod - def get_aa_window(seq: str, - pos_start: int, + def get_aa_window(seq: str = None, + pos_start: int = 0, pos_stop: Optional[int] = None, window_size: Optional[int] = None, index1: bool = False, @@ -237,23 +239,22 @@ def get_aa_window(seq: str, """ Extracts a window of amino acids from a sequence. - This window starts from a given start position (``pos_start``, starting from 1) - and stops either at a defined stop position (``pos_stop``) or after a number of - residues defined by ``window_size``. + This window starts from a given start position (``pos_start``) and stops either at a defined + stop position (``pos_stop``) or after a number of residues defined by ``window_size``. Parameters ---------- seq : str The protein sequence from which to extract the window. - pos_start : int + pos_start : int, default=0 The starting position (>=0) of the window. pos_stop : int, optional - The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used. + The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used to determine it. window_size : int, optional - The size of the window (>=1) to extract. Only used if ``pos_end`` is ``None``. + The size of the window (>=1) to extract. Only used if ``pos_stop`` is ``None``. index1 : bool, default=False Whether position index starts at 1 (if ``True``) or 0 (if ``False``), - where first amino acid is at position 1 or 0, respectively. + where the first amino acid is at position 1 or 0, respectively. gap : str, default='-' The character used to represent gaps. accept_gap : bool, default=True @@ -264,6 +265,10 @@ def get_aa_window(seq: str, window : str The extracted window of amino acids. + Notes + ----- + * A ``ValueError`` is raised if both ``pos_stop`` and ``window_size`` are ``None`` or if both are provided. + Examples -------- .. include:: examples/sp_get_aa_window.rst @@ -298,7 +303,7 @@ def get_sliding_aa_window(seq: str = None, window_size: int = 5, index1: bool = False, gap: str = '-', - accept_gap: bool = False + accept_gap: bool = True ) -> List[str]: """ Extract sliding windows of amino acids from a sequence. @@ -325,6 +330,7 @@ def get_sliding_aa_window(seq: str = None, ------- list_windows : list of str A list of extracted windows of amino acids. + Examples -------- .. include:: examples/sp_get_sliding_aa_window.rst diff --git a/aaanalysis/data_handling_pro/_comp_seq_sim.py b/aaanalysis/data_handling_pro/_comp_seq_sim.py index 050740ba..3ba8d162 100644 --- a/aaanalysis/data_handling_pro/_comp_seq_sim.py +++ b/aaanalysis/data_handling_pro/_comp_seq_sim.py @@ -15,8 +15,7 @@ def comp_seq_sim(seq1: Optional[str] = None, df_seq: Optional[pd.DataFrame] = None, ) -> Union[float, pd.DataFrame]: """ - Compute sequence similarity between two sequences or - pairwise sequence similarity between all sequences in a DataFrame. + Compute pairwise similarity between two or more sequences. The normalized sequence similarity score between two sequences is computed as a fraction of the alignment score to the length of the longest sequence. The alignment score is obtained using the :class:`Bio.Align.PairwiseAligner` diff --git a/examples/data_handling/sp_encode_integer.ipynb b/examples/data_handling/sp_encode_integer.ipynb new file mode 100644 index 00000000..e660c447 --- /dev/null +++ b/examples/data_handling/sp_encode_integer.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "You can integer encode protein sequences using the ``SequencePreprocessor().encode_integer()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:" + ], + "metadata": { + "collapsed": false + }, + "id": "91e15230dad23b05" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import aaanalysis as aa\n", + "\n", + "seq = \"AACDEFGHII\"\n", + "sp = aa.SequencePreprocessor()" + ], + "metadata": { + "collapsed": false + }, + "id": "6529c65f51e1c14f" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/data_handling/sp_encode_one_hot.ipynb b/examples/data_handling/sp_encode_one_hot.ipynb new file mode 100644 index 00000000..4681b457 --- /dev/null +++ b/examples/data_handling/sp_encode_one_hot.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "You can one-hot encode protein sequences using the ``SequencePreprocessor().encode_integer()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:" + ], + "metadata": { + "collapsed": false + }, + "id": "a0933f329a756b28" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import aaanalysis as aa\n", + "\n", + "seq = \"AACDEFGHII\"\n", + "sp = aa.SequencePreprocessor()" + ], + "metadata": { + "collapsed": false + }, + "id": "7a26ecf62120d4d3" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/data_handling/sp_get_aa_window.ipynb b/examples/data_handling/sp_get_aa_window.ipynb new file mode 100644 index 00000000..75a99171 --- /dev/null +++ b/examples/data_handling/sp_get_aa_window.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "You can obtain a defined amino acid window (a subsequence of defined length) from a protein sequences using the ``SequencePreprocessor().get_aa_window()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:" + ], + "metadata": { + "collapsed": false + }, + "id": "9c145a5f9339adbb" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import aaanalysis as aa\n", + "\n", + "seq = \"AACDEFGHII\"\n", + "sp = aa.SequencePreprocessor()" + ], + "metadata": { + "collapsed": false + }, + "id": "72d98628b21cc579" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/data_handling/sp_get_sliding_aa_window.ipynb b/examples/data_handling/sp_get_sliding_aa_window.ipynb new file mode 100644 index 00000000..c9a30fbb --- /dev/null +++ b/examples/data_handling/sp_get_sliding_aa_window.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "You can obtain multiple defined amino acid windows (shifted by 1 residue position towards the C-terminus) from a protein sequences using the ``SequencePreprocessor().get_sliding_aa_window()`` method. We first create an example sequence and the ``SequencePrepreprocessor()`` object as follows:" + ], + "metadata": { + "collapsed": false + }, + "id": "a3d92c0e51155422" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import aaanalysis as aa\n", + "\n", + "seq = \"AACDEFGHII\"\n", + "sp = aa.SequencePreprocessor()" + ], + "metadata": { + "collapsed": false + }, + "id": "6cabbb7fb20d38c8" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/unit/data_handling_tests/test_sp_encode_integer.py b/tests/unit/data_handling_tests/test_sp_encode_integer.py new file mode 100644 index 00000000..b7dded62 --- /dev/null +++ b/tests/unit/data_handling_tests/test_sp_encode_integer.py @@ -0,0 +1,131 @@ +"""This is a script to test the encode_integer function.""" +import numpy as np +from hypothesis import given, settings, strategies as st +import pytest +from typing import Union, List, Tuple +import aaanalysis as aa + +# Set default deadline from 200 to 400 +settings.register_profile("ci", deadline=400) +settings.load_profile("ci") + + +# Normal Cases +class TestEncodeInteger: + """Test encode_integer function.""" + + @settings(max_examples=10, deadline=1000) + @given(list_seq=st.lists(st.text(alphabet="ACDEFGHIKLMNPQRSTVWY-", min_size=1, max_size=50), min_size=1, max_size=20)) + def test_list_seq_valid(self, list_seq): + """Test a valid 'list_seq' parameter.""" + sp = aa.SequencePreprocessor() + result = sp.encode_integer(list_seq=list_seq) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_list_seq_invalid(self): + """Test an invalid 'list_seq' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_integer(list_seq=None) + with pytest.raises(ValueError): + sp.encode_integer(list_seq=[]) + with pytest.raises(ValueError): + sp.encode_integer(list_seq=dict()) + with pytest.raises(ValueError): + sp.encode_integer(list_seq=["INVALIDSEQUENCE"]) + + @settings(max_examples=10, deadline=1000) + @given(alphabet=st.text(min_size=1, alphabet="ACDEFGHIKLMNPQRSTVWY")) + def test_alphabet_valid(self, alphabet): + """Test a valid 'alphabet' parameter.""" + sp = aa.SequencePreprocessor() + valid_seq = "".join(np.random.choice(list(alphabet), size=10)) + result = sp.encode_integer(list_seq=[valid_seq], alphabet=alphabet) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_alphabet_invalid(self): + """Test an invalid 'alphabet' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_integer(alphabet=None) + with pytest.raises(ValueError): + sp.encode_integer(alphabet="") + with pytest.raises(ValueError): + sp.encode_integer(alphabet=123) + + @settings(max_examples=10, deadline=1000) + @given(gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY")) + def test_gap_valid(self, gap): + """Test a valid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + result = sp.encode_integer(list_seq=["ARND"], gap=gap) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_gap_invalid(self): + """Test an invalid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_integer(gap=None) + with pytest.raises(ValueError): + sp.encode_integer(gap="") + with pytest.raises(ValueError): + sp.encode_integer(gap="INVALID") + + @settings(max_examples=10, deadline=1000) + @given(pad_at=st.sampled_from(["N", "C"])) + def test_pad_at_valid(self, pad_at): + """Test a valid 'pad_at' parameter.""" + sp = aa.SequencePreprocessor() + result = sp.encode_integer(list_seq=["ARND"], pad_at=pad_at) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_pad_at_invalid(self): + """Test an invalid 'pad_at' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_integer(pad_at=None) + with pytest.raises(ValueError): + sp.encode_integer(pad_at="") + with pytest.raises(ValueError): + sp.encode_integer(pad_at="INVALID") + + +# Complex Cases +class TestEncodeIntegerComplex: + """Test encode_integer function for Complex Cases.""" + + @settings(max_examples=10, deadline=1000) + @given(list_seq=st.lists(st.text(alphabet="ACDEFGHIKLMNPQRSTVWY-", min_size=1), min_size=1), + alphabet=st.text(min_size=1, alphabet="ACDEFGHIKLMNPQRSTVWY").filter(lambda a: "-" not in a), + gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY"), + pad_at=st.sampled_from(["N", "C"])) + def test_valid_combination(self, list_seq, alphabet, gap, pad_at): + """Test valid combinations of parameters.""" + sp = aa.SequencePreprocessor() + # Filter list_seq to include only characters in the alphabet or the gap + list_seq = ["".join([char if char in alphabet else gap for char in seq]) for seq in list_seq] + result = sp.encode_integer(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + @settings(max_examples=10, deadline=1000) + @given( + list_seq=st.none(), + alphabet=st.text(min_size=0), + gap=st.text(min_size=0, max_size=2), + pad_at=st.text(min_size=0) + ) + def test_invalid_combination(self, list_seq, alphabet, gap, pad_at): + """Test invalid combinations of parameters.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_integer(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) diff --git a/tests/unit/data_handling_tests/test_sp_encode_one_hot.py b/tests/unit/data_handling_tests/test_sp_encode_one_hot.py new file mode 100644 index 00000000..316fda37 --- /dev/null +++ b/tests/unit/data_handling_tests/test_sp_encode_one_hot.py @@ -0,0 +1,130 @@ +"""This is a script to test the encode_one_hot function.""" +import numpy as np +from hypothesis import given, settings, strategies as st +import pytest +import aaanalysis as aa + +# Set default deadline from 200 to 400 +settings.register_profile("ci", deadline=400) +settings.load_profile("ci") + + +# Normal Cases +class TestEncodeOneHot: + """Test encode_one_hot function.""" + + @settings(max_examples=10, deadline=1000) + @given(list_seq=st.lists(st.text(alphabet="ACDEFGHIKLMNPQRSTVWY-", min_size=1, max_size=50), min_size=1, max_size=20)) + def test_list_seq_valid(self, list_seq): + """Test a valid 'list_seq' parameter.""" + sp = aa.SequencePreprocessor() + result = sp.encode_one_hot(list_seq=list_seq) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_list_seq_invalid(self): + """Test an invalid 'list_seq' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_one_hot(list_seq=None) + with pytest.raises(ValueError): + sp.encode_one_hot(list_seq=[]) + with pytest.raises(ValueError): + sp.encode_one_hot(list_seq=dict()) + with pytest.raises(ValueError): + sp.encode_one_hot(list_seq=["INVALIDSEQUENCE"]) + + @settings(max_examples=10, deadline=1000) + @given(alphabet=st.text(min_size=1, alphabet="ACDEFGHIKLMNPQRSTVWY")) + def test_alphabet_valid(self, alphabet): + """Test a valid 'alphabet' parameter.""" + sp = aa.SequencePreprocessor() + valid_seq = "".join(np.random.choice(list(alphabet), size=10)) + result = sp.encode_one_hot(list_seq=[valid_seq], alphabet=alphabet) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_alphabet_invalid(self): + """Test an invalid 'alphabet' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_one_hot(alphabet=None) + with pytest.raises(ValueError): + sp.encode_one_hot(alphabet="") + with pytest.raises(ValueError): + sp.encode_one_hot(alphabet=123) + + @settings(max_examples=10, deadline=1000) + @given(gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY")) + def test_gap_valid(self, gap): + """Test a valid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + result = sp.encode_one_hot(list_seq=["ARND"], gap=gap) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_gap_invalid(self): + """Test an invalid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_one_hot(gap=None) + with pytest.raises(ValueError): + sp.encode_one_hot(gap="") + with pytest.raises(ValueError): + sp.encode_one_hot(gap="INVALID") + + @settings(max_examples=10, deadline=1000) + @given(pad_at=st.sampled_from(["N", "C"])) + def test_pad_at_valid(self, pad_at): + """Test a valid 'pad_at' parameter.""" + sp = aa.SequencePreprocessor() + result = sp.encode_one_hot(list_seq=["ARND"], pad_at=pad_at) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + def test_pad_at_invalid(self): + """Test an invalid 'pad_at' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_one_hot(pad_at=None) + with pytest.raises(ValueError): + sp.encode_one_hot(pad_at="") + with pytest.raises(ValueError): + sp.encode_one_hot(pad_at="INVALID") + + +# Complex Cases +class TestEncodeOneHotComplex: + """Test encode_one_hot function for Complex Cases.""" + + @settings(max_examples=10, deadline=1000) + @given(list_seq=st.lists(st.text(alphabet="ACDEFGHIKLMNPQRSTVWY-", min_size=1), min_size=1), + alphabet=st.text(min_size=1, alphabet="ACDEFGHIKLMNPQRSTVWY").filter(lambda a: "-" not in a), + gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY"), + pad_at=st.sampled_from(["N", "C"])) + def test_valid_combination(self, list_seq, alphabet, gap, pad_at): + """Test valid combinations of parameters.""" + sp = aa.SequencePreprocessor() + # Filter list_seq to include only characters in the alphabet or the gap + list_seq = ["".join([char if char in alphabet else gap for char in seq]) for seq in list_seq] + result = sp.encode_one_hot(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) + assert isinstance(result, tuple) + assert isinstance(result[0], np.ndarray) + assert isinstance(result[1], list) + + @settings(max_examples=10, deadline=1000) + @given( + list_seq=st.none(), + alphabet=st.text(min_size=0), + gap=st.text(min_size=0, max_size=2), + pad_at=st.text(min_size=0) + ) + def test_invalid_combination(self, list_seq, alphabet, gap, pad_at): + """Test invalid combinations of parameters.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.encode_one_hot(list_seq=list_seq, alphabet=alphabet, gap=gap, pad_at=pad_at) diff --git a/tests/unit/data_handling_tests/test_sp_get_aa_window.py b/tests/unit/data_handling_tests/test_sp_get_aa_window.py new file mode 100644 index 00000000..9496fa5a --- /dev/null +++ b/tests/unit/data_handling_tests/test_sp_get_aa_window.py @@ -0,0 +1,166 @@ +"""This is a script to test the get_aa_window function.""" +from typing import Optional +from hypothesis import given, settings, strategies as st +import pytest +import aaanalysis as aa + +# Set default deadline from 200 to 400 +settings.register_profile("ci", deadline=400) +settings.load_profile("ci") + +# Normal Cases +class TestGetAAWindow: + """Test get_aa_window function.""" + + @settings(max_examples=10, deadline=1000) + @given(seq=st.text(alphabet="ACDEFGHIKLMNPQRSTVWY-", min_size=10, max_size=100)) + def test_seq_valid(self, seq): + """Test a valid 'seq' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq=seq, pos_start=0, pos_stop=5) + assert isinstance(window, str) + + def test_seq_invalid(self): + """Test an invalid 'seq' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq=None, pos_start=0, pos_stop=5) + with pytest.raises(ValueError): + sp.get_aa_window(seq="", pos_start=0, pos_stop=5, accept_gap=False) + + @settings(max_examples=10, deadline=1000) + @given(pos_start=st.integers(min_value=0, max_value=50)) + def test_pos_start_valid(self, pos_start): + """Test a valid 'pos_start' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=pos_start, pos_stop=pos_start + 5) + assert isinstance(window, str) + + def test_pos_start_invalid(self): + """Test an invalid 'pos_start' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=-1, pos_stop=5) + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=100, pos_stop=5) + + @settings(max_examples=10, deadline=1000) + @given(pos_stop=st.integers(min_value=0, max_value=50)) + def test_pos_stop_valid(self, pos_stop): + """Test a valid 'pos_stop' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=pos_stop) + assert isinstance(window, str) + + def test_pos_stop_invalid(self): + """Test an invalid 'pos_stop' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=-1) + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=100, accept_gap=False) + + @settings(max_examples=10, deadline=1000) + @given(window_size=st.integers(min_value=1, max_value=50)) + def test_window_size_valid(self, window_size): + """Test a valid 'window_size' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, window_size=window_size) + assert isinstance(window, str) + + def test_window_size_invalid(self): + """Test an invalid 'window_size' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, window_size=0) + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, window_size=-5) + + @settings(max_examples=10, deadline=1000) + @given(index1=st.booleans()) + def test_index1_valid(self, index1): + """Test a valid 'index1' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=1 if index1 else 0, pos_stop=5, index1=index1) + assert isinstance(window, str) + + def test_index1_invalid(self): + """Test an invalid 'index1' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=1, pos_stop=5, index1=None) + + @settings(max_examples=10, deadline=1000) + @given(gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY")) + def test_gap_valid(self, gap): + """Test a valid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=5, gap=gap) + assert isinstance(window, str) + + def test_gap_invalid(self): + """Test an invalid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=5, gap="") + + @settings(max_examples=10, deadline=1000) + @given(accept_gap=st.booleans()) + def test_accept_gap_valid(self, accept_gap): + """Test a valid 'accept_gap' parameter.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=5, accept_gap=accept_gap) + assert isinstance(window, str) + + def test_accept_gap_invalid(self): + """Test an invalid 'accept_gap' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", pos_start=0, pos_stop=5, accept_gap=None) + + +# Complex Cases +class TestGetAAWindowComplex: + """Test get_aa_window function for Complex Cases.""" + + @settings(max_examples=10, deadline=1000) + @given( + seq=st.text(alphabet="ACDEFGHIKLMNPQRSTVWY", min_size=10, max_size=100), + pos_start=st.integers(min_value=0, max_value=50), + window_size=st.integers(min_value=1, max_value=50), + index1=st.booleans(), + gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY"), + ) + def test_valid_combination(self, seq, pos_start, window_size, index1, gap): + """Test valid combinations of parameters.""" + sp = aa.SequencePreprocessor() + window = sp.get_aa_window(seq=seq, + pos_start=pos_start, + window_size=window_size, + index1=index1, + gap=gap) + assert isinstance(window, str) + + @settings(max_examples=10, deadline=1000) + @given( + seq=st.none(), + pos_start=st.integers(min_value=-10, max_value=-1), + pos_stop=st.integers(min_value=-10, max_value=-1), + window_size=st.integers(min_value=-10, max_value=0), + index1=st.none(), + gap=st.text(min_size=0, max_size=2), + accept_gap=st.none() + ) + def test_invalid_combination(self, seq, pos_start, pos_stop, window_size, index1, gap, accept_gap): + """Test invalid combinations of parameters.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_aa_window( + seq=seq, + pos_start=pos_start, + pos_stop=pos_stop, + window_size=window_size, + index1=index1, + gap=gap, + accept_gap=accept_gap + ) diff --git a/tests/unit/data_handling_tests/test_sp_get_sliding_aa_window.py b/tests/unit/data_handling_tests/test_sp_get_sliding_aa_window.py new file mode 100644 index 00000000..3253fb99 --- /dev/null +++ b/tests/unit/data_handling_tests/test_sp_get_sliding_aa_window.py @@ -0,0 +1,178 @@ +"""This is a script to test the get_sliding_aa_window function.""" +from typing import Optional +from hypothesis import given, settings, strategies as st +import pytest +import aaanalysis as aa + +# Set default deadline from 200 to 400 +settings.register_profile("ci", deadline=400) +settings.load_profile("ci") + + +# Normal Cases +class TestGetSlidingAAWindow: + """Test get_sliding_aa_window function.""" + + @settings(max_examples=10, deadline=1000) + @given(seq=st.text(alphabet="ACDEFGHIKLMNPQRSTVWY-", min_size=10, max_size=100)) + def test_seq_valid(self, seq): + """Test a valid 'seq' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq=seq) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_seq_invalid(self): + """Test an invalid 'seq' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq=None) + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="", accept_gap=False) + + @settings(max_examples=10, deadline=1000) + @given(slide_start=st.integers(min_value=0, max_value=50)) + def test_slide_start_valid(self, slide_start): + """Test a valid 'slide_start' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", slide_start=slide_start) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_slide_start_invalid(self): + """Test an invalid 'slide_start' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", slide_start=-1) + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", slide_start=100, accept_gap=False) + + @settings(max_examples=10, deadline=1000) + @given(slide_stop=st.integers(min_value=1, max_value=50)) + def test_slide_stop_valid(self, slide_stop): + """Test a valid 'slide_stop' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", slide_stop=slide_stop) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_slide_stop_invalid(self): + """Test an invalid 'slide_stop' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", slide_stop=0) + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", slide_stop=100, accept_gap=False) + + @settings(max_examples=10, deadline=1000) + @given(window_size=st.integers(min_value=1, max_value=50)) + def test_window_size_valid(self, window_size): + """Test a valid 'window_size' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", window_size=window_size) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_window_size_invalid(self): + """Test an invalid 'window_size' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", window_size=0) + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", window_size=-5) + + @settings(max_examples=10, deadline=1000) + @given(index1=st.booleans()) + def test_index1_valid(self, index1): + """Test a valid 'index1' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", index1=index1) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_index1_invalid(self): + """Test an invalid 'index1' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", index1=None) + + @settings(max_examples=10, deadline=1000) + @given(gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY")) + def test_gap_valid(self, gap): + """Test a valid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", gap=gap) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_gap_invalid(self): + """Test an invalid 'gap' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", gap="") + + @settings(max_examples=10, deadline=1000) + @given(accept_gap=st.booleans()) + def test_accept_gap_valid(self, accept_gap): + """Test a valid 'accept_gap' parameter.""" + sp = aa.SequencePreprocessor() + windows = sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", accept_gap=accept_gap) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + def test_accept_gap_invalid(self): + """Test an invalid 'accept_gap' parameter.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window(seq="ACDEFGHIKLMNPQRSTVWY", accept_gap=None) + + +# Complex Cases +class TestGetSlidingAAWindowComplex: + """Test get_sliding_aa_window function for Complex Cases.""" + + @settings(max_examples=10, deadline=1000) + @given( + seq=st.text(alphabet="ACDEFGHIKLMNPQRSTVWY", min_size=10, max_size=100), + slide_start=st.integers(min_value=0, max_value=50), + slide_stop=st.integers(min_value=1, max_value=50), + window_size=st.integers(min_value=1, max_value=50), + index1=st.booleans(), + gap=st.text(min_size=1, max_size=1).filter(lambda g: g not in "ACDEFGHIKLMNPQRSTVWY"), + ) + def test_valid_combination(self, seq, slide_start, slide_stop, window_size, index1, gap): + """Test valid combinations of parameters.""" + sp = aa.SequencePreprocessor() + if slide_start < slide_stop: + windows = sp.get_sliding_aa_window(seq=seq, + slide_start=slide_start, + slide_stop=slide_stop, + window_size=window_size, + index1=index1, + gap=gap) + assert isinstance(windows, list) + assert all(isinstance(window, str) for window in windows) + + @settings(max_examples=10, deadline=1000) + @given( + seq=st.none(), + slide_start=st.integers(min_value=-10, max_value=-1), + slide_stop=st.integers(min_value=-10, max_value=-1), + window_size=st.integers(min_value=-10, max_value=0), + index1=st.none(), + gap=st.text(min_size=0, max_size=2), + accept_gap=st.none() + ) + def test_invalid_combination(self, seq, slide_start, slide_stop, window_size, index1, gap, accept_gap): + """Test invalid combinations of parameters.""" + sp = aa.SequencePreprocessor() + with pytest.raises(ValueError): + sp.get_sliding_aa_window( + seq=seq, + slide_start=slide_start, + slide_stop=slide_stop, + window_size=window_size, + index1=index1, + gap=gap, + accept_gap=accept_gap + )