Skip to content

Commit

Permalink
Finsih SequencePreprocessor.encode_integer() and update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Jun 28, 2024
1 parent f2da477 commit 8d38d41
Show file tree
Hide file tree
Showing 3 changed files with 244 additions and 7 deletions.
4 changes: 2 additions & 2 deletions aaanalysis/data_handling/_seq_preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def encode_integer(list_seq: Union[List[str], str] = None,
def get_aa_window(seq: str = None,
pos_start: int = 0,
pos_stop: Optional[int] = None,
window_size: Optional[int] = 5,
window_size: Optional[int] = None,
index1: bool = False,
gap: str = '-',
accept_gap: bool = True,
Expand All @@ -250,7 +250,7 @@ def get_aa_window(seq: str = None,
The starting position (>=0) of the window.
pos_stop : int, optional
The ending position (>=``pos_start``) of the window. If ``None``, ``window_size`` is used to determine it.
window_size : int, default=5
window_size : int, optional
The size of the window (>=1) to extract. Only used if ``pos_stop`` is ``None``.
index1 : bool, default=False
Whether position index starts at 1 (if ``True``) or 0 (if ``False``),
Expand Down
243 changes: 240 additions & 3 deletions examples/data_handling/sp_encode_integer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,255 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"outputs": [],
"source": [
"import aaanalysis as aa\n",
"import pandas as pd\n",
"\n",
"seq = \"AACDEFGHII\"\n",
"list_seq = [\"AACDEFGHIY\", \"IIHGFECDAY\"]\n",
"sp = aa.SequencePreprocessor()"
],
"metadata": {
"collapsed": false
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-28T20:36:38.975453869Z",
"start_time": "2024-06-28T20:36:36.239449900Z"
}
},
"id": "6529c65f51e1c14f"
},
{
"cell_type": "markdown",
"source": [
"Provide the sequence as ``seq`` parameter to obtain a feature matrix (``X``) and the respective ``features``, which are integer amino acid representation at given residue positions: "
],
"metadata": {
"collapsed": false
},
"id": "1163b1302190f5a5"
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame shape: (2, 10)\n"
]
},
{
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<style type=\"text/css\">\n#T_6d5a5 thead th {\n background-color: white;\n color: black;\n}\n#T_6d5a5 tbody tr:nth-child(odd) {\n background-color: #f2f2f2;\n}\n#T_6d5a5 tbody tr:nth-child(even) {\n background-color: white;\n}\n#T_6d5a5 th {\n padding: 5px;\n white-space: nowrap;\n}\n#T_6d5a5 td {\n padding: 5px;\n white-space: nowrap;\n}\n</style>\n<table id=\"T_6d5a5\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n <thead>\n <tr>\n <th class=\"blank level0\" >&nbsp;</th>\n <th id=\"T_6d5a5_level0_col0\" class=\"col_heading level0 col0\" >P1</th>\n <th id=\"T_6d5a5_level0_col1\" class=\"col_heading level0 col1\" >P2</th>\n <th id=\"T_6d5a5_level0_col2\" class=\"col_heading level0 col2\" >P3</th>\n <th id=\"T_6d5a5_level0_col3\" class=\"col_heading level0 col3\" >P4</th>\n <th id=\"T_6d5a5_level0_col4\" class=\"col_heading level0 col4\" >P5</th>\n <th id=\"T_6d5a5_level0_col5\" class=\"col_heading level0 col5\" >P6</th>\n <th id=\"T_6d5a5_level0_col6\" class=\"col_heading level0 col6\" >P7</th>\n <th id=\"T_6d5a5_level0_col7\" class=\"col_heading level0 col7\" >P8</th>\n <th id=\"T_6d5a5_level0_col8\" class=\"col_heading level0 col8\" >P9</th>\n <th id=\"T_6d5a5_level0_col9\" class=\"col_heading level0 col9\" >P10</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th id=\"T_6d5a5_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n <td id=\"T_6d5a5_row0_col0\" class=\"data row0 col0\" >1</td>\n <td id=\"T_6d5a5_row0_col1\" class=\"data row0 col1\" >1</td>\n <td id=\"T_6d5a5_row0_col2\" class=\"data row0 col2\" >2</td>\n <td id=\"T_6d5a5_row0_col3\" class=\"data row0 col3\" >3</td>\n <td id=\"T_6d5a5_row0_col4\" class=\"data row0 col4\" >4</td>\n <td id=\"T_6d5a5_row0_col5\" class=\"data row0 col5\" >5</td>\n <td id=\"T_6d5a5_row0_col6\" class=\"data row0 col6\" >6</td>\n <td id=\"T_6d5a5_row0_col7\" class=\"data row0 col7\" >7</td>\n <td id=\"T_6d5a5_row0_col8\" class=\"data row0 col8\" >8</td>\n <td id=\"T_6d5a5_row0_col9\" class=\"data row0 col9\" >20</td>\n </tr>\n <tr>\n <th id=\"T_6d5a5_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n <td id=\"T_6d5a5_row1_col0\" class=\"data row1 col0\" >8</td>\n <td id=\"T_6d5a5_row1_col1\" class=\"data row1 col1\" >8</td>\n <td id=\"T_6d5a5_row1_col2\" class=\"data row1 col2\" >7</td>\n <td id=\"T_6d5a5_row1_col3\" class=\"data row1 col3\" >6</td>\n <td id=\"T_6d5a5_row1_col4\" class=\"data row1 col4\" >5</td>\n <td id=\"T_6d5a5_row1_col5\" class=\"data row1 col5\" >4</td>\n <td id=\"T_6d5a5_row1_col6\" class=\"data row1 col6\" >2</td>\n <td id=\"T_6d5a5_row1_col7\" class=\"data row1 col7\" >3</td>\n <td id=\"T_6d5a5_row1_col8\" class=\"data row1 col8\" >1</td>\n <td id=\"T_6d5a5_row1_col9\" class=\"data row1 col9\" >20</td>\n </tr>\n </tbody>\n</table>\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"X, features = sp.encode_integer(list_seq=list_seq)\n",
"\n",
"# Convert to DataFrame for visualization\n",
"df_encode = pd.DataFrame(X, columns=features)\n",
"aa.display_df(df=df_encode, show_shape=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-28T20:36:39.094402731Z",
"start_time": "2024-06-28T20:36:38.987669931Z"
}
},
"id": "ae99247d4a174b85"
},
{
"cell_type": "markdown",
"source": [
"You can adjust the used ``alphabet`` to change the considered characters:"
],
"metadata": {
"collapsed": false
},
"id": "bd44a3454980db67"
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame shape: (2, 3)\n"
]
},
{
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<style type=\"text/css\">\n#T_9581a thead th {\n background-color: white;\n color: black;\n}\n#T_9581a tbody tr:nth-child(odd) {\n background-color: #f2f2f2;\n}\n#T_9581a tbody tr:nth-child(even) {\n background-color: white;\n}\n#T_9581a th {\n padding: 5px;\n white-space: nowrap;\n}\n#T_9581a td {\n padding: 5px;\n white-space: nowrap;\n}\n</style>\n<table id=\"T_9581a\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n <thead>\n <tr>\n <th class=\"blank level0\" >&nbsp;</th>\n <th id=\"T_9581a_level0_col0\" class=\"col_heading level0 col0\" >P1</th>\n <th id=\"T_9581a_level0_col1\" class=\"col_heading level0 col1\" >P2</th>\n <th id=\"T_9581a_level0_col2\" class=\"col_heading level0 col2\" >P3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th id=\"T_9581a_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n <td id=\"T_9581a_row0_col0\" class=\"data row0 col0\" >1</td>\n <td id=\"T_9581a_row0_col1\" class=\"data row0 col1\" >2</td>\n <td id=\"T_9581a_row0_col2\" class=\"data row0 col2\" >3</td>\n </tr>\n <tr>\n <th id=\"T_9581a_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n <td id=\"T_9581a_row1_col0\" class=\"data row1 col0\" >3</td>\n <td id=\"T_9581a_row1_col1\" class=\"data row1 col1\" >2</td>\n <td id=\"T_9581a_row1_col2\" class=\"data row1 col2\" >1</td>\n </tr>\n </tbody>\n</table>\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Show integer encoding with smaller alphabet\n",
"list_seq = [\"ABC\", \"CBA\"]\n",
"ALPHABET = \"ABC\"\n",
"X, features = sp.encode_integer(list_seq=list_seq, alphabet=ALPHABET)\n",
"\n",
"# Convert to DataFrame for visualization\n",
"df_encode = pd.DataFrame(X, columns=features)\n",
"aa.display_df(df=df_encode, show_shape=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-28T20:36:39.149782676Z",
"start_time": "2024-06-28T20:36:39.095604192Z"
}
},
"id": "280dd1afa9267756"
},
{
"cell_type": "markdown",
"source": [
"Change the ``gap`` symbol (default=``-``) as follows:"
],
"metadata": {
"collapsed": false
},
"id": "f53bf29b98c3fd9e"
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame shape: (2, 3)\n"
]
},
{
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<style type=\"text/css\">\n#T_8e4b3 thead th {\n background-color: white;\n color: black;\n}\n#T_8e4b3 tbody tr:nth-child(odd) {\n background-color: #f2f2f2;\n}\n#T_8e4b3 tbody tr:nth-child(even) {\n background-color: white;\n}\n#T_8e4b3 th {\n padding: 5px;\n white-space: nowrap;\n}\n#T_8e4b3 td {\n padding: 5px;\n white-space: nowrap;\n}\n</style>\n<table id=\"T_8e4b3\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n <thead>\n <tr>\n <th class=\"blank level0\" >&nbsp;</th>\n <th id=\"T_8e4b3_level0_col0\" class=\"col_heading level0 col0\" >P1</th>\n <th id=\"T_8e4b3_level0_col1\" class=\"col_heading level0 col1\" >P2</th>\n <th id=\"T_8e4b3_level0_col2\" class=\"col_heading level0 col2\" >P3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th id=\"T_8e4b3_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n <td id=\"T_8e4b3_row0_col0\" class=\"data row0 col0\" >1</td>\n <td id=\"T_8e4b3_row0_col1\" class=\"data row0 col1\" >2</td>\n <td id=\"T_8e4b3_row0_col2\" class=\"data row0 col2\" >3</td>\n </tr>\n <tr>\n <th id=\"T_8e4b3_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n <td id=\"T_8e4b3_row1_col0\" class=\"data row1 col0\" >3</td>\n <td id=\"T_8e4b3_row1_col1\" class=\"data row1 col1\" >2</td>\n <td id=\"T_8e4b3_row1_col2\" class=\"data row1 col2\" >0</td>\n </tr>\n </tbody>\n</table>\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Show integer encoding with other gap ('*')\n",
"list_seq = [\"ABC\", \"CB*\"]\n",
"ALPHABET = \"ABC\"\n",
"X, features = sp.encode_integer(list_seq=list_seq, alphabet=ALPHABET, gap=\"*\")\n",
"\n",
"# Convert to DataFrame for visualization\n",
"df_encode = pd.DataFrame(X, columns=features)\n",
"aa.display_df(df=df_encode, show_shape=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-28T20:36:39.172622592Z",
"start_time": "2024-06-28T20:36:39.111814958Z"
}
},
"id": "91341e3430083a63"
},
{
"cell_type": "markdown",
"source": [
"If one sequence is smaller than the other, gaps will be included either at the N-terminus or C-terminus (default), which is called padding. Adjust the padding using the ``pad_at`` (``N`` or ``C``) parameter:"
],
"metadata": {
"collapsed": false
},
"id": "4691599de5bd9c99"
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame shape: (2, 3)\n"
]
},
{
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<style type=\"text/css\">\n#T_e676c thead th {\n background-color: white;\n color: black;\n}\n#T_e676c tbody tr:nth-child(odd) {\n background-color: #f2f2f2;\n}\n#T_e676c tbody tr:nth-child(even) {\n background-color: white;\n}\n#T_e676c th {\n padding: 5px;\n white-space: nowrap;\n}\n#T_e676c td {\n padding: 5px;\n white-space: nowrap;\n}\n</style>\n<table id=\"T_e676c\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n <thead>\n <tr>\n <th class=\"blank level0\" >&nbsp;</th>\n <th id=\"T_e676c_level0_col0\" class=\"col_heading level0 col0\" >P1</th>\n <th id=\"T_e676c_level0_col1\" class=\"col_heading level0 col1\" >P2</th>\n <th id=\"T_e676c_level0_col2\" class=\"col_heading level0 col2\" >P3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th id=\"T_e676c_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n <td id=\"T_e676c_row0_col0\" class=\"data row0 col0\" >1</td>\n <td id=\"T_e676c_row0_col1\" class=\"data row0 col1\" >2</td>\n <td id=\"T_e676c_row0_col2\" class=\"data row0 col2\" >3</td>\n </tr>\n <tr>\n <th id=\"T_e676c_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n <td id=\"T_e676c_row1_col0\" class=\"data row1 col0\" >2</td>\n <td id=\"T_e676c_row1_col1\" class=\"data row1 col1\" >0</td>\n <td id=\"T_e676c_row1_col2\" class=\"data row1 col2\" >0</td>\n </tr>\n </tbody>\n</table>\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Show default padding (at C-Termius)\n",
"list_seq = [\"ABC\", \"B\"]\n",
"ALPHABET = \"ABC\"\n",
"X, features = sp.encode_integer(list_seq=list_seq, alphabet=ALPHABET)\n",
"\n",
"# Convert to DataFrame for visualization\n",
"df_encode = pd.DataFrame(X, columns=features)\n",
"aa.display_df(df=df_encode, show_shape=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-28T20:36:39.174009891Z",
"start_time": "2024-06-28T20:36:39.156212641Z"
}
},
"id": "48ff3a51801c9d62"
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataFrame shape: (2, 3)\n"
]
},
{
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<style type=\"text/css\">\n#T_449d7 thead th {\n background-color: white;\n color: black;\n}\n#T_449d7 tbody tr:nth-child(odd) {\n background-color: #f2f2f2;\n}\n#T_449d7 tbody tr:nth-child(even) {\n background-color: white;\n}\n#T_449d7 th {\n padding: 5px;\n white-space: nowrap;\n}\n#T_449d7 td {\n padding: 5px;\n white-space: nowrap;\n}\n</style>\n<table id=\"T_449d7\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n <thead>\n <tr>\n <th class=\"blank level0\" >&nbsp;</th>\n <th id=\"T_449d7_level0_col0\" class=\"col_heading level0 col0\" >P1</th>\n <th id=\"T_449d7_level0_col1\" class=\"col_heading level0 col1\" >P2</th>\n <th id=\"T_449d7_level0_col2\" class=\"col_heading level0 col2\" >P3</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th id=\"T_449d7_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n <td id=\"T_449d7_row0_col0\" class=\"data row0 col0\" >1</td>\n <td id=\"T_449d7_row0_col1\" class=\"data row0 col1\" >2</td>\n <td id=\"T_449d7_row0_col2\" class=\"data row0 col2\" >3</td>\n </tr>\n <tr>\n <th id=\"T_449d7_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n <td id=\"T_449d7_row1_col0\" class=\"data row1 col0\" >0</td>\n <td id=\"T_449d7_row1_col1\" class=\"data row1 col1\" >0</td>\n <td id=\"T_449d7_row1_col2\" class=\"data row1 col2\" >2</td>\n </tr>\n </tbody>\n</table>\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Show N-terminal padding\n",
"list_seq = [\"ABC\", \"B\"]\n",
"ALPHABET = \"ABC\"\n",
"X, features = sp.encode_integer(list_seq=list_seq, alphabet=ALPHABET, pad_at=\"N\")\n",
"\n",
"# Convert to DataFrame for visualization\n",
"df_encode = pd.DataFrame(X, columns=features)\n",
"aa.display_df(df=df_encode, show_shape=True)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-28T20:36:39.616720327Z",
"start_time": "2024-06-28T20:36:39.477176504Z"
}
},
"id": "43798b8ac9b7e61c"
}
],
"metadata": {
Expand Down
4 changes: 2 additions & 2 deletions examples/data_handling/sp_encode_one_hot.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"import aaanalysis as aa\n",
"import pandas as pd\n",
"\n",
"list_seq = [\"AACDEFGHII\", \"IIHGFECDAA\"]\n",
"list_seq = [\"AACDEFGHIY\", \"IIHGFECDAY\"]\n",
"sp = aa.SequencePreprocessor()"
],
"metadata": {
Expand All @@ -33,7 +33,7 @@
{
"cell_type": "markdown",
"source": [
"Provide the sequence as ``seq`` parameter to obtain a feature matrix (``X``) and the respective ``features``, which are binary representation of amino acid at given residue positions: "
"Provide the sequence as ``seq`` parameter to obtain a feature matrix (``X``) and the respective ``features``, which are binary representation of each amino acid at given residue positions: "
],
"metadata": {
"collapsed": false
Expand Down

0 comments on commit 8d38d41

Please sign in to comment.