diff --git a/aaanalysis/data_handling_pro/_backend/cd_hit.py b/aaanalysis/data_handling_pro/_backend/cd_hit.py index 380cca63..af30aa83 100644 --- a/aaanalysis/data_handling_pro/_backend/cd_hit.py +++ b/aaanalysis/data_handling_pro/_backend/cd_hit.py @@ -1,4 +1,4 @@ -"""This is a script for the backend of the cd-hit method for the filtering_seq function.""" +"""This is a script for the backend of the CH-HIT method for the filter_seq() function.""" import pandas as pd import os @@ -64,7 +64,7 @@ def run_cd_hit(df_seq=None, file_in = os.path.join(temp_dir, f"_{result_prefix}_in") save_entries_to_fasta(df_seq=df_seq, file_path=file_in) file_out = os.path.join(temp_dir, f"_{result_prefix}_out") - # Create CD-hit command + # Create CD-HIT command if word_size is None: word_size = _select_word_size(st=similarity_threshold) cmd = ["cd-hit", "-i", file_in, @@ -87,9 +87,9 @@ def run_cd_hit(df_seq=None, if sort_clusters: cmd.extend(["-sc", "1"]) - # Run CD-Hit command + # Run CD-HIT command if verbose: - ut.print_out("Run CD-Hit filtering") + ut.print_out("Run CD-HIT filtering") run_command(cmd=cmd, verbose=verbose, temp_dir=temp_dir) # Convert CD-Hit output to clustering DataFrame @@ -98,6 +98,3 @@ def run_cd_hit(df_seq=None, # Remove temporary file remove_temp(path=temp_dir) return df_clust - - - diff --git a/aaanalysis/data_handling_pro/_backend/mmseq2.py b/aaanalysis/data_handling_pro/_backend/mmseq2.py index 38f2ac2e..0adbfffe 100644 --- a/aaanalysis/data_handling_pro/_backend/mmseq2.py +++ b/aaanalysis/data_handling_pro/_backend/mmseq2.py @@ -1,4 +1,4 @@ -"""This is a script for the backend of the MMseqs2 method for the filtering_seq function.""" +"""This is a script for the backend of the MMseqs2 method for the filter_seq() function.""" import os import pandas as pd diff --git a/aaanalysis/data_handling_pro/_filter_seq.py b/aaanalysis/data_handling_pro/_filter_seq.py index 198bf5ec..087f63c1 100644 --- a/aaanalysis/data_handling_pro/_filter_seq.py +++ b/aaanalysis/data_handling_pro/_filter_seq.py @@ -1,5 +1,5 @@ """ -This is a script for a wrapper function called filter_seq that provides an Python interface to the +This is a script for a wrapper function called filter_seq that provides a Python interface to the redundancy-reduction algorithms CD-Hit and MMseqs2. """ from typing import Optional, List, Literal @@ -39,7 +39,7 @@ def filter_seq(df_seq: pd.DataFrame = None, verbose: bool = False ) -> pd.DataFrame: """ - UNDER CONSTRUCTION: Redundancy reduction of sequences using clustering-based algorithms. + Redundancy reduction of sequences using clustering-based algorithms. This functions performs redundancy reduction of sequences by clustering and selecting representative sequences using the CD-HIT ([Li06]_) or MMseqs2 ([Steinegger17]_) algorithms locally. It allows for adjustable filtering strictness: @@ -47,7 +47,7 @@ def filter_seq(df_seq: pd.DataFrame = None, * Strict filtering results in smaller, more homogeneous clusters, suitable when high sequence similarity is required. * Non-strict filtering creates larger, more diverse clusters, enhancing sequence representation. - CD-Hit and MMseq2 are standalone software tools, each requiring separate installation. CD-Hit is more + CD-HIT and MMseq2 are standalone software tools, each requiring separate installation. CD-Hit is more resource-efficient and easier to install, while MMseq2 is a larger multi-purpose tool. Pairwise sequence similarities for the MMseq2 clustering results were computed using the Biopython :class:`Bio.Align.PairwiseAligner` class. @@ -64,7 +64,7 @@ def filter_seq(df_seq: pd.DataFrame = None, similarity_threshold : float, default=0.9 Defines the minimum sequence identity [0.4-1.0] for clustering. Higher values increase strictness. word_size : int, optional - The size of the 'word' (in CD-Hit, [2-5]) or 'k-mer' (in MMseqs, [5-7]) used for the initial screening step in clustering. + The size of the 'word' (in CD-HIT, [2-5]) or 'k-mer' (in MMseqs, [5-7]) used for the initial screening step in clustering. Effect on strictness is dataset-dependent. If ``None``, optimized based on ``similarity_threshold`` (CD-Hit). global_identity : bool, default=True Whether to use global (True) or local (False) sequence identity for 'cd-hit' clustering. Global is stricter. @@ -117,7 +117,8 @@ def filter_seq(df_seq: pd.DataFrame = None, Warnings -------- - * CD-Hit and MMseq2 must be installed separately. + * CD-HIT and MMseq2 must be installed separately. + * CD-HIT is not available for Windows. * This function requires `biopython`, which is automatically installed via `pip install aaanalysis[pro]`. Examples diff --git a/examples/data_handling/filter_seq.ipynb b/examples/data_handling/filter_seq.ipynb index 2bfef3e7..e78b3008 100644 --- a/examples/data_handling/filter_seq.ipynb +++ b/examples/data_handling/filter_seq.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "outputs": [], "source": [ "import aaanalysis as aa\n", @@ -22,8 +22,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:43:08.448843861Z", - "start_time": "2024-06-27T13:43:05.447608778Z" + "end_time": "2024-06-27T20:22:29.817583849Z", + "start_time": "2024-06-27T20:22:29.631806177Z" } }, "id": "f6652f89954b8969" @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "outputs": [ { "name": "stdout", @@ -53,7 +53,7 @@ { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1CAPSID_44570100.0000001
2CAPSID_48551100.0000001
3CAPSID_9152100.0000001
4CAPSID_2583100.0000001
5CAPSID_43014100.0000001
6CAPSID_41675100.0000001
7CAPSID_45746100.0000001
8CAPSID_41917100.0000001
9CAPSID_49518100.0000001
10CAPSID_43809100.0000001
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1991CAPSID_1621990100.0000001
1992CAPSID_47751991100.0000001
1993CAPSID_50051992100.0000001
1994CAPSID_47011993100.0000001
1995CAPSID_49621994100.0000001
1996CAPSID_45171995100.0000001
1997CAPSID_45161996100.0000001
1998CAPSID_43001997100.0000001
1999CAPSID_41081998100.0000001
2000CAPSID_49841999100.0000001
\n" }, "metadata": {}, "output_type": "display_data" @@ -67,11 +67,11 @@ } ], "source": [ - "# Filtering using cd-hit (default)\n", + "# Filtering using CD-HIT (default)\n", "df_clust = aa.filter_seq(df_seq=df_seq)\n", "n_clust = df_clust[\"cluster\"].nunique()\n", "print(f\"Number of CD-HIT clusters: {n_clust}\")\n", - "aa.display_df(df_clust, n_rows=10, show_shape=True)\n", + "aa.display_df(df_clust, n_rows=-10, show_shape=True)\n", "\n", "# Filtering using MMSeqs\n", "df_clust = aa.filter_seq(df_seq=df_seq, method=\"mmseqs\")\n", @@ -81,8 +81,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:43:26.967403360Z", - "start_time": "2024-06-27T13:43:13.028873821Z" + "end_time": "2024-06-27T20:22:45.025340662Z", + "start_time": "2024-06-27T20:22:30.433050007Z" } }, "id": "b9e4f7053030fd3f" @@ -90,7 +90,51 @@ { "cell_type": "markdown", "source": [ - "The decrease the number of clusters, you can decrease the sequence ``similarity_threshold`` (default=0.9): " + "You can obtain a redundancy-reduced set of protein sequences by selecting the representative sequence of each cluster:" + ], + "metadata": { + "collapsed": false + }, + "id": "7a2d9bf2bf0b9de6" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame shape: (2000, 4)\n" + ] + }, + { + "data": { + "text/plain": "", + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1991CAPSID_47741988100.0000001
1992CAPSID_48061989100.0000001
1993CAPSID_48391990100.0000001
1994CAPSID_48711991100.0000001
1995CAPSID_49041992100.0000001
1996CAPSID_49361993100.0000001
1997CAPSID_49681994100.0000001
1998CAPSID_50021995100.0000001
1999CAPSID_50371996100.0000001
2000CAPSID_50691997100.0000001
\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Select redundancy-reduced sequences\n", + "df_selected = df_clust[df_clust[\"is_representative\"] == 1]\n", + "aa.display_df(df_clust, n_rows=-10, show_shape=True)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-06-27T20:22:45.110610021Z", + "start_time": "2024-06-27T20:22:45.020134758Z" + } + }, + "id": "50a17ad119d34446" + }, + { + "cell_type": "markdown", + "source": [ + "To reduce the number of clusters, you can decrease the sequence ``similarity_threshold`` (default=0.9): " ], "metadata": { "collapsed": false @@ -99,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "outputs": [ { "name": "stdout", @@ -124,8 +168,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:44:16.530809132Z", - "start_time": "2024-06-27T13:43:33.611598651Z" + "end_time": "2024-06-27T20:23:26.767419508Z", + "start_time": "2024-06-27T20:22:45.081053810Z" } }, "id": "ef5135c86e164027" @@ -142,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "outputs": [ { "name": "stdout", @@ -160,8 +204,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:44:47.307324800Z", - "start_time": "2024-06-27T13:44:25.611748437Z" + "end_time": "2024-06-27T20:23:47.145309577Z", + "start_time": "2024-06-27T20:23:26.755456935Z" } }, "id": "7d55920a4d8a2183" @@ -178,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "outputs": [ { "name": "stdout", @@ -197,8 +241,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:44:55.331701921Z", - "start_time": "2024-06-27T13:44:51.223695188Z" + "end_time": "2024-06-27T20:23:50.951949901Z", + "start_time": "2024-06-27T20:23:47.156300620Z" } }, "id": "62d0351a3c25f270" @@ -215,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "outputs": [ { "name": "stdout", @@ -240,8 +284,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:45:17.853935207Z", - "start_time": "2024-06-27T13:45:02.144846759Z" + "end_time": "2024-06-27T20:24:06.633475548Z", + "start_time": "2024-06-27T20:23:50.959943351Z" } }, "id": "5da69ca3ad47a131" @@ -258,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "outputs": [], "source": [ "# Sort sequences by clusters\n", @@ -267,8 +311,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:45:25.181395528Z", - "start_time": "2024-06-27T13:45:24.395434611Z" + "end_time": "2024-06-27T20:24:07.190489991Z", + "start_time": "2024-06-27T20:24:06.648321841Z" } }, "id": "ce1a5c401c42fe9d" @@ -285,14 +329,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time without multiprocessing: 0.8 seconds\n", - "Time with multiprocessing. 0.91 seconds\n" + "Time without multiprocessing: 0.54 seconds\n", + "Time with multiprocessing. 0.68 seconds\n" ] } ], @@ -314,8 +358,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T13:45:27.414166029Z", - "start_time": "2024-06-27T13:45:25.691803391Z" + "end_time": "2024-06-27T20:24:08.416840033Z", + "start_time": "2024-06-27T20:24:07.208454665Z" } }, "id": "87a6ef861d00ab12" @@ -329,6 +373,16 @@ "collapsed": false }, "id": "9328b8f044d63fc8" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "d0993d1b5f9aa2b" } ], "metadata": {