Update filter_seq function

breimanntools · Jun 27, 2024 · 26ee2c0 · 26ee2c0
1 parent 7b1e3b5
commit 26ee2c0
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 43 deletions.
diff --git a/aaanalysis/data_handling_pro/_backend/cd_hit.py b/aaanalysis/data_handling_pro/_backend/cd_hit.py
@@ -1,4 +1,4 @@
-"""This is a script for the backend of the cd-hit method for the filtering_seq function."""
+"""This is a script for the backend of the CH-HIT method for the filter_seq() function."""
 import pandas as pd
 import os
 
@@ -64,7 +64,7 @@ def run_cd_hit(df_seq=None,
     file_in = os.path.join(temp_dir, f"_{result_prefix}_in")
     save_entries_to_fasta(df_seq=df_seq, file_path=file_in)
     file_out = os.path.join(temp_dir, f"_{result_prefix}_out")
-    # Create CD-hit command
+    # Create CD-HIT command
     if word_size is None:
         word_size = _select_word_size(st=similarity_threshold)
     cmd = ["cd-hit", "-i", file_in,
@@ -87,9 +87,9 @@ def run_cd_hit(df_seq=None,
     if sort_clusters:
         cmd.extend(["-sc", "1"])
 
-    # Run CD-Hit command
+    # Run CD-HIT command
     if verbose:
-        ut.print_out("Run CD-Hit filtering")
+        ut.print_out("Run CD-HIT filtering")
     run_command(cmd=cmd, verbose=verbose, temp_dir=temp_dir)
 
     # Convert CD-Hit output to clustering DataFrame
@@ -98,6 +98,3 @@ def run_cd_hit(df_seq=None,
     # Remove temporary file
     remove_temp(path=temp_dir)
     return df_clust
-
-
-
diff --git a/aaanalysis/data_handling_pro/_backend/mmseq2.py b/aaanalysis/data_handling_pro/_backend/mmseq2.py
@@ -1,4 +1,4 @@
-"""This is a script for the backend of the MMseqs2 method for the filtering_seq function."""
+"""This is a script for the backend of the MMseqs2 method for the filter_seq() function."""
 import os
 import pandas as pd
 

diff --git a/aaanalysis/data_handling_pro/_filter_seq.py b/aaanalysis/data_handling_pro/_filter_seq.py
@@ -1,5 +1,5 @@
 """
-This is a script for a wrapper function called filter_seq that provides an Python interface to the
+This is a script for a wrapper function called filter_seq that provides a Python interface to the
 redundancy-reduction algorithms CD-Hit and MMseqs2.
 """
 from typing import Optional, List, Literal
@@ -39,15 +39,15 @@ def filter_seq(df_seq: pd.DataFrame = None,
                verbose: bool = False
                ) -> pd.DataFrame:
     """
-    UNDER CONSTRUCTION: Redundancy reduction of sequences using clustering-based algorithms.
+    Redundancy reduction of sequences using clustering-based algorithms.
 
     This functions performs redundancy reduction of sequences by clustering and selecting representative sequences using
     the CD-HIT ([Li06]_) or MMseqs2 ([Steinegger17]_) algorithms locally. It allows for adjustable filtering strictness:
 
     * Strict filtering results in smaller, more homogeneous clusters, suitable when high sequence similarity is required.
     * Non-strict filtering creates larger, more diverse clusters, enhancing sequence representation.
 
-    CD-Hit and MMseq2 are standalone software tools, each requiring separate installation. CD-Hit is more
+    CD-HIT and MMseq2 are standalone software tools, each requiring separate installation. CD-Hit is more
     resource-efficient and easier to install, while MMseq2 is a larger multi-purpose tool. Pairwise sequence similarities
     for the MMseq2 clustering results were computed using the Biopython :class:`Bio.Align.PairwiseAligner` class.
 
@@ -64,7 +64,7 @@ def filter_seq(df_seq: pd.DataFrame = None,
     similarity_threshold : float, default=0.9
         Defines the minimum sequence identity [0.4-1.0] for clustering. Higher values increase strictness.
     word_size : int, optional
-        The size of the 'word' (in CD-Hit, [2-5]) or 'k-mer' (in MMseqs, [5-7]) used for the initial screening step in clustering.
+        The size of the 'word' (in CD-HIT, [2-5]) or 'k-mer' (in MMseqs, [5-7]) used for the initial screening step in clustering.
         Effect on strictness is dataset-dependent. If ``None``, optimized based on ``similarity_threshold`` (CD-Hit).
     global_identity : bool, default=True
         Whether to use global (True) or local (False) sequence identity for 'cd-hit' clustering. Global is stricter.
@@ -117,7 +117,8 @@ def filter_seq(df_seq: pd.DataFrame = None,
 
     Warnings
     --------
-    * CD-Hit and MMseq2 must be installed separately.
+    * CD-HIT and MMseq2 must be installed separately.
+    * CD-HIT is not available for Windows.
     * This function requires `biopython`, which is automatically installed via `pip install aaanalysis[pro]`.
 
     Examples