Skip to content

Commit

Permalink
Update filter_seq function
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Jun 27, 2024
1 parent 7b1e3b5 commit 26ee2c0
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 43 deletions.
11 changes: 4 additions & 7 deletions aaanalysis/data_handling_pro/_backend/cd_hit.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This is a script for the backend of the cd-hit method for the filtering_seq function."""
"""This is a script for the backend of the CH-HIT method for the filter_seq() function."""
import pandas as pd
import os

Expand Down Expand Up @@ -64,7 +64,7 @@ def run_cd_hit(df_seq=None,
file_in = os.path.join(temp_dir, f"_{result_prefix}_in")
save_entries_to_fasta(df_seq=df_seq, file_path=file_in)
file_out = os.path.join(temp_dir, f"_{result_prefix}_out")
# Create CD-hit command
# Create CD-HIT command
if word_size is None:
word_size = _select_word_size(st=similarity_threshold)
cmd = ["cd-hit", "-i", file_in,
Expand All @@ -87,9 +87,9 @@ def run_cd_hit(df_seq=None,
if sort_clusters:
cmd.extend(["-sc", "1"])

# Run CD-Hit command
# Run CD-HIT command
if verbose:
ut.print_out("Run CD-Hit filtering")
ut.print_out("Run CD-HIT filtering")
run_command(cmd=cmd, verbose=verbose, temp_dir=temp_dir)

# Convert CD-Hit output to clustering DataFrame
Expand All @@ -98,6 +98,3 @@ def run_cd_hit(df_seq=None,
# Remove temporary file
remove_temp(path=temp_dir)
return df_clust



2 changes: 1 addition & 1 deletion aaanalysis/data_handling_pro/_backend/mmseq2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This is a script for the backend of the MMseqs2 method for the filtering_seq function."""
"""This is a script for the backend of the MMseqs2 method for the filter_seq() function."""
import os
import pandas as pd

Expand Down
11 changes: 6 additions & 5 deletions aaanalysis/data_handling_pro/_filter_seq.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is a script for a wrapper function called filter_seq that provides an Python interface to the
This is a script for a wrapper function called filter_seq that provides a Python interface to the
redundancy-reduction algorithms CD-Hit and MMseqs2.
"""
from typing import Optional, List, Literal
Expand Down Expand Up @@ -39,15 +39,15 @@ def filter_seq(df_seq: pd.DataFrame = None,
verbose: bool = False
) -> pd.DataFrame:
"""
UNDER CONSTRUCTION: Redundancy reduction of sequences using clustering-based algorithms.
Redundancy reduction of sequences using clustering-based algorithms.
This functions performs redundancy reduction of sequences by clustering and selecting representative sequences using
the CD-HIT ([Li06]_) or MMseqs2 ([Steinegger17]_) algorithms locally. It allows for adjustable filtering strictness:
* Strict filtering results in smaller, more homogeneous clusters, suitable when high sequence similarity is required.
* Non-strict filtering creates larger, more diverse clusters, enhancing sequence representation.
CD-Hit and MMseq2 are standalone software tools, each requiring separate installation. CD-Hit is more
CD-HIT and MMseq2 are standalone software tools, each requiring separate installation. CD-Hit is more
resource-efficient and easier to install, while MMseq2 is a larger multi-purpose tool. Pairwise sequence similarities
for the MMseq2 clustering results were computed using the Biopython :class:`Bio.Align.PairwiseAligner` class.
Expand All @@ -64,7 +64,7 @@ def filter_seq(df_seq: pd.DataFrame = None,
similarity_threshold : float, default=0.9
Defines the minimum sequence identity [0.4-1.0] for clustering. Higher values increase strictness.
word_size : int, optional
The size of the 'word' (in CD-Hit, [2-5]) or 'k-mer' (in MMseqs, [5-7]) used for the initial screening step in clustering.
The size of the 'word' (in CD-HIT, [2-5]) or 'k-mer' (in MMseqs, [5-7]) used for the initial screening step in clustering.
Effect on strictness is dataset-dependent. If ``None``, optimized based on ``similarity_threshold`` (CD-Hit).
global_identity : bool, default=True
Whether to use global (True) or local (False) sequence identity for 'cd-hit' clustering. Global is stricter.
Expand Down Expand Up @@ -117,7 +117,8 @@ def filter_seq(df_seq: pd.DataFrame = None,
Warnings
--------
* CD-Hit and MMseq2 must be installed separately.
* CD-HIT and MMseq2 must be installed separately.
* CD-HIT is not available for Windows.
* This function requires `biopython`, which is automatically installed via `pip install aaanalysis[pro]`.
Examples
Expand Down
Loading

0 comments on commit 26ee2c0

Please sign in to comment.