From 259e7e2aa78ca4823d7daf4ff2eaa9c4d0a1cd56 Mon Sep 17 00:00:00 2001
From: stephanbreimann <stephanbreimann@gmail.com>
Date: Thu, 2 May 2024 22:29:51 +0200
Subject: [PATCH] Add read_fasta and to_fasta (not finished)

---
 aaanalysis/__init__.py                        |   6 +-
 aaanalysis/_utils/check_data.py               |   8 ++
 aaanalysis/data_handling/__init__.py          |   5 +-
 aaanalysis/data_handling/_encode_sequences.py |  91 ++++++++++++
 aaanalysis/data_handling/_filter_sequences.py | 106 ++++++++++++++
 aaanalysis/data_handling/_read_fasta.py       | 134 ++++++++++++++++++
 .../{_data_read_write.py => _to_fasta.py}     |  29 ++--
 aaanalysis/utils.py                           |   6 +-
 docs/source/api.rst                           |   1 +
 protocols/protocol0_sequence_analysis.ipynb   |  35 +++++
 10 files changed, 402 insertions(+), 19 deletions(-)
 create mode 100644 aaanalysis/data_handling/_encode_sequences.py
 create mode 100644 aaanalysis/data_handling/_filter_sequences.py
 create mode 100644 aaanalysis/data_handling/_read_fasta.py
 rename aaanalysis/data_handling/{_data_read_write.py => _to_fasta.py} (71%)
 create mode 100644 protocols/protocol0_sequence_analysis.ipynb

diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py
index 4d0afd14..7095e12b 100644
--- a/aaanalysis/__init__.py
+++ b/aaanalysis/__init__.py
@@ -1,4 +1,6 @@
-from .data_handling import load_dataset, load_scales, load_features, to_fasta
+from .data_handling import (load_dataset, load_scales, load_features,
+                            to_fasta, read_fasta,
+                            encode_sequences)
 from .feature_engineering import AAclust, AAclustPlot, SequenceFeature, NumericalFeature, CPP, CPPPlot
 from .pu_learning import dPULearn, dPULearnPlot
 from .pertubation import AAMut, AAMutPlot, SeqMut, SeqMutPlot
@@ -14,6 +16,8 @@
     "load_scales",
     "load_features",
     "to_fasta",
+    "read_fasta",
+    "encode_sequences",
     "AAclust",
     "AAclustPlot",
     "SequenceFeature",
diff --git a/aaanalysis/_utils/check_data.py b/aaanalysis/_utils/check_data.py
index 3eb29769..ad2a01b2 100644
--- a/aaanalysis/_utils/check_data.py
+++ b/aaanalysis/_utils/check_data.py
@@ -3,6 +3,7 @@
 """
 import pandas as pd
 import numpy as np
+import os
 from sklearn.utils import check_array
 
 from ._utils import add_str
@@ -282,3 +283,10 @@ def check_df(name="df", df=None, accept_none=False, accept_nan=True, check_all_p
         str_error = add_str(str_error=f"The following columns are duplicated '{cols_duplicated}'.", str_add=str_add)
         raise ValueError(str_error)
 
+
+def check_file(file_path=None):
+    """Check if file is valid string and path exists"""
+    check_type.check_str(name="file_path", val=file_path)
+    if not os.path.isfile(file_path):
+        raise ValueError(f"Following 'file_path' does not exist: '{file_path}'")
+
diff --git a/aaanalysis/data_handling/__init__.py b/aaanalysis/data_handling/__init__.py
index 0fe15394..b1632515 100644
--- a/aaanalysis/data_handling/__init__.py
+++ b/aaanalysis/data_handling/__init__.py
@@ -1,11 +1,14 @@
 from ._load_dataset import load_dataset
 from ._load_scales import load_scales
 from ._load_features import load_features
-from ._data_read_write import to_fasta
+from ._read_fasta import read_fasta
+from ._to_fasta import to_fasta
+from ._encode_sequences import encode_sequences
 
 __all__ = [
     "load_dataset",
     "load_scales",
     "load_features",
     "to_fasta",
+    "encode_sequences",
 ]
diff --git a/aaanalysis/data_handling/_encode_sequences.py b/aaanalysis/data_handling/_encode_sequences.py
new file mode 100644
index 00000000..1ea25885
--- /dev/null
+++ b/aaanalysis/data_handling/_encode_sequences.py
@@ -0,0 +1,91 @@
+"""
+This is a script for creating one-hot-encoding of sequences used as baseline representation.
+"""
+import pandas as pd
+import numpy as np
+
+
+# I Helper Functions
+def _pad_sequences(sequences, pad_at='C'):
+    """
+    Pads all sequences in the list to the length of the longest sequence by adding gaps.
+    """
+    max_length = max(len(seq) for seq in sequences)
+    padded_sequences = []
+    for seq in sequences:
+        gap_length = max_length - len(seq)
+        if pad_at == 'N':
+            padded_seq = '_' * gap_length + seq
+        else:
+            padded_seq = seq + '_' * gap_length
+        padded_sequences.append(padded_seq)
+    return padded_sequences
+
+
+def _one_hot_encode(amino_acid=None, alphabet=None, gap="_"):
+    """
+    Encodes a single amino acid into a one-hot vector based on a specified alphabet.
+    Returns a zero vector for gaps represented as '_'.
+    """
+    index_dict = {aa: i for i, aa in enumerate(alphabet)}
+    vector = np.zeros(len(alphabet), dtype=int)
+    if amino_acid != gap:
+        if amino_acid in index_dict:
+            vector[index_dict[amino_acid]] = 1
+        else:
+            raise ValueError(f"Unrecognized amino acid '{amino_acid}' not in alphabet.")
+    return vector
+
+
+# II Main Functions
+# TODO finish, docu, test, example ...
+def encode_sequences(sequences=None,
+                     alphabet='ARNDCEQGHILKMFPSTWYV',
+                     gap="_",
+                     pad_at='C'):
+    """
+    One-hot-encode a list of protein sequences into a feature matrix, padding shorter sequences
+    with gaps represented as zero vectors.
+
+    Parameters:
+    ----------
+    sequences : list of str
+        List of protein sequences to encode.
+    alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
+        The alphabet of amino acids used for encoding. The gap character is not part of the alphabet.
+    gap : str, default='_'
+        The character used to represent gaps in sequences.
+    pad_at : str, default='C'
+        Specifies where to add the padding:
+        'N' for N-terminus (beginning of the sequence),
+        'C' for C-terminus (end of the sequence).
+
+    Returns:
+    -------
+    np.array
+        A numpy array where each row represents an encoded sequence, and each column represents a feature.
+
+    """
+    # Validate input parameters
+    if pad_at not in ['N', 'C']:
+        raise ValueError(f"pad_at must be 'N' or 'C', got {pad_at}")
+
+    # Validate if all characters in the sequences are within the given alphabet
+    all_chars = set(''.join(sequences))
+    if not all_chars.issubset(set(alphabet + '_')):
+        invalid_chars = all_chars - set(alphabet + '_')
+        raise ValueError(f"Found invalid amino acid(s) {invalid_chars} not in alphabet.")
+
+    # Pad sequences
+    padded_sequences = _pad_sequences(sequences, pad_at=pad_at)
+    max_length = len(padded_sequences[0])
+    num_amino_acids = len(alphabet)
+    feature_matrix = np.zeros((len(padded_sequences), max_length * num_amino_acids), dtype=int)
+    args = dict(alphabet=alphabet, gap=gap)
+    # Create one-hot-encoding
+    for idx, seq in enumerate(padded_sequences):
+        encoded_seq = [_one_hot_encode(amino_acid=aa, **args) for aa in seq]
+        feature_matrix[idx, :] = np.array(encoded_seq).flatten()
+
+    return feature_matrix
+
diff --git a/aaanalysis/data_handling/_filter_sequences.py b/aaanalysis/data_handling/_filter_sequences.py
new file mode 100644
index 00000000..ca2c0647
--- /dev/null
+++ b/aaanalysis/data_handling/_filter_sequences.py
@@ -0,0 +1,106 @@
+import subprocess
+import shutil
+
+# TODO test, adjust, finish (see ChatGPT: Model Performance Correlation Analysis including STD)
+
+
+# I Helper functions
+def _is_tool(name):
+    """Check whether `name` is on PATH and marked as executable."""
+    return shutil.which(name) is not None
+
+
+def _select_longest_representatives(cluster_tsv, all_sequences_file, output_file):
+    seq_dict = {}
+    with open(all_sequences_file, 'r') as file:
+        current_id = None
+        for line in file:
+            if line.startswith('>'):
+                current_id = line.strip().split()[0][1:]
+                seq_dict[current_id] = ""
+            else:
+                seq_dict[current_id] += line.strip()
+
+    clusters = {}
+    with open(cluster_tsv, 'r') as file:
+        for line in file:
+            parts = line.strip().split('\t')
+            cluster_id, seq_id = parts[0], parts[1]
+            if cluster_id not in clusters:
+                clusters[cluster_id] = []
+            clusters[cluster_id].append(seq_id)
+
+    representatives = {}
+    for cluster_id, seq_ids in clusters.items():
+        longest_seq_id = max(seq_ids, key=lambda x: len(seq_dict[x]))
+        representatives[longest_seq_id] = seq_dict[longest_seq_id]
+
+    with open(output_file, 'w') as out_file:
+        for seq_id, sequence in representatives.items():
+            out_file.write(f">{seq_id}\n{sequence}\n")
+
+
+# II Main Functions
+# TODO finish, docu, test, example ...
+def filter_sequences(method, input_file, output_file, similarity_threshold=0.7, word_size=5,
+                     coverage_long=None, coverage_short=None, threads=1, verbose=False):
+    """Perform redundancy-reduction of sequences by calling CD-Hit or MMSeq2 algorithm"""
+    if method not in ['cd-hit', 'mmseq2']:
+        raise ValueError("Invalid method specified. Use 'cd-hit' or 'mmseq2'.")
+
+    if method == 'cd-hit' and not _is_tool('cd-hit'):
+        raise RuntimeError("CD-HIT is not installed or not in the PATH.")
+    if method == 'mmseq2' and not _is_tool('mmseqs'):
+        raise RuntimeError("MMseq2 is not installed or not in the PATH.")
+
+    if method == "cd-hit":
+        cmd = [
+            "cd-hit", "-i", input_file, "-o", output_file,
+            "-c", str(similarity_threshold), "-n", str(word_size),
+            "-T", str(threads)
+        ]
+        if coverage_long:
+            cmd.extend(["-aL", str(coverage_long)])
+        if coverage_short:
+            cmd.extend(["-aS", str(coverage_short)])
+        if verbose:
+            cmd.append("-d")
+            cmd.append("0")
+
+        subprocess.run(cmd, check=True)
+        print("CD-HIT clustering completed. Representatives are saved in:", output_file)
+
+    elif method == "mmseq2":
+        tmp_directory = "tmp"
+        result_prefix = "result_"
+        db_name = result_prefix + "DB"
+        cluster_name = result_prefix + "Clu"
+        subprocess.run(["mmseqs", "createdb", input_file, db_name], check=True)
+        cmd = [
+            "mmseqs", "cluster", db_name, cluster_name, tmp_directory,
+            "--min-seq-id", str(similarity_threshold), "-k", str(word_size),
+            "--threads", str(threads)
+        ]
+        if verbose:
+            cmd.append("-v")
+            cmd.append("3")
+
+        subprocess.run(cmd, check=True)
+        cluster_tsv = result_prefix + "Clu.tsv"
+        subprocess.run([
+            "mmseqs", "createtsv", db_name, db_name, cluster_name, cluster_tsv
+        ], check=True)
+
+        _select_longest_representatives(cluster_tsv, input_file, output_file)
+        print("MMseq2 clustering completed. Representatives are saved in:", output_file)
+
+
+# Example usage
+input_fasta = "your_input_sequences.fasta"
+final_output = "representatives.fasta"
+method = "cd-hit"  # Change to "mmseq2" to use MMseq2
+
+filter_sequences(method, input_fasta, final_output, similarity_threshold=0.7, word_size=5,
+                 coverage_long=0.8, coverage_short=0.8, threads=4, verbose=True)
+
+
diff --git a/aaanalysis/data_handling/_read_fasta.py b/aaanalysis/data_handling/_read_fasta.py
new file mode 100644
index 00000000..c044c8f8
--- /dev/null
+++ b/aaanalysis/data_handling/_read_fasta.py
@@ -0,0 +1,134 @@
+"""
+This is a script for reading FASTA files to DataFrames (df_seq). FASTA files are the most commonly used format
+in computational biology. This function should enable a smooth interaction with the biopython package.
+"""
+import pandas as pd
+from typing import Optional, List
+import warnings
+
+import aaanalysis.utils as ut
+
+
+# I Helper Functions
+# Post check functions
+def post_check_unique_entries(list_entries=None, col_id=None):
+    """Check if entries are unique"""
+    list_duplicates = list(set([x for x in list_entries if list_entries.count(x) > 1]))
+    if len(list_duplicates) > 0:
+        warnings.warn(f"Entries from '{col_id}' should be unique. "
+                      f"\nFollowing entries are duplicated: {list_duplicates}")
+
+
+def _get_entries_from_fasta(file_path, col_id, col_seq, col_db, sep):
+    """Read information from FASTA file and convert to DataFrame"""
+    # Read fasta
+    list_entries = []
+    dict_current_entry = {}
+    with open(file_path, 'r') as file:
+        for line in file:
+            line = line.strip()
+            if line.startswith('>'):
+                if dict_current_entry:
+                    # Save the previous sequence before starting a new one
+                    list_entries.append(dict_current_entry)
+                # Parse the header and prepare a new entry
+                list_info = line[1:].split(sep)
+                if col_db:
+                    dict_current_entry = {col_id: list_info[1], col_seq: "", col_db: list_info[0]}
+                    list_info = list_info[1:]
+                else:
+                    dict_current_entry = {col_id: list_info[0], col_seq: ""}
+                if len(list_info) > 1:
+                    for i in range(1, len(list_info[1:])):
+                        dict_current_entry[f'info{i}'] = list_info[i]
+            else:
+                dict_current_entry[col_seq] += line
+        if dict_current_entry:
+            list_entries.append(dict_current_entry)
+    df = pd.DataFrame(list_entries)
+    return df
+
+
+# II Main Functions
+# TODO test, example ...
+def read_fasta(file_path: str,
+               col_id: str = "entry",
+               col_seq: str = "sequence",
+               col_db: Optional[str] = None,
+               cols_info: Optional[List[str]] = None,
+               sep: str = "|"
+               ) -> pd.DataFrame:
+    """
+    Read a FASTA file and translate it into a sequence DataFrame with optional additional columns.
+
+    Parses FASTA files, splitting each entry header into the main identifier and additional information,
+    based on the specified separator. Sequences and associated information are returned as a DataFrame.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the FASTA file.
+    col_id : str, default='entry'
+        Column name for the sequence identifiers in the resulting DataFrame.
+    col_seq : str, default='sequence'
+        Column name for the sequences in the resulting DataFrame.
+    col_db : str, optional
+        Column name for databases. First entry of FASTA header if given.
+    cols_info : List[str], optional
+        Specifies custom column names for the additional info extracted from headers.
+        If not provided, defaults to 'info1', 'info2', etc.
+    sep : str, default='|'
+        Separator used for splitting identifier and additional information in the FASTA headers.
+
+    Returns
+    -------
+    pandas.DataFrame
+        A DataFrame (``df_seq``) where each row corresponds to a sequence entry from the FASTA file.
+
+    Notes
+    -----
+    Each ``FASTA`` file entry consists of two parts:
+
+    - 'FASTA header': Starting with '>', the header contains the main id and additional information,
+      all separated by a specified separator.
+    - 'Sequence': Sequence of specific entry, directly following the header
+
+    ``df_seq`` includes at least these columns:
+
+    - 'entry': Protein identifier, either the UniProt accession number or an id based on index.
+    - 'sequence': Amino acid sequence.
+
+    See Also
+    --------
+    * See further information on FASTA files and examples in
+     `bioperl FASTA sequence<https://bioperl.org/formats/sequence_formats/FASTA_sequence_format>`_.
+
+    Examples
+    --------
+
+    """
+    # Check input
+    ut.check_file(file_path=file_path)
+    ut.check_str(name="col_id", val=col_id, accept_none=False)
+    ut.check_str(name="col_seq", val=col_seq, accept_none=False)
+    ut.check_str(name="col_db", val=col_db, accept_none=True)
+    cols_info = ut.check_list_like(name="cols_info", val=cols_info, accept_str=True, accept_none=True)
+
+    # Read fasta
+    df_seq = _get_entries_from_fasta(file_path, col_id, col_seq, col_db, sep)
+    # Adjust column names
+    columns = list(df_seq)
+    if cols_info is not None:
+        n_info = len(columns)-2
+        if len(cols_info) >= n_info:
+            cols_info = cols_info[0:n_info]
+        else:
+            cols_info = cols_info + [f"info{i}" for i in range(1, n_info-len(cols_info)+1)]
+        if col_db:
+            columns = [col_id, col_seq, col_db] + cols_info[0:-1]
+        else:
+            columns = [col_id, col_seq] + cols_info
+        df_seq.columns = columns
+    # Post check
+    post_check_unique_entries(list_entries=df_seq[col_id].to_list(), col_id=col_id)
+    return df_seq
diff --git a/aaanalysis/data_handling/_data_read_write.py b/aaanalysis/data_handling/_to_fasta.py
similarity index 71%
rename from aaanalysis/data_handling/_data_read_write.py
rename to aaanalysis/data_handling/_to_fasta.py
index cb9bcabb..b3ed6b07 100644
--- a/aaanalysis/data_handling/_data_read_write.py
+++ b/aaanalysis/data_handling/_to_fasta.py
@@ -1,29 +1,31 @@
 """
 This is a script for reading and writing df_seq to the fasta file format, which is the most commonly used format
 in computational biology. This fasta format enables a smooth interaction with the biopython package.
-See https://bioperl.org/formats/sequence_formats/FASTA_sequence_format for description of input fasta format.
 """
 import time
 import pandas as pd
 import numpy as np
+from typing import Optional, List
 
 import aaanalysis.utils as ut
 
 
-
 # I Helper Functions
-# TODO add more parsers for often used data formats in computational biology (make overview)
-# TODO implement parser from df to df_seq (remove not necessary columns and adjust naming)
 
 
 # II Main Functions
-def read_fasta():
-    """"""
-
-
-def to_fasta(df=None, fasta_name=None, col_id=None, col_seq=None, cols_info=None):
+# TODO finish, docu, test, example ...
+
+def to_fasta(df=None,
+             file_path=None,
+             col_id="entry",
+             col_seq="sequence",
+             col_db=None,
+             cols_info=None,
+             sep="|"):
         """"""
         # Check input
+        ut.check_file(file_path=file_path)
         ut.check_str(name="col_id", val=col_id, accept_none=False)
         ut.check_str(name="col_seq", val=col_seq, accept_none=False)
         cols_info = ut.check_list_like(name="cols_info", val=cols_info, accept_str=True, accept_none=True)
@@ -32,9 +34,9 @@ def to_fasta(df=None, fasta_name=None, col_id=None, col_seq=None, cols_info=None
             cols_requiered += cols_info
         ut.check_df(df=df, name="df", cols_requiered=cols_requiered, accept_none=False, accept_nan=False)
         # Create faste
-        if ".fasta" not in fasta_name:
-            fasta_name += ".fasta"
-        fasta = open(fasta_name, "w")
+        if ".fasta" not in file_path:
+            file_path += ".fasta"
+        fasta = open(file_path, "w")
         for i, row in df.iterrows():
             seq = row[col_seq]
             entry = row[col_id]
@@ -49,6 +51,3 @@ def to_fasta(df=None, fasta_name=None, col_id=None, col_seq=None, cols_info=None
         fasta.close()
 
 
-
-def to_df_seq(df=None):
-    """"""
diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py
index 9255360d..b6691c12 100644
--- a/aaanalysis/utils.py
+++ b/aaanalysis/utils.py
@@ -50,7 +50,8 @@
                                 check_match_list_labels_names_datasets,
                                 check_array_like,
                                 check_superset_subset,
-                                check_df)
+                                check_df,
+                                check_file)
 from ._utils.check_models import (check_mode_class,
                                   check_model_kwargs)
 from ._utils.check_plots import (check_fig,
@@ -86,6 +87,7 @@
                              kullback_leibler_divergence_,
                              bic_score_)
 
+
 # Folder structure
 def _folder_path(super_folder, folder_name):
     """Modification of separator (OS-depending)"""
@@ -260,7 +262,7 @@ def _folder_path(super_folder, folder_name):
 COLOR_FEAT_IMP = '#7F7F7F'  # (127, 127, 127) feature importance
 COLOR_TMD = '#00FA9A'       # (0, 250, 154)
 COLOR_JMD = '#0000FF'       # (0, 0, 255)
-COLOR_POS =  "#389d2b"    # (56, 157, 43)
+COLOR_POS = "#389d2b"    # (56, 157, 43)
 COLOR_UNL = "tab:gray"
 COLOR_NEG = "#ad4570"   # (173,69,112)
 COLOR_REL_NEG = "#ad9745" # (173, 151, 69)
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 7acd4762..6687c0c8 100755
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -26,6 +26,7 @@ Data Handling
     load_dataset
     load_scales
     load_features
+    read_fasta
 
 .. _feature_engineering_api:
 
diff --git a/protocols/protocol0_sequence_analysis.ipynb b/protocols/protocol0_sequence_analysis.ipynb
new file mode 100644
index 00000000..29cd7437
--- /dev/null
+++ b/protocols/protocol0_sequence_analysis.ipynb
@@ -0,0 +1,35 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}