Skip to content

Commit

Permalink
Update cd-hit and mmseq installation6
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Jun 27, 2024
1 parent 133fb53 commit d0edf17
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 8 deletions.
12 changes: 9 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,23 @@ jobs:
if: runner.os == 'Windows'
run: |
curl -L -o cd-hit.tar.gz https://github.com/weizhongli/cdhit/releases/download/V4.8.1/cd-hit-v4.8.1-2019-0228.tar.gz
mkdir $Env:USERPROFILE\cdhit
New-Item -ItemType Directory -Force -Path $Env:USERPROFILE\cdhit
tar -xzf cd-hit.tar.gz -C $Env:USERPROFILE\cdhit
echo "$Env:USERPROFILE\cdhit\cd-hit-v4.8.1-2019-0228\bin" | Out-File -FilePath $Env:GITHUB_PATH -Append
echo "$Env:USERPROFILE\cdhit\cd-hit-v4.8.1-2019-0228\bin" >> $Env:GITHUB_PATH
- name: Install mmseqs2 (Windows)
if: runner.os == 'Windows'
run: |
curl -L -o mmseqs2.zip https://mmseqs.com/latest/mmseqs-win64.zip
New-Item -ItemType Directory -Force -Path $Env:USERPROFILE\mmseqs
Expand-Archive -Path mmseqs2.zip -DestinationPath $Env:USERPROFILE\mmseqs
echo "$Env:USERPROFILE\mmseqs\mmseqs-win64" | Out-File -FilePath $Env:GITHUB_PATH -Append
echo "$Env:USERPROFILE\mmseqs\mmseqs-win64" >> $Env:GITHUB_PATH
- name: Update PATH
if: runner.os == 'Windows'
run: |
$env:Path += ";$Env:USERPROFILE\cdhit\cd-hit-v4.8.1-2019-0228\bin;$Env:USERPROFILE\mmseqs\mmseqs-win64"
[System.Environment]::SetEnvironmentVariable('PATH', $env:Path, [System.EnvironmentVariableTarget]::Process)
- name: Run Tests
run: pytest tests
Expand Down
21 changes: 16 additions & 5 deletions aaanalysis/data_handling/_seq_preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,39 @@ class SequencePreprocessor:
"""

@staticmethod
def encode_one_hot(sequence: List[str],
def encode_one_hot(list_seq: Union[List[str], str] = None,
alphabet: str = "ARNDCEQGHILKMFPSTWYV",
gap: str = "_"
gap: str = "_",
pad_at: Literal["C", "N"] = "C",
) -> np.ndarray:
"""
One-hot encodes a list of protein sequences into a feature matrix.
One-hot-encode a list of protein sequences into a feature matrix.
Padding of shorter sequences with gaps represented as zero vectors.
Parameters
----------
sequence : List[str]
list_seq : list of str or str
List of protein sequences to encode.
alphabet : str, default='ARNDCEQGHILKMFPSTWYV'
The alphabet of amino acids used for encoding.
gap : str, default='_'
The character used to represent gaps in sequences.
pad_at : str, default='C'
Specifies where to add the padding:
'N' for N-terminus (beginning of the sequence),
'C' for C-terminus (end of the sequence).
Returns
-------
np.ndarray
A numpy array where each row represents an encoded sequence.
"""
return encode_one_hot(sequence, alphabet, gap)
# Check input
list_str = ut.check_list_like(name="list_seq", val=list_seq, check_all_str_or_convertible=True,
accept_none=False, accept_str=True)
#
return encode_one_hot(list_seq, alphabet, gap)

@staticmethod
def encode_integer(list_seq: List[str], alphabet: str = "ARNDCEQGHILKMFPSTWYV", gap: str = "_", pad_at: Literal["C", "N"] = "C") -> np.ndarray:
Expand Down

0 comments on commit d0edf17

Please sign in to comment.