Skip to content

Commit

Permalink
ENH Add support for HMM matching
Browse files Browse the repository at this point in the history
  • Loading branch information
luispedro committed Jun 28, 2024
1 parent a2ad360 commit 1097871
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 7 deletions.
2 changes: 1 addition & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Unreleased
* Add support for local match (exact and MMSeqs modes)
* Add support for local searching
* Slightly change output format for AMPSphere matching

Version 1.4.0 2024-06-27
Expand Down
18 changes: 16 additions & 2 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,23 @@ macrel query-ampsphere \
2. MMSeqs2: This mode uses MMSeqs2 to perform approximate matching of the input sequences in the AMPSphere database. This is slower than exact matching but can find more matches. To use this mode, pass the `--query-mode=mmseqs` flag.
3. HMMER: This mode uses HMMER to search for homologs of the input sequences in the AMPSphere database. This is the slowest mode but can find more distant homologs. To use this mode, pass the `--query-mode=hmm` flag.

Note that since these use the API, you do not need to have the AMPSphere database downloaded locally or installed MMSeqs2/HMMER. On the other hand, you need an internet connection to use this feature. At the moment (version 1.5.0), exact matching and MMSeqs can be used with a local database (use the `--local` flag).

As MMSeqs2 is a heuristic method, the results may vary slightly between the local and remote mode (as versions of MMSeqs2 may differ).
#### Remote vs. local

By default, macrel uses the remote API to query the AMPSphere database. This is the fastest way to query the database for a small number of peptides, but it requires an internet connection.

If you want to use a local database use the `--local` flag:

```bash
macrel query-ampsphere \
--fasta example_seqs/pep8.faa \
--query-mode mmseqs \
--local \
--output out_ampsphere
```

This will download the AMPSphere database and use it to query the input sequences. The database will stored in `~/.cache/macrel/ampsphere` (you can change it with the `--cache-dir` flag). To use the MMSeqs2 or HHMER modes, you need to have the respective software installed on your system (`conda install bioconda::mmseqs2` or `conda install bioconda::hmmer`).

As MMSeqs2 is a heuristic method, the results may vary slightly between the local and remote mode (as versions of MMSeqs2 may differ).


55 changes: 53 additions & 2 deletions macrel/ampsphere.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
import tempfile
import requests
import pandas as pd
Expand Down Expand Up @@ -33,7 +34,6 @@ def get_cache_directory(args):

def maybe_download_ampsphere_mmseqs(args):
import tarfile
import shutil
target = path.join(get_cache_directory(args), 'AMPSphere_latest.mmseqsdb')
if path.exists(target):
logging.debug(f'AMPSphere MMSeqs2 database already downloaded to {target}')
Expand All @@ -57,6 +57,58 @@ def maybe_download_ampsphere_mmseqs(args):
return shutil.move(path.join(tmpdir, 'mmseqs_db'), target)


def maybe_download_ampsphere_hmm(args):
target_dir = path.join(get_cache_directory(args), 'hmm_db')
target = path.join(target_dir, 'AMPSphere_latest.hmm')
if path.exists(target):
logging.debug(f'AMPSphere HMM database already downloaded to {target}')
if args.re_download_database:
logging.debug(f'Force redownload enabled, re-downloading AMPSphere HMM database')
shutil.rmtree(target)
else:
return target
HMM_URL = 'https://ampsphere-api.big-data-biology.org/v1/downloads/AMPSphere_latest.hmm'
if not shutil.which('hmmpress'):
logging.error('HMMER not found. Please install it first (you can use `conda install -c bioconda hmmer`)')
sys.exit(1)
with tempfile.TemporaryDirectory() as tmpdir:
tfile = path.join(tmpdir, 'AMPSphere_latest.hmm')
r = requests.get(HMM_URL, stream=True, headers=REQUESTS_HEADER)
with open(tfile, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logging.debug(f'Downloaded AMPSphere HMM database to {tfile}')
_logged_subprocess_call(['hmmpress', tfile])
logging.debug(f'Indexed AMPSphere HMM database')
logging.debug(f'Moving AMPSphere HMM database to {target}')
# move all files in tmpdir to target
from os import listdir, makedirs
makedirs(target_dir, exist_ok=True)
for f in listdir(tmpdir):
shutil.move(path.join(tmpdir, f), target_dir)
return target

def get_ampsphere_hmmer_match_local(args, seqs):
from macrel.fasta import fasta_iter
hmm = maybe_download_ampsphere_hmm(args)
if not hmm:
logging.error('AMPSphere HMM database not found. Please download it first or provide the path to the database using the --cache-dir flag')
sys.exit(1)
with tempfile.TemporaryDirectory() as tmpdir:
query_file = path.join(tmpdir, 'query.faa')
output_file = path.join(tmpdir, 'output.tsv')
with open(query_file, 'wt') as f:
for (query_name, seq) in seqs:
f.write(f'>{query_name}\n{seq}\n')
_logged_subprocess_call(
['hmmsearch', '--tblout', output_file, hmm, query_file])
return pd.read_csv(output_file, sep='\s+', comment='#', header=None,
names=['target', 'target_accession', 'query_name',
'query_accession', 'evalue', 'score', 'bias',
'domain_score', 'domain_bias', 'exp', 'reg', 'clu',
'ov', 'env', 'dom', 'rep', 'inc', 'description'])\
.set_index('query_name')

MMSEQS2_OUTPUT_FORMAT = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qseq,tseq'

def _logged_subprocess_call(cmd):
Expand All @@ -65,7 +117,6 @@ def _logged_subprocess_call(cmd):
subprocess.check_call(cmd)

def get_ampsphere_mmseqs_match_local(args, seqs):
import shutil
mmseqs_bin = shutil.which('mmseqs')
if not mmseqs_bin:
logging.error('MMSeqs2 not found. Please install it first (you can use `conda install -c bioconda mmseqs2`)')
Expand Down
3 changes: 1 addition & 2 deletions macrel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,10 @@ def do_ampsphere_query(args):
from time import sleep
import pandas as pd
if args.local:
if args.query_mode == 'hmmer':
error_exit(args, 'Local mode does not support HMMER queries')
match_function = {
'exact': ampsphere.get_ampsphere_exact_match_local,
'mmseqs': ampsphere.get_ampsphere_mmseqs_match_local,
'hmmer': ampsphere.get_ampsphere_hmmer_match_local,
}[args.query_mode]
results = match_function(args, fasta_iter(args.fasta_file))
else:
Expand Down

0 comments on commit 1097871

Please sign in to comment.