Skip to content

Commit

Permalink
ENH Add local support for MMSeqs searches
Browse files Browse the repository at this point in the history
  • Loading branch information
luispedro committed Jun 28, 2024
1 parent ea687ee commit 2a1c87c
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 6 deletions.
2 changes: 1 addition & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Unreleased
* Add support for local match (exact only for now)
* Add support for local match (exact and MMSeqs modes)
* Slightly change output format for AMPSphere matching

Version 1.4.0 2024-06-27
Expand Down
6 changes: 5 additions & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,5 +153,9 @@ macrel query-ampsphere \
2. MMSeqs2: This mode uses MMSeqs2 to perform approximate matching of the input sequences in the AMPSphere database. This is slower than exact matching but can find more matches. To use this mode, pass the `--query-mode=mmseqs` flag.
3. HMMER: This mode uses HMMER to search for homologs of the input sequences in the AMPSphere database. This is the slowest mode but can find more distant homologs. To use this mode, pass the `--query-mode=hmm` flag.

Note that since these use the API, you do not need to have the AMPSphere database downloaded locally or installed MMSeqs2/HMMER. On the other hand, you need an internet connection to use this feature. At the moment (version 1.5.0), only exact matching can be used with a local database (use the `--local` flag).
Note that since these use the API, you do not need to have the AMPSphere database downloaded locally or installed MMSeqs2/HMMER. On the other hand, you need an internet connection to use this feature. At the moment (version 1.5.0), exact matching and MMSeqs can be used with a local database (use the `--local` flag).

As MMSeqs2 is a heuristic method, the results may vary slightly between the local and remote mode (as versions of MMSeqs2 may differ).



75 changes: 75 additions & 0 deletions macrel/ampsphere.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import tempfile
import requests
import pandas as pd
from os import path
Expand Down Expand Up @@ -25,6 +26,80 @@ def get_cache_directory(args):
f.write('#\thttp://www.brynosaurus.com/cachedir/\n')
return cache_dir

def maybe_download_ampsphere_mmseqs(args):
import tarfile
import shutil
target = path.join(get_cache_directory(args), 'AMPSphere_latest.mmseqsdb')
if path.exists(target):
logging.debug(f'AMPSphere MMSeqs2 database already downloaded to {target}')
if not args.no_download_database and args.force:
logging.debug(f'Force download enabled, re-downloading AMPSphere MMSeqs2 database')
shutil.rmtree(target)
else:
return target
AMPSPHERE_MMSEQS2_URL = 'https://ampsphere-api.big-data-biology.org/v1/downloads/AMPSphere_latest.mmseqsdb.tar.xz'
with tempfile.TemporaryDirectory() as tmpdir:
tfile = path.join(tmpdir, 'AMPSphere_latest.mmseqsdb.tar.xz')
r = requests.get(AMPSPHERE_MMSEQS2_URL, stream=True)
with open(tfile, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logging.debug(f'Downloaded AMPSphere MMSeqs2 database to {tfile}')
with tarfile.open(tfile) as tar:
tar.extractall(tmpdir)
logging.debug(f'Extracted AMPSphere MMSeqs2 database to {tmpdir}')
logging.debug(f'Moving AMPSphere MMSeqs2 database to {target}')
return shutil.move(path.join(tmpdir, 'mmseqs_db'), target)


MMSEQS2_OUTPUT_FORMAT = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qseq,tseq'

def _logged_subprocess_call(cmd):
import subprocess
logging.debug(f'Running command: {" ".join(cmd)}')
subprocess.check_call(cmd)

def get_ampsphere_mmseqs_match_local(args, seqs):
import shutil
mmseqs_bin = shutil.which('mmseqs')
if not mmseqs_bin:
logging.error('MMSeqs2 not found. Please install it first (you can use `conda install -c bioconda mmseqs2`)')
sys.exit(1)
logging.debug(f'Using MMSeqs2 binary at {mmseqs_bin}')

mmseqs_db = maybe_download_ampsphere_mmseqs(args)
if not mmseqs_db:
logging.error('AMPSphere MMSeqs2 database not found. Please download it first (use the --download-database flag) or provide the path to the database using the --cache-dir flag')
sys.exit(1)
mmseqs_db = path.join(mmseqs_db, 'AMPSphere_latest.mmseqsdb')
logging.info(f'Using AMPSphere MMSeqs2 database at {mmseqs_db}')
with tempfile.TemporaryDirectory() as tmpdir:
query_file = path.join(tmpdir, 'query.faa')
output_file = path.join(tmpdir, 'output.tsv')
with open(query_file, 'wt') as f:
for (query_name, seq) in seqs:
f.write(f'>{query_name}\n{seq}\n')
_logged_subprocess_call(
['mmseqs', 'createdb', query_file, f'{query_file}.mmseqsdb'])
_logged_subprocess_call(
['mmseqs', 'search',
f'{query_file}.mmseqsdb', mmseqs_db,
output_file + '.mmseqsdb',
tmpdir + '/tmp'])

_logged_subprocess_call(
['mmseqs', 'convertalis',
f'{query_file}.mmseqsdb',
mmseqs_db,
output_file + '.mmseqsdb',
output_file,
'--format-output', MMSEQS2_OUTPUT_FORMAT])

return pd.read_csv(output_file, sep='\t', header=None,
names=MMSEQS2_OUTPUT_FORMAT.split(','))\
.set_index('query') \
.sort_values('evalue').sort_index(kind='stable')

def maybe_download_ampsphere_faa(args):
target = path.join(get_cache_directory(args), 'AMPSphere_v.2022-03.faa.gz')
if path.exists(target):
Expand Down
12 changes: 8 additions & 4 deletions macrel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,13 @@ def do_ampsphere_query(args):
from time import sleep
import pandas as pd
if args.local:
if args.query_mode != 'exact':
error_exit(args, 'Local mode only supports exact query mode (for now)')
results = ampsphere.get_ampsphere_exact_match_local(args, fasta_iter(args.fasta_file))
if args.query_mode == 'hmmer':
error_exit(args, 'Local mode does not support HMMER queries')
match_function = {
'exact': ampsphere.get_ampsphere_exact_match_local,
'mmseqs': ampsphere.get_ampsphere_mmseqs_match_local,
}[args.query_mode]
results = match_function(args, fasta_iter(args.fasta_file))
else:
match_function = {
'exact': ampsphere.get_ampsphere_exact_match,
Expand All @@ -370,7 +374,7 @@ def do_ampsphere_query(args):
import sys
if sys.stdout.isatty():
print('Note that to avoid overloading the AMPSphere API, this script will pause a bit after every query')
if args.query_mode == 'exact':
if args.query_mode != 'hmmer':
print('You can use the --local flag to download and use a local version of the AMPSphere database')
results = pd.concat(results)
results.index.name = 'query_name'
Expand Down

0 comments on commit 2a1c87c

Please sign in to comment.