diff --git a/ChangeLog b/ChangeLog index d8b533a..5e0f62e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,5 @@ Unreleased - * Add support for local match (exact only for now) + * Add support for local match (exact and MMSeqs modes) * Slightly change output format for AMPSphere matching Version 1.4.0 2024-06-27 diff --git a/docs/usage.md b/docs/usage.md index 112f17f..381b4b7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -153,5 +153,9 @@ macrel query-ampsphere \ 2. MMSeqs2: This mode uses MMSeqs2 to perform approximate matching of the input sequences in the AMPSphere database. This is slower than exact matching but can find more matches. To use this mode, pass the `--query-mode=mmseqs` flag. 3. HMMER: This mode uses HMMER to search for homologs of the input sequences in the AMPSphere database. This is the slowest mode but can find more distant homologs. To use this mode, pass the `--query-mode=hmm` flag. -Note that since these use the API, you do not need to have the AMPSphere database downloaded locally or installed MMSeqs2/HMMER. On the other hand, you need an internet connection to use this feature. At the moment (version 1.5.0), only exact matching can be used with a local database (use the `--local` flag). +Note that since these use the API, you do not need to have the AMPSphere database downloaded locally or installed MMSeqs2/HMMER. On the other hand, you need an internet connection to use this feature. At the moment (version 1.5.0), exact matching and MMSeqs can be used with a local database (use the `--local` flag). + +As MMSeqs2 is a heuristic method, the results may vary slightly between the local and remote mode (as versions of MMSeqs2 may differ). + + diff --git a/macrel/ampsphere.py b/macrel/ampsphere.py index d726c5d..1b4eda9 100644 --- a/macrel/ampsphere.py +++ b/macrel/ampsphere.py @@ -1,3 +1,4 @@ +import tempfile import requests import pandas as pd from os import path @@ -25,6 +26,80 @@ def get_cache_directory(args): f.write('#\thttp://www.brynosaurus.com/cachedir/\n') return cache_dir +def maybe_download_ampsphere_mmseqs(args): + import tarfile + import shutil + target = path.join(get_cache_directory(args), 'AMPSphere_latest.mmseqsdb') + if path.exists(target): + logging.debug(f'AMPSphere MMSeqs2 database already downloaded to {target}') + if not args.no_download_database and args.force: + logging.debug(f'Force download enabled, re-downloading AMPSphere MMSeqs2 database') + shutil.rmtree(target) + else: + return target + AMPSPHERE_MMSEQS2_URL = 'https://ampsphere-api.big-data-biology.org/v1/downloads/AMPSphere_latest.mmseqsdb.tar.xz' + with tempfile.TemporaryDirectory() as tmpdir: + tfile = path.join(tmpdir, 'AMPSphere_latest.mmseqsdb.tar.xz') + r = requests.get(AMPSPHERE_MMSEQS2_URL, stream=True) + with open(tfile, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + logging.debug(f'Downloaded AMPSphere MMSeqs2 database to {tfile}') + with tarfile.open(tfile) as tar: + tar.extractall(tmpdir) + logging.debug(f'Extracted AMPSphere MMSeqs2 database to {tmpdir}') + logging.debug(f'Moving AMPSphere MMSeqs2 database to {target}') + return shutil.move(path.join(tmpdir, 'mmseqs_db'), target) + + +MMSEQS2_OUTPUT_FORMAT = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qseq,tseq' + +def _logged_subprocess_call(cmd): + import subprocess + logging.debug(f'Running command: {" ".join(cmd)}') + subprocess.check_call(cmd) + +def get_ampsphere_mmseqs_match_local(args, seqs): + import shutil + mmseqs_bin = shutil.which('mmseqs') + if not mmseqs_bin: + logging.error('MMSeqs2 not found. Please install it first (you can use `conda install -c bioconda mmseqs2`)') + sys.exit(1) + logging.debug(f'Using MMSeqs2 binary at {mmseqs_bin}') + + mmseqs_db = maybe_download_ampsphere_mmseqs(args) + if not mmseqs_db: + logging.error('AMPSphere MMSeqs2 database not found. Please download it first (use the --download-database flag) or provide the path to the database using the --cache-dir flag') + sys.exit(1) + mmseqs_db = path.join(mmseqs_db, 'AMPSphere_latest.mmseqsdb') + logging.info(f'Using AMPSphere MMSeqs2 database at {mmseqs_db}') + with tempfile.TemporaryDirectory() as tmpdir: + query_file = path.join(tmpdir, 'query.faa') + output_file = path.join(tmpdir, 'output.tsv') + with open(query_file, 'wt') as f: + for (query_name, seq) in seqs: + f.write(f'>{query_name}\n{seq}\n') + _logged_subprocess_call( + ['mmseqs', 'createdb', query_file, f'{query_file}.mmseqsdb']) + _logged_subprocess_call( + ['mmseqs', 'search', + f'{query_file}.mmseqsdb', mmseqs_db, + output_file + '.mmseqsdb', + tmpdir + '/tmp']) + + _logged_subprocess_call( + ['mmseqs', 'convertalis', + f'{query_file}.mmseqsdb', + mmseqs_db, + output_file + '.mmseqsdb', + output_file, + '--format-output', MMSEQS2_OUTPUT_FORMAT]) + + return pd.read_csv(output_file, sep='\t', header=None, + names=MMSEQS2_OUTPUT_FORMAT.split(','))\ + .set_index('query') \ + .sort_values('evalue').sort_index(kind='stable') + def maybe_download_ampsphere_faa(args): target = path.join(get_cache_directory(args), 'AMPSphere_v.2022-03.faa.gz') if path.exists(target): diff --git a/macrel/main.py b/macrel/main.py index 46e01d4..49bd574 100644 --- a/macrel/main.py +++ b/macrel/main.py @@ -352,9 +352,13 @@ def do_ampsphere_query(args): from time import sleep import pandas as pd if args.local: - if args.query_mode != 'exact': - error_exit(args, 'Local mode only supports exact query mode (for now)') - results = ampsphere.get_ampsphere_exact_match_local(args, fasta_iter(args.fasta_file)) + if args.query_mode == 'hmmer': + error_exit(args, 'Local mode does not support HMMER queries') + match_function = { + 'exact': ampsphere.get_ampsphere_exact_match_local, + 'mmseqs': ampsphere.get_ampsphere_mmseqs_match_local, + }[args.query_mode] + results = match_function(args, fasta_iter(args.fasta_file)) else: match_function = { 'exact': ampsphere.get_ampsphere_exact_match, @@ -370,7 +374,7 @@ def do_ampsphere_query(args): import sys if sys.stdout.isatty(): print('Note that to avoid overloading the AMPSphere API, this script will pause a bit after every query') - if args.query_mode == 'exact': + if args.query_mode != 'hmmer': print('You can use the --local flag to download and use a local version of the AMPSphere database') results = pd.concat(results) results.index.name = 'query_name'