ENH Add query-ampsphere subcommand

Uses the AMPSphere API to query the databases online
BigDataBiology · Jun 27, 2024 · 76b2c60 · 76b2c60
1 parent 3b7d269
commit 76b2c60
Show file tree

Hide file tree

Showing 13 changed files with 173 additions and 8 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -30,7 +30,7 @@ jobs:
       shell: bash -l {0}
       run : |
         conda install -c bioconda -c conda-forge \
-          ngless pyrodigal megahit paladin pandas atomium tzlocal \
+          ngless pyrodigal megahit paladin pandas requests atomium tzlocal \
           "scikit-learn<1.3.0" "joblib<1.3.0"
         conda install pytest
         pip install .

diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,6 @@
+Unreleased
+	* Query AMPSphere online
+
 Version 1.3.0 2023-12-05
 	* Add compatibility with Pyrodigal ≥3
 	* Add density estimates to output

diff --git a/docs/install.md b/docs/install.md
@@ -23,6 +23,7 @@ conda install -y \
           megahit \
           paladin \
           pandas \
+          requests \
           "scikit-learn<1.3.0" \
           "joblib<1.3.0" \
           atomicwrites \

diff --git a/docs/usage.md b/docs/usage.md
@@ -12,6 +12,7 @@ If you have not yet installed macrel, see [install](install).
 - `reads`: to input reads in fastQ format (Macrel accepts single- and paired-end reads),
 - `abundance`: to measure abundance of a peptides fasta file using a given fastQ file,
 - `get-smorfs`: to predict small genes from a contigs fasta file.
+- `query-ampsphere`: to query the AMPsphere database.
 
 ## Mandatory input flags
 
@@ -97,7 +98,7 @@ Note that the results per contig are only produced when not using the option
 
 Additionally to the prediction table, this mode also produces two files containing
 general gene prediction information in the contigs and a fasta file containing the predicted
-and filtered small genes (<= 100 amino acids).
+and filtered small genes (&le; 100 amino acids).
 
 To run Macrel on paired-end reads, use the `reads` subcommand:
 
@@ -117,7 +118,7 @@ the `-2` argument. An example of outputs using this mode can be found at
 
 Additionally to the prediction table, this mode also produces a contigs fasta file, 
 and the two files containing general gene prediction coordinates and a fasta file
-containing the predicted and filtered small genes (<= 100 amino acids).
+containing the predicted and filtered small genes (&le; 100 amino acids).
 
 To run Macrel to get abundance profiles, you only need the short reads file
 and a reference with peptide sequences. Use the `abundance` subcommand:

diff --git a/docs/whatsnew.md b/docs/whatsnew.md
@@ -1,5 +1,9 @@
 # What's new? (History)
 
+## Unreleased
+
+- Adds `query-ampsphere` command to query the AMPsphere database
+
 ## Version 1.3.0
 
 *Released 5 December 2023*

diff --git a/macrel/ampsphere.py b/macrel/ampsphere.py
@@ -0,0 +1,32 @@
+import requests
+import pandas as pd
+
+
+def get_ampsphere_exact_match(seq, query_name):
+    '''Get exact match from AMPSphere API'''
+    URL = f'https://ampsphere-api.big-data-biology.org/v1/search/sequence-match?query={seq}'
+    response = requests.get(URL)
+    data = response.json()
+    return pd.DataFrame.from_dict({query_name : data}, orient='index')
+
+def get_ampsphere_mmseqs_match(seq, query_name):
+    '''Get MMSeqs2 match from AMPSphere API'''
+    query = f'>{query_name}\n{seq}'
+    URL = f'https://ampsphere-api.big-data-biology.org/v1/search/mmseqs?query={query}'
+    response = requests.get(URL)
+    data = response.json()
+    return pd.DataFrame.from_dict(data)\
+            .drop("alignment_strings", axis=1)\
+            .set_index('query_identifier')
+
+def get_ampsphere_hmmer_match(seq, query_name):
+    '''Get HMMER match from AMPSphere API'''
+    query = f'>{query_name}\n{seq}'
+    URL = f'https://ampsphere-api.big-data-biology.org/v1/search/hmmer?query={query}'
+    response = requests.get(URL)
+    data = response.json()
+    if not data:
+        return pd.DataFrame()
+    return pd.DataFrame.from_dict(data)\
+            .set_index('query_name')
+
diff --git a/macrel/main.py b/macrel/main.py
@@ -26,7 +26,7 @@ def parse_args(args):
     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                      description='macrel v{}'.format(__version__), epilog=textwrap.dedent('''\
              Examples:
-                 run Macrel on peptides:  
+                 run Macrel on peptides:
                  macrel peptides --fasta example_seqs/expep.faa.gz --output out_peptides
                  
                  run Macrel on contigs:
@@ -35,8 +35,11 @@ def parse_args(args):
                  run Macrel on paired-end reads:
                  macrel reads -1 example_seqs/R1.fq.gz -2 example_seqs/R2.fq.gz --output out_metag --outtag example_metag
                  
-                 run Macrel to get abundance profiles: 
+                 run Macrel to get abundance profiles:
                  macrel abundance -1 example_seqs/R1.fq.gz --fasta example_seqs/ref.faa.gz --output out_abundance --outtag example_abundance
+
+                 Query the AMPSphere database
+                 macrel query-ampsphere --query-mode mmseqs --fasta peptides.faa
                  
                  For more information,please read the docs: https://macrel.readthedocs.io/en/latest/
              '''))
@@ -76,6 +79,9 @@ def parse_args(args):
             default=False, dest='logfile_append',
             help='If set, then the log file is appended to (default: overwrite existing file)')
 
+    parser.add_argument('--query-mode', required=False, default='exact',
+                        help='Query mode to use in the AMPSphere query (options: exact, mmseqs, hhm)', dest='query_mode')
+
     return parser.parse_args()
 
 def validate_args(args):
@@ -106,6 +112,11 @@ def validate_args(args):
     elif args.command == 'get-smorfs':
         if not args.fasta_file:
             error_exit(args, "Fasta file is necessary for 'get-smorfs' command.")
+    elif args.command == 'query-ampsphere':
+        if not args.fasta_file:
+            error_exit(args, "Fasta file is necessary for 'query-ampsphere' command.")
+        if args.query_mode not in ['exact', 'mmseqs', 'hmmer']:
+            error_exit(args, f"Unknown query mode {args.query_mode} (options: exact, mmseqs, hmmer)")
     else:
         error_exit(args, "Unknown command {}".format(args.command))
     if not args.output and not args.output_file:
@@ -326,6 +337,36 @@ def do_get_examples(args):
         print('Retrieving {}...'.format(f))
         urlretrieve(BASEURL + f, 'example_seqs/'+f)
 
+def do_ampsphere_query(args):
+    from . import ampsphere
+    from .fasta import fasta_iter
+    from time import sleep
+    import pandas as pd
+    match_function = {
+            'exact': ampsphere.get_ampsphere_exact_match,
+            'mmseqs': ampsphere.get_ampsphere_mmseqs_match,
+            'hmmer': ampsphere.get_ampsphere_hmmer_match,
+            }[args.query_mode]
+    results = []
+    logging.debug(f'Calling the AMPSphere API in {args.query_mode} mode')
+    for h,seq in fasta_iter(args.fasta_file):
+        results.append(match_function(seq, h))
+        sleep(0.1)
+        if len(results) == 20:
+            import sys
+            if sys.stdout.isatty():
+                print('Note that to avoid overloading the AMPSphere API, this script will pause a bit after every query')
+    results = pd.concat(results)
+    results['result'].fillna('No_Hit', inplace=True)
+    ofile = (args.output_file if args.output_file != '-' else '/dev/stdout')
+    if ofile is None:
+        ofile = path.join(args.output,
+                          f'{args.outtag}.ampsphere_{args.query_mode}.tsv.gz')
+    with open_output(ofile, mode='wb') as raw_out:
+        with gzip.open(raw_out, 'wt') as out:
+            out.write(f'# AMPSphere query results (mode: {args.query_mode})\n')
+            results.to_csv(out, sep='\t', index=True)
+
 def main(args=None):
     if args is None:
         import sys
@@ -344,15 +385,15 @@ def main(args=None):
 
     with tempfile.TemporaryDirectory(dir=args.tmpdir) as tdir:
         from .output import readme_output_abundance_mode,readme_output_contigs_mode,\
-            readme_output_peptides_mode,readme_output_reads_mode
+            readme_output_peptides_mode,readme_output_reads_mode, readme_output_ampsphere_mode
 
         creadme = {'reads': readme_output_reads_mode,
                    'contigs': readme_output_contigs_mode,
                    'get-smorfs': readme_output_contigs_mode,
                    'peptides': readme_output_peptides_mode,
                    'abundance': readme_output_abundance_mode,
+                   'query-ampsphere': readme_output_ampsphere_mode,
                   }
-
         # print readme
         if args.output_file != '-':
             with open_output(os.path.join(args.output, 'README.md')) as ofile:
@@ -370,6 +411,8 @@ def main(args=None):
                 do_density(args, clen, prediction)
         if args.command == 'abundance':
             do_abundance(args, tdir,logfile)
+        if args.command == 'query-ampsphere':
+            do_ampsphere_query(args)
 
 if __name__ == '__main__':
     import sys

diff --git a/macrel/output.py b/macrel/output.py
@@ -142,3 +142,29 @@
 
 {footer}
 """
+
+readme_output_ampsphere_mode = f"""{header}
+
+For results for this mode, please see more information and cite the AMPSphere manuscript:
+
+> **Discovery of antimicrobial peptides in the global microbiome with machine
+> learning** by Célio Dias Santos Júnior, Marcelo D.T. Torres, Yiqian Duan,
+> Álvaro Rodríguez del Río, Thomas S.B. Schmidt, Hui Chong, Anthony Fullam,
+> Michael Kuhn, Chengkai Zhu, Amy Houseman, Jelena Somborski, Anna Vines,
+> Xing-Ming Zhao, Peer Bork, Jaime Huerta-Cepas, Cesar de la Fuente-Nunez, Luis
+> Pedro Coelho in _Cell_
+> [doi:10.1016/j.cell.2024.05.013](https://doi.org/10.1016/j.cell.2024.05.013)
+
+
+## Outputs for `ampsphere` mode
+
+There are three modes:
+
+- `exact`: exact match only
+- `mmseqs`: MMSeqs2 match
+- `hmm`: HMMER match
+
+The exact output format will depend on the mode as it will return results from each tool.
+
+{footer}
+"""
diff --git a/macrel/tests/test_ampsphere.py b/macrel/tests/test_ampsphere.py
@@ -0,0 +1,20 @@
+from macrel import ampsphere
+
+seq1 = 'KRVKSFFKGYMRAIEINAALMYGYRPK'
+seq2 = 'GRVIGKQGRIAKAIRVVMRAAAVRVDEKVLVEID'
+def test_exact():
+    r = ampsphere.get_ampsphere_exact_match(seq1, 'seq1')
+    assert r.index[0] == 'seq1'
+    assert r.iloc[0]['result'] == 'AMP10.000_002'
+    r = ampsphere.get_ampsphere_exact_match(seq1 + 'HELO', 'seq2')
+    assert r.index[0] == 'seq2'
+    assert r.iloc[0]['result'] is None
+
+def test_mmseqs():
+    r = ampsphere.get_ampsphere_mmseqs_match(seq1, 'seq1')
+    assert r.index[0] == 'seq1'
+    assert r.iloc[0]['target_identifier'] == 'AMP10.000_002'
+
+def test_hmmer():
+    r = ampsphere.get_ampsphere_hmmer_match(seq2, 'seq2')
+    assert r.index[0] == 'seq2'
diff --git a/setup.py b/setup.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2019-2023, MACREL Authors
+# Copyright (C) 2019-2024, MACREL Authors
 # vim: set ts=4 sts=4 sw=4 expandtab smartindent:
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -73,6 +73,7 @@
       install_requires=[
           'scikit-learn',
           'pandas',
+          'requests',
           'atomicwrites'
       ],
       entry_points={

diff --git a/tests/ampsphere/command.sh b/tests/ampsphere/command.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+set -e
+
+macrel query-ampsphere \
+    --fasta pep8.faa \
+    --output out
+gunzip out/macrel.out.ampsphere_exact.tsv.gz
diff --git a/tests/ampsphere/expected.ampsphere_exact.tsv b/tests/ampsphere/expected.ampsphere_exact.tsv
@@ -0,0 +1,10 @@
+# AMPSphere query results (mode: exact)
+	query	result
+Query0	KKVKSIFKKALAMMGENEVKAWGIGIK	AMP10.000_000
+Query1	FFGIGQQEMTLEEIGDKFGLTRERVRQIKEKAIRRLRQSNRSKLLKSYLG	AMP10.000_001
+Query2	KRVKSFFKGYMRAIEINAALMYGYRPK	AMP10.000_002
+Query3	GRVIGKQGRIAKAIRVVMRAAAVRVDEKVLVEID	AMP10.000_003
+Query4	KLRKILKSMFNNYCKTFKDVPPGNMFR	AMP10.000_004
+Query5	AIFYVIKHISRKHFVSLQRYKIKEKM	AMP10.000_005
+Query6	LVRIISMVIAGVIIVYLVRWIDNFFSRYRK	AMP10.000_006
+Query7	HELOQFLTHIIFLLNLLKTLINHFS	No_Hit
diff --git a/tests/ampsphere/pep8.faa b/tests/ampsphere/pep8.faa
@@ -0,0 +1,16 @@
+>Query0
+KKVKSIFKKALAMMGENEVKAWGIGIK
+>Query1
+FFGIGQQEMTLEEIGDKFGLTRERVRQIKEKAIRRLRQSNRSKLLKSYLG
+>Query2
+KRVKSFFKGYMRAIEINAALMYGYRPK
+>Query3
+GRVIGKQGRIAKAIRVVMRAAAVRVDEKVLVEID
+>Query4
+KLRKILKSMFNNYCKTFKDVPPGNMFR
+>Query5
+AIFYVIKHISRKHFVSLQRYKIKEKM
+>Query6
+LVRIISMVIAGVIIVYLVRWIDNFFSRYRK
+>Query7
+HELOQFLTHIIFLLNLLKTLINHFS