ENH Add density calculation

Include results per contig and print density per sample when in contigs/reads mode
BigDataBiology · Jul 13, 2023 · a4e24de · a4e24de
1 parent 4167bdf
commit a4e24de
Show file tree

Hide file tree

Showing 18 changed files with 214 additions and 29 deletions.
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -0,0 +1,58 @@
+# Tutorial for AMP density
+
+If you have not yet installed macrel, see [install](install) and for a 
+detailed usage, please see our [manual](usage).
+
+In this tutorial, we will show how to use the `expected.percontigs` output
+produced in the `contigs` and `reads` mode to obtain the density of AMPs
+per species.
+
+Consider the expected results in the folder `tests/contigs/expected.percontigs`
+as one of the inputs used in this tutorial, and the table for taxonomy of contigs
+available in the `example_seqs/example_taxonomy_contigs.tsv.gz`.
+
+First you will need to load tables into your system:
+
+```
+import pandas as pd
+
+percontigs = pd.read_table('tests/contigs/expected.percontigs', comment='#')
+taxonomy = pd.read_table('example_seqs/example_taxonomy_contigs.tsv.gz')
+```
+
+Then, you will need to merge these tables.
+
+```
+percontigs = percontigs.merge(on='contig',
+                              right=taxonomy,
+                              how='outer')
+```
+
+Now, we will group results and sum values:
+
+```
+percontigs = percontigs.dropna()
+percontigs = percontigs.drop('contig', axis=1)
+percontigs = percontigs.groupby('taxonomy').agg('sum')
+```
+
+By now, you should have a table with the species and the total
+of assembled base pairs per species as well as their total number of ORFs,
+smORFs and redundant AMPs. Now you just calculate the density as follows:
+
+```
+percontigs['AMP_density'] = percontigs.AMPs * 1e6 / percontigs.length
+percontigs.to_csv('expected.density',
+                  sep='\t',
+                  header=True,
+                  index=None)
+```
+
+The resulting `expected.density` table should be as follows:
+
+| **taxonomy** | **length** | **ORFs** | **smORFs** | **AMPs** | **AMP_density** |
+| :---: | :---: | :---: | :---: | :---: | :---: | 
+| speciesA | 4208 | 11 | 10 | 0 | 0.000 | 
+| speciesB | 11876 | 15 | 8 | 1 | 84.203 |
+| speciesC | 5679 | 8 | 6 | 0 | 0.000 |
+
diff --git a/docs/usage.md b/docs/usage.md
@@ -89,10 +89,15 @@ macrel contigs \
 
 In this example, we use the example file `excontigs.fna.gz` which is a FASTA
 file with nucleotide sequences, writing the output to `out_contigs`.
-An example of output using this mode can be found at `test/contigs/expected.prediction`.
+An example of output using this mode can be found at `test/contigs/expected.prediction`
+and `test/contigs/expected.percontigs`, with the latter you can also calculate AMP density.
+
+Note that the results per contig are only produced when not using the option 
+`cluster`.
+
 Additionally to the prediction table, this mode also produces two files containing
-general gene prediction information in the contigs and a fasta file containing the
-predicted and filtered small genes (<= 100 amino acids).
+general gene prediction information in the contigs and a fasta file containing the predicted
+and filtered small genes (<= 100 amino acids).
 
 To run Macrel on paired-end reads, use the `reads` subcommand:
 
@@ -107,7 +112,9 @@ macrel reads \
 The paired-end reads are given as paired files (here, `example_seqs/R1.fq.gz`
 and `example_seqs/R2.fq.gz`). If you only have single-end reads, you can omit
 the `-2` argument. An example of outputs using this mode can be found at
-`test/reads/expected.prediction` and `test/reads/expected.smorfs.faa`.
+`test/reads/expected.prediction`, `test/reads/expected.smorfs.faa` and
+`test/contigs/expected.percontigs`, with the latter you can also calculate AMP density.
+
 Additionally to the prediction table, this mode also produces a contigs fasta file, 
 and the two files containing general gene prediction coordinates and a fasta file
 containing the predicted and filtered small genes (<= 100 amino acids).

diff --git a/example_seqs/example_taxonomy_contigs.tsv.gz b/example_seqs/example_taxonomy_contigs.tsv.gz
diff --git a/install.sh b/install.sh
@@ -43,11 +43,12 @@ fi
 ${CONDA_INSTALL_CMD} install -y \
           --prefix $BASEDIR/envs/Macrel_env \
           ngless \
-          pyrodigal>=0.7.3 \
+          "pyrodigal>=0.7.3" \
           megahit \
           paladin \
           pandas \
-          scikit-learn \
+          "scikit-learn<1.3.0" \
+          "joblib<1.3.0" \
           atomicwrites \
           tzlocal
 

diff --git a/macrel/ORFs_prediction.py b/macrel/ORFs_prediction.py
@@ -31,16 +31,22 @@ def predict_genes(infile, ofile):
     import pandas as pd
     from .fasta import fasta_iter
     from atomicwrites import atomic_write
+
+    clen = []
     gorf, morf_finder = create_pyrodigal_orffinder()
+
     # predict genes
     with atomic_write(ofile, overwrite=True) as odb:
         for idx, (h, s) in enumerate(fasta_iter(infile)):
+            orfs, smorfs = [0, 0]
             if len(s) <= 100_000:
                 # if contig length less than 100kbp then not suitable for training
                 # predict genes using metagenome pretrained models
                 for i, pred in enumerate(morf_finder.find_genes(s)):
                     t = ppyrodigal_out(h, i+1, idx+1, pred)
                     odb.write(t)
+                    orfs += 1
+                    if len(pred.translate()) <= 100: smorfs += 1
             else:
                 # if contig length is above or 100kbp then suitable for training of
                 # its own model, therefore proceed in a genome wise way
@@ -49,4 +55,10 @@ def predict_genes(infile, ofile):
                 for i, pred in enumerate(gorf.find_genes(s)):
                     t = ppyrodigal_out(h, i+1, idx+1, pred)
                     odb.write(t)
+                    orfs += 1
+                    if len(pred.translate()) <= 100: smorfs += 1
+
+            clen.append([h, len(s), orfs, smorfs]) 
+
+    return pd.DataFrame(clen, columns=['contig', 'length', 'ORFs', 'smORFs'])
 
diff --git a/macrel/data/models/AMP.pkl.gz b/macrel/data/models/AMP.pkl.gz
diff --git a/macrel/data/models/Hemo.pkl.gz b/macrel/data/models/Hemo.pkl.gz
diff --git a/macrel/main.py b/macrel/main.py
@@ -143,9 +143,10 @@ def do_smorfs(args, tdir,logfile):
         peptide_file = (args.output_file if args.output_file != '-' else '/dev/stdout')
 
     # predict genes with pyrodigal
-    predict_genes(args.fasta_file, all_peptide_file)
+    clen = predict_genes(args.fasta_file, all_peptide_file)
     filter_smorfs(all_peptide_file, peptide_file, args.cluster, args.keep_fasta_headers)
     args.fasta_file = peptide_file
+    return clen
 
 
 def link_or_uncompress_fasta_file(orig, dest):
@@ -271,16 +272,37 @@ def do_predict(args, tdir):
     import gzip
     fs = fasta_features(args.fasta_file)
     prediction = predict(
-                    data_file("models/AMP.pkl.gz"),
-                    data_file("models/Hemo.pkl.gz"),
-                    fs,
-                    args.keep_negatives)
+                         data_file("models/AMP.pkl.gz"),
+                         data_file("models/Hemo.pkl.gz"),
+                         fs,
+                         args.keep_negatives)
     ofile = path.join(args.output, args.outtag + '.prediction.gz')
     with open_output(ofile, mode='wb') as raw_out:
         with gzip.open(raw_out, 'wt') as out:
             from .macrel_version import __version__
             out.write('# Prediction from macrel v{}\n'.format(__version__))
             prediction.to_csv(out, sep='\t', index_label='Access', float_format="%.3f")
+    return prediction
+
+def do_density(args, clen, prediction):
+    tpred = prediction.reset_index()
+    tpred['contig'] = tpred['index'].apply(lambda x: '_'.join(x.split('_')[:-1]))
+    tpred = tpred[tpred['AMP_probability'] > 0.5]
+    tpred = tpred.groupby('contig').agg('size')
+    tpred = tpred.reset_index()
+    tpred = tpred.rename({0: 'AMPs'}, axis=1)
+    clen = clen.merge(on='contig', right=tpred, how='outer').fillna(0)
+    clen[clen.columns[1:]] = clen[clen.columns[1:]].astype(int)   
+    ofile = path.join(args.output, args.outtag + '.percontigs.gz')
+    sample = clen.set_index('contig').sum(axis=0).tolist()
+    sample_density = sample[-1] * 1e6 / sample[0]
+    with open_output(ofile, mode='wb') as raw_out:
+        with gzip.open(raw_out, 'wt') as out:
+            from .macrel_version import __version__
+            out.write('# Prediction from macrel v{}\n'.format(__version__))
+            out.write(f'# Macrel calculated for the sample a density of {sample_density:.3f} AMPs / Mbp.\n')
+            clen.to_csv(out, sep='\t', index=False, float_format="%.3f")
+    print(f'Macrel processed the sample and verified a density of {sample_density:.3f} AMPs / Mbp.')
 
 def do_get_examples(args):
     try:
@@ -303,7 +325,6 @@ def do_get_examples(args):
         print('Retrieving {}...'.format(f))
         urlretrieve(BASEURL + f, 'example_seqs/'+f)
 
-
 def main(args=None):
     if args is None:
         import sys
@@ -323,23 +344,31 @@ def main(args=None):
     with tempfile.TemporaryDirectory(dir=args.tmpdir) as tdir:
         from .output import readme_output_abundance_mode,readme_output_contigs_mode,\
             readme_output_peptides_mode,readme_output_reads_mode
+
+        creadme = {'reads': readme_output_reads_mode,
+                   'contigs': readme_output_contigs_mode,
+                   'get-smorfs': readme_output_contigs_mode,
+                   'peptides': readme_output_peptides_mode,
+                   'abundance': readme_output_abundance_mode,
+                  }
+
+        # print readme
+        if args.output_file != '-':
+            with open_output(os.path.join(args.output, 'README.md')) as ofile:
+                ofile.write(creadme[args.command])
+
+        # commands
         if args.command == 'reads':
             do_assembly(args, tdir,logfile)
-            with open_output(os.path.join(args.output, 'README.md')) as ofile:
-                ofile.write(readme_output_reads_mode)
         if args.command in ['reads', 'contigs', 'get-smorfs']:
-            do_smorfs(args, tdir,logfile)
-            if args.output:
-                with open_output(os.path.join(args.output, 'README.md')) as ofile:
-                    ofile.write(readme_output_contigs_mode)
+            clen = do_smorfs(args, tdir,logfile)
         if args.command in ['reads', 'contigs', 'peptides']:
-            do_predict(args, tdir)
-            with open_output(os.path.join(args.output, 'README.md')) as ofile:
-                ofile.write(readme_output_peptides_mode)
+            prediction = do_predict(args, tdir)
+        if args.command in ['reads', 'contigs']:
+            if not args.cluster:
+                do_density(args, clen, prediction)
         if args.command == 'abundance':
             do_abundance(args, tdir,logfile)
-            with open_output(os.path.join(args.output, 'README.md')) as ofile:
-                ofile.write(readme_output_abundance_mode)
 
 if __name__ == '__main__':
     import sys

diff --git a/macrel/output.py b/macrel/output.py
@@ -80,6 +80,23 @@
 
 This folder contains the full outputs of running megahit for assembly."""
 
+percontigs_table_doc = f"""- `macrel.out.percontigs.gz`
+
+Compressed tab-separated table with the following columns
+
+1. `contig`: Identififiers (same in the fasta or assembled contigs, also include Sample - the sum of all results)
+2. `length`: The length in base pairs
+3. `ORFs`: Number of ORFs found
+4. `smORFs`: Number of small ORFs found
+5. `AMPs`: Number of sequences predicted as AMPs
+
+The table contains a header (as comments marked with the `#` character at the
+start of the line) identifying the version of macrel used to generate these
+results.
+
+Note that, it is only printed when not using cluster option or get-smorfs mode.
+"""
+
 readme_output_abundance_mode = f"""{header}
 
 ## Outputs for `abundance` mode
@@ -100,11 +117,11 @@
 
 {prediction_table_doc}
 {predicted_faas_doc}
+{percontigs_table_doc}
 
 {footer}
 """
 
-
 readme_output_peptides_mode = f"""{header}
 
 ## Outputs for `peptides` mode
@@ -121,7 +138,7 @@
 {prediction_table_doc}
 {predicted_faas_doc}
 {megahit_output_doc}
+{percontigs_table_doc}
 
 {footer}
 """
-
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -23,6 +23,7 @@ nav:
     - 'Macrel': index.md
     - 'Install': install.md
     - 'Usage': usage.md
+    - 'Tutorial': tutorial.md
     - "What's New (history)": whatsnew.md
     - 'Contacts': contact.md
     - 'FAQ': faq.md

diff --git a/tests/contigs.nosmorfs/command.sh b/tests/contigs.nosmorfs/command.sh
@@ -6,4 +6,4 @@ macrel contigs \
     -o out
 
 gunzip out/macrel.out.prediction.gz
-
+gunzip out/macrel.out.percontigs.gz
diff --git a/tests/contigs.nosmorfs/expected.percontigs b/tests/contigs.nosmorfs/expected.percontigs
@@ -0,0 +1,4 @@
+# Prediction from macrel v1.2.0
+# Macrel calculated for the sample a density of 0.000 AMPs / Mbp.
+contig	length	ORFs	smORFs	AMPs
+scaffold2530_2_MH0058	1324	1	0	0
diff --git a/tests/contigs/command.sh b/tests/contigs/command.sh
@@ -7,4 +7,4 @@ macrel contigs \
     --log-file log.txt
 
 gunzip out/macrel.out.prediction.gz
-
+gunzip out/macrel.out.percontigs.gz
diff --git a/tests/contigs/expected.percontigs b/tests/contigs/expected.percontigs
@@ -0,0 +1,27 @@
+# Prediction from macrel v1.2.0
+# Macrel calculated for the sample a density of 45.062 AMPs / Mbp.
+contig	length	ORFs	smORFs	AMPs
+scaffold2530_2_MH0058	717	2	2	0
+scaffold75334_1_MH0058	3424	1	1	1
+scaffold54112_5_MH0058	728	1	1	0
+scaffold24504_2_MH0058	505	1	1	0
+scaffold95393_2_MH0058	995	2	1	0
+scaffold8449_1_MH0058	1037	2	1	0
+scaffold10455_3_MH0058	808	3	3	0
+scaffold98598_5_MH0058	3426	4	2	0
+scaffold76045_5_MH0058	960	3	3	0
+scaffold106190_2_MH0058	688	1	1	0
+C4067509_1_MH0058	534	1	1	0
+scaffold90770_1_MH0058	1031	2	2	0
+scaffold33693_17_MH0058	3481	2	1	1
+scaffold77554_3_MH0058	6086	9	4	0
+scaffold34596_7_MH0058	1345	1	0	0
+scaffold75223_9_MH0058	3597	2	2	0
+scaffold30291_4_MH0058	824	2	1	0
+scaffold107406_2_MH0058	1401	2	1	0
+scaffold16564_1_MH0058	992	2	2	0
+scaffold7019_2_MH0058	5218	6	3	0
+C4060843_1_MH0058	518	1	1	0
+scaffold20234_2_MH0058	2926	4	1	0
+C4193751_1_MH0058	1820	3	1	0
+C4177507_1_MH0058	1322	2	1	0
diff --git a/tests/reads.se/command.sh b/tests/reads.se/command.sh
@@ -6,4 +6,4 @@ macrel reads \
     -o out
 
 gunzip out/macrel.out.prediction.gz
-
+gunzip out/macrel.out.percontigs.gz
diff --git a/tests/reads.se/expected.percontigs b/tests/reads.se/expected.percontigs
@@ -0,0 +1,14 @@
+# Prediction from macrel v1.2.0
+# Macrel calculated for the sample a density of 59.743 AMPs / Mbp.
+contig	length	ORFs	smORFs	AMPs
+k47_0	3379	4	2	0
+k47_1	6046	9	4	0
+k47_3	5202	6	3	0
+k47_7	3483	2	2	0
+k47_8	2877	3	1	0
+k47_10	3374	1	1	1
+k47_11	3415	2	1	1
+k47_12	1307	1	0	0
+k47_16	1790	3	1	0
+k47_17	1376	2	1	0
+k47_24	1228	2	1	0
diff --git a/tests/reads/command.sh b/tests/reads/command.sh
@@ -7,4 +7,4 @@ macrel reads \
     -o out
 
 gunzip out/macrel.out.prediction.gz
-
+gunzip out/macrel.out.percontigs.gz
diff --git a/tests/reads/expected.percontigs b/tests/reads/expected.percontigs
@@ -0,0 +1,15 @@
+# Prediction from macrel v1.2.0
+# Macrel calculated for the sample a density of 57.627 AMPs / Mbp.
+contig	length	ORFs	smORFs	AMPs
+k77_3	1270	1	0	0
+k77_5	5202	6	3	0
+k77_6	3527	2	2	0
+k77_7	6058	9	4	0
+k77_8	1009	2	1	0
+k77_11	1303	2	1	0
+k77_12	3374	1	1	1
+k77_13	2920	4	1	0
+k77_15	3457	2	1	1
+k77_16	1790	3	1	0
+k77_20	1380	2	1	0
+k77_23	3416	4	2	0