diff --git a/docs/tutorial.md b/docs/tutorial.md new file mode 100644 index 00000000..43ccc398 --- /dev/null +++ b/docs/tutorial.md @@ -0,0 +1,58 @@ +# Tutorial for AMP density + +If you have not yet installed macrel, see [install](install) and for a +detailed usage, please see our [manual](usage). + +In this tutorial, we will show how to use the `expected.percontigs` output +produced in the `contigs` and `reads` mode to obtain the density of AMPs +per species. + +Consider the expected results in the folder `tests/contigs/expected.percontigs` +as one of the inputs used in this tutorial, and the table for taxonomy of contigs +available in the `example_seqs/example_taxonomy_contigs.tsv.gz`. + +First you will need to load tables into your system: + +``` +import pandas as pd + +percontigs = pd.read_table('tests/contigs/expected.percontigs', comment='#') +taxonomy = pd.read_table('example_seqs/example_taxonomy_contigs.tsv.gz') +``` + +Then, you will need to merge these tables. + +``` +percontigs = percontigs.merge(on='contig', + right=taxonomy, + how='outer') +``` + +Now, we will group results and sum values: + +``` +percontigs = percontigs.dropna() +percontigs = percontigs.drop('contig', axis=1) +percontigs = percontigs.groupby('taxonomy').agg('sum') +``` + +By now, you should have a table with the species and the total +of assembled base pairs per species as well as their total number of ORFs, +smORFs and redundant AMPs. Now you just calculate the density as follows: + +``` +percontigs['AMP_density'] = percontigs.AMPs * 1e6 / percontigs.length +percontigs.to_csv('expected.density', + sep='\t', + header=True, + index=None) +``` + +The resulting `expected.density` table should be as follows: + +| **taxonomy** | **length** | **ORFs** | **smORFs** | **AMPs** | **AMP_density** | +| :---: | :---: | :---: | :---: | :---: | :---: | +| speciesA | 4208 | 11 | 10 | 0 | 0.000 | +| speciesB | 11876 | 15 | 8 | 1 | 84.203 | +| speciesC | 5679 | 8 | 6 | 0 | 0.000 | + diff --git a/docs/usage.md b/docs/usage.md index 3ddc9b63..3dab8c39 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -89,10 +89,15 @@ macrel contigs \ In this example, we use the example file `excontigs.fna.gz` which is a FASTA file with nucleotide sequences, writing the output to `out_contigs`. -An example of output using this mode can be found at `test/contigs/expected.prediction`. +An example of output using this mode can be found at `test/contigs/expected.prediction` +and `test/contigs/expected.percontigs`, with the latter you can also calculate AMP density. + +Note that the results per contig are only produced when not using the option +`cluster`. + Additionally to the prediction table, this mode also produces two files containing -general gene prediction information in the contigs and a fasta file containing the -predicted and filtered small genes (<= 100 amino acids). +general gene prediction information in the contigs and a fasta file containing the predicted +and filtered small genes (<= 100 amino acids). To run Macrel on paired-end reads, use the `reads` subcommand: @@ -107,7 +112,9 @@ macrel reads \ The paired-end reads are given as paired files (here, `example_seqs/R1.fq.gz` and `example_seqs/R2.fq.gz`). If you only have single-end reads, you can omit the `-2` argument. An example of outputs using this mode can be found at -`test/reads/expected.prediction` and `test/reads/expected.smorfs.faa`. +`test/reads/expected.prediction`, `test/reads/expected.smorfs.faa` and +`test/contigs/expected.percontigs`, with the latter you can also calculate AMP density. + Additionally to the prediction table, this mode also produces a contigs fasta file, and the two files containing general gene prediction coordinates and a fasta file containing the predicted and filtered small genes (<= 100 amino acids). diff --git a/example_seqs/example_taxonomy_contigs.tsv.gz b/example_seqs/example_taxonomy_contigs.tsv.gz new file mode 100644 index 00000000..e8bfaa54 Binary files /dev/null and b/example_seqs/example_taxonomy_contigs.tsv.gz differ diff --git a/install.sh b/install.sh index be31f2cd..ad8b0a6e 100755 --- a/install.sh +++ b/install.sh @@ -43,11 +43,12 @@ fi ${CONDA_INSTALL_CMD} install -y \ --prefix $BASEDIR/envs/Macrel_env \ ngless \ - pyrodigal>=0.7.3 \ + "pyrodigal>=0.7.3" \ megahit \ paladin \ pandas \ - scikit-learn \ + "scikit-learn<1.3.0" \ + "joblib<1.3.0" \ atomicwrites \ tzlocal diff --git a/macrel/ORFs_prediction.py b/macrel/ORFs_prediction.py index 25369748..690cea15 100644 --- a/macrel/ORFs_prediction.py +++ b/macrel/ORFs_prediction.py @@ -31,16 +31,22 @@ def predict_genes(infile, ofile): import pandas as pd from .fasta import fasta_iter from atomicwrites import atomic_write + + clen = [] gorf, morf_finder = create_pyrodigal_orffinder() + # predict genes with atomic_write(ofile, overwrite=True) as odb: for idx, (h, s) in enumerate(fasta_iter(infile)): + orfs, smorfs = [0, 0] if len(s) <= 100_000: # if contig length less than 100kbp then not suitable for training # predict genes using metagenome pretrained models for i, pred in enumerate(morf_finder.find_genes(s)): t = ppyrodigal_out(h, i+1, idx+1, pred) odb.write(t) + orfs += 1 + if len(pred.translate()) <= 100: smorfs += 1 else: # if contig length is above or 100kbp then suitable for training of # its own model, therefore proceed in a genome wise way @@ -49,4 +55,10 @@ def predict_genes(infile, ofile): for i, pred in enumerate(gorf.find_genes(s)): t = ppyrodigal_out(h, i+1, idx+1, pred) odb.write(t) + orfs += 1 + if len(pred.translate()) <= 100: smorfs += 1 + + clen.append([h, len(s), orfs, smorfs]) + + return pd.DataFrame(clen, columns=['contig', 'length', 'ORFs', 'smORFs']) diff --git a/macrel/data/models/AMP.pkl.gz b/macrel/data/models/AMP.pkl.gz index 32b600c2..97360a23 100644 Binary files a/macrel/data/models/AMP.pkl.gz and b/macrel/data/models/AMP.pkl.gz differ diff --git a/macrel/data/models/Hemo.pkl.gz b/macrel/data/models/Hemo.pkl.gz index 10f6ccfc..21a01b27 100644 Binary files a/macrel/data/models/Hemo.pkl.gz and b/macrel/data/models/Hemo.pkl.gz differ diff --git a/macrel/main.py b/macrel/main.py index 8dbe6151..5e2f9796 100644 --- a/macrel/main.py +++ b/macrel/main.py @@ -143,9 +143,10 @@ def do_smorfs(args, tdir,logfile): peptide_file = (args.output_file if args.output_file != '-' else '/dev/stdout') # predict genes with pyrodigal - predict_genes(args.fasta_file, all_peptide_file) + clen = predict_genes(args.fasta_file, all_peptide_file) filter_smorfs(all_peptide_file, peptide_file, args.cluster, args.keep_fasta_headers) args.fasta_file = peptide_file + return clen def link_or_uncompress_fasta_file(orig, dest): @@ -271,16 +272,37 @@ def do_predict(args, tdir): import gzip fs = fasta_features(args.fasta_file) prediction = predict( - data_file("models/AMP.pkl.gz"), - data_file("models/Hemo.pkl.gz"), - fs, - args.keep_negatives) + data_file("models/AMP.pkl.gz"), + data_file("models/Hemo.pkl.gz"), + fs, + args.keep_negatives) ofile = path.join(args.output, args.outtag + '.prediction.gz') with open_output(ofile, mode='wb') as raw_out: with gzip.open(raw_out, 'wt') as out: from .macrel_version import __version__ out.write('# Prediction from macrel v{}\n'.format(__version__)) prediction.to_csv(out, sep='\t', index_label='Access', float_format="%.3f") + return prediction + +def do_density(args, clen, prediction): + tpred = prediction.reset_index() + tpred['contig'] = tpred['index'].apply(lambda x: '_'.join(x.split('_')[:-1])) + tpred = tpred[tpred['AMP_probability'] > 0.5] + tpred = tpred.groupby('contig').agg('size') + tpred = tpred.reset_index() + tpred = tpred.rename({0: 'AMPs'}, axis=1) + clen = clen.merge(on='contig', right=tpred, how='outer').fillna(0) + clen[clen.columns[1:]] = clen[clen.columns[1:]].astype(int) + ofile = path.join(args.output, args.outtag + '.percontigs.gz') + sample = clen.set_index('contig').sum(axis=0).tolist() + sample_density = sample[-1] * 1e6 / sample[0] + with open_output(ofile, mode='wb') as raw_out: + with gzip.open(raw_out, 'wt') as out: + from .macrel_version import __version__ + out.write('# Prediction from macrel v{}\n'.format(__version__)) + out.write(f'# Macrel calculated for the sample a density of {sample_density:.3f} AMPs / Mbp.\n') + clen.to_csv(out, sep='\t', index=False, float_format="%.3f") + print(f'Macrel processed the sample and verified a density of {sample_density:.3f} AMPs / Mbp.') def do_get_examples(args): try: @@ -303,7 +325,6 @@ def do_get_examples(args): print('Retrieving {}...'.format(f)) urlretrieve(BASEURL + f, 'example_seqs/'+f) - def main(args=None): if args is None: import sys @@ -323,23 +344,31 @@ def main(args=None): with tempfile.TemporaryDirectory(dir=args.tmpdir) as tdir: from .output import readme_output_abundance_mode,readme_output_contigs_mode,\ readme_output_peptides_mode,readme_output_reads_mode + + creadme = {'reads': readme_output_reads_mode, + 'contigs': readme_output_contigs_mode, + 'get-smorfs': readme_output_contigs_mode, + 'peptides': readme_output_peptides_mode, + 'abundance': readme_output_abundance_mode, + } + + # print readme + if args.output_file != '-': + with open_output(os.path.join(args.output, 'README.md')) as ofile: + ofile.write(creadme[args.command]) + + # commands if args.command == 'reads': do_assembly(args, tdir,logfile) - with open_output(os.path.join(args.output, 'README.md')) as ofile: - ofile.write(readme_output_reads_mode) if args.command in ['reads', 'contigs', 'get-smorfs']: - do_smorfs(args, tdir,logfile) - if args.output: - with open_output(os.path.join(args.output, 'README.md')) as ofile: - ofile.write(readme_output_contigs_mode) + clen = do_smorfs(args, tdir,logfile) if args.command in ['reads', 'contigs', 'peptides']: - do_predict(args, tdir) - with open_output(os.path.join(args.output, 'README.md')) as ofile: - ofile.write(readme_output_peptides_mode) + prediction = do_predict(args, tdir) + if args.command in ['reads', 'contigs']: + if not args.cluster: + do_density(args, clen, prediction) if args.command == 'abundance': do_abundance(args, tdir,logfile) - with open_output(os.path.join(args.output, 'README.md')) as ofile: - ofile.write(readme_output_abundance_mode) if __name__ == '__main__': import sys diff --git a/macrel/output.py b/macrel/output.py index ee744c42..0903784e 100644 --- a/macrel/output.py +++ b/macrel/output.py @@ -80,6 +80,23 @@ This folder contains the full outputs of running megahit for assembly.""" +percontigs_table_doc = f"""- `macrel.out.percontigs.gz` + +Compressed tab-separated table with the following columns + +1. `contig`: Identififiers (same in the fasta or assembled contigs, also include Sample - the sum of all results) +2. `length`: The length in base pairs +3. `ORFs`: Number of ORFs found +4. `smORFs`: Number of small ORFs found +5. `AMPs`: Number of sequences predicted as AMPs + +The table contains a header (as comments marked with the `#` character at the +start of the line) identifying the version of macrel used to generate these +results. + +Note that, it is only printed when not using cluster option or get-smorfs mode. +""" + readme_output_abundance_mode = f"""{header} ## Outputs for `abundance` mode @@ -100,11 +117,11 @@ {prediction_table_doc} {predicted_faas_doc} +{percontigs_table_doc} {footer} """ - readme_output_peptides_mode = f"""{header} ## Outputs for `peptides` mode @@ -121,7 +138,7 @@ {prediction_table_doc} {predicted_faas_doc} {megahit_output_doc} +{percontigs_table_doc} {footer} """ - diff --git a/mkdocs.yml b/mkdocs.yml index 1d17dc2d..1ed90bc7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -23,6 +23,7 @@ nav: - 'Macrel': index.md - 'Install': install.md - 'Usage': usage.md + - 'Tutorial': tutorial.md - "What's New (history)": whatsnew.md - 'Contacts': contact.md - 'FAQ': faq.md diff --git a/tests/contigs.nosmorfs/command.sh b/tests/contigs.nosmorfs/command.sh index 31cf0cad..3b1ca9ca 100755 --- a/tests/contigs.nosmorfs/command.sh +++ b/tests/contigs.nosmorfs/command.sh @@ -6,4 +6,4 @@ macrel contigs \ -o out gunzip out/macrel.out.prediction.gz - +gunzip out/macrel.out.percontigs.gz diff --git a/tests/contigs.nosmorfs/expected.percontigs b/tests/contigs.nosmorfs/expected.percontigs new file mode 100644 index 00000000..138570d7 --- /dev/null +++ b/tests/contigs.nosmorfs/expected.percontigs @@ -0,0 +1,4 @@ +# Prediction from macrel v1.2.0 +# Macrel calculated for the sample a density of 0.000 AMPs / Mbp. +contig length ORFs smORFs AMPs +scaffold2530_2_MH0058 1324 1 0 0 diff --git a/tests/contigs/command.sh b/tests/contigs/command.sh index 7102ae37..a2c301fa 100755 --- a/tests/contigs/command.sh +++ b/tests/contigs/command.sh @@ -7,4 +7,4 @@ macrel contigs \ --log-file log.txt gunzip out/macrel.out.prediction.gz - +gunzip out/macrel.out.percontigs.gz diff --git a/tests/contigs/expected.percontigs b/tests/contigs/expected.percontigs new file mode 100644 index 00000000..5c6b96ac --- /dev/null +++ b/tests/contigs/expected.percontigs @@ -0,0 +1,27 @@ +# Prediction from macrel v1.2.0 +# Macrel calculated for the sample a density of 45.062 AMPs / Mbp. +contig length ORFs smORFs AMPs +scaffold2530_2_MH0058 717 2 2 0 +scaffold75334_1_MH0058 3424 1 1 1 +scaffold54112_5_MH0058 728 1 1 0 +scaffold24504_2_MH0058 505 1 1 0 +scaffold95393_2_MH0058 995 2 1 0 +scaffold8449_1_MH0058 1037 2 1 0 +scaffold10455_3_MH0058 808 3 3 0 +scaffold98598_5_MH0058 3426 4 2 0 +scaffold76045_5_MH0058 960 3 3 0 +scaffold106190_2_MH0058 688 1 1 0 +C4067509_1_MH0058 534 1 1 0 +scaffold90770_1_MH0058 1031 2 2 0 +scaffold33693_17_MH0058 3481 2 1 1 +scaffold77554_3_MH0058 6086 9 4 0 +scaffold34596_7_MH0058 1345 1 0 0 +scaffold75223_9_MH0058 3597 2 2 0 +scaffold30291_4_MH0058 824 2 1 0 +scaffold107406_2_MH0058 1401 2 1 0 +scaffold16564_1_MH0058 992 2 2 0 +scaffold7019_2_MH0058 5218 6 3 0 +C4060843_1_MH0058 518 1 1 0 +scaffold20234_2_MH0058 2926 4 1 0 +C4193751_1_MH0058 1820 3 1 0 +C4177507_1_MH0058 1322 2 1 0 diff --git a/tests/reads.se/command.sh b/tests/reads.se/command.sh index 9b34dfa5..8608c1b7 100755 --- a/tests/reads.se/command.sh +++ b/tests/reads.se/command.sh @@ -6,4 +6,4 @@ macrel reads \ -o out gunzip out/macrel.out.prediction.gz - +gunzip out/macrel.out.percontigs.gz diff --git a/tests/reads.se/expected.percontigs b/tests/reads.se/expected.percontigs new file mode 100644 index 00000000..3565512c --- /dev/null +++ b/tests/reads.se/expected.percontigs @@ -0,0 +1,14 @@ +# Prediction from macrel v1.2.0 +# Macrel calculated for the sample a density of 59.743 AMPs / Mbp. +contig length ORFs smORFs AMPs +k47_0 3379 4 2 0 +k47_1 6046 9 4 0 +k47_3 5202 6 3 0 +k47_7 3483 2 2 0 +k47_8 2877 3 1 0 +k47_10 3374 1 1 1 +k47_11 3415 2 1 1 +k47_12 1307 1 0 0 +k47_16 1790 3 1 0 +k47_17 1376 2 1 0 +k47_24 1228 2 1 0 diff --git a/tests/reads/command.sh b/tests/reads/command.sh index 661eb64f..29f7d39b 100755 --- a/tests/reads/command.sh +++ b/tests/reads/command.sh @@ -7,4 +7,4 @@ macrel reads \ -o out gunzip out/macrel.out.prediction.gz - +gunzip out/macrel.out.percontigs.gz diff --git a/tests/reads/expected.percontigs b/tests/reads/expected.percontigs new file mode 100644 index 00000000..62e1c85d --- /dev/null +++ b/tests/reads/expected.percontigs @@ -0,0 +1,15 @@ +# Prediction from macrel v1.2.0 +# Macrel calculated for the sample a density of 57.627 AMPs / Mbp. +contig length ORFs smORFs AMPs +k77_3 1270 1 0 0 +k77_5 5202 6 3 0 +k77_6 3527 2 2 0 +k77_7 6058 9 4 0 +k77_8 1009 2 1 0 +k77_11 1303 2 1 0 +k77_12 3374 1 1 1 +k77_13 2920 4 1 0 +k77_15 3457 2 1 1 +k77_16 1790 3 1 0 +k77_20 1380 2 1 0 +k77_23 3416 4 2 0