diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..7157760 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,13 @@ +cff-version: 1.2.0 +message: "If you use Bactopia, please cite it as below." +authors: +- family-names: "Petit III" + given-names: "Robert A. " + orcid: "https://orcid.org/0000-0002-1350-9426" +- family-names: "Read" + given-names: "Timothy D." + orcid: "https://orcid.org/0000-0001-8966-9680" +title: "Bactopia: a Flexible Pipeline for Complete Analysis of Bacterial Genomes. mSystems. 5 (2020)" +doi: 10.1128/mSystems.00190-20 +url: "https://github.com/bactopia/bactopia" +version: 1.4.1 diff --git a/bin/bactopia-variants b/bin/bactopia-variants new file mode 100755 index 0000000..5651699 --- /dev/null +++ b/bin/bactopia-variants @@ -0,0 +1,53 @@ +#! /bin/bash +VERSION=1.0.0 +PREFIX=$1 + +# If no user input, print usage +if [[ $# == 0 ]]; then + echo "bactopia-variants - v${VERSION}" + echo "" + echo "bactopia-variants ... " + echo "" + exit +fi + +if [ "$is_compressed" == "true" ]; then + gzip -c -d $reference > $final_reference +fi +snippy \\ + $options.args \\ + --cpus $task.cpus \\ + --outdir $prefix \\ + --reference $final_reference \\ + --prefix $prefix \\ + $read_inputs + +# Add GenBank annotations +vcf-annotator ${prefix}/${prefix}.vcf ${final_reference} > ${prefix}/${prefix}.annotated.vcf + +# Get per-base coverage +grep "^##contig" ${prefix}/${prefix}.vcf > ${prefix}/${prefix}.full-coverage.txt +genomeCoverageBed -ibam ${prefix}/${prefix}.bam -d >> ${prefix}/${prefix}.full-coverage.txt +cleanup-coverage.py ${prefix}/${prefix}.full-coverage.txt > ${prefix}/${prefix}.coverage.txt +rm ${prefix}/${prefix}.full-coverage.txt + +# Mask low coverage regions +mask-consensus.py \\ + ${prefix} \\ + ${reference_name} \\ + ${prefix}/${prefix}.consensus.subs.fa \\ + ${prefix}/${prefix}.subs.vcf \\ + ${prefix}/${prefix}.coverage.txt \\ + --mincov ${params.mincov} > ${prefix}/${prefix}.consensus.subs.masked.fa + +# Clean Up +rm -rf ${prefix}/reference ${prefix}/ref.fa* ${prefix}/${prefix}.vcf.gz* +if [[ ${params.skip_compression} == "false" ]]; then + find ${prefix}/ -type f | \ + grep -v -E "\\.bam\$|\\.bai\$|\\.log\$|\\.txt\$|\\.html\$|\\.tab\$" | \ + xargs -I {} pigz -n --best -p ${task.cpus} {} + pigz -n --best -p ${task.cpus} ${prefix}/${prefix}.coverage.txt +fi + +mv ${prefix}/ results/ +mv results/*.log ./ diff --git a/bin/cleanup-coverage.py b/bin/cleanup-coverage.py new file mode 100755 index 0000000..73bca93 --- /dev/null +++ b/bin/cleanup-coverage.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 +""" +usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE + +cleanup-coverage - Reduce redundancy in per-base coverage. + +positional arguments: + COVERAGE Output from genomeBedCoverage + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit +""" +PROGRAM = "cleanup-coverage" +VERSION = "2.2.0" +import sys + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + accession, position, coverage = line.split('\t') + coverages[accession]['positions'].append(int(coverage)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Directory where BLAST databases are stored') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + for accession, vals in coverages.items(): + print(f'##contig=') + for cov in vals['positions']: + print(cov) diff --git a/bin/mask-consensus.py b/bin/mask-consensus.py new file mode 100755 index 0000000..105795e --- /dev/null +++ b/bin/mask-consensus.py @@ -0,0 +1,173 @@ +#! /usr/bin/env python3 +""" +usage: mask-consensus [-h] [--mincov INT] [--version] + SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE + +mask-consensus - Snippy consensus (subs) with coverage masking. + +positional arguments: + SAMPLE Sample name + REFERENCE Reference name + SUBS_FASTA Input "consensus.subs.fa" FASTA file + SUBS_VCF Input ".subs.vcf" VCF file + COVERAGE Per-base coverage of alignment + +optional arguments: + -h, --help show this help message and exit + --mincov INT Minimum required coverage to not mask. + --version show program's version number and exit +""" +PROGRAM = "mask-consensus" +VERSION = "2.2.0" +import sys + + +def read_coverage(coverage): + """Read the per-base coverage input.""" + import re + accession = None + length = None + first_line = True + coverages = {} + with open(coverage, 'rt') as coverage_fh: + for line in coverage_fh: + line = line.rstrip() + if line.startswith('##'): + # ##contig= + contig = re.search(r'contig=', line) + if contig: + accession = contig.group(1) + length = contig.group(2) + coverages[accession] = {'length':int(length), 'positions': []} + else: + print(f'{line} is an unexpected format.', file=sys.stderr) + sys.exit(1) + else: + if line: + coverages[accession]['positions'].append(int(line)) + + for accession, vals in coverages.items(): + if len(vals['positions']) != vals['length']: + print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) + sys.exit(1) + + return coverages + + +def read_vcf(vcf): + """Get positions with a substitution.""" + subs = {} + with open(vcf, 'rt') as vcf_fh: + for line in vcf_fh: + if not line.startswith("#"): + line = line.split('\t') + # 0 = accession, 1 = position + if line[0] not in subs: + subs[line[0]] = {} + subs[line[0]][line[1]] = True + return subs + + +def read_fasta(fasta): + """Parse the input FASTA file.""" + from Bio import SeqIO + seqs = {} + with open(fasta, 'r') as fasta_fh: + for record in SeqIO.parse(fasta_fh,'fasta'): + seqs[record.name] = str(record.seq) + return seqs + + +def mask_sequence(sequence, coverages, subs, mincov): + """Mask positions with low or no coverage in the input FASTA.""" + masked_seqs = {} + + for accession, vals in coverages.items(): + bases = [] + coverage = vals['positions'] + for i, cov in enumerate(coverage): + if cov >= mincov: + # Passes + if accession in subs: + if str(i+1) in subs[accession]: + # Substitution + bases.append(sequence[accession][i].lower()) + else: + # Same as reference + bases.append(sequence[accession][i]) + else: + # No SNPs, Same as reference + bases.append(sequence[accession][i]) + elif cov: + # Low coverage + bases.append("N") + else: + # 0 coverage + bases.append('n') + + if len(bases) != len(sequence[accession]): + print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', + file=sys.stderr) + sys.exit(1) + else: + masked_seqs[accession] = bases + + return masked_seqs + + +def format_header(sample, reference, accession, length): + """Return a newly formatted header.""" + title = f'Pseudo-seq with called substitutions and low coverage masked' + return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' + + +def chunks(s, n): + """ + Produce `n`-character chunks from `s`. + https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters + """ + for start in range(0, len(s), n): + yield s[start:start+n] + + +if __name__ == '__main__': + import argparse as ap + import sys + + parser = ap.ArgumentParser( + prog=PROGRAM, + conflict_handler='resolve', + description=( + f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' + ) + ) + parser.add_argument('sample', metavar="SAMPLE", type=str, + help='Sample name') + parser.add_argument('reference', metavar="REFERENCE", type=str, + help='Reference name') + parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, + help='Input "consensus.subs.fa" FASTA file') + parser.add_argument('vcf', metavar="SUBS_VCF", type=str, + help='Input ".subs.vcf" VCF file') + parser.add_argument('coverage', metavar="COVERAGE", type=str, + help='Per-base coverage of alignment') + parser.add_argument('--mincov', metavar='INT', type=int, default=10, + help='Minimum required coverage to not mask.') + parser.add_argument('--version', action='version', + version=f'{PROGRAM} {VERSION}') + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + coverages = read_coverage(args.coverage) + sub_positions = read_vcf(args.vcf) + seqs = read_fasta(args.fasta) + masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) + for accession, seq in masked_seqs.items(): + header = format_header(args.sample, args.reference, accession, len(seq)) + print(header) + for chunk in chunks(seq, 60): + print("".join(chunk))