Skip to content

Commit

Permalink
add scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Feb 14, 2023
1 parent 657fff5 commit bde736f
Show file tree
Hide file tree
Showing 4 changed files with 314 additions and 0 deletions.
13 changes: 13 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
cff-version: 1.2.0
message: "If you use Bactopia, please cite it as below."
authors:
- family-names: "Petit III"
given-names: "Robert A. "
orcid: "https://orcid.org/0000-0002-1350-9426"
- family-names: "Read"
given-names: "Timothy D."
orcid: "https://orcid.org/0000-0001-8966-9680"
title: "Bactopia: a Flexible Pipeline for Complete Analysis of Bacterial Genomes. mSystems. 5 (2020)"
doi: 10.1128/mSystems.00190-20
url: "https://github.com/bactopia/bactopia"
version: 1.4.1
53 changes: 53 additions & 0 deletions bin/bactopia-variants
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#! /bin/bash
VERSION=1.0.0
PREFIX=$1

# If no user input, print usage
if [[ $# == 0 ]]; then
echo "bactopia-variants - v${VERSION}"
echo ""
echo "bactopia-variants <OPT1> ... <OPTN>"
echo ""
exit
fi

if [ "$is_compressed" == "true" ]; then
gzip -c -d $reference > $final_reference
fi
snippy \\
$options.args \\
--cpus $task.cpus \\
--outdir $prefix \\
--reference $final_reference \\
--prefix $prefix \\
$read_inputs

# Add GenBank annotations
vcf-annotator ${prefix}/${prefix}.vcf ${final_reference} > ${prefix}/${prefix}.annotated.vcf

# Get per-base coverage
grep "^##contig" ${prefix}/${prefix}.vcf > ${prefix}/${prefix}.full-coverage.txt
genomeCoverageBed -ibam ${prefix}/${prefix}.bam -d >> ${prefix}/${prefix}.full-coverage.txt
cleanup-coverage.py ${prefix}/${prefix}.full-coverage.txt > ${prefix}/${prefix}.coverage.txt
rm ${prefix}/${prefix}.full-coverage.txt

# Mask low coverage regions
mask-consensus.py \\
${prefix} \\
${reference_name} \\
${prefix}/${prefix}.consensus.subs.fa \\
${prefix}/${prefix}.subs.vcf \\
${prefix}/${prefix}.coverage.txt \\
--mincov ${params.mincov} > ${prefix}/${prefix}.consensus.subs.masked.fa

# Clean Up
rm -rf ${prefix}/reference ${prefix}/ref.fa* ${prefix}/${prefix}.vcf.gz*
if [[ ${params.skip_compression} == "false" ]]; then
find ${prefix}/ -type f | \
grep -v -E "\\.bam\$|\\.bai\$|\\.log\$|\\.txt\$|\\.html\$|\\.tab\$" | \
xargs -I {} pigz -n --best -p ${task.cpus} {}
pigz -n --best -p ${task.cpus} ${prefix}/${prefix}.coverage.txt
fi

mv ${prefix}/ results/
mv results/*.log ./
75 changes: 75 additions & 0 deletions bin/cleanup-coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#! /usr/bin/env python3
"""
usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE
cleanup-coverage - Reduce redundancy in per-base coverage.
positional arguments:
COVERAGE Output from genomeBedCoverage
optional arguments:
-h, --help show this help message and exit
--version show program's version number and exit
"""
PROGRAM = "cleanup-coverage"
VERSION = "2.2.0"
import sys

def read_coverage(coverage):
"""Read the per-base coverage input."""
import re
accession = None
length = None
first_line = True
coverages = {}
with open(coverage, 'rt') as coverage_fh:
for line in coverage_fh:
line = line.rstrip()
if line.startswith('##'):
# ##contig=<ID=NZ_CP020108,length=5407749>
contig = re.search(r'contig=<ID=(.*),length=([0-9]+)>', line)
if contig:
accession = contig.group(1)
length = contig.group(2)
coverages[accession] = {'length':int(length), 'positions': []}
else:
print(f'{line} is an unexpected format.', file=sys.stderr)
sys.exit(1)
else:
accession, position, coverage = line.split('\t')
coverages[accession]['positions'].append(int(coverage))

for accession, vals in coverages.items():
if len(vals['positions']) != vals['length']:
print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr)
sys.exit(1)

return coverages

if __name__ == '__main__':
import argparse as ap
import sys

parser = ap.ArgumentParser(
prog=PROGRAM,
conflict_handler='resolve',
description=(
f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.'
)
)
parser.add_argument('coverage', metavar="COVERAGE", type=str,
help='Directory where BLAST databases are stored')
parser.add_argument('--version', action='version',
version=f'{PROGRAM} {VERSION}')

if len(sys.argv) == 1:
parser.print_help()
sys.exit(0)

args = parser.parse_args()

coverages = read_coverage(args.coverage)
for accession, vals in coverages.items():
print(f'##contig=<ID={accession},length={vals["length"]}>')
for cov in vals['positions']:
print(cov)
173 changes: 173 additions & 0 deletions bin/mask-consensus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#! /usr/bin/env python3
"""
usage: mask-consensus [-h] [--mincov INT] [--version]
SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE
mask-consensus - Snippy consensus (subs) with coverage masking.
positional arguments:
SAMPLE Sample name
REFERENCE Reference name
SUBS_FASTA Input "consensus.subs.fa" FASTA file
SUBS_VCF Input ".subs.vcf" VCF file
COVERAGE Per-base coverage of alignment
optional arguments:
-h, --help show this help message and exit
--mincov INT Minimum required coverage to not mask.
--version show program's version number and exit
"""
PROGRAM = "mask-consensus"
VERSION = "2.2.0"
import sys


def read_coverage(coverage):
"""Read the per-base coverage input."""
import re
accession = None
length = None
first_line = True
coverages = {}
with open(coverage, 'rt') as coverage_fh:
for line in coverage_fh:
line = line.rstrip()
if line.startswith('##'):
# ##contig=<ID=NZ_CP020108,length=5407749>
contig = re.search(r'contig=<ID=(.*),length=([0-9]+)>', line)
if contig:
accession = contig.group(1)
length = contig.group(2)
coverages[accession] = {'length':int(length), 'positions': []}
else:
print(f'{line} is an unexpected format.', file=sys.stderr)
sys.exit(1)
else:
if line:
coverages[accession]['positions'].append(int(line))

for accession, vals in coverages.items():
if len(vals['positions']) != vals['length']:
print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr)
sys.exit(1)

return coverages


def read_vcf(vcf):
"""Get positions with a substitution."""
subs = {}
with open(vcf, 'rt') as vcf_fh:
for line in vcf_fh:
if not line.startswith("#"):
line = line.split('\t')
# 0 = accession, 1 = position
if line[0] not in subs:
subs[line[0]] = {}
subs[line[0]][line[1]] = True
return subs


def read_fasta(fasta):
"""Parse the input FASTA file."""
from Bio import SeqIO
seqs = {}
with open(fasta, 'r') as fasta_fh:
for record in SeqIO.parse(fasta_fh,'fasta'):
seqs[record.name] = str(record.seq)
return seqs


def mask_sequence(sequence, coverages, subs, mincov):
"""Mask positions with low or no coverage in the input FASTA."""
masked_seqs = {}

for accession, vals in coverages.items():
bases = []
coverage = vals['positions']
for i, cov in enumerate(coverage):
if cov >= mincov:
# Passes
if accession in subs:
if str(i+1) in subs[accession]:
# Substitution
bases.append(sequence[accession][i].lower())
else:
# Same as reference
bases.append(sequence[accession][i])
else:
# No SNPs, Same as reference
bases.append(sequence[accession][i])
elif cov:
# Low coverage
bases.append("N")
else:
# 0 coverage
bases.append('n')

if len(bases) != len(sequence[accession]):
print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).',
file=sys.stderr)
sys.exit(1)
else:
masked_seqs[accession] = bases

return masked_seqs


def format_header(sample, reference, accession, length):
"""Return a newly formatted header."""
title = f'Pseudo-seq with called substitutions and low coverage masked'
return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]'


def chunks(s, n):
"""
Produce `n`-character chunks from `s`.
https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters
"""
for start in range(0, len(s), n):
yield s[start:start+n]


if __name__ == '__main__':
import argparse as ap
import sys

parser = ap.ArgumentParser(
prog=PROGRAM,
conflict_handler='resolve',
description=(
f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.'
)
)
parser.add_argument('sample', metavar="SAMPLE", type=str,
help='Sample name')
parser.add_argument('reference', metavar="REFERENCE", type=str,
help='Reference name')
parser.add_argument('fasta', metavar="SUBS_FASTA", type=str,
help='Input "consensus.subs.fa" FASTA file')
parser.add_argument('vcf', metavar="SUBS_VCF", type=str,
help='Input ".subs.vcf" VCF file')
parser.add_argument('coverage', metavar="COVERAGE", type=str,
help='Per-base coverage of alignment')
parser.add_argument('--mincov', metavar='INT', type=int, default=10,
help='Minimum required coverage to not mask.')
parser.add_argument('--version', action='version',
version=f'{PROGRAM} {VERSION}')

if len(sys.argv) == 1:
parser.print_help()
sys.exit(0)

args = parser.parse_args()

coverages = read_coverage(args.coverage)
sub_positions = read_vcf(args.vcf)
seqs = read_fasta(args.fasta)
masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov)
for accession, seq in masked_seqs.items():
header = format_header(args.sample, args.reference, accession, len(seq))
print(header)
for chunk in chunks(seq, 60):
print("".join(chunk))

0 comments on commit bde736f

Please sign in to comment.