-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
314 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
cff-version: 1.2.0 | ||
message: "If you use Bactopia, please cite it as below." | ||
authors: | ||
- family-names: "Petit III" | ||
given-names: "Robert A. " | ||
orcid: "https://orcid.org/0000-0002-1350-9426" | ||
- family-names: "Read" | ||
given-names: "Timothy D." | ||
orcid: "https://orcid.org/0000-0001-8966-9680" | ||
title: "Bactopia: a Flexible Pipeline for Complete Analysis of Bacterial Genomes. mSystems. 5 (2020)" | ||
doi: 10.1128/mSystems.00190-20 | ||
url: "https://github.com/bactopia/bactopia" | ||
version: 1.4.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#! /bin/bash | ||
VERSION=1.0.0 | ||
PREFIX=$1 | ||
|
||
# If no user input, print usage | ||
if [[ $# == 0 ]]; then | ||
echo "bactopia-variants - v${VERSION}" | ||
echo "" | ||
echo "bactopia-variants <OPT1> ... <OPTN>" | ||
echo "" | ||
exit | ||
fi | ||
|
||
if [ "$is_compressed" == "true" ]; then | ||
gzip -c -d $reference > $final_reference | ||
fi | ||
snippy \\ | ||
$options.args \\ | ||
--cpus $task.cpus \\ | ||
--outdir $prefix \\ | ||
--reference $final_reference \\ | ||
--prefix $prefix \\ | ||
$read_inputs | ||
|
||
# Add GenBank annotations | ||
vcf-annotator ${prefix}/${prefix}.vcf ${final_reference} > ${prefix}/${prefix}.annotated.vcf | ||
|
||
# Get per-base coverage | ||
grep "^##contig" ${prefix}/${prefix}.vcf > ${prefix}/${prefix}.full-coverage.txt | ||
genomeCoverageBed -ibam ${prefix}/${prefix}.bam -d >> ${prefix}/${prefix}.full-coverage.txt | ||
cleanup-coverage.py ${prefix}/${prefix}.full-coverage.txt > ${prefix}/${prefix}.coverage.txt | ||
rm ${prefix}/${prefix}.full-coverage.txt | ||
|
||
# Mask low coverage regions | ||
mask-consensus.py \\ | ||
${prefix} \\ | ||
${reference_name} \\ | ||
${prefix}/${prefix}.consensus.subs.fa \\ | ||
${prefix}/${prefix}.subs.vcf \\ | ||
${prefix}/${prefix}.coverage.txt \\ | ||
--mincov ${params.mincov} > ${prefix}/${prefix}.consensus.subs.masked.fa | ||
|
||
# Clean Up | ||
rm -rf ${prefix}/reference ${prefix}/ref.fa* ${prefix}/${prefix}.vcf.gz* | ||
if [[ ${params.skip_compression} == "false" ]]; then | ||
find ${prefix}/ -type f | \ | ||
grep -v -E "\\.bam\$|\\.bai\$|\\.log\$|\\.txt\$|\\.html\$|\\.tab\$" | \ | ||
xargs -I {} pigz -n --best -p ${task.cpus} {} | ||
pigz -n --best -p ${task.cpus} ${prefix}/${prefix}.coverage.txt | ||
fi | ||
|
||
mv ${prefix}/ results/ | ||
mv results/*.log ./ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#! /usr/bin/env python3 | ||
""" | ||
usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE | ||
cleanup-coverage - Reduce redundancy in per-base coverage. | ||
positional arguments: | ||
COVERAGE Output from genomeBedCoverage | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
--version show program's version number and exit | ||
""" | ||
PROGRAM = "cleanup-coverage" | ||
VERSION = "2.2.0" | ||
import sys | ||
|
||
def read_coverage(coverage): | ||
"""Read the per-base coverage input.""" | ||
import re | ||
accession = None | ||
length = None | ||
first_line = True | ||
coverages = {} | ||
with open(coverage, 'rt') as coverage_fh: | ||
for line in coverage_fh: | ||
line = line.rstrip() | ||
if line.startswith('##'): | ||
# ##contig=<ID=NZ_CP020108,length=5407749> | ||
contig = re.search(r'contig=<ID=(.*),length=([0-9]+)>', line) | ||
if contig: | ||
accession = contig.group(1) | ||
length = contig.group(2) | ||
coverages[accession] = {'length':int(length), 'positions': []} | ||
else: | ||
print(f'{line} is an unexpected format.', file=sys.stderr) | ||
sys.exit(1) | ||
else: | ||
accession, position, coverage = line.split('\t') | ||
coverages[accession]['positions'].append(int(coverage)) | ||
|
||
for accession, vals in coverages.items(): | ||
if len(vals['positions']) != vals['length']: | ||
print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) | ||
sys.exit(1) | ||
|
||
return coverages | ||
|
||
if __name__ == '__main__': | ||
import argparse as ap | ||
import sys | ||
|
||
parser = ap.ArgumentParser( | ||
prog=PROGRAM, | ||
conflict_handler='resolve', | ||
description=( | ||
f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' | ||
) | ||
) | ||
parser.add_argument('coverage', metavar="COVERAGE", type=str, | ||
help='Directory where BLAST databases are stored') | ||
parser.add_argument('--version', action='version', | ||
version=f'{PROGRAM} {VERSION}') | ||
|
||
if len(sys.argv) == 1: | ||
parser.print_help() | ||
sys.exit(0) | ||
|
||
args = parser.parse_args() | ||
|
||
coverages = read_coverage(args.coverage) | ||
for accession, vals in coverages.items(): | ||
print(f'##contig=<ID={accession},length={vals["length"]}>') | ||
for cov in vals['positions']: | ||
print(cov) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
#! /usr/bin/env python3 | ||
""" | ||
usage: mask-consensus [-h] [--mincov INT] [--version] | ||
SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE | ||
mask-consensus - Snippy consensus (subs) with coverage masking. | ||
positional arguments: | ||
SAMPLE Sample name | ||
REFERENCE Reference name | ||
SUBS_FASTA Input "consensus.subs.fa" FASTA file | ||
SUBS_VCF Input ".subs.vcf" VCF file | ||
COVERAGE Per-base coverage of alignment | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
--mincov INT Minimum required coverage to not mask. | ||
--version show program's version number and exit | ||
""" | ||
PROGRAM = "mask-consensus" | ||
VERSION = "2.2.0" | ||
import sys | ||
|
||
|
||
def read_coverage(coverage): | ||
"""Read the per-base coverage input.""" | ||
import re | ||
accession = None | ||
length = None | ||
first_line = True | ||
coverages = {} | ||
with open(coverage, 'rt') as coverage_fh: | ||
for line in coverage_fh: | ||
line = line.rstrip() | ||
if line.startswith('##'): | ||
# ##contig=<ID=NZ_CP020108,length=5407749> | ||
contig = re.search(r'contig=<ID=(.*),length=([0-9]+)>', line) | ||
if contig: | ||
accession = contig.group(1) | ||
length = contig.group(2) | ||
coverages[accession] = {'length':int(length), 'positions': []} | ||
else: | ||
print(f'{line} is an unexpected format.', file=sys.stderr) | ||
sys.exit(1) | ||
else: | ||
if line: | ||
coverages[accession]['positions'].append(int(line)) | ||
|
||
for accession, vals in coverages.items(): | ||
if len(vals['positions']) != vals['length']: | ||
print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr) | ||
sys.exit(1) | ||
|
||
return coverages | ||
|
||
|
||
def read_vcf(vcf): | ||
"""Get positions with a substitution.""" | ||
subs = {} | ||
with open(vcf, 'rt') as vcf_fh: | ||
for line in vcf_fh: | ||
if not line.startswith("#"): | ||
line = line.split('\t') | ||
# 0 = accession, 1 = position | ||
if line[0] not in subs: | ||
subs[line[0]] = {} | ||
subs[line[0]][line[1]] = True | ||
return subs | ||
|
||
|
||
def read_fasta(fasta): | ||
"""Parse the input FASTA file.""" | ||
from Bio import SeqIO | ||
seqs = {} | ||
with open(fasta, 'r') as fasta_fh: | ||
for record in SeqIO.parse(fasta_fh,'fasta'): | ||
seqs[record.name] = str(record.seq) | ||
return seqs | ||
|
||
|
||
def mask_sequence(sequence, coverages, subs, mincov): | ||
"""Mask positions with low or no coverage in the input FASTA.""" | ||
masked_seqs = {} | ||
|
||
for accession, vals in coverages.items(): | ||
bases = [] | ||
coverage = vals['positions'] | ||
for i, cov in enumerate(coverage): | ||
if cov >= mincov: | ||
# Passes | ||
if accession in subs: | ||
if str(i+1) in subs[accession]: | ||
# Substitution | ||
bases.append(sequence[accession][i].lower()) | ||
else: | ||
# Same as reference | ||
bases.append(sequence[accession][i]) | ||
else: | ||
# No SNPs, Same as reference | ||
bases.append(sequence[accession][i]) | ||
elif cov: | ||
# Low coverage | ||
bases.append("N") | ||
else: | ||
# 0 coverage | ||
bases.append('n') | ||
|
||
if len(bases) != len(sequence[accession]): | ||
print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).', | ||
file=sys.stderr) | ||
sys.exit(1) | ||
else: | ||
masked_seqs[accession] = bases | ||
|
||
return masked_seqs | ||
|
||
|
||
def format_header(sample, reference, accession, length): | ||
"""Return a newly formatted header.""" | ||
title = f'Pseudo-seq with called substitutions and low coverage masked' | ||
return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]' | ||
|
||
|
||
def chunks(s, n): | ||
""" | ||
Produce `n`-character chunks from `s`. | ||
https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters | ||
""" | ||
for start in range(0, len(s), n): | ||
yield s[start:start+n] | ||
|
||
|
||
if __name__ == '__main__': | ||
import argparse as ap | ||
import sys | ||
|
||
parser = ap.ArgumentParser( | ||
prog=PROGRAM, | ||
conflict_handler='resolve', | ||
description=( | ||
f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.' | ||
) | ||
) | ||
parser.add_argument('sample', metavar="SAMPLE", type=str, | ||
help='Sample name') | ||
parser.add_argument('reference', metavar="REFERENCE", type=str, | ||
help='Reference name') | ||
parser.add_argument('fasta', metavar="SUBS_FASTA", type=str, | ||
help='Input "consensus.subs.fa" FASTA file') | ||
parser.add_argument('vcf', metavar="SUBS_VCF", type=str, | ||
help='Input ".subs.vcf" VCF file') | ||
parser.add_argument('coverage', metavar="COVERAGE", type=str, | ||
help='Per-base coverage of alignment') | ||
parser.add_argument('--mincov', metavar='INT', type=int, default=10, | ||
help='Minimum required coverage to not mask.') | ||
parser.add_argument('--version', action='version', | ||
version=f'{PROGRAM} {VERSION}') | ||
|
||
if len(sys.argv) == 1: | ||
parser.print_help() | ||
sys.exit(0) | ||
|
||
args = parser.parse_args() | ||
|
||
coverages = read_coverage(args.coverage) | ||
sub_positions = read_vcf(args.vcf) | ||
seqs = read_fasta(args.fasta) | ||
masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov) | ||
for accession, seq in masked_seqs.items(): | ||
header = format_header(args.sample, args.reference, accession, len(seq)) | ||
print(header) | ||
for chunk in chunks(seq, 60): | ||
print("".join(chunk)) |