add scripts

bactopia · Feb 14, 2023 · bde736f · bde736f
1 parent 657fff5
commit bde736f
Show file tree

Hide file tree

Showing 4 changed files with 314 additions and 0 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,13 @@
+cff-version: 1.2.0
+message: "If you use Bactopia, please cite it as below."
+authors:
+- family-names: "Petit III"
+  given-names: "Robert A. "
+  orcid: "https://orcid.org/0000-0002-1350-9426"
+- family-names: "Read"
+  given-names: "Timothy D."
+  orcid: "https://orcid.org/0000-0001-8966-9680"
+title: "Bactopia: a Flexible Pipeline for Complete Analysis of Bacterial Genomes. mSystems. 5 (2020)"
+doi: 10.1128/mSystems.00190-20
+url: "https://github.com/bactopia/bactopia"
+version: 1.4.1
diff --git a/bin/bactopia-variants b/bin/bactopia-variants
@@ -0,0 +1,53 @@
+#! /bin/bash
+VERSION=1.0.0
+PREFIX=$1
+
+# If no user input, print usage
+if [[ $# == 0 ]]; then
+    echo "bactopia-variants - v${VERSION}"
+    echo ""
+    echo "bactopia-variants <OPT1> ... <OPTN>"
+    echo ""
+    exit
+fi
+
+if [ "$is_compressed" == "true" ]; then
+    gzip -c -d $reference > $final_reference
+fi
+snippy \\
+    $options.args \\
+    --cpus $task.cpus \\
+    --outdir $prefix \\
+    --reference $final_reference \\
+    --prefix $prefix \\
+    $read_inputs
+
+# Add GenBank annotations
+vcf-annotator ${prefix}/${prefix}.vcf ${final_reference} > ${prefix}/${prefix}.annotated.vcf
+
+# Get per-base coverage
+grep "^##contig" ${prefix}/${prefix}.vcf > ${prefix}/${prefix}.full-coverage.txt
+genomeCoverageBed -ibam ${prefix}/${prefix}.bam -d >> ${prefix}/${prefix}.full-coverage.txt
+cleanup-coverage.py ${prefix}/${prefix}.full-coverage.txt > ${prefix}/${prefix}.coverage.txt
+rm ${prefix}/${prefix}.full-coverage.txt
+
+# Mask low coverage regions
+mask-consensus.py \\
+    ${prefix} \\
+    ${reference_name} \\
+    ${prefix}/${prefix}.consensus.subs.fa \\
+    ${prefix}/${prefix}.subs.vcf \\
+    ${prefix}/${prefix}.coverage.txt \\
+    --mincov ${params.mincov} > ${prefix}/${prefix}.consensus.subs.masked.fa
+
+# Clean Up
+rm -rf ${prefix}/reference ${prefix}/ref.fa* ${prefix}/${prefix}.vcf.gz*
+if [[ ${params.skip_compression} == "false" ]]; then
+    find ${prefix}/ -type f | \
+        grep -v -E "\\.bam\$|\\.bai\$|\\.log\$|\\.txt\$|\\.html\$|\\.tab\$" | \
+        xargs -I {} pigz -n --best -p ${task.cpus} {}
+    pigz -n --best -p ${task.cpus} ${prefix}/${prefix}.coverage.txt
+fi
+
+mv ${prefix}/ results/
+mv results/*.log ./
diff --git a/bin/cleanup-coverage.py b/bin/cleanup-coverage.py
@@ -0,0 +1,75 @@
+#! /usr/bin/env python3
+"""
+usage: cleanup-coverage [-h] [--mincov INT] [--version] COVERAGE
+
+cleanup-coverage - Reduce redundancy in per-base coverage.
+
+positional arguments:
+  COVERAGE      Output from genomeBedCoverage
+
+optional arguments:
+  -h, --help    show this help message and exit
+  --version     show program's version number and exit
+"""
+PROGRAM = "cleanup-coverage"
+VERSION = "2.2.0"
+import sys
+
+def read_coverage(coverage):
+    """Read the per-base coverage input."""
+    import re
+    accession = None
+    length = None
+    first_line = True
+    coverages = {}
+    with open(coverage, 'rt') as coverage_fh:
+        for line in coverage_fh:
+            line = line.rstrip()
+            if line.startswith('##'):
+                # ##contig=<ID=NZ_CP020108,length=5407749>
+                contig = re.search(r'contig=<ID=(.*),length=([0-9]+)>', line)
+                if contig:
+                    accession = contig.group(1)
+                    length = contig.group(2)
+                    coverages[accession] = {'length':int(length), 'positions': []}
+                else:
+                    print(f'{line} is an unexpected format.', file=sys.stderr)
+                    sys.exit(1)
+            else:
+                accession, position, coverage = line.split('\t')
+                coverages[accession]['positions'].append(int(coverage))
+
+    for accession, vals in coverages.items():
+        if len(vals['positions']) != vals['length']:
+            print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr)
+            sys.exit(1)
+
+    return coverages
+
+if __name__ == '__main__':
+    import argparse as ap
+    import sys
+
+    parser = ap.ArgumentParser(
+        prog=PROGRAM,
+        conflict_handler='resolve',
+        description=(
+            f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.'
+        )
+    )
+    parser.add_argument('coverage', metavar="COVERAGE", type=str,
+                        help='Directory where BLAST databases are stored')
+    parser.add_argument('--version', action='version',
+                        version=f'{PROGRAM} {VERSION}')
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(0)
+
+    args = parser.parse_args()
+
+    coverages = read_coverage(args.coverage)
+    for accession, vals in coverages.items():
+        print(f'##contig=<ID={accession},length={vals["length"]}>')
+        for cov in vals['positions']:
+            print(cov)
diff --git a/bin/mask-consensus.py b/bin/mask-consensus.py
@@ -0,0 +1,173 @@
+#! /usr/bin/env python3
+"""
+usage: mask-consensus [-h] [--mincov INT] [--version]
+                      SAMPLE REFERENCE SUBS_FASTA SUBS_VCF COVERAGE
+
+mask-consensus - Snippy consensus (subs) with coverage masking.
+
+positional arguments:
+  SAMPLE        Sample name
+  REFERENCE     Reference name
+  SUBS_FASTA    Input "consensus.subs.fa" FASTA file
+  SUBS_VCF      Input ".subs.vcf" VCF file
+  COVERAGE      Per-base coverage of alignment
+
+optional arguments:
+  -h, --help    show this help message and exit
+  --mincov INT  Minimum required coverage to not mask.
+  --version     show program's version number and exit
+"""
+PROGRAM = "mask-consensus"
+VERSION = "2.2.0"
+import sys
+
+
+def read_coverage(coverage):
+    """Read the per-base coverage input."""
+    import re
+    accession = None
+    length = None
+    first_line = True
+    coverages = {}
+    with open(coverage, 'rt') as coverage_fh:
+        for line in coverage_fh:
+            line = line.rstrip()
+            if line.startswith('##'):
+                # ##contig=<ID=NZ_CP020108,length=5407749>
+                contig = re.search(r'contig=<ID=(.*),length=([0-9]+)>', line)
+                if contig:
+                    accession = contig.group(1)
+                    length = contig.group(2)
+                    coverages[accession] = {'length':int(length), 'positions': []}
+                else:
+                    print(f'{line} is an unexpected format.', file=sys.stderr)
+                    sys.exit(1)
+            else:
+                if line:
+                    coverages[accession]['positions'].append(int(line))
+
+    for accession, vals in coverages.items():
+        if len(vals['positions']) != vals['length']:
+            print(f'Observed bases ({len(vals["positions"])} in {accession} not expected length ({vals["length"]}).', file=sys.stderr)
+            sys.exit(1)
+
+    return coverages
+
+
+def read_vcf(vcf):
+    """Get positions with a substitution."""
+    subs = {}
+    with open(vcf, 'rt') as vcf_fh:
+        for line in vcf_fh:
+            if not line.startswith("#"):
+                line = line.split('\t')
+                # 0 = accession, 1 = position
+                if line[0] not in subs:
+                    subs[line[0]] = {}
+                subs[line[0]][line[1]] = True
+    return subs
+
+
+def read_fasta(fasta):
+    """Parse the input FASTA file."""
+    from Bio import SeqIO
+    seqs = {}
+    with open(fasta, 'r') as fasta_fh:
+        for record in SeqIO.parse(fasta_fh,'fasta'):
+            seqs[record.name] = str(record.seq)
+    return seqs
+
+
+def mask_sequence(sequence, coverages, subs, mincov):
+    """Mask positions with low or no coverage in the input FASTA."""
+    masked_seqs = {}
+
+    for accession, vals in coverages.items():
+        bases = []
+        coverage = vals['positions']
+        for i, cov in enumerate(coverage):
+            if cov >= mincov:
+                # Passes
+                if accession in subs:
+                    if str(i+1) in subs[accession]:
+                        # Substitution
+                        bases.append(sequence[accession][i].lower())
+                    else:
+                        # Same as reference
+                        bases.append(sequence[accession][i])
+                else:
+                    # No SNPs, Same as reference
+                    bases.append(sequence[accession][i])
+            elif cov:
+                # Low coverage
+                bases.append("N")
+            else:
+                # 0 coverage
+                bases.append('n')
+
+        if len(bases) != len(sequence[accession]):
+            print(f'Masked sequence ({len(bases)} for {accession} not expected length ({len(sequence[accession])}).',
+                file=sys.stderr)
+            sys.exit(1)
+        else:
+            masked_seqs[accession] = bases
+
+    return masked_seqs
+
+
+def format_header(sample, reference, accession, length):
+    """Return a newly formatted header."""
+    title = f'Pseudo-seq with called substitutions and low coverage masked'
+    return f'>gnl|{accession}|{sample} {title} [assembly_accession={reference}] [length={length}]'
+
+
+def chunks(s, n):
+    """
+    Produce `n`-character chunks from `s`.
+    https://stackoverflow.com/questions/7111068/split-string-by-count-of-characters
+    """
+    for start in range(0, len(s), n):
+        yield s[start:start+n]
+
+
+if __name__ == '__main__':
+    import argparse as ap
+    import sys
+
+    parser = ap.ArgumentParser(
+        prog=PROGRAM,
+        conflict_handler='resolve',
+        description=(
+            f'{PROGRAM} (v{VERSION}) - Snippy consensus (subs) with coverage masking.'
+        )
+    )
+    parser.add_argument('sample', metavar="SAMPLE", type=str,
+                        help='Sample name')
+    parser.add_argument('reference', metavar="REFERENCE", type=str,
+                        help='Reference name')
+    parser.add_argument('fasta', metavar="SUBS_FASTA", type=str,
+                        help='Input "consensus.subs.fa" FASTA file')
+    parser.add_argument('vcf', metavar="SUBS_VCF", type=str,
+                        help='Input ".subs.vcf" VCF file')
+    parser.add_argument('coverage', metavar="COVERAGE", type=str,
+                        help='Per-base coverage of alignment')
+    parser.add_argument('--mincov', metavar='INT', type=int, default=10,
+                        help='Minimum required coverage to not mask.')
+    parser.add_argument('--version', action='version',
+                        version=f'{PROGRAM} {VERSION}')
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(0)
+
+    args = parser.parse_args()
+
+    coverages = read_coverage(args.coverage)
+    sub_positions = read_vcf(args.vcf)
+    seqs = read_fasta(args.fasta)
+    masked_seqs = mask_sequence(seqs, coverages, sub_positions, args.mincov)
+    for accession, seq in masked_seqs.items():
+        header = format_header(args.sample, args.reference, accession, len(seq))
+        print(header)
+        for chunk in chunks(seq, 60):
+            print("".join(chunk))