Skip to content

Commit

Permalink
add priorities, punch up naibr move/copy logic (#178)
Browse files Browse the repository at this point in the history
* add priorities, punch up naibr move/copy logic

* fix this MISSION CRITICAL bug

* update the tests

* restore the missing file

* restore text condition

* fix call
  • Loading branch information
pdimens authored Dec 20, 2024
1 parent f3f891f commit f3ab27c
Show file tree
Hide file tree
Showing 14 changed files with 66 additions and 32 deletions.
33 changes: 33 additions & 0 deletions harpy/bin/haplotag_acbd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#! /usr/bin/env python
"""Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes"""
import os
import sys
import argparse

parser = argparse.ArgumentParser(
prog = 'haplotag_acbd.py',
description ="Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes",
usage = "haplotag_acbd.py output_directory",
exit_on_error = False
)
parser.add_argument("output_directory", type = str, help = "Directory to create barcode files")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()
outdir = args.output_directory.rstrip("/")
os.makedirs(outdir, exist_ok = True)

BX = {
"A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"],
"C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"],
"B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"],
"D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"]
}

for BC in ["A","C","B","D"]:
with open(f"{outdir}/BC_{BC}.txt", "w", encoding="utf-8") as f:
ID = [f"{BC}{number:02d}" for number in range(1, 97)]
delim = [" ".join(tup) for tup in zip(ID, BX[BC])]
_ = [f.write(f"{i}\n") for i in delim]
1 change: 1 addition & 0 deletions harpy/snakefiles/align_bwa.smk
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ rule index_duplicates:
"samtools index {input}"

rule assign_molecules:
priority: 100
input:
bam = outdir + "/samples/{sample}/{sample}.markdup.bam",
bai = outdir + "/samples/{sample}/{sample}.markdup.bam.bai"
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/align_ema.smk
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ rule index_duplicates:
"samtools index {input}"

rule concat_alignments:
priority: 100
input:
aln_bc = outdir + "/ema_align/{sample}.bc.bam",
idx_bc = outdir + "/ema_align/{sample}.bc.bam.bai",
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/align_strobealign.smk
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ rule index_duplicates:
"samtools index {input}"

rule assign_molecules:
priority: 100
input:
bam = outdir + "/samples/{sample}/{sample}.markdup.bam",
bai = outdir + "/samples/{sample}/{sample}.markdup.bam.bai"
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/demultiplex_gen1.smk
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ rule demultiplex_barcodes:
"""

rule demultiplex_samples:
priority: 100
input:
outdir + "/demux_R{FR}_001.fastq.gz"
output:
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/impute.smk
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ rule concat_list:
_ = fout.write("\n".join(input.bcf))

rule merge_vcf:
priority: 100
input:
files = outdir + "/{paramset}/bcf.files",
idx = collect(outdir + "/{{paramset}}/contigs/{contig}.vcf.gz.tbi", contig = contigs)
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/phase.smk
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ use rule compress_phaseblock as compress_vcf with:
outdir + "/workflow/input/gzvcf/{sample}.hom.vcf.gz"

rule merge_het_hom:
priority: 100
input:
phase = outdir + "/phase_blocks/{sample}.phased.vcf.gz",
orig = outdir + "/workflow/input/gzvcf/{sample}.hom.vcf.gz"
Expand Down
2 changes: 2 additions & 0 deletions harpy/snakefiles/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def get_fq2(wildcards):

if not deconvolve:
rule fastp:
priority: 100
input:
fw = get_fq1,
rv = get_fq2
Expand All @@ -73,6 +74,7 @@ if not deconvolve:
"""
else:
rule fastp:
priority: 100
input:
fw = get_fq1,
rv = get_fq2
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/sv_leviathan.smk
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ rule call_variants:
"LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output.vcf} -t {threads} --candidates {output.candidates} 2> {log.runlog}"

rule sort_variants:
priority: 100
input:
outdir + "/vcf/{sample}.vcf"
output:
Expand Down
1 change: 1 addition & 0 deletions harpy/snakefiles/sv_leviathan_pop.smk
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ rule call_variants:
"LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output.vcf} -t {threads} --candidates {output.candidates} 2> {log.runlog}"

rule sort_variants:
priority: 100
input:
outdir + "/vcf/{population}.vcf"
output:
Expand Down
13 changes: 5 additions & 8 deletions harpy/snakefiles/sv_naibr.smk
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ rule call_variants:
bai = get_align_index,
conf = outdir + "/workflow/input/{sample}.naibr"
output:
bedpe = outdir + "/{sample}/{sample}.bedpe",
refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
vcf = outdir + "/{sample}/{sample}.vcf"
bedpe = temp(outdir + "/{sample}/{sample}.bedpe"),
refmt = temp(outdir + "/{sample}/{sample}.reformat.bedpe"),
vcf = temp(outdir + "/{sample}/{sample}.vcf")
log:
outdir + "/logs/naibr/{sample}.naibr.log"
threads:
Expand All @@ -114,16 +114,13 @@ rule infer_variants:
refmt = outdir + "/IGV/{sample}.reformat.bedpe",
fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe",
vcf = outdir + "/vcf/{sample}.vcf"
params:
outdir = lambda wc: outdir + "/" + wc.get("sample")
container:
None
shell:
"""
infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe}
mv {input.refmt} {output.refmt} &&
mv {input.vcf} {output.vcf} &&
rm -rf {params.outdir}
cp {input.refmt} {output.refmt}
cp {input.vcf} {output.vcf}
"""

rule aggregate_variants:
Expand Down
14 changes: 6 additions & 8 deletions harpy/snakefiles/sv_naibr_phase.smk
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,9 @@ rule call_variants:
bai = outdir + "/phasedbam/{sample}.bam.bai",
conf = outdir + "/workflow/input/{sample}.naibr"
output:
bedpe = outdir + "/{sample}/{sample}.bedpe",
refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
vcf = outdir + "/{sample}/{sample}.vcf"
bedpe = temp(outdir + "/{sample}/{sample}.bedpe"),
refmt = temp(outdir + "/{sample}/{sample}.reformat.bedpe"),
vcf = temp(outdir + "/{sample}/{sample}.vcf")
log:
outdir + "/logs/naibr/{sample}.naibr.log"
threads:
Expand All @@ -201,6 +201,7 @@ rule call_variants:
"naibr {input.conf} > {log} 2>&1"

rule infer_variants:
priority: 100
input:
bedpe = outdir + "/{sample}/{sample}.bedpe",
refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
Expand All @@ -210,16 +211,13 @@ rule infer_variants:
refmt = outdir + "/IGV/{sample}.reformat.bedpe",
fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe",
vcf = outdir + "/vcf/{sample}.vcf"
params:
outdir = lambda wc: outdir + "/" + wc.get("sample")
container:
None
shell:
"""
infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe}
mv {input.refmt} {output.refmt} &&
mv {input.vcf} {output.vcf} &&
rm -rf {params.outdir}
cp {input.refmt} {output.refmt}
cp {input.vcf} {output.vcf}
"""

rule aggregate_variants:
Expand Down
14 changes: 6 additions & 8 deletions harpy/snakefiles/sv_naibr_pop.smk
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ rule call_variants:
bai = outdir + "/workflow/input/{population}.bam.bai",
conf = outdir + "/workflow/config/{population}.naibr"
output:
bedpe = outdir + "/{population}/{population}.bedpe",
refmt = outdir + "/{population}/{population}.reformat.bedpe",
vcf = outdir + "/{population}/{population}.vcf"
bedpe = temp(outdir + "/{population}/{population}.bedpe"),
refmt = temp(outdir + "/{population}/{population}.reformat.bedpe"),
vcf = temp(outdir + "/{population}/{population}.vcf")
log:
outdir + "/logs/naibr/{population}.naibr.log"
threads:
Expand All @@ -154,6 +154,7 @@ rule call_variants:
"naibr {input.conf} > {log} 2>&1"

rule infer_variants:
priority: 100
input:
bedpe = outdir + "/{population}/{population}.bedpe",
refmt = outdir + "/{population}/{population}.reformat.bedpe",
Expand All @@ -163,16 +164,13 @@ rule infer_variants:
refmt = outdir + "/IGV/{population}.reformat.bedpe",
fail = outdir + "/bedpe/qc_fail/{population}.fail.bedpe",
vcf = outdir + "/vcf/{population}.vcf"
params:
outdir = lambda wc: outdir + "/" + wc.get("population")
container:
None
shell:
"""
infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe}
mv {input.refmt} {output.refmt} &&
mv {input.vcf} {output.vcf} &&
rm -rf {params.outdir}
cp {input.refmt} {output.refmt}
cp {input.vcf} {output.vcf}
"""

rule aggregate_variants_variants:
Expand Down
14 changes: 6 additions & 8 deletions harpy/snakefiles/sv_naibr_pop_phase.smk
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,9 @@ rule call_variants:
bai = outdir + "/workflow/input/{population}.bam.bai",
conf = outdir + "/workflow/config/{population}.naibr"
output:
bedpe = outdir + "/{population}/{population}.bedpe",
refmt = outdir + "/{population}/{population}.reformat.bedpe",
vcf = outdir + "/{population}/{population}.vcf"
bedpe = temp(outdir + "/{population}/{population}.bedpe"),
refmt = temp(outdir + "/{population}/{population}.reformat.bedpe"),
vcf = temp(outdir + "/{population}/{population}.vcf")
log:
outdir + "/logs/naibr/{population}.naibr.log"
threads:
Expand All @@ -259,6 +259,7 @@ rule call_variants:
"naibr {input.conf} > {log} 2>&1"

rule infer_variants:
priority: 100
input:
bedpe = outdir + "/{population}/{population}.bedpe",
refmt = outdir + "/{population}/{population}.reformat.bedpe",
Expand All @@ -268,16 +269,13 @@ rule infer_variants:
refmt = outdir + "/IGV/{population}.reformat.bedpe",
fail = outdir + "/bedpe/qc_fail/{population}.fail.bedpe",
vcf = outdir + "/vcf/{population}.vcf"
params:
outdir = lambda wc: outdir + "/" + wc.get("population")
container:
None
shell:
"""
infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe}
mv {input.refmt} {output.refmt} &&
mv {input.vcf} {output.vcf} &&
rm -rf {params.outdir}
cp {input.refmt} {output.refmt}
cp {input.vcf} {output.vcf}
"""

rule aggregate_variants:
Expand Down

0 comments on commit f3ab27c

Please sign in to comment.