diff --git a/harpy/bin/haplotag_acbd.py b/harpy/bin/haplotag_acbd.py new file mode 100755 index 00000000..d3a26e48 --- /dev/null +++ b/harpy/bin/haplotag_acbd.py @@ -0,0 +1,33 @@ +#! /usr/bin/env python +"""Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes""" +import os +import sys +import argparse + +parser = argparse.ArgumentParser( + prog = 'haplotag_acbd.py', + description ="Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes", + usage = "haplotag_acbd.py output_directory", + exit_on_error = False + ) +parser.add_argument("output_directory", type = str, help = "Directory to create barcode files") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + +args = parser.parse_args() +outdir = args.output_directory.rstrip("/") +os.makedirs(outdir, exist_ok = True) + +BX = { + "A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"], + "C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"], + "B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"], + "D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"] +} + +for BC in ["A","C","B","D"]: + with open(f"{outdir}/BC_{BC}.txt", "w", encoding="utf-8") as f: + ID = [f"{BC}{number:02d}" for number in range(1, 97)] + delim = [" ".join(tup) for tup in zip(ID, BX[BC])] + _ = [f.write(f"{i}\n") for i in delim] \ No newline at end of file diff --git a/harpy/snakefiles/align_bwa.smk b/harpy/snakefiles/align_bwa.smk index 63e3f386..06396681 100644 --- a/harpy/snakefiles/align_bwa.smk +++ b/harpy/snakefiles/align_bwa.smk @@ -147,6 +147,7 @@ rule index_duplicates: "samtools index {input}" rule assign_molecules: + priority: 100 input: bam = outdir + "/samples/{sample}/{sample}.markdup.bam", bai = outdir + "/samples/{sample}/{sample}.markdup.bam.bai" diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index d04e1632..36199fc3 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -227,6 +227,7 @@ rule index_duplicates: "samtools index {input}" rule concat_alignments: + priority: 100 input: aln_bc = outdir + "/ema_align/{sample}.bc.bam", idx_bc = outdir + "/ema_align/{sample}.bc.bam.bai", diff --git a/harpy/snakefiles/align_strobealign.smk b/harpy/snakefiles/align_strobealign.smk index 717da140..e1a63f4f 100644 --- a/harpy/snakefiles/align_strobealign.smk +++ b/harpy/snakefiles/align_strobealign.smk @@ -138,6 +138,7 @@ rule index_duplicates: "samtools index {input}" rule assign_molecules: + priority: 100 input: bam = outdir + "/samples/{sample}/{sample}.markdup.bam", bai = outdir + "/samples/{sample}/{sample}.markdup.bam.bai" diff --git a/harpy/snakefiles/demultiplex_gen1.smk b/harpy/snakefiles/demultiplex_gen1.smk index 2f5daa91..67f1e687 100644 --- a/harpy/snakefiles/demultiplex_gen1.smk +++ b/harpy/snakefiles/demultiplex_gen1.smk @@ -87,6 +87,7 @@ rule demultiplex_barcodes: """ rule demultiplex_samples: + priority: 100 input: outdir + "/demux_R{FR}_001.fastq.gz" output: diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk index a9f66547..a4684f6c 100644 --- a/harpy/snakefiles/impute.smk +++ b/harpy/snakefiles/impute.smk @@ -150,6 +150,7 @@ rule concat_list: _ = fout.write("\n".join(input.bcf)) rule merge_vcf: + priority: 100 input: files = outdir + "/{paramset}/bcf.files", idx = collect(outdir + "/{{paramset}}/contigs/{contig}.vcf.gz.tbi", contig = contigs) diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index 78d9cebf..e4353b16 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -196,6 +196,7 @@ use rule compress_phaseblock as compress_vcf with: outdir + "/workflow/input/gzvcf/{sample}.hom.vcf.gz" rule merge_het_hom: + priority: 100 input: phase = outdir + "/phase_blocks/{sample}.phased.vcf.gz", orig = outdir + "/workflow/input/gzvcf/{sample}.hom.vcf.gz" diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index f450656b..43a24506 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -47,6 +47,7 @@ def get_fq2(wildcards): if not deconvolve: rule fastp: + priority: 100 input: fw = get_fq1, rv = get_fq2 @@ -73,6 +74,7 @@ if not deconvolve: """ else: rule fastp: + priority: 100 input: fw = get_fq1, rv = get_fq2 diff --git a/harpy/snakefiles/sv_leviathan.smk b/harpy/snakefiles/sv_leviathan.smk index 929b0e5a..f5ff4af3 100644 --- a/harpy/snakefiles/sv_leviathan.smk +++ b/harpy/snakefiles/sv_leviathan.smk @@ -125,6 +125,7 @@ rule call_variants: "LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output.vcf} -t {threads} --candidates {output.candidates} 2> {log.runlog}" rule sort_variants: + priority: 100 input: outdir + "/vcf/{sample}.vcf" output: diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index 60adf24a..b48e0a94 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -172,6 +172,7 @@ rule call_variants: "LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output.vcf} -t {threads} --candidates {output.candidates} 2> {log.runlog}" rule sort_variants: + priority: 100 input: outdir + "/vcf/{population}.vcf" output: diff --git a/harpy/snakefiles/sv_naibr.smk b/harpy/snakefiles/sv_naibr.smk index e7177a6a..eae3604c 100644 --- a/harpy/snakefiles/sv_naibr.smk +++ b/harpy/snakefiles/sv_naibr.smk @@ -92,9 +92,9 @@ rule call_variants: bai = get_align_index, conf = outdir + "/workflow/input/{sample}.naibr" output: - bedpe = outdir + "/{sample}/{sample}.bedpe", - refmt = outdir + "/{sample}/{sample}.reformat.bedpe", - vcf = outdir + "/{sample}/{sample}.vcf" + bedpe = temp(outdir + "/{sample}/{sample}.bedpe"), + refmt = temp(outdir + "/{sample}/{sample}.reformat.bedpe"), + vcf = temp(outdir + "/{sample}/{sample}.vcf") log: outdir + "/logs/naibr/{sample}.naibr.log" threads: @@ -114,16 +114,13 @@ rule infer_variants: refmt = outdir + "/IGV/{sample}.reformat.bedpe", fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe", vcf = outdir + "/vcf/{sample}.vcf" - params: - outdir = lambda wc: outdir + "/" + wc.get("sample") container: None shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} - mv {input.refmt} {output.refmt} && - mv {input.vcf} {output.vcf} && - rm -rf {params.outdir} + cp {input.refmt} {output.refmt} + cp {input.vcf} {output.vcf} """ rule aggregate_variants: diff --git a/harpy/snakefiles/sv_naibr_phase.smk b/harpy/snakefiles/sv_naibr_phase.smk index 4a983a6f..a263d862 100644 --- a/harpy/snakefiles/sv_naibr_phase.smk +++ b/harpy/snakefiles/sv_naibr_phase.smk @@ -188,9 +188,9 @@ rule call_variants: bai = outdir + "/phasedbam/{sample}.bam.bai", conf = outdir + "/workflow/input/{sample}.naibr" output: - bedpe = outdir + "/{sample}/{sample}.bedpe", - refmt = outdir + "/{sample}/{sample}.reformat.bedpe", - vcf = outdir + "/{sample}/{sample}.vcf" + bedpe = temp(outdir + "/{sample}/{sample}.bedpe"), + refmt = temp(outdir + "/{sample}/{sample}.reformat.bedpe"), + vcf = temp(outdir + "/{sample}/{sample}.vcf") log: outdir + "/logs/naibr/{sample}.naibr.log" threads: @@ -201,6 +201,7 @@ rule call_variants: "naibr {input.conf} > {log} 2>&1" rule infer_variants: + priority: 100 input: bedpe = outdir + "/{sample}/{sample}.bedpe", refmt = outdir + "/{sample}/{sample}.reformat.bedpe", @@ -210,16 +211,13 @@ rule infer_variants: refmt = outdir + "/IGV/{sample}.reformat.bedpe", fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe", vcf = outdir + "/vcf/{sample}.vcf" - params: - outdir = lambda wc: outdir + "/" + wc.get("sample") container: None shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} - mv {input.refmt} {output.refmt} && - mv {input.vcf} {output.vcf} && - rm -rf {params.outdir} + cp {input.refmt} {output.refmt} + cp {input.vcf} {output.vcf} """ rule aggregate_variants: diff --git a/harpy/snakefiles/sv_naibr_pop.smk b/harpy/snakefiles/sv_naibr_pop.smk index 1750ed69..889db897 100644 --- a/harpy/snakefiles/sv_naibr_pop.smk +++ b/harpy/snakefiles/sv_naibr_pop.smk @@ -141,9 +141,9 @@ rule call_variants: bai = outdir + "/workflow/input/{population}.bam.bai", conf = outdir + "/workflow/config/{population}.naibr" output: - bedpe = outdir + "/{population}/{population}.bedpe", - refmt = outdir + "/{population}/{population}.reformat.bedpe", - vcf = outdir + "/{population}/{population}.vcf" + bedpe = temp(outdir + "/{population}/{population}.bedpe"), + refmt = temp(outdir + "/{population}/{population}.reformat.bedpe"), + vcf = temp(outdir + "/{population}/{population}.vcf") log: outdir + "/logs/naibr/{population}.naibr.log" threads: @@ -154,6 +154,7 @@ rule call_variants: "naibr {input.conf} > {log} 2>&1" rule infer_variants: + priority: 100 input: bedpe = outdir + "/{population}/{population}.bedpe", refmt = outdir + "/{population}/{population}.reformat.bedpe", @@ -163,16 +164,13 @@ rule infer_variants: refmt = outdir + "/IGV/{population}.reformat.bedpe", fail = outdir + "/bedpe/qc_fail/{population}.fail.bedpe", vcf = outdir + "/vcf/{population}.vcf" - params: - outdir = lambda wc: outdir + "/" + wc.get("population") container: None shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} - mv {input.refmt} {output.refmt} && - mv {input.vcf} {output.vcf} && - rm -rf {params.outdir} + cp {input.refmt} {output.refmt} + cp {input.vcf} {output.vcf} """ rule aggregate_variants_variants: diff --git a/harpy/snakefiles/sv_naibr_pop_phase.smk b/harpy/snakefiles/sv_naibr_pop_phase.smk index 70226ba4..18702076 100644 --- a/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -246,9 +246,9 @@ rule call_variants: bai = outdir + "/workflow/input/{population}.bam.bai", conf = outdir + "/workflow/config/{population}.naibr" output: - bedpe = outdir + "/{population}/{population}.bedpe", - refmt = outdir + "/{population}/{population}.reformat.bedpe", - vcf = outdir + "/{population}/{population}.vcf" + bedpe = temp(outdir + "/{population}/{population}.bedpe"), + refmt = temp(outdir + "/{population}/{population}.reformat.bedpe"), + vcf = temp(outdir + "/{population}/{population}.vcf") log: outdir + "/logs/naibr/{population}.naibr.log" threads: @@ -259,6 +259,7 @@ rule call_variants: "naibr {input.conf} > {log} 2>&1" rule infer_variants: + priority: 100 input: bedpe = outdir + "/{population}/{population}.bedpe", refmt = outdir + "/{population}/{population}.reformat.bedpe", @@ -268,16 +269,13 @@ rule infer_variants: refmt = outdir + "/IGV/{population}.reformat.bedpe", fail = outdir + "/bedpe/qc_fail/{population}.fail.bedpe", vcf = outdir + "/vcf/{population}.vcf" - params: - outdir = lambda wc: outdir + "/" + wc.get("population") container: None shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} - mv {input.refmt} {output.refmt} && - mv {input.vcf} {output.vcf} && - rm -rf {params.outdir} + cp {input.refmt} {output.refmt} + cp {input.vcf} {output.vcf} """ rule aggregate_variants: