-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
129 lines (112 loc) · 3.79 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
__author__ = "Taavi Päll"
__copyright__ = "Copyright 2019, Avilab"
__email__ = "taavi.pall@ut.ee"
__license__ = "MIT"
# Load libraries
import os
import json
import glob
import pandas as pd
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
from snakemake.utils import validate, makedirs
shell.executable("bash")
# Load configuration file with sample and path info
configfile: "config.yaml"
validate(config, "schemas/config.schema.yaml")
# Load runs and groups
RUNS = pd.read_csv(config["samples"], sep="\s+").set_index("run", drop=False)
validate(RUNS, "schemas/samples.schema.yaml")
RUN_IDS = RUNS.index.tolist()
N_FILES = config["split_fasta"]["n_files"]
N = list(range(1, N_FILES + 1, 1))
# Create slurm logs dir
makedirs("logs/slurm")
wildcard_constraints:
run = "[a-zA-Z0-9]+",
n = "\d+",
blastresult = "[-a-z]+"
# Main output files and target rules
RESULTS = ["viruses.csv", "non-viral.csv", "unassigned.fa"]
BLASTV = ["blastn-virus", "blastx-virus"] if config["run_blastx"] else ["blastn-virus"]
BLASTNR = (
["megablast-nt", "blastn-nt", "blastx-nr"]
if config["run_blastx"]
else ["megablast-nt", "blastn-nt"]
)
BLAST = BLASTV + BLASTNR
STATS = expand(
[
"output/stats/{run}_host-bam-stats.txt",
"output/stats/{run}_preprocess-stats.tsv",
"output/stats/{run}_blast.tsv",
"output/stats/{run}_coverage.txt",
"output/stats/{run}_basecov.txt",
],
run=RUN_IDS,
)
OUTPUTS = (
expand(
["output/results/{run}_{result}", "output/contigs/{run}_final-contigs.fa"],
run=RUN_IDS,
result=RESULTS,
)
+ STATS
)
# Remote outputs
if config["zenodo"]["deposition_id"]:
from snakemake.remote.zenodo import RemoteProvider as ZENRemoteProvider
# Setup Zenodo RemoteProvider
ZEN = ZENRemoteProvider(deposition = config["zenodo"]["deposition_id"], access_token = os.environ["ZENODO_PAT"])
ZENOUTPUTS = ZEN.remote(expand(["output/results/{run}_results.tgz", "output/stats/{run}_assembly-stats.tgz", "output/stats/{run}_run-stats.tgz"], run = RUN_IDS))
OUTPUTS = OUTPUTS + ZENOUTPUTS
localrules: upload_results, upload_assembly, upload_stats
rule upload_results:
input:
expand("output/results/{{run}}_{result}", result = RESULTS)
output:
ZEN.remote("output/results/{run}_results.tgz")
shell:
"tar czvf {output} {input}"
rule upload_stats:
input:
rules.refgenome_bam_stats.output,
rules.preprocess_stats.output,
rules.blast_stats.output
output:
ZEN.remote("output/stats/{run}_run-stats.tgz")
shell:
"tar czvf {output} {input}"
rule upload_assembly:
input:
rules.assemble_cleanup.output.contigs,
rules.coverage.output.covstats,
rules.coverage.output.basecov
output:
ZEN.remote("output/stats/{run}_assembly-stats.tgz")
shell:
"tar czvf {output} {input}"
localrules: all
rule all:
input:
OUTPUTS
# Path to reference genomes
HOST_GENOME = os.getenv("REF_GENOME_HUMAN")
HOST_TAXID = 9606
TAXON_DB = os.getenv("TAXON_DB")
# Wrappers
wrapper_prefix = "https://raw.githubusercontent.com/avilab/virome-wrappers/"
LN_FILTER = wrapper_prefix + "master/filter/masked"
BWA_UNMAPPED = wrapper_prefix + "master/unmapped"
BLAST_QUERY = wrapper_prefix + "master/blast/query"
PARSE_BLAST = wrapper_prefix + "master/blast/parse"
BLAST_TAXONOMY = wrapper_prefix + "master/blast/taxonomy"
SUBSET_FASTA = wrapper_prefix + "master/subset_fasta"
SEQ_STATS = wrapper_prefix + "master/seqkit/stats"
# Path to Repeatmasker script
RM = wrapper_prefix + "master/repeatmasker/wrapper.py"
# Modules
include: "rules/preprocess.smk"
include: "rules/blast.smk"
onsuccess:
email = config["email"]
shell("mail -s 'Forkflow finished successfully' {email} < {log}")