@@ -6,7 +6,6 @@ PROJECT = config["project"] + "/"
6
6
7
7
rule final :
8
8
input : expand ("{project}/fastqc_raw/{data}_R1_fastqc.zip \
9
- {project}/fastqc_pandaseq/{data}_fastqc.zip \
10
9
{project}/{prog}/clst/{ds}.minsize{minsize}.usearch_smallmem.fasta \
11
10
{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.taxonomy \
12
11
{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.sina.biom \
@@ -37,6 +36,7 @@ rule fastqc:
37
36
adapters = config ["adapters_fasta" ]
38
37
log : "fastqc_raw.log"
39
38
threads : 2
39
+ conda : "envs/fastqc.yaml"
40
40
run :
41
41
shell ("fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.forward} > {params.dir}/{log}" )
42
42
shell ("fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.reverse} > {params.dir}/{log}" )
@@ -46,7 +46,7 @@ rule pandaseq:
46
46
forward = "{project}/gunzip/{data}_R1.fastq" ,
47
47
reverse = "{project}/gunzip/{data}_R2.fastq"
48
48
output :
49
- fastq = "{project}/pandaseq/{data}.fastq "
49
+ fasta = "{project}/pandaseq/{data}.fasta "
50
50
params :
51
51
overlap = config ['pandaseq_overlap' ],
52
52
quality = config ['pandaseq_quality' ],
@@ -56,8 +56,8 @@ rule pandaseq:
56
56
reverse_primer = config ['reverse_primer' ]
57
57
log : "{project}/pandaseq/{data}_pandaseq.stdout"
58
58
threads : 1
59
- #shell : "source /data/tools/RDP_Assembler/1.0.3/env.sh; pandaseq -N -o {params.overlap} -e {params.quality} -F -d rbfkms -l {params.minlength} -L {params.maxlength} -T {threads} -f {input.forward} -r {input.reverse} 1> {output.fastq} 2> {log} "
60
- shell : "/data/tools/ pandaseq/2.9/bin/pandaseq -N -f {input.forward} -r {input.reverse} -p {params.forward_primer} -q {params.reverse_primer} -A rdp_mle -T {threads} -w {output.fastq } -g {log}"
59
+ conda : "envs/ pandaseq.yaml "
60
+ shell : "pandaseq -N -f {input.forward} -r {input.reverse} -p {params.forward_primer} -q {params.reverse_primer} -A rdp_mle -T {threads} -w {output.fasta } -g {log}"
61
61
62
62
rule fastqc_pandaseq :
63
63
input :
@@ -69,9 +69,11 @@ rule fastqc_pandaseq:
69
69
adapters = config ["adapters_fasta" ]
70
70
log : "fastqc.log"
71
71
threads : 8
72
+ conda : "envs/fastqc.yaml"
72
73
shell : "fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.fastq} > {params.dir}/{log}"
73
74
74
75
76
+ #Obsolete
75
77
rule primer_matching :
76
78
input :
77
79
"{project}/pandaseq/{data}.fastq"
@@ -86,7 +88,7 @@ rule primer_matching:
86
88
shell ("cat {params.prefix_forward}* | fastx_reverse_complement | /data/tools/flexbar/2.5/flexbar -t {params.prefix_reverse} -b primers.fasta --reads - --barcode-trim-end LEFT --barcode-min-overlap 10 --barcode-threshold 3 --min-read-length 50 --barcode-unassigned >> {log}" )
87
89
shell ("cat {params.prefix_reverse}* > {output}" )
88
90
89
-
91
+ #Obsolete
90
92
rule fastq2fasta :
91
93
input :
92
94
fastq = "{project}/pandaseq/{data}.fastq"
@@ -104,28 +106,20 @@ rule mergefiles:
104
106
samples = config ["data" ]
105
107
shell : """cat {input} > {output}"""
106
108
107
- ########
108
- # VSEARCH PIPELINE
109
- ########
110
- USEARCH = "/data/tools/usearch/7.0.1090/usearch"
111
- VSEARCH = "/data/tools/vsearch/1.0.10/vsearch-1.0.10-linux-x86_64"
112
- # VSEARCH commands are compatible with USEARCH commands
113
- # depending on the output file that is requested the required program is chosen:
114
- # Example: snakemake -j 8 vsearch/B2R.results.txt -p
115
-
116
109
# Dereplication
117
110
rule derep :
118
111
input :
119
112
"{project}/mergefiles/{ds}.fasta" ,
120
113
output :
121
114
temp ("{project}/{prog}/{ds}.derep.fasta" )
122
115
threads : 8
116
+ conda : "envs/vsearch.yaml"
123
117
run :
124
118
cmd = ""
125
119
if wildcards .prog == "vsearch" :
126
- cmd = VSEARCH
120
+ cmd = "vsearch"
127
121
elif wildcards .prog == "usearch" :
128
- cmd = USEARCH
122
+ cmd = "usearch"
129
123
shell ("{cmd} -derep_fulllength {input} -output {output} -sizeout -threads {threads}" )
130
124
131
125
# Abundance sort and discard singletons
@@ -137,13 +131,14 @@ rule sortbysize:
137
131
params :
138
132
minsize = "{minsize}"
139
133
threads : 8
134
+ conda : "envs/vsearch.yaml"
140
135
run :
141
136
cmd = ""
142
137
if wildcards .prog == "vsearch" :
143
- cmd = VSEARCH
138
+ cmd = "vsearch"
144
139
shell ("{cmd} -sortbysize {input} -fasta_width 0 -output {output} -threads {threads} -minsize {params.minsize}" )
145
140
elif wildcards .prog == "usearch" :
146
- cmd = USEARCH
141
+ cmd = "usearch"
147
142
shell ("{cmd} -sortbysize {input} -output {output} -minsize {params.minsize}" )
148
143
149
144
# Uclust clustering
@@ -153,12 +148,13 @@ rule smallmem:
153
148
output :
154
149
otus = protected ("{project}/{prog}/clst/{ds}.minsize{minsize}.usearch_smallmem.fasta" )
155
150
threads : 8
151
+ conda : "envs/vsearch.yaml"
156
152
run :
157
153
cmd = ""
158
154
if wildcards .prog == "vsearch" :
159
- cmd = VSEARCH
155
+ cmd = "vsearch"
160
156
elif wildcards .prog == "usearch" :
161
- cmd = USEARCH
157
+ cmd = "usearch"
162
158
shell ("{cmd} --cluster_smallmem {input} --usersort -centroids {output.otus} --id 0.97 -sizeout" )
163
159
164
160
rule cluster_fast :
@@ -167,14 +163,16 @@ rule cluster_fast:
167
163
output :
168
164
otus = protected ("{project}/{prog}/{ds}.minsize{minsize}.usearch_cluster_fast.fasta" )
169
165
threads : 8
166
+ conda : "envs/vsearch.yaml"
170
167
run :
171
168
cmd = ""
172
169
if wildcards .prog == "vsearch" :
173
- cmd = VSEARCH
170
+ cmd = "vsearch"
174
171
elif wildcards .prog == "usearch" :
175
- cmd = USEARCH
172
+ cmd = "usearch"
176
173
shell ("{cmd} --cluster_fast {input} --usersort -centroids {output.otus} --id 0.97 -sizeout" )
177
174
175
+ # Not longer supported since it is not in bioconda
178
176
rule uparse :
179
177
input :
180
178
"{project}/{prog}/{ds}.sorted.minsize{minsize}.fasta"
@@ -194,53 +192,56 @@ rule uchime:
194
192
chimeras = "{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.chimeras" ,
195
193
nonchimeras = "{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.fasta"
196
194
log : "{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.uchime.log"
195
+ conda : "envs/vsearch.yaml"
197
196
run :
198
197
cmd = ""
199
198
if wildcards .prog == "vsearch" :
200
- cmd = VSEARCH
199
+ cmd = "vsearch"
201
200
elif wildcards .prog == "usearch" :
202
- cmd = USEARCH
201
+ cmd = "usearch"
203
202
shell ("{cmd} --uchime_denovo {input} --nonchimeras {output.nonchimeras} --chimeras {output.chimeras} > {log}" )
204
203
205
204
#
206
205
# Mapping
207
206
#
208
- #TODO Check mapping accuracy!!!
207
+
209
208
rule make_otu_names :
210
209
input :
211
210
"{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.fasta"
212
211
output :
213
212
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta"
214
- shell : "python2.7 /data/tools/usearch/ uparse_scripts/fasta_number.py {input} OTU_ > {output}"
213
+ shell : "python2.7 uparse_scripts/fasta_number.py {input} OTU_ > {output}"
215
214
216
215
rule mapping :
217
216
input :
218
217
otus = "{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta" ,
219
218
reads = "{project}/mergefiles/{ds}.fasta"
220
219
output :
221
220
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.uc"
221
+ conda : "envs/vsearch.yaml"
222
222
run :
223
223
cmd = ""
224
224
if wildcards .prog == "vsearch" :
225
- cmd = VSEARCH
225
+ cmd = "vsearch"
226
226
elif wildcards .prog == "usearch" :
227
- cmd = USEARCH
227
+ cmd = "usearch"
228
228
shell ("{cmd} -usearch_global {input.reads} -db {input.otus} -strand plus -id 0.97 -uc {output}" )
229
229
230
230
rule create_otutable :
231
231
input :
232
232
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.uc"
233
233
output :
234
234
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.otutable.txt"
235
- shell : "python2.7 /data/tools/usearch/ uparse_scripts/uc2otutab.py {input} > {output}"
235
+ shell : "python2.7 uparse_scripts/uc2otutab.py {input} > {output}"
236
236
237
237
# convert to biom file
238
238
rule biom_otu :
239
239
input :
240
240
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.otutable.txt"
241
241
output :
242
242
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.biom"
243
- shell : "/data/tools/qiime/1.9/qiime1.9/bin/biom convert -i {input} --to-json -o {output} --table-type='OTU table'"
243
+ conda : "envs/biom-format.yaml"
244
+ shell : "biom convert -i {input} --to-json -o {output} --table-type='OTU table'"
244
245
245
246
#
246
247
# Taxonomy
@@ -264,7 +265,8 @@ rule sina_parallel_edgar:
264
265
priority : - 1
265
266
threads : 8
266
267
# TODO: turn is set to all to get classification. Reverse the reads in earlier stage!
267
- shell : "cat {input} | parallel --block 1000K -j{threads} --recstart '>' --pipe /data/tools/sina/{SINA_VERSION}/sina --log-file {log} -i /dev/stdin -o {output.align} --outtype fasta --meta-fmt csv --ptdb {SILVA_ARB} --overhang remove --turn all --search --search-db {SILVA_ARB} --search-min-sim 0.95 --search-no-fast --search-kmer-len 10 --lca-fields tax_slv"
268
+ conda : "envs/sina.yaml"
269
+ shell : "cat {input} | parallel --block 1000K -j{threads} --recstart '>' --pipe sina --log-file {log} -i /dev/stdin -o {output.align} --outtype fasta --meta-fmt csv --ptdb {SILVA_ARB} --overhang remove --turn all --search --search-db {SILVA_ARB} --search-min-sim 0.95 --search-no-fast --search-kmer-len 10 --lca-fields tax_slv"
268
270
269
271
rule sina_get_taxonomy_from_logfile_edgar :
270
272
input :
@@ -285,14 +287,16 @@ rule filter_alignment:
285
287
filtered = "{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina_pfiltered.fasta"
286
288
params :
287
289
outdir = "{project}/{prog}/sina/"
288
- shell : "set +u; source /data/tools/qiime/1.9/env.sh; set -u; filter_alignment.py -i {input.align} -o {params.outdir} --suppress_lane_mask_filter --entropy_threshold 0.10"
290
+ conda : "envs/qiime.yaml"
291
+ shell : "filter_alignment.py -i {input.align} -o {params.outdir} --suppress_lane_mask_filter --entropy_threshold 0.10"
289
292
290
293
rule make_tree :
291
294
input :
292
295
align = "{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina_pfiltered.fasta"
293
296
output :
294
297
tree = "{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.tre"
295
- shell : "set +u; source /data/tools/qiime/1.9/env.sh; source /data/tools/arb/6.0.1/env.sh; set -u; make_phylogeny.py -i {input.align} -t fasttree -o {output.tree}"
298
+ conda : "envs/qiime.yaml"
299
+ shell : "make_phylogeny.py -i {input.align} -t fasttree -o {output.tree}"
296
300
297
301
298
302
@@ -305,8 +309,9 @@ rule biom_tax_sina:
305
309
taxonomy = "{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.qiimeformat.taxonomy" ,
306
310
biom = protected ("{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.sina.biom" ),
307
311
otutable = protected ("{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.sina.otutable.txt" )
312
+ conda : "envs/biom-format.yaml"
308
313
run :
309
314
shell ("""cat {input.taxonomy} | awk -F"[;\t ]" 'BEGIN{{print "OTUs,Domain,Phylum,Class,Order,Family,Genus"}}{{print $1"\\ tk__"$2"; p__"$3"; c__"$4"; o__"$5"; f__"$6"; g__"$7"; s__"$8}}' > {output.taxonomy}""" )
310
- shell ("/data/tools/qiime/1.9/qiime1.9/bin/ biom add-metadata -i {input.biom} -o {output.biom} --output-as-json --observation-metadata-fp {output.taxonomy} --observation-header OTUID,taxonomy --sc-separated taxonomy --float-fields confidence --sample-metadata-fp {input.meta}" )
311
- shell ("/data/tools/qiime/1.9/qiime1.9/bin/ biom convert --to-tsv --header-key=taxonomy -i {output.biom} -o {output.otutable}" )
315
+ shell ("biom add-metadata -i {input.biom} -o {output.biom} --output-as-json --observation-metadata-fp {output.taxonomy} --observation-header OTUID,taxonomy --sc-separated taxonomy --float-fields confidence --sample-metadata-fp {input.meta}" )
316
+ shell ("biom convert --to-tsv --header-key=taxonomy -i {output.biom} -o {output.otutable}" )
312
317
0 commit comments