Skip to content
This repository was archived by the owner on Dec 18, 2023. It is now read-only.

Commit 125b6fa

Browse files
committed
Merge branch 'conda' into 'master'
Get all the software dependencies from conda Works, except for sina: bioconda/bioconda-recipes#4099 See merge request !1
2 parents 86cc300 + 1b1d3f1 commit 125b6fa

26 files changed

+1366
-35
lines changed

Snakefile

Lines changed: 40 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ PROJECT = config["project"] + "/"
66

77
rule final:
88
input: expand("{project}/fastqc_raw/{data}_R1_fastqc.zip \
9-
{project}/fastqc_pandaseq/{data}_fastqc.zip \
109
{project}/{prog}/clst/{ds}.minsize{minsize}.usearch_smallmem.fasta \
1110
{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.taxonomy \
1211
{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.sina.biom \
@@ -37,6 +36,7 @@ rule fastqc:
3736
adapters = config["adapters_fasta"]
3837
log: "fastqc_raw.log"
3938
threads: 2
39+
conda: "envs/fastqc.yaml"
4040
run:
4141
shell("fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.forward} > {params.dir}/{log}")
4242
shell("fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.reverse} > {params.dir}/{log}")
@@ -46,7 +46,7 @@ rule pandaseq:
4646
forward="{project}/gunzip/{data}_R1.fastq",
4747
reverse="{project}/gunzip/{data}_R2.fastq"
4848
output:
49-
fastq = "{project}/pandaseq/{data}.fastq"
49+
fasta = "{project}/pandaseq/{data}.fasta"
5050
params:
5151
overlap = config['pandaseq_overlap'],
5252
quality = config['pandaseq_quality'],
@@ -56,8 +56,8 @@ rule pandaseq:
5656
reverse_primer = config['reverse_primer']
5757
log: "{project}/pandaseq/{data}_pandaseq.stdout"
5858
threads: 1
59-
#shell: "source /data/tools/RDP_Assembler/1.0.3/env.sh; pandaseq -N -o {params.overlap} -e {params.quality} -F -d rbfkms -l {params.minlength} -L {params.maxlength} -T {threads} -f {input.forward} -r {input.reverse} 1> {output.fastq} 2> {log}"
60-
shell: "/data/tools/pandaseq/2.9/bin/pandaseq -N -f {input.forward} -r {input.reverse} -p {params.forward_primer} -q {params.reverse_primer} -A rdp_mle -T {threads} -w {output.fastq} -g {log}"
59+
conda: "envs/pandaseq.yaml"
60+
shell: "pandaseq -N -f {input.forward} -r {input.reverse} -p {params.forward_primer} -q {params.reverse_primer} -A rdp_mle -T {threads} -w {output.fasta} -g {log}"
6161

6262
rule fastqc_pandaseq:
6363
input:
@@ -69,9 +69,11 @@ rule fastqc_pandaseq:
6969
adapters = config["adapters_fasta"]
7070
log: "fastqc.log"
7171
threads: 8
72+
conda: "envs/fastqc.yaml"
7273
shell: "fastqc -q -t {threads} --contaminants {params.adapters} --outdir {params.dir} {input.fastq} > {params.dir}/{log}"
7374

7475

76+
#Obsolete
7577
rule primer_matching:
7678
input:
7779
"{project}/pandaseq/{data}.fastq"
@@ -86,7 +88,7 @@ rule primer_matching:
8688
shell("cat {params.prefix_forward}* | fastx_reverse_complement | /data/tools/flexbar/2.5/flexbar -t {params.prefix_reverse} -b primers.fasta --reads - --barcode-trim-end LEFT --barcode-min-overlap 10 --barcode-threshold 3 --min-read-length 50 --barcode-unassigned >> {log}")
8789
shell("cat {params.prefix_reverse}* > {output}")
8890

89-
91+
#Obsolete
9092
rule fastq2fasta:
9193
input:
9294
fastq = "{project}/pandaseq/{data}.fastq"
@@ -104,28 +106,20 @@ rule mergefiles:
104106
samples=config["data"]
105107
shell: """cat {input} > {output}"""
106108

107-
########
108-
# VSEARCH PIPELINE
109-
########
110-
USEARCH = "/data/tools/usearch/7.0.1090/usearch"
111-
VSEARCH = "/data/tools/vsearch/1.0.10/vsearch-1.0.10-linux-x86_64"
112-
# VSEARCH commands are compatible with USEARCH commands
113-
# depending on the output file that is requested the required program is chosen:
114-
# Example: snakemake -j 8 vsearch/B2R.results.txt -p
115-
116109
# Dereplication
117110
rule derep:
118111
input:
119112
"{project}/mergefiles/{ds}.fasta",
120113
output:
121114
temp("{project}/{prog}/{ds}.derep.fasta")
122115
threads: 8
116+
conda: "envs/vsearch.yaml"
123117
run:
124118
cmd = ""
125119
if wildcards.prog == "vsearch":
126-
cmd = VSEARCH
120+
cmd = "vsearch"
127121
elif wildcards.prog == "usearch":
128-
cmd = USEARCH
122+
cmd = "usearch"
129123
shell("{cmd} -derep_fulllength {input} -output {output} -sizeout -threads {threads}")
130124

131125
# Abundance sort and discard singletons
@@ -137,13 +131,14 @@ rule sortbysize:
137131
params:
138132
minsize="{minsize}"
139133
threads: 8
134+
conda: "envs/vsearch.yaml"
140135
run:
141136
cmd = ""
142137
if wildcards.prog == "vsearch":
143-
cmd = VSEARCH
138+
cmd = "vsearch"
144139
shell("{cmd} -sortbysize {input} -fasta_width 0 -output {output} -threads {threads} -minsize {params.minsize}")
145140
elif wildcards.prog == "usearch":
146-
cmd = USEARCH
141+
cmd = "usearch"
147142
shell("{cmd} -sortbysize {input} -output {output} -minsize {params.minsize}")
148143

149144
# Uclust clustering
@@ -153,12 +148,13 @@ rule smallmem:
153148
output:
154149
otus=protected("{project}/{prog}/clst/{ds}.minsize{minsize}.usearch_smallmem.fasta")
155150
threads: 8
151+
conda: "envs/vsearch.yaml"
156152
run:
157153
cmd = ""
158154
if wildcards.prog == "vsearch":
159-
cmd = VSEARCH
155+
cmd = "vsearch"
160156
elif wildcards.prog == "usearch":
161-
cmd = USEARCH
157+
cmd = "usearch"
162158
shell("{cmd} --cluster_smallmem {input} --usersort -centroids {output.otus} --id 0.97 -sizeout")
163159

164160
rule cluster_fast:
@@ -167,14 +163,16 @@ rule cluster_fast:
167163
output:
168164
otus=protected("{project}/{prog}/{ds}.minsize{minsize}.usearch_cluster_fast.fasta")
169165
threads: 8
166+
conda: "envs/vsearch.yaml"
170167
run:
171168
cmd = ""
172169
if wildcards.prog == "vsearch":
173-
cmd = VSEARCH
170+
cmd = "vsearch"
174171
elif wildcards.prog == "usearch":
175-
cmd = USEARCH
172+
cmd = "usearch"
176173
shell("{cmd} --cluster_fast {input} --usersort -centroids {output.otus} --id 0.97 -sizeout")
177174

175+
# Not longer supported since it is not in bioconda
178176
rule uparse:
179177
input:
180178
"{project}/{prog}/{ds}.sorted.minsize{minsize}.fasta"
@@ -194,53 +192,56 @@ rule uchime:
194192
chimeras="{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.chimeras",
195193
nonchimeras="{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.fasta"
196194
log: "{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.uchime.log"
195+
conda: "envs/vsearch.yaml"
197196
run:
198197
cmd = ""
199198
if wildcards.prog == "vsearch":
200-
cmd = VSEARCH
199+
cmd = "vsearch"
201200
elif wildcards.prog == "usearch":
202-
cmd = USEARCH
201+
cmd = "usearch"
203202
shell("{cmd} --uchime_denovo {input} --nonchimeras {output.nonchimeras} --chimeras {output.chimeras} > {log}")
204203

205204
#
206205
# Mapping
207206
#
208-
#TODO Check mapping accuracy!!!
207+
209208
rule make_otu_names:
210209
input:
211210
"{project}/{prog}/uchime/{ds}.minsize{minsize}.{clmethod}.fasta"
212211
output:
213212
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta"
214-
shell: "python2.7 /data/tools/usearch/uparse_scripts/fasta_number.py {input} OTU_ > {output}"
213+
shell: "python2.7 uparse_scripts/fasta_number.py {input} OTU_ > {output}"
215214

216215
rule mapping:
217216
input:
218217
otus="{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta",
219218
reads="{project}/mergefiles/{ds}.fasta"
220219
output:
221220
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.uc"
221+
conda: "envs/vsearch.yaml"
222222
run:
223223
cmd = ""
224224
if wildcards.prog == "vsearch":
225-
cmd = VSEARCH
225+
cmd = "vsearch"
226226
elif wildcards.prog == "usearch":
227-
cmd = USEARCH
227+
cmd = "usearch"
228228
shell("{cmd} -usearch_global {input.reads} -db {input.otus} -strand plus -id 0.97 -uc {output}")
229229

230230
rule create_otutable:
231231
input:
232232
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.uc"
233233
output:
234234
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.otutable.txt"
235-
shell: "python2.7 /data/tools/usearch/uparse_scripts/uc2otutab.py {input} > {output}"
235+
shell: "python2.7 uparse_scripts/uc2otutab.py {input} > {output}"
236236

237237
# convert to biom file
238238
rule biom_otu:
239239
input:
240240
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.otutable.txt"
241241
output:
242242
"{project}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.biom"
243-
shell: "/data/tools/qiime/1.9/qiime1.9/bin/biom convert -i {input} --to-json -o {output} --table-type='OTU table'"
243+
conda: "envs/biom-format.yaml"
244+
shell: "biom convert -i {input} --to-json -o {output} --table-type='OTU table'"
244245

245246
#
246247
# Taxonomy
@@ -264,7 +265,8 @@ rule sina_parallel_edgar:
264265
priority: -1
265266
threads: 8
266267
# TODO: turn is set to all to get classification. Reverse the reads in earlier stage!
267-
shell: "cat {input} | parallel --block 1000K -j{threads} --recstart '>' --pipe /data/tools/sina/{SINA_VERSION}/sina --log-file {log} -i /dev/stdin -o {output.align} --outtype fasta --meta-fmt csv --ptdb {SILVA_ARB} --overhang remove --turn all --search --search-db {SILVA_ARB} --search-min-sim 0.95 --search-no-fast --search-kmer-len 10 --lca-fields tax_slv"
268+
conda: "envs/sina.yaml"
269+
shell: "cat {input} | parallel --block 1000K -j{threads} --recstart '>' --pipe sina --log-file {log} -i /dev/stdin -o {output.align} --outtype fasta --meta-fmt csv --ptdb {SILVA_ARB} --overhang remove --turn all --search --search-db {SILVA_ARB} --search-min-sim 0.95 --search-no-fast --search-kmer-len 10 --lca-fields tax_slv"
268270

269271
rule sina_get_taxonomy_from_logfile_edgar:
270272
input:
@@ -285,14 +287,16 @@ rule filter_alignment:
285287
filtered="{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina_pfiltered.fasta"
286288
params:
287289
outdir="{project}/{prog}/sina/"
288-
shell: "set +u; source /data/tools/qiime/1.9/env.sh; set -u; filter_alignment.py -i {input.align} -o {params.outdir} --suppress_lane_mask_filter --entropy_threshold 0.10"
290+
conda: "envs/qiime.yaml"
291+
shell: "filter_alignment.py -i {input.align} -o {params.outdir} --suppress_lane_mask_filter --entropy_threshold 0.10"
289292

290293
rule make_tree:
291294
input:
292295
align="{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina_pfiltered.fasta"
293296
output:
294297
tree="{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.tre"
295-
shell: "set +u; source /data/tools/qiime/1.9/env.sh; source /data/tools/arb/6.0.1/env.sh; set -u; make_phylogeny.py -i {input.align} -t fasttree -o {output.tree}"
298+
conda: "envs/qiime.yaml"
299+
shell: "make_phylogeny.py -i {input.align} -t fasttree -o {output.tree}"
296300

297301

298302

@@ -305,8 +309,9 @@ rule biom_tax_sina:
305309
taxonomy="{project}/{prog}/sina/{ds}.minsize{minsize}.{clmethod}.sina.qiimeformat.taxonomy",
306310
biom=protected("{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.sina.biom"),
307311
otutable=protected("{project}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.sina.otutable.txt")
312+
conda: "envs/biom-format.yaml"
308313
run:
309314
shell("""cat {input.taxonomy} | awk -F"[;\t]" 'BEGIN{{print "OTUs,Domain,Phylum,Class,Order,Family,Genus"}}{{print $1"\\tk__"$2"; p__"$3"; c__"$4"; o__"$5"; f__"$6"; g__"$7"; s__"$8}}' > {output.taxonomy}""")
310-
shell("/data/tools/qiime/1.9/qiime1.9/bin/biom add-metadata -i {input.biom} -o {output.biom} --output-as-json --observation-metadata-fp {output.taxonomy} --observation-header OTUID,taxonomy --sc-separated taxonomy --float-fields confidence --sample-metadata-fp {input.meta}")
311-
shell("/data/tools/qiime/1.9/qiime1.9/bin/biom convert --to-tsv --header-key=taxonomy -i {output.biom} -o {output.otutable}")
315+
shell("biom add-metadata -i {input.biom} -o {output.biom} --output-as-json --observation-metadata-fp {output.taxonomy} --observation-header OTUID,taxonomy --sc-separated taxonomy --float-fields confidence --sample-metadata-fp {input.meta}")
316+
shell("biom convert --to-tsv --header-key=taxonomy -i {output.biom} -o {output.otutable}")
312317

envs/biom-format.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- bioconda
3+
dependencies:
4+
- biom-format ==2.1.5
5+

envs/fastqc.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- bioconda
3+
dependencies:
4+
- fastqc ==0.11.5
5+

envs/pandaseq.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- bioconda
3+
dependencies:
4+
- pandaseq ==2.11
5+

envs/qiime.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
channels:
2+
- bioconda
3+
dependencies:
4+
- python ==2.7.12
5+
- qiime ==1.9.1
6+
- pyqt ==4.11.4
7+
- xorg-libsm

envs/sina.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- bioconda
3+
dependencies:
4+
# - sina ==1.3.0
5+
- parallel ==20160622

envs/vsearch.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- bioconda
3+
dependencies:
4+
- vsearch ==2.4.0
5+
9.07 KB
Binary file not shown.

uparse_scripts/die.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import sys
2+
import traceback
3+
4+
def Die(Msg):
5+
print >> sys.stderr
6+
print >> sys.stderr
7+
8+
traceback.print_stack()
9+
s = ""
10+
for i in range(0, len(sys.argv)):
11+
if i > 0:
12+
s += " "
13+
s += sys.argv[i]
14+
print >> sys.stderr, s
15+
print >> sys.stderr, "**ERROR**", Msg
16+
print >> sys.stderr
17+
print >> sys.stderr
18+
sys.exit(1)
19+
print "NOTHERE!!"
20+
21+
def Warning(Msg):
22+
print >> sys.stderr
23+
print >> sys.stderr, sys.argv
24+
print >> sys.stderr, "**WARNING**", Msg

uparse_scripts/die.pyc

890 Bytes
Binary file not shown.

uparse_scripts/faqual2fastq.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import sys
2+
import fasta
3+
import fastq
4+
import die
5+
6+
FastaFileName = sys.argv[1]
7+
QualFileName = sys.argv[2]
8+
9+
ff = open(FastaFileName)
10+
fq = open(QualFileName)
11+
12+
while 1:
13+
Linef = ff.readline()
14+
if len(Linef) == 0:
15+
break
16+
Labelf = Linef.strip()
17+
Seqf = ff.readline().strip()
18+
L = len(Seqf)
19+
assert L != 0
20+
21+
Labelq = fq.readline().strip()
22+
Seqq = fq.readline().strip()
23+
assert len(Seqq) != 0
24+
25+
Labf = Labelf.split()[0]
26+
Labq = Labelq.split()[0]
27+
28+
if Labf != Labq:
29+
print >> sys.stderr
30+
print >> sys.stderr, "LABEL MISMATCH"
31+
print >> sys.stderr, "Labelf:", Labelf
32+
print >> sys.stderr, "Labelq:", Labelq
33+
sys.exit(1)
34+
35+
Quals = Seqq.split()
36+
LQ = len(Quals)
37+
if LQ != L:
38+
die.Die("LS %u, LQ %u >%s" % (L, LQ, Labelf))
39+
40+
q = ""
41+
for Qual in Quals:
42+
iq = int(Qual)
43+
cq = fastq.IntQualToChar(iq)
44+
q += cq
45+
46+
assert len(q) == L
47+
fastq.WriteRec(sys.stdout, Labelf, Seqf, q)

0 commit comments

Comments
 (0)