This repository has been archived by the owner on Dec 3, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
executable file
·175 lines (157 loc) · 5 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env nextflow
// todo directions say can input mult files, but that's not right
// todo add threads to lorma and bbduk (auto isn't right)
/*
* Assemble KIR.
*
* Requires 'docker.enabled = true' in Nexflow configuration (e.g., $HOME/.nextflow/config).
*
* @author Dave Roe
* @todo make the contig names unique
* @todo --resume isn't working
*/
// things that may change per run
// here are the FASTA/Q files
params.home = baseDir
home = params.home + "/"
params.raw = home + "/raw/"
raw = params.raw + "/"
params.output = home + "/output"
output = params.output + "/"
params.off = 0
fqNameSuffix = "fastq.gz" // extension on the file name (todo: expand this)
params.container = "droeatumn/kass:latest"
params.nocontainer = "null"
params.canuPB = "-pacbio-hifi"
params.threads = "8"
// things that probably won"t change per run
fqPath = raw + "/*" + fqNameSuffix
capFile = file("${home}/input/cap.fasta") // capture probes
markerCapFile = file("${home}/input/markers_wCap.fasta") // gene markers + capture probes
featuresFile = file("${home}/input/features.txt") // markup features
haps = home + "${home}/input/HapSet23_v1.txt"
alignProbesFile = file("${home}/src/alignment2ProbePairs.groovy")
annotateFile = file("${home}/src/annotateMarkup.groovy")
fqs = Channel.fromPath(fqPath).ifEmpty { error "cannot find any files matching ${fqPath}" }.map { path -> tuple(sample(path), path) }
/*
* extract
*
* Extract the KIR fastq reads from potentially larger tuple of reads.
* Base and output is fastq
* Reads that don't match any kmer go to <name>_off-kir.fastq.
* All others go to <name>_kir.fastq.
*
*/
process extract {
if(params.nocontainer == "null") {
container = params.container
}
// publishDir output, mode: 'copy', overwrite: true
//doesn't work publishDir "*_off-kir.fastq.gz", mode: 'copy', overwrite: true
//doesn't work publishDir "*_kir.fastq", mode: 'copy', overwrite: true
input:
tuple s, path(fa) from fqs
path(markerCapFile)
output:
tuple s, file{"*_kir.fastq.gz"} into kirFastqs
tuple s, file{"*_off-kir.fastq.gz"} into offkirFastqs optional true
script:
def offFile = ""
def offStr = ""
if(params.off != 0) {
offFile = s + "_off-kir.fastq"
offStr = "out=" + offFile
}
"""
bbduk.sh in=${fa} ${offStr} outm=${s}_kir.fastq.gz ref=${markerCapFile} k=25 maskmiddle=f overwrite=t rename=t nzo=t rcomp=t ignorebadquality=t
# remove empty files
find . -type f -size 0 -print0 |xargs -0 rm -f
if [ -f ${offFile} ];
then
gzip ${offFile}
fi
#todo gzip ${s}_off-kir.fastq
"""
} // extract
/*
* correct
*
* Error correct the fastq reads.
*
*/
process correct {
if(params.nocontainer == "null") {
container = params.container
}
//publishDir output, mode: 'copy', overwrite: true
input:
tuple s, path(fq) from kirFastqs
output:
tuple s, path{"${s}*.fasta"} into correctedReads
// tuple s, path{"*.fasta.gz"} into correctedReads mode flatten
"""
lorma.sh ${fq}
mv final.fasta ${s}-corrected.fasta
binBBFasta.groovy -i ${s}-corrected.fasta -o .
# gzip ${s}-corrected*.fasta
"""
} // correct
/*
* assemble
*
* @todo change genome size for genes
* @todo rename the contig names to eliminate duplications
*/
process assemble {
if(params.nocontainer == "null") {
container = params.container
}
publishDir output, mode: 'copy', overwrite: true //todo
errorStrategy 'ignore'
input:
tuple s, path(cr) from correctedReads
output:
tuple s, path{"*.contigs.fasta.gz"} into assembly
script:
//def s2 = cr.name.replaceFirst("-corrected", "").replaceFirst(".fasta.gz", "")
"""
id=""
firstID=""
FILES="*-corrected*.fasta"
for bFile in \$FILES; do
echo \$bFile
id=\$(basename \$bFile)
# '%' Means start to remove after the next character;
id=\${id/-corrected/}
id=\${id%.fasta}
echo \$id
if [ "\$firstID" == "" ]; then
firstID=\$id
fi
canu -p \$id -d \$id genomeSize=200k ${params.canuPB} \$bFile || true
cp \$id/\$id.contigs.fasta . || true
deep.pl replace '>tig' ">\${id}_tig" "\${id}.contigs.fasta"
done
cat *.contigs.fasta > tmp.fasta
orient.groovy -i tmp.fasta -p ${capFile} -o tmp2.fasta
reformat.sh in=tmp2.fasta out=\$firstID.contigs.fasta fastawrap=1000000 overwrite=true
gzip \$firstID.contigs.fasta
"""
} // assemble
// get the per-sample name
def sample(Path path) {
def name = path.getFileName().toString()
int start = Math.max(0, name.lastIndexOf('/'))
int end = name.indexOf(fqNameSuffix)
if ( end <= 0 ) {
throw new Exception( "Expected file " + name + " to end in '" + fqNameSuffix + "'" );
}
end = end -1 // Remove the trailing '.'
return name.substring(start, end)
} // sample
workflow.onComplete {
println "DONE: ${ workflow.success ? 'OK' : 'FAILED' }"
}
workflow.onError {
println "ERROR: ${workflow.errorReport.toString()}"
}