Skip to content

Commit 32cf05e

Browse files
committed
initial commit #8
1 parent ed71119 commit 32cf05e

File tree

4 files changed

+326
-18
lines changed

4 files changed

+326
-18
lines changed

CoVpipe2.nf

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ nextflow.enable.dsl=2
66
if (params.help) { exit 0, helpMSG() }
77

88
// parameter sanity check
9-
Set valid_params = ['cores', 'max_cores', 'memory', 'help', 'profile', 'workdir', 'fastq', 'list', 'mode', 'run_id', 'reference', 'ref_genome', 'ref_annotation', 'adapter', 'fastp_additional_parameters', 'kraken', 'taxid', 'primer_bed', 'primer_bedpe', 'primer_version', 'vcount', 'frac', 'cov', 'vois', 'var_mqm', 'var_sap', 'var_qual', 'cns_min_cov', 'cns_gt_adjust', 'update', 'pangolin_docker_default', 'nextclade_docker_default', 'output', 'reference_dir', 'read_dir', 'mapping_dir', 'variant_calling_dir', 'consensus_dir', 'linage_dir', 'report_dir', 'rki_dir', 'runinfo_dir', 'singularity_cache_dir', 'conda_cache_dir', 'databases', 'publish_dir_mode', 'cloudProcess', 'cloud-process']
9+
Set valid_params = ['cores', 'max_cores', 'memory', 'help', 'profile', 'workdir', 'fastq', 'list', 'dir', 'mode', 'run_id', 'reference', 'ref_genome', 'ref_annotation', 'adapter', 'fastp_additional_parameters', 'kraken', 'taxid', 'primer_bed', 'primer_bedpe', 'primer_version', 'vcount', 'frac', 'cov', 'vois', 'var_mqm', 'var_sap', 'var_qual', 'cns_min_cov', 'cns_gt_adjust', 'update', 'pangolin_docker_default', 'nextclade_docker_default', 'output', 'reference_dir', 'read_dir', 'mapping_dir', 'variant_calling_dir', 'consensus_dir', 'linage_dir', 'report_dir', 'rki_dir', 'runinfo_dir', 'singularity_cache_dir', 'conda_cache_dir', 'databases', 'publish_dir_mode', 'cloudProcess', 'cloud-process']
1010
def parameter_diff = params.keySet() - valid_params
1111
if (parameter_diff.size() != 0){
1212
exit 1, "ERROR: Parameter(s) $parameter_diff is/are not valid in the pipeline!\n"
@@ -80,21 +80,23 @@ if ( params.reference ) {
8080
}
8181

8282
// illumina reads input & --list support
83-
if (params.mode == 'paired') {
84-
if (params.fastq && params.list) { fastqInputChannel = Channel
85-
.fromPath( params.fastq, checkIfExists: true )
86-
.splitCsv(header: true, sep: ',')
87-
.map { row -> [row.sample, [file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true)]] }
88-
} else if (params.fastq) { fastqInputChannel = Channel
89-
.fromFilePairs( params.fastq, checkIfExists: true )}
90-
} else {
91-
if (params.fastq && params.list) { fastqInputChannel = Channel
92-
.fromPath( params.fastq, checkIfExists: true )
93-
.splitCsv(header: true, sep: ',')
94-
.map { row -> [row.sample, [file(row.fastq, checkIfExists: true)]] }}
95-
else if (params.fastq) { fastqInputChannel = Channel
96-
.fromPath( params.fastq, checkIfExists: true )
97-
.map { file -> [file.simpleName, [file]]}}
83+
if (! params.dir) {
84+
if (params.mode == 'paired') {
85+
if (params.fastq && params.list) { fastqInputChannel = Channel
86+
.fromPath( params.fastq, checkIfExists: true )
87+
.splitCsv(header: true, sep: ',')
88+
.map { row -> [row.sample, [file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true)]] }
89+
} else if (params.fastq) { fastqInputChannel = Channel
90+
.fromFilePairs( params.fastq, checkIfExists: true )}
91+
} else {
92+
if (params.fastq && params.list) { fastqInputChannel = Channel
93+
.fromPath( params.fastq, checkIfExists: true )
94+
.splitCsv(header: true, sep: ',')
95+
.map { row -> [row.sample, [file(row.fastq, checkIfExists: true)]] }}
96+
else if (params.fastq) { fastqInputChannel = Channel
97+
.fromPath( params.fastq, checkIfExists: true )
98+
.map { file -> [file.simpleName, [file]]}}
99+
}
98100
}
99101

100102
// load adapters [optional]
@@ -220,12 +222,23 @@ include { genome_quality } from './workflows/genome_quality_wf'
220222
include { summary_report } from './workflows/report_wf'
221223
include { rki_report_wf } from './workflows/rki_wf'
222224

223-
include { bed2bedpe } from './modules/utils'
225+
include { bed2bedpe; make_sample_sheet } from './modules/utils'
224226

225227
/**************************
226228
* MAIN WORKFLOW
227229
**************************/
228230
workflow {
231+
if(params.dir){
232+
make_sample_sheet(params.fastq)
233+
234+
if (params.mode == 'paired') { fastqInputChannel = make_sample_sheet.out
235+
.splitCsv(header: true, sep: ',')
236+
.map { row -> [row.sample, [file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true)]] }
237+
} else { fastqInputChannel = make_sample_sheet.out
238+
.splitCsv(header: true, sep: ',')
239+
.map { row -> [row.sample, [file(row.fastq, checkIfExists: true)]] }
240+
}
241+
}
229242

230243
// 1: reference preprocessing
231244
reference_preprocessing(ref_genome_file)

bin/make_sample_sheet.py

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
#!/usr/bin/env python3
2+
3+
import pandas as pd
4+
import os
5+
import re
6+
import sys
7+
8+
DEFAULT_FILENAME_PATTERNS = [
9+
("illumina_some_institute_1",{
10+
"regex":r"(?P<date>\d+)_(?P<lab_id>\d{2}-\d{4,5}(-[^_]+)?)_(?P<sample_id>.+)"
11+
r"_(?P<snum>S\d+)_(?P<lane>L\d{3})_(?P<read>R[12])"
12+
r"_(?P<running>\d{3})",
13+
"ambig": ["lane","running"]
14+
}),
15+
("illumina1",{
16+
"regex":r"(?P<sample_id>.+)_(?P<snum>S[\d]+)_(?P<lane>L[\d]{3})_"
17+
r"(?P<read>R[12])_(?P<running>\d{3})",
18+
"ambig":["lane","running"] }),
19+
("illumina2",{
20+
"regex":r"(?P<sample_id>.+)_(?P<snum>S[\d]+)_"
21+
r"(?P<read>R[12])_(?P<running>\d{3})",
22+
"ambig":["running"] }),
23+
("illumina3",{
24+
"regex":r"(?P<sample_id>.+)_(?P<snum>S[0-9]+)_(?P<lane>L[0-9]{3})_"
25+
r"(?P<read>R[12])",
26+
"ambig":["lane"] }),
27+
("illumina4",{
28+
"regex":r"(?P<sample_id>.+)_S(?P<snum>[0-9]+)_"
29+
r"(?P<read>R[12])",
30+
"ambig":[] }),
31+
("illumina_fallback",{
32+
"regex":r"(?P<sample_id>.+)_"
33+
r"(?P<read>R[12])(?P<residual>_.+)?",
34+
"ambig":[] }),
35+
("SRA",{
36+
"regex":r"(?P<sample_id>SR.+)_(?P<read>[12])",
37+
"ambig":[] }),
38+
39+
("fallback1",{
40+
"regex":r"(?P<sample_id>.+)_(?P<read>[A-Za-z0-9]+)",
41+
"ambig":[] }),
42+
("fallback2",{
43+
"regex":r"(?P<sample_id>.+)\.(?P<read>[A-Za-z0-9]+)",
44+
"ambig":[],
45+
"sep":"."})
46+
]
47+
48+
PAIRED_READ_REF = {
49+
"1":1,
50+
"2":2,
51+
"R1":1,
52+
"R2":2,
53+
"FORWARD":1,
54+
"REVERSE":2,
55+
"FWD":1,
56+
"REV":2,
57+
"F":1,
58+
"R":2,
59+
"P":1,
60+
"M":2,
61+
"PLUS":1,
62+
"MINUS":2,
63+
"SENSE":1,
64+
"ANTI":2
65+
}
66+
67+
def check_and_add_fastq(_files, res, pdir=None, sample_names=None, alt_ids=None):
68+
single_mode = len(_files) == 1 and sample_names is not None
69+
patterns=dict(DEFAULT_FILENAME_PATTERNS)
70+
patterns_in_order = [key for key, _ in DEFAULT_FILENAME_PATTERNS]
71+
end=r'.(fq|fnq|fastq)(.gz)?'
72+
for _file in (fl for fl in _files if re.match(f".*{end}", fl)):
73+
if pdir is None:
74+
path = os.path.dirname(os.path.realpath(_file))
75+
else:
76+
path = os.path.realpath(pdir)
77+
_file = os.path.basename(_file)
78+
for pnum, pattern in enumerate(patterns_in_order):
79+
regex_string = ('{patternregex}{end}'.format(
80+
patternregex=patterns[pattern]["regex"],
81+
end=end))
82+
match = re.match(regex_string, _file)
83+
if match is None:
84+
if pnum-1 == len(patterns):
85+
print(f"FastQ does not meet any known spec: file: {_file}")
86+
continue
87+
sep = default_if_not("sep", patterns[pattern],"_")
88+
ambig_keys=default_if_not("ambig",patterns[pattern], [])
89+
match_dict = match.groupdict()
90+
ambig = sep.join(match_dict[a]
91+
for a in sorted(ambig_keys))
92+
nonambig = sep.join(match_dict[na]
93+
for na in sorted(match_dict.keys())
94+
if na not in ambig_keys + ["read"]
95+
and match_dict[na] is not None)
96+
sub_sample_id = sep.join(
97+
[val if val is not None else ""
98+
for (_id, val) in list(match.groupdict().items())
99+
if _id not in ["read"]])
100+
if pattern.startswith("illumina_some_institute"):
101+
matchdict = match.groupdict()
102+
sub_sample_id = "{sid}_{lid}".format(
103+
sid=matchdict["sample_id"],
104+
lid=matchdict["lab_id"])
105+
106+
107+
read = "R1"
108+
read_id = 1
109+
try:
110+
read = match_dict["read"]
111+
except KeyError:
112+
pass
113+
try:
114+
read_id = PAIRED_READ_REF[read.upper()]
115+
except KeyError:
116+
print("Warning: Read name \"",read,"\" not known")
117+
print(f" Using Regex-pattern: {pattern} {regex_string} File: {_file}")
118+
119+
120+
key=(path,nonambig,ambig,read)
121+
if key in res["sample_data"].index:
122+
break
123+
new_entry=pd.DataFrame(dict(zip(res["sample_data"].columns,
124+
([a]
125+
for a
126+
in [path, nonambig, ambig, read, read_id,
127+
"PLACE_HOLDER",
128+
sample_names[0] if single_mode
129+
and sample_names is not None
130+
else sub_sample_id,
131+
"PLACE_HOLDER",
132+
alt_ids[0] if single_mode
133+
and alt_ids is not None
134+
else "PLACE_HOLDER",
135+
_file, pattern,
136+
'{regexpattern}{end}'.format(
137+
regexpattern=patterns[pattern]["regex"],end=end),
138+
"new"]))))
139+
new_entry.set_index(["path","unambig_id","ambig_id","read"],inplace=True, drop=False)
140+
res["sample_data"]= res["sample_data"].append(new_entry)
141+
# print(read, nonambig, ambig, pattern, _file, path, sep='\t')
142+
break
143+
#eprint(res["sample_data"][["sub_sample_id", "alt_sub_sample_id"]])
144+
145+
def default_if_not(key, _dict, default):
146+
try:
147+
return _dict[key]
148+
except KeyError:
149+
return default
150+
151+
def resolve_sample_id_conflicts(sample_data):
152+
generate_alternative_ids(sample_data)
153+
prelim_groups = sample_data.groupby(["sub_sample_id"]).groups
154+
for sub_id in prelim_groups:
155+
paths_subgroup = sample_data[
156+
sample_data.sub_sample_id == sub_id].groupby(level="path").groups
157+
num_subids = len(paths_subgroup)
158+
if num_subids == 1:
159+
continue
160+
print("Warning: Found multiple samples ({num_subids}) with sample id {sub_id}! DUPLICATE_[Number]_ will be prepended to duplicates")
161+
for (_id, path) in enumerate(paths_subgroup):
162+
if _id == 0: # Skip first occurance
163+
continue
164+
sample_data.loc[((sample_data.sub_sample_id == sub_id) &
165+
(sample_data.path == path)) ,
166+
"sub_sample_id"] = "Duplicate_{_id}_{samp}".format(
167+
_id=_id, samp=sub_id)
168+
169+
def generate_alternative_ids(sample_data):
170+
if not ((sample_data["alt_sub_sample_id"]=="PLACE_HOLDER").any()):
171+
# All Samples known and will just be passed on
172+
return
173+
old_samples = sample_data.loc[
174+
sample_data.alt_sub_sample_id != "PLACE_HOLDER"].copy()
175+
old_sample_groups = old_samples.groupby(
176+
level=["path", "unambig_id"]).groups
177+
num_old_samples = len(old_sample_groups)
178+
new_samples = sample_data.loc[
179+
sample_data.alt_sub_sample_id == "PLACE_HOLDER"].copy()
180+
new_sample_groups = new_samples.groupby(
181+
level=["path","unambig_id"]).groups
182+
for key in (key
183+
for key in new_sample_groups if key in old_sample_groups):
184+
new_sample_data = sample_data.loc[new_sample_groups[key]].copy()
185+
old_sample_data = sample_data.loc[old_sample_groups[key]].copy()
186+
old_sample_sub_groups = old_sample_data.groupby(
187+
level=["ambig_id"]).groups
188+
new_sample_sub_groups = new_sample_data.groupby(
189+
level=["ambig_id"]).groups
190+
for sub_key in (s_key
191+
for s_key in new_sample_sub_groups
192+
if s_key in old_sample_sub_groups):
193+
old_alt_id = old_sample_data.loc[
194+
old_sample_sub_groups[sub_key]].alt_sub_sample_id[0]
195+
sample_data.loc[
196+
new_sample_sub_groups[sub_key],
197+
"alt_sub_sample_id"] = old_alt_id
198+
199+
for sub_key in (skey
200+
for skey in new_sample_sub_groups
201+
if skey not in old_sample_sub_groups):
202+
raise RuntimeError(
203+
"Use of given sample names and conflict resolution through"
204+
" longest common prefix is not implemented yet!!!")
205+
206+
new_samples = sample_data.loc[
207+
sample_data.alt_sub_sample_id == "PLACE_HOLDER"].copy()
208+
new_sample_groups = new_samples.groupby(
209+
level=["path","unambig_id"]).groups
210+
for (_id,(path, unambig)) in enumerate(new_sample_groups):
211+
main_name = "Sample_{nid}".format(nid=_id + 1 + num_old_samples)
212+
sample_data.loc[
213+
new_sample_groups[(path, unambig)],
214+
"alt_sample_id" ] = main_name
215+
sub_groups = sample_data[
216+
sample_data.alt_sample_id == main_name].groupby(
217+
level=["ambig_id"]).groups
218+
if len(sub_groups)==1:
219+
# Easy Case when Sample not split between lanes or
220+
# in multiple files with differing running number
221+
sample_data.loc[sample_data.alt_sample_id == main_name,
222+
"alt_sub_sample_id"] = main_name
223+
else:
224+
#import pdb; pdb.set_trace()
225+
for (_id, ambig) in enumerate(sub_groups):
226+
sample_data.loc[((sample_data.alt_sample_id == main_name) &
227+
(sample_data.ambig_id == ambig)),
228+
"alt_sub_sample_id"] = "{sid}.{nid}".format(
229+
sid=main_name, nid=_id+1)
230+
if (sample_data["alt_sub_sample_id"]=="PLACE_HOLDER").any():
231+
raise RuntimeError("Some unique ids could not be assigned")
232+
233+
def generate_sample_config(sample_data):
234+
samples = dict(
235+
(key, dict(("read{read}".format(read=read_id), os.path.join(
236+
sample_data[(
237+
(sample_data.sub_sample_id==key) &
238+
(sample_data.read_id == read_id))].path[0],
239+
sample_data[(
240+
(sample_data.sub_sample_id==key) &
241+
(sample_data.read_id == read_id))].file[0]
242+
))
243+
for read_id in sample_data[
244+
sample_data.sub_sample_id==key].read_id ))
245+
for key in sample_data.sub_sample_id)
246+
for key in samples:
247+
samples[key]["alt_id"] = sample_data[
248+
sample_data.sub_sample_id==key].alt_sub_sample_id[0]
249+
return samples
250+
251+
252+
def main(fastq_dir):
253+
# fastq_dir = '/scratch/Projekte/MF1_genome-reconstruction/covpipe_testdata/20211126_FS10000749_53_BPG61617-2127/sedaghatjoo/'
254+
assert os.path.isdir(fastq_dir), f"{fastq_dir} does not exist"
255+
fastq_all = [f for f in os.listdir(fastq_dir) if re.search(r'.(fq|fnq|fastq)(.gz)?', f)]
256+
257+
res={"sample_data":pd.DataFrame(
258+
columns=["path", "unambig_id", "ambig_id","read", "read_id",
259+
"sample_id", "sub_sample_id",
260+
"alt_sample_id", "alt_sub_sample_id",
261+
"file","match","regex", "state"],
262+
),
263+
"delta_files":[]}
264+
res['sample_data'].set_index(["path","unambig_id","ambig_id","read"], inplace=True, drop=False)
265+
266+
check_and_add_fastq(fastq_all, res, fastq_dir)
267+
samp_data = res["sample_data"]
268+
resolve_sample_id_conflicts(samp_data)
269+
foo = generate_sample_config(samp_data)
270+
271+
print(f"sample,fastq_1,fastq_2")
272+
for sample in foo:
273+
print(f"{sample},{foo[sample]['read1']},{foo[sample]['read2']}")
274+
275+
if __name__ == "__main__":
276+
fastq_dir = sys.argv[1]
277+
main(fastq_dir)

modules/utils.nf

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
process make_sample_sheet {
2+
label 'president'
3+
cache false
4+
publishDir "${params.output}/${params.runinfo_dir}", mode: params.publish_dir_mode
5+
6+
input:
7+
val(read_dir)
8+
9+
output:
10+
path('sample_sheet.csv')
11+
12+
script:
13+
"""
14+
make_sample_sheet.py ${read_dir} > sample_sheet.csv
15+
"""
16+
}
17+
118
process compress_reads {
219
label 'pigz'
320

@@ -20,7 +37,7 @@ process compress_reads {
2037
process bgzip_compress {
2138
label 'samtools'
2239

23-
publishDir "${params.publish_dir}/${name}", mode: params.publish_dir_mode
40+
publishDir "${params.output}/${name}", mode: params.publish_dir_mode
2441

2542
input:
2643
tuple val(name), path(file)

nextflow.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ params {
1818
// reads
1919
fastq = ''
2020
list = false
21+
dir = false
2122
mode = 'paired'
2223
run_id = ''
2324

0 commit comments

Comments
 (0)