1
- #!/usr/bin/env python3
1
+ #!/usr/bin/env python
2
2
3
+ import os
4
+ import sys
5
+ import errno
6
+ import argparse
3
7
4
- """Provide a command line tool to validate and transform tabular samplesheets."""
5
8
9
+ def parse_args (args = None ):
10
+ Description = "Reformat avantonder/bovisanalyzer samplesheet file and check its contents."
11
+ Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
6
12
7
- import argparse
8
- import csv
9
- import logging
10
- import sys
11
- from collections import Counter
12
- from pathlib import Path
13
+ parser = argparse .ArgumentParser (description = Description , epilog = Epilog )
14
+ parser .add_argument ("FILE_IN" , help = "Input samplesheet file." )
15
+ parser .add_argument ("FILE_OUT" , help = "Output file." )
16
+ return parser .parse_args (args )
13
17
14
18
15
- logger = logging .getLogger ()
19
+ def make_dir (path ):
20
+ if len (path ) > 0 :
21
+ try :
22
+ os .makedirs (path )
23
+ except OSError as exception :
24
+ if exception .errno != errno .EEXIST :
25
+ raise exception
16
26
17
27
18
- class RowChecker :
19
- """
20
- Define a service that can validate and transform each given row.
21
- Attributes:
22
- modified (list): A list of dicts, where each dict corresponds to a previously
23
- validated and transformed row. The order of rows is maintained.
24
- """
25
-
26
- VALID_FORMATS = (
27
- ".fq.gz" ,
28
- ".fastq.gz" ,
29
- )
30
-
31
- def __init__ (
32
- self ,
33
- sample_col = "sample" ,
34
- first_col = "fastq_1" ,
35
- second_col = "fastq_2" ,
36
- single_col = "single_end" ,
37
- ** kwargs ,
38
- ):
39
- """
40
- Initialize the row checker with the expected column names.
41
- Args:
42
- sample_col (str): The name of the column that contains the sample name
43
- (default "sample").
44
- first_col (str): The name of the column that contains the first (or only)
45
- FASTQ file path (default "fastq_1").
46
- second_col (str): The name of the column that contains the second (if any)
47
- FASTQ file path (default "fastq_2").
48
- single_col (str): The name of the new column that will be inserted and
49
- records whether the sample contains single- or paired-end sequencing
50
- reads (default "single_end").
51
- """
52
- super ().__init__ (** kwargs )
53
- self ._sample_col = sample_col
54
- self ._first_col = first_col
55
- self ._second_col = second_col
56
- self ._single_col = single_col
57
- self ._seen = set ()
58
- self .modified = []
59
-
60
- def validate_and_transform (self , row ):
61
- """
62
- Perform all validations on the given row and insert the read pairing status.
63
- Args:
64
- row (dict): A mapping from column headers (keys) to elements of that row
65
- (values).
66
- """
67
- self ._validate_sample (row )
68
- self ._validate_first (row )
69
- self ._validate_second (row )
70
- self ._validate_pair (row )
71
- self ._seen .add ((row [self ._sample_col ], row [self ._first_col ]))
72
- self .modified .append (row )
73
-
74
- def _validate_sample (self , row ):
75
- """Assert that the sample name exists and convert spaces to underscores."""
76
- assert len (row [self ._sample_col ]) > 0 , "Sample input is required."
77
- # Sanitize samples slightly.
78
- row [self ._sample_col ] = row [self ._sample_col ].replace (" " , "_" )
79
-
80
- def _validate_first (self , row ):
81
- """Assert that the first FASTQ entry is non-empty and has the right format."""
82
- assert len (row [self ._first_col ]) > 0 , "At least the first FASTQ file is required."
83
- self ._validate_fastq_format (row [self ._first_col ])
84
-
85
- def _validate_second (self , row ):
86
- """Assert that the second FASTQ entry has the right format if it exists."""
87
- if len (row [self ._second_col ]) > 0 :
88
- self ._validate_fastq_format (row [self ._second_col ])
89
-
90
- def _validate_pair (self , row ):
91
- """Assert that read pairs have the same file extension. Report pair status."""
92
- if row [self ._first_col ] and row [self ._second_col ]:
93
- row [self ._single_col ] = False
94
- assert (
95
- Path (row [self ._first_col ]).suffixes [- 2 :] == Path (row [self ._second_col ]).suffixes [- 2 :]
96
- ), "FASTQ pairs must have the same file extensions."
97
- else :
98
- row [self ._single_col ] = True
99
-
100
- def _validate_fastq_format (self , filename ):
101
- """Assert that a given filename has one of the expected FASTQ extensions."""
102
- assert any (filename .endswith (extension ) for extension in self .VALID_FORMATS ), (
103
- f"The FASTQ file has an unrecognized extension: { filename } \n "
104
- f"It should be one of: { ', ' .join (self .VALID_FORMATS )} "
28
+ def print_error (error , context = "Line" , context_str = "" ):
29
+ error_str = "ERROR: Please check samplesheet -> {}" .format (error )
30
+ if context != "" and context_str != "" :
31
+ error_str = "ERROR: Please check samplesheet -> {}\n {}: '{}'" .format (
32
+ error , context .strip (), context_str .strip ()
105
33
)
34
+ print (error_str )
35
+ sys .exit (1 )
106
36
107
- def validate_unique_samples (self ):
108
- """
109
- Assert that the combination of sample name and FASTQ filename is unique.
110
- In addition to the validation, also rename the sample if more than one sample,
111
- FASTQ file combination exists.
112
- """
113
- assert len (self ._seen ) == len (self .modified ), "The pair of sample name and FASTQ must be unique."
114
- if len ({pair [0 ] for pair in self ._seen }) < len (self ._seen ):
115
- counts = Counter (pair [0 ] for pair in self ._seen )
116
- seen = Counter ()
117
- for row in self .modified :
118
- sample = row [self ._sample_col ]
119
- seen [sample ] += 1
120
- if counts [sample ] > 1 :
121
- row [self ._sample_col ] = f"{ sample } _T{ seen [sample ]} "
122
-
123
-
124
- def read_head (handle , num_lines = 10 ):
125
- """Read the specified number of lines from the current position in the file."""
126
- lines = []
127
- for idx , line in enumerate (handle ):
128
- if idx == num_lines :
129
- break
130
- lines .append (line )
131
- return "" .join (lines )
132
-
133
-
134
- def sniff_format (handle ):
135
- """
136
- Detect the tabular format.
137
- Args:
138
- handle (text file): A handle to a `text file`_ object. The read position is
139
- expected to be at the beginning (index 0).
140
- Returns:
141
- csv.Dialect: The detected tabular format.
142
- .. _text file:
143
- https://docs.python.org/3/glossary.html#term-text-file
144
- """
145
- peek = read_head (handle )
146
- handle .seek (0 )
147
- sniffer = csv .Sniffer ()
148
- if not sniffer .has_header (peek ):
149
- logger .critical (f"The given sample sheet does not appear to contain a header." )
150
- sys .exit (1 )
151
- dialect = sniffer .sniff (peek )
152
- return dialect
153
37
154
-
155
- def check_samplesheet (file_in , file_out ):
38
+ def check_illumina_samplesheet (file_in , file_out ):
156
39
"""
157
- Check that the tabular samplesheet has the structure expected by nf-core pipelines.
158
- Validate the general shape of the table, expected columns, and each row. Also add
159
- an additional column which records whether one or two FASTQ reads were found.
160
- Args:
161
- file_in (pathlib.Path): The given tabular samplesheet. The format can be either
162
- CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
163
- file_out (pathlib.Path): Where the validated and transformed samplesheet should
164
- be created; always in CSV format.
165
- Example:
166
- This function checks that the samplesheet follows the following structure,
167
- see also the `viral recon samplesheet`_::
168
- sample,fastq_1,fastq_2
169
- SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
170
- SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
171
- SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
172
- .. _viral recon samplesheet:
173
- https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
40
+ This function checks that the samplesheet follows the following structure:
41
+ sample,fastq_1,fastq_2
42
+ SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
43
+ SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
44
+ SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
45
+ For an example see:
46
+ https://github.com/nf-core/test-datasets/blob/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
174
47
"""
175
- required_columns = {"sample" , "fastq_1" , "fastq_2" }
176
- # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
177
- with file_in .open (newline = "" ) as in_handle :
178
- reader = csv .DictReader (in_handle , dialect = sniff_format (in_handle ))
179
- # Validate the existence of the expected header columns.
180
- if not required_columns .issubset (reader .fieldnames ):
181
- logger .critical (f"The sample sheet **must** contain the column headers: { ', ' .join (required_columns )} ." )
48
+
49
+ sample_mapping_dict = {}
50
+ with open (file_in , "r" ) as fin :
51
+
52
+ ## Check header
53
+ MIN_COLS = 2
54
+ HEADER = ["sample" , "fastq_1" , "fastq_2" ]
55
+ header = [x .strip ('"' ) for x in fin .readline ().strip ().split ("," )]
56
+ if header [: len (HEADER )] != HEADER :
57
+ print (
58
+ "ERROR: Please check samplesheet header -> {} != {}" .format (
59
+ "," .join (header ), "," .join (HEADER )
60
+ )
61
+ )
182
62
sys .exit (1 )
183
- # Validate each row.
184
- checker = RowChecker ()
185
- for i , row in enumerate (reader ):
186
- try :
187
- checker .validate_and_transform (row )
188
- except AssertionError as error :
189
- logger .critical (f"{ str (error )} On line { i + 2 } ." )
190
- sys .exit (1 )
191
- checker .validate_unique_samples ()
192
- header = list (reader .fieldnames )
193
- header .insert (1 , "single_end" )
194
- # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
195
- with file_out .open (mode = "w" , newline = "" ) as out_handle :
196
- writer = csv .DictWriter (out_handle , header , delimiter = "," )
197
- writer .writeheader ()
198
- for row in checker .modified :
199
- writer .writerow (row )
200
-
201
-
202
- def parse_args (argv = None ):
203
- """Define and immediately parse command line arguments."""
204
- parser = argparse .ArgumentParser (
205
- description = "Validate and transform a tabular samplesheet." ,
206
- epilog = "Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv" ,
207
- )
208
- parser .add_argument (
209
- "file_in" ,
210
- metavar = "FILE_IN" ,
211
- type = Path ,
212
- help = "Tabular input samplesheet in CSV or TSV format." ,
213
- )
214
- parser .add_argument (
215
- "file_out" ,
216
- metavar = "FILE_OUT" ,
217
- type = Path ,
218
- help = "Transformed output samplesheet in CSV format." ,
219
- )
220
- parser .add_argument (
221
- "-l" ,
222
- "--log-level" ,
223
- help = "The desired log level (default WARNING)." ,
224
- choices = ("CRITICAL" , "ERROR" , "WARNING" , "INFO" , "DEBUG" ),
225
- default = "WARNING" ,
226
- )
227
- return parser .parse_args (argv )
228
-
229
-
230
- def main (argv = None ):
231
- """Coordinate argument parsing and program execution."""
232
- args = parse_args (argv )
233
- logging .basicConfig (level = args .log_level , format = "[%(levelname)s] %(message)s" )
234
- if not args .file_in .is_file ():
235
- logger .error (f"The given input file { args .file_in } was not found!" )
236
- sys .exit (2 )
237
- args .file_out .parent .mkdir (parents = True , exist_ok = True )
238
- check_samplesheet (args .file_in , args .file_out )
239
63
64
+ ## Check sample entries
65
+ for line in fin :
66
+ lspl = [x .strip ().strip ('"' ) for x in line .strip ().split ("," )]
67
+
68
+ # Check valid number of columns per row
69
+ if len (lspl ) < len (HEADER ):
70
+ print_error (
71
+ "Invalid number of columns (minimum = {})!" .format (len (HEADER )),
72
+ "Line" ,
73
+ line ,
74
+ )
75
+ num_cols = len ([x for x in lspl if x ])
76
+ if num_cols < MIN_COLS :
77
+ print_error (
78
+ "Invalid number of populated columns (minimum = {})!" .format (
79
+ MIN_COLS
80
+ ),
81
+ "Line" ,
82
+ line ,
83
+ )
84
+
85
+ ## Check sample name entries
86
+ sample , fastq_1 , fastq_2 = lspl [: len (HEADER )]
87
+ if sample .find (" " ) != - 1 :
88
+ print (
89
+ f"WARNING: Spaces have been replaced by underscores for sample: { sample } "
90
+ )
91
+ sample = sample .replace (" " , "_" )
92
+ if not sample :
93
+ print_error ("Sample entry has not been specified!" , "Line" , line )
94
+
95
+ ## Check FastQ file extension
96
+ for fastq in [fastq_1 , fastq_2 ]:
97
+ if fastq :
98
+ if fastq .find (" " ) != - 1 :
99
+ print_error ("FastQ file contains spaces!" , "Line" , line )
100
+ if not fastq .endswith (".fastq.gz" ) and not fastq .endswith (".fq.gz" ):
101
+ print_error (
102
+ "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!" ,
103
+ "Line" ,
104
+ line ,
105
+ )
106
+
107
+ ## Auto-detect paired-end/single-end
108
+ sample_info = [] ## [single_end, fastq_1, fastq_2]
109
+ if sample and fastq_1 and fastq_2 : ## Paired-end short reads
110
+ sample_info = ["0" , fastq_1 , fastq_2 ]
111
+ elif sample and fastq_1 and not fastq_2 : ## Single-end short reads
112
+ sample_info = ["1" , fastq_1 , fastq_2 ]
113
+ else :
114
+ print_error ("Invalid combination of columns provided!" , "Line" , line )
115
+
116
+ ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
117
+ if sample not in sample_mapping_dict :
118
+ sample_mapping_dict [sample ] = [sample_info ]
119
+ else :
120
+ if sample_info in sample_mapping_dict [sample ]:
121
+ print_error ("Samplesheet contains duplicate rows!" , "Line" , line )
122
+ else :
123
+ sample_mapping_dict [sample ].append (sample_info )
124
+
125
+ ## Write validated samplesheet with appropriate columns
126
+ if len (sample_mapping_dict ) > 0 :
127
+ out_dir = os .path .dirname (file_out )
128
+ make_dir (out_dir )
129
+ with open (file_out , "w" ) as fout :
130
+ fout .write ("," .join (["sample" , "single_end" , "fastq_1" , "fastq_2" ]) + "\n " )
131
+ for sample in sorted (sample_mapping_dict .keys ()):
132
+
133
+ ## Check that multiple runs of the same sample are of the same datatype
134
+ if not all (
135
+ x [0 ] == sample_mapping_dict [sample ][0 ][0 ]
136
+ for x in sample_mapping_dict [sample ]
137
+ ):
138
+ print_error (
139
+ "Multiple runs of a sample must be of the same datatype!" ,
140
+ "Sample: {}" .format (sample ),
141
+ )
142
+
143
+ for idx , val in enumerate (sample_mapping_dict [sample ]):
144
+ fout .write (
145
+ "," .join (["{}_T{}" .format (sample , idx + 1 )] + val ) + "\n "
146
+ )
147
+ else :
148
+ print_error ("No entries to process!" , "Samplesheet: {}" .format (file_in ))
149
+
150
+ def main (args = None ):
151
+ args = parse_args (args )
152
+
153
+ check_illumina_samplesheet (args .FILE_IN , args .FILE_OUT )
240
154
241
155
if __name__ == "__main__" :
242
156
sys .exit (main ())
0 commit comments