Skip to content

Commit 4e2de08

Browse files
authored
Merge pull request #41 from avantonder/apha
fix check_samplesheet.py
2 parents b27324e + ee69793 commit 4e2de08

File tree

2 files changed

+161
-224
lines changed

2 files changed

+161
-224
lines changed

bin/check_samplesheet.py

Lines changed: 138 additions & 224 deletions
Original file line numberDiff line numberDiff line change
@@ -1,242 +1,156 @@
1-
#!/usr/bin/env python3
1+
#!/usr/bin/env python
22

3+
import os
4+
import sys
5+
import errno
6+
import argparse
37

4-
"""Provide a command line tool to validate and transform tabular samplesheets."""
58

9+
def parse_args(args=None):
10+
Description = "Reformat avantonder/bovisanalyzer samplesheet file and check its contents."
11+
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
612

7-
import argparse
8-
import csv
9-
import logging
10-
import sys
11-
from collections import Counter
12-
from pathlib import Path
13+
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
14+
parser.add_argument("FILE_IN", help="Input samplesheet file.")
15+
parser.add_argument("FILE_OUT", help="Output file.")
16+
return parser.parse_args(args)
1317

1418

15-
logger = logging.getLogger()
19+
def make_dir(path):
20+
if len(path) > 0:
21+
try:
22+
os.makedirs(path)
23+
except OSError as exception:
24+
if exception.errno != errno.EEXIST:
25+
raise exception
1626

1727

18-
class RowChecker:
19-
"""
20-
Define a service that can validate and transform each given row.
21-
Attributes:
22-
modified (list): A list of dicts, where each dict corresponds to a previously
23-
validated and transformed row. The order of rows is maintained.
24-
"""
25-
26-
VALID_FORMATS = (
27-
".fq.gz",
28-
".fastq.gz",
29-
)
30-
31-
def __init__(
32-
self,
33-
sample_col="sample",
34-
first_col="fastq_1",
35-
second_col="fastq_2",
36-
single_col="single_end",
37-
**kwargs,
38-
):
39-
"""
40-
Initialize the row checker with the expected column names.
41-
Args:
42-
sample_col (str): The name of the column that contains the sample name
43-
(default "sample").
44-
first_col (str): The name of the column that contains the first (or only)
45-
FASTQ file path (default "fastq_1").
46-
second_col (str): The name of the column that contains the second (if any)
47-
FASTQ file path (default "fastq_2").
48-
single_col (str): The name of the new column that will be inserted and
49-
records whether the sample contains single- or paired-end sequencing
50-
reads (default "single_end").
51-
"""
52-
super().__init__(**kwargs)
53-
self._sample_col = sample_col
54-
self._first_col = first_col
55-
self._second_col = second_col
56-
self._single_col = single_col
57-
self._seen = set()
58-
self.modified = []
59-
60-
def validate_and_transform(self, row):
61-
"""
62-
Perform all validations on the given row and insert the read pairing status.
63-
Args:
64-
row (dict): A mapping from column headers (keys) to elements of that row
65-
(values).
66-
"""
67-
self._validate_sample(row)
68-
self._validate_first(row)
69-
self._validate_second(row)
70-
self._validate_pair(row)
71-
self._seen.add((row[self._sample_col], row[self._first_col]))
72-
self.modified.append(row)
73-
74-
def _validate_sample(self, row):
75-
"""Assert that the sample name exists and convert spaces to underscores."""
76-
assert len(row[self._sample_col]) > 0, "Sample input is required."
77-
# Sanitize samples slightly.
78-
row[self._sample_col] = row[self._sample_col].replace(" ", "_")
79-
80-
def _validate_first(self, row):
81-
"""Assert that the first FASTQ entry is non-empty and has the right format."""
82-
assert len(row[self._first_col]) > 0, "At least the first FASTQ file is required."
83-
self._validate_fastq_format(row[self._first_col])
84-
85-
def _validate_second(self, row):
86-
"""Assert that the second FASTQ entry has the right format if it exists."""
87-
if len(row[self._second_col]) > 0:
88-
self._validate_fastq_format(row[self._second_col])
89-
90-
def _validate_pair(self, row):
91-
"""Assert that read pairs have the same file extension. Report pair status."""
92-
if row[self._first_col] and row[self._second_col]:
93-
row[self._single_col] = False
94-
assert (
95-
Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:]
96-
), "FASTQ pairs must have the same file extensions."
97-
else:
98-
row[self._single_col] = True
99-
100-
def _validate_fastq_format(self, filename):
101-
"""Assert that a given filename has one of the expected FASTQ extensions."""
102-
assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), (
103-
f"The FASTQ file has an unrecognized extension: {filename}\n"
104-
f"It should be one of: {', '.join(self.VALID_FORMATS)}"
28+
def print_error(error, context="Line", context_str=""):
29+
error_str = "ERROR: Please check samplesheet -> {}".format(error)
30+
if context != "" and context_str != "":
31+
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
32+
error, context.strip(), context_str.strip()
10533
)
34+
print(error_str)
35+
sys.exit(1)
10636

107-
def validate_unique_samples(self):
108-
"""
109-
Assert that the combination of sample name and FASTQ filename is unique.
110-
In addition to the validation, also rename the sample if more than one sample,
111-
FASTQ file combination exists.
112-
"""
113-
assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
114-
if len({pair[0] for pair in self._seen}) < len(self._seen):
115-
counts = Counter(pair[0] for pair in self._seen)
116-
seen = Counter()
117-
for row in self.modified:
118-
sample = row[self._sample_col]
119-
seen[sample] += 1
120-
if counts[sample] > 1:
121-
row[self._sample_col] = f"{sample}_T{seen[sample]}"
122-
123-
124-
def read_head(handle, num_lines=10):
125-
"""Read the specified number of lines from the current position in the file."""
126-
lines = []
127-
for idx, line in enumerate(handle):
128-
if idx == num_lines:
129-
break
130-
lines.append(line)
131-
return "".join(lines)
132-
133-
134-
def sniff_format(handle):
135-
"""
136-
Detect the tabular format.
137-
Args:
138-
handle (text file): A handle to a `text file`_ object. The read position is
139-
expected to be at the beginning (index 0).
140-
Returns:
141-
csv.Dialect: The detected tabular format.
142-
.. _text file:
143-
https://docs.python.org/3/glossary.html#term-text-file
144-
"""
145-
peek = read_head(handle)
146-
handle.seek(0)
147-
sniffer = csv.Sniffer()
148-
if not sniffer.has_header(peek):
149-
logger.critical(f"The given sample sheet does not appear to contain a header.")
150-
sys.exit(1)
151-
dialect = sniffer.sniff(peek)
152-
return dialect
15337

154-
155-
def check_samplesheet(file_in, file_out):
38+
def check_illumina_samplesheet(file_in, file_out):
15639
"""
157-
Check that the tabular samplesheet has the structure expected by nf-core pipelines.
158-
Validate the general shape of the table, expected columns, and each row. Also add
159-
an additional column which records whether one or two FASTQ reads were found.
160-
Args:
161-
file_in (pathlib.Path): The given tabular samplesheet. The format can be either
162-
CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
163-
file_out (pathlib.Path): Where the validated and transformed samplesheet should
164-
be created; always in CSV format.
165-
Example:
166-
This function checks that the samplesheet follows the following structure,
167-
see also the `viral recon samplesheet`_::
168-
sample,fastq_1,fastq_2
169-
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
170-
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
171-
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
172-
.. _viral recon samplesheet:
173-
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
40+
This function checks that the samplesheet follows the following structure:
41+
sample,fastq_1,fastq_2
42+
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
43+
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
44+
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
45+
For an example see:
46+
https://github.com/nf-core/test-datasets/blob/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
17447
"""
175-
required_columns = {"sample", "fastq_1", "fastq_2"}
176-
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
177-
with file_in.open(newline="") as in_handle:
178-
reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
179-
# Validate the existence of the expected header columns.
180-
if not required_columns.issubset(reader.fieldnames):
181-
logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.")
48+
49+
sample_mapping_dict = {}
50+
with open(file_in, "r") as fin:
51+
52+
## Check header
53+
MIN_COLS = 2
54+
HEADER = ["sample", "fastq_1", "fastq_2"]
55+
header = [x.strip('"') for x in fin.readline().strip().split(",")]
56+
if header[: len(HEADER)] != HEADER:
57+
print(
58+
"ERROR: Please check samplesheet header -> {} != {}".format(
59+
",".join(header), ",".join(HEADER)
60+
)
61+
)
18262
sys.exit(1)
183-
# Validate each row.
184-
checker = RowChecker()
185-
for i, row in enumerate(reader):
186-
try:
187-
checker.validate_and_transform(row)
188-
except AssertionError as error:
189-
logger.critical(f"{str(error)} On line {i + 2}.")
190-
sys.exit(1)
191-
checker.validate_unique_samples()
192-
header = list(reader.fieldnames)
193-
header.insert(1, "single_end")
194-
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
195-
with file_out.open(mode="w", newline="") as out_handle:
196-
writer = csv.DictWriter(out_handle, header, delimiter=",")
197-
writer.writeheader()
198-
for row in checker.modified:
199-
writer.writerow(row)
200-
201-
202-
def parse_args(argv=None):
203-
"""Define and immediately parse command line arguments."""
204-
parser = argparse.ArgumentParser(
205-
description="Validate and transform a tabular samplesheet.",
206-
epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
207-
)
208-
parser.add_argument(
209-
"file_in",
210-
metavar="FILE_IN",
211-
type=Path,
212-
help="Tabular input samplesheet in CSV or TSV format.",
213-
)
214-
parser.add_argument(
215-
"file_out",
216-
metavar="FILE_OUT",
217-
type=Path,
218-
help="Transformed output samplesheet in CSV format.",
219-
)
220-
parser.add_argument(
221-
"-l",
222-
"--log-level",
223-
help="The desired log level (default WARNING).",
224-
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
225-
default="WARNING",
226-
)
227-
return parser.parse_args(argv)
228-
229-
230-
def main(argv=None):
231-
"""Coordinate argument parsing and program execution."""
232-
args = parse_args(argv)
233-
logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
234-
if not args.file_in.is_file():
235-
logger.error(f"The given input file {args.file_in} was not found!")
236-
sys.exit(2)
237-
args.file_out.parent.mkdir(parents=True, exist_ok=True)
238-
check_samplesheet(args.file_in, args.file_out)
23963

64+
## Check sample entries
65+
for line in fin:
66+
lspl = [x.strip().strip('"') for x in line.strip().split(",")]
67+
68+
# Check valid number of columns per row
69+
if len(lspl) < len(HEADER):
70+
print_error(
71+
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
72+
"Line",
73+
line,
74+
)
75+
num_cols = len([x for x in lspl if x])
76+
if num_cols < MIN_COLS:
77+
print_error(
78+
"Invalid number of populated columns (minimum = {})!".format(
79+
MIN_COLS
80+
),
81+
"Line",
82+
line,
83+
)
84+
85+
## Check sample name entries
86+
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
87+
if sample.find(" ") != -1:
88+
print(
89+
f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
90+
)
91+
sample = sample.replace(" ", "_")
92+
if not sample:
93+
print_error("Sample entry has not been specified!", "Line", line)
94+
95+
## Check FastQ file extension
96+
for fastq in [fastq_1, fastq_2]:
97+
if fastq:
98+
if fastq.find(" ") != -1:
99+
print_error("FastQ file contains spaces!", "Line", line)
100+
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
101+
print_error(
102+
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
103+
"Line",
104+
line,
105+
)
106+
107+
## Auto-detect paired-end/single-end
108+
sample_info = [] ## [single_end, fastq_1, fastq_2]
109+
if sample and fastq_1 and fastq_2: ## Paired-end short reads
110+
sample_info = ["0", fastq_1, fastq_2]
111+
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
112+
sample_info = ["1", fastq_1, fastq_2]
113+
else:
114+
print_error("Invalid combination of columns provided!", "Line", line)
115+
116+
## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
117+
if sample not in sample_mapping_dict:
118+
sample_mapping_dict[sample] = [sample_info]
119+
else:
120+
if sample_info in sample_mapping_dict[sample]:
121+
print_error("Samplesheet contains duplicate rows!", "Line", line)
122+
else:
123+
sample_mapping_dict[sample].append(sample_info)
124+
125+
## Write validated samplesheet with appropriate columns
126+
if len(sample_mapping_dict) > 0:
127+
out_dir = os.path.dirname(file_out)
128+
make_dir(out_dir)
129+
with open(file_out, "w") as fout:
130+
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
131+
for sample in sorted(sample_mapping_dict.keys()):
132+
133+
## Check that multiple runs of the same sample are of the same datatype
134+
if not all(
135+
x[0] == sample_mapping_dict[sample][0][0]
136+
for x in sample_mapping_dict[sample]
137+
):
138+
print_error(
139+
"Multiple runs of a sample must be of the same datatype!",
140+
"Sample: {}".format(sample),
141+
)
142+
143+
for idx, val in enumerate(sample_mapping_dict[sample]):
144+
fout.write(
145+
",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n"
146+
)
147+
else:
148+
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
149+
150+
def main(args=None):
151+
args = parse_args(args)
152+
153+
check_illumina_samplesheet(args.FILE_IN, args.FILE_OUT)
240154

241155
if __name__ == "__main__":
242156
sys.exit(main())

0 commit comments

Comments
 (0)