From 51b2f96f4c1734d5dc9b4bda0e83970c8ca1b7dc Mon Sep 17 00:00:00 2001 From: JoseEspinosa Date: Thu, 4 Jul 2024 13:04:13 +0200 Subject: [PATCH 1/3] Check for sample names allowed characters and report line number when erroring --- bin/check_samplesheet.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index f422a287..e9b5ab83 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -3,7 +3,7 @@ import os import sys import argparse - +import re def parse_args(args=None): Description = "Reformat nf-core/atacseq samplesheet file and check its contents." @@ -62,22 +62,22 @@ def check_samplesheet(file_in, file_out, with_control=False): sys.exit(1) ## Check sample entries - for line in fin: + for line_number, line in enumerate(fin, start=1): if line.strip(): lspl = [x.strip().strip('"') for x in line.strip().split(",")] # Check valid number of columns per row if len(lspl) < len(HEADER): print_error( - "Invalid number of columns (minimum = {})!".format(len(HEADER)), - "Line", + "Invalid number of columns (found = {}, minimum = {})!".format(len(lspl),len(HEADER)), + "Line {}".format(line_number), line, ) num_cols = len([x for x in lspl[: len(HEADER)] if x]) if num_cols < MIN_COLS: print_error( - "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), - "Line", + "Invalid number of populated columns (found = {}, minimum = {})!".format(num_cols,MIN_COLS), + "Line {}".format(line_number), line, ) @@ -85,27 +85,34 @@ def check_samplesheet(file_in, file_out, with_control=False): sample, fastq_1, fastq_2, replicate = lspl[: len(HEADER) - 2 if with_control else len(HEADER)] control = lspl[len(HEADER) - 2] if with_control else "" control_replicate = lspl[len(HEADER) - 1] if with_control else "" + if sample.find(" ") != -1: print(f"WARNING: Spaces have been replaced by underscores for sample: {sample}") sample = sample.replace(" ", "_") if not sample: - print_error("Sample entry has not been specified!", "Line", line) + print_error("Sample entry has not been specified!", "Line {}".format(line_number), line) + if not re.match(r"^[a-zA-Z0-9_.-]+$", sample): + print_error( + "Sample name contains invalid characters! Only alphanumeric characters, underscores, dots and dashes are allowed.", + "Line {}".format(line_number), + line, + ) ## Check FastQ file extension for fastq in [fastq_1, fastq_2]: if fastq: if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line", line) + print_error("FastQ file contains spaces!", "Line {}".format(line_number), line) if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): print_error( "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", - "Line", + "Line {}".format(line_number), line, ) ## Check replicate column is integer if not replicate.isdecimal(): - print_error("Replicate id not an integer!", "Line", line) + print_error("Replicate id not an integer!", "Line {}".format(line_number), line) sys.exit(1) if with_control and control: @@ -113,7 +120,7 @@ def check_samplesheet(file_in, file_out, with_control=False): print(f"WARNING: Spaces have been replaced by underscores for control: {control}") control = control.replace(" ", "_") if not control_replicate.isdecimal(): - print_error("Control replicate id not an integer!", "Line", line) + print_error("Replicate id not an integer!", "Line {}".format(line_number), line) sys.exit(1) control = "{}_REP{}".format(control, control_replicate) @@ -126,7 +133,7 @@ def check_samplesheet(file_in, file_out, with_control=False): elif sample and fastq_1 and not fastq_2: sample_info = [fastq_1, fastq_2, replicate, "1", control] else: - print_error("Invalid combination of columns provided!", "Line", line) + print_error("Invalid combination of columns provided!", "Line {}".format(line_number), line) ## Create sample mapping dictionary = {sample: {replicate: [[ fastq_1, fastq_2, replicate, control, single_end ]]}} replicate = int(replicate) @@ -137,7 +144,7 @@ def check_samplesheet(file_in, file_out, with_control=False): sample_mapping_dict[sample][replicate] = [sample_info] else: if sample_info in sample_mapping_dict[sample][replicate]: - print_error("Samplesheet contains duplicate rows!", "Line", line) + print_error("Samplesheet contains duplicate rows!", "Line {}".format(line_number), line) else: sample_mapping_dict[sample][replicate].append(sample_info) From 90a440a8bb25106232efce2d148ba002718c3c3e Mon Sep 17 00:00:00 2001 From: JoseEspinosa Date: Thu, 4 Jul 2024 13:09:30 +0200 Subject: [PATCH 2/3] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 878073b5..f88231b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updated pipeline template to [nf-core/tools 2.14.1](https://github.com/nf-core/tools/releases/tag/2.14.1) - [[#359](https://github.com/nf-core/atacseq/issues/359)] - Fix `--save_unaligned` description in schema. - [[#344](https://github.com/nf-core/atacseq/issues/344)] - Fix memory issues when sorting merged replicates after `bedtools genomecov`. +- [[#338](https://github.com/nf-core/atacseq/issues/338)] - Check that samplesheet samples IDs do only have alphanumeric characters, dots, dashes or underscores. +- [[#370](https://github.com/nf-core/atacseq/issues/370)] - Adding line numbers to errors messages in `bin/check_samplesheet.py` ### Parameters From 50f1b0db5decb775dd906f925ca9e8314ee43ae9 Mon Sep 17 00:00:00 2001 From: JoseEspinosa Date: Thu, 4 Jul 2024 13:14:40 +0200 Subject: [PATCH 3/3] Make linting happy --- bin/check_samplesheet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index e9b5ab83..e85fefe9 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -85,7 +85,7 @@ def check_samplesheet(file_in, file_out, with_control=False): sample, fastq_1, fastq_2, replicate = lspl[: len(HEADER) - 2 if with_control else len(HEADER)] control = lspl[len(HEADER) - 2] if with_control else "" control_replicate = lspl[len(HEADER) - 1] if with_control else "" - + if sample.find(" ") != -1: print(f"WARNING: Spaces have been replaced by underscores for sample: {sample}") sample = sample.replace(" ", "_")