From e7fba4870cde9b676a6f4446fdb5a411dade8185 Mon Sep 17 00:00:00 2001 From: JoseEspinosa Date: Thu, 4 Jul 2024 14:12:20 +0200 Subject: [PATCH] Check for sample IDs allowed characters --- CHANGELOG.md | 3 ++- bin/check_samplesheet.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 777d00b5..be4590e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#370](https://github.com/nf-core/chipseq/issues/370)] - Fix stack overflow exceptions in phantompeakqualtools ([see here](https://github.com/kundajelab/phantompeakqualtools/issues/3)). - [[#387](https://github.com/nf-core/chipseq/issues/387)] - Get rid of the `lib` folder and rearrange the pipeline accordingly. - [[#385](https://github.com/nf-core/chipseq/issues/385)] - Fix `--save_unaligned` description in schema. -- [[PR #392](https://github.com/nf-core/chipseq/pull/392)] - Adding line numbers to warnings/errors messages in `bin/check_samplesheet.py` +- [[PR #392](https://github.com/nf-core/chipseq/pull/392)] - Adding line numbers to warnings/errors messages in `bin/check_samplesheet.py`. +- [[#396](https://github.com/nf-core/chipseq/issues/396)] - Check that samplesheet samples IDs do only have alphanumeric characters, dots, dashes or underscores. ### Software dependencies diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 36351a71..c7d8add7 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -4,6 +4,7 @@ import sys import errno import argparse +import re def parse_args(args=None): @@ -84,22 +85,28 @@ def check_samplesheet(file_in, file_out): sample = sample.replace(" ", "_") if not sample: print_error("Sample entry has not been specified!", "Line {}".format(line_number), line) + if not re.match(r"^[a-zA-Z0-9_.-]+$", sample): + print_error( + "Sample name contains invalid characters! Only alphanumeric characters, underscores, dots and dashes are allowed.", + "Line {}".format(line_number), + line, + ) ## Check FastQ file extension for fastq in [fastq_1, fastq_2]: if fastq: if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line {}".format(lineNo), line) + print_error("FastQ file contains spaces!", "Line {}".format(line_number), line) if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): print_error( "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", - "Line {}".format(lineNo), + "Line {}".format(line_number), line, ) ## Check replicate column is integer if not replicate.isdecimal(): - print_error("Replicate id not an integer!", "Line {}".format(lineNo), line) + print_error("Replicate id not an integer!", "Line {}".format(line_number), line) sys.exit(1) ## Check antibody and control columns have valid values