Skip to content

Commit

Permalink
Merge pull request #371 from JoseEspinosa/fixes
Browse files Browse the repository at this point in the history
Check for sample IDs characters in samplesheet and report line numbers when erroring
  • Loading branch information
JoseEspinosa authored Jul 4, 2024
2 parents 0623e2e + 50f1b0d commit a238997
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 13 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Updated pipeline template to [nf-core/tools 2.14.1](https://github.com/nf-core/tools/releases/tag/2.14.1)
- [[#359](https://github.com/nf-core/atacseq/issues/359)] - Fix `--save_unaligned` description in schema.
- [[#344](https://github.com/nf-core/atacseq/issues/344)] - Fix memory issues when sorting merged replicates after `bedtools genomecov`.
- [[#338](https://github.com/nf-core/atacseq/issues/338)] - Check that samplesheet samples IDs do only have alphanumeric characters, dots, dashes or underscores.
- [[#370](https://github.com/nf-core/atacseq/issues/370)] - Adding line numbers to errors messages in `bin/check_samplesheet.py`

### Parameters

Expand Down
33 changes: 20 additions & 13 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import sys
import argparse

import re

def parse_args(args=None):
Description = "Reformat nf-core/atacseq samplesheet file and check its contents."
Expand Down Expand Up @@ -62,58 +62,65 @@ def check_samplesheet(file_in, file_out, with_control=False):
sys.exit(1)

## Check sample entries
for line in fin:
for line_number, line in enumerate(fin, start=1):
if line.strip():
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
"Invalid number of columns (found = {}, minimum = {})!".format(len(lspl),len(HEADER)),
"Line {}".format(line_number),
line,
)
num_cols = len([x for x in lspl[: len(HEADER)] if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
"Line",
"Invalid number of populated columns (found = {}, minimum = {})!".format(num_cols,MIN_COLS),
"Line {}".format(line_number),
line,
)

## Check sample name entries
sample, fastq_1, fastq_2, replicate = lspl[: len(HEADER) - 2 if with_control else len(HEADER)]
control = lspl[len(HEADER) - 2] if with_control else ""
control_replicate = lspl[len(HEADER) - 1] if with_control else ""

if sample.find(" ") != -1:
print(f"WARNING: Spaces have been replaced by underscores for sample: {sample}")
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)
print_error("Sample entry has not been specified!", "Line {}".format(line_number), line)
if not re.match(r"^[a-zA-Z0-9_.-]+$", sample):
print_error(
"Sample name contains invalid characters! Only alphanumeric characters, underscores, dots and dashes are allowed.",
"Line {}".format(line_number),
line,
)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
print_error("FastQ file contains spaces!", "Line {}".format(line_number), line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"Line",
"Line {}".format(line_number),
line,
)

## Check replicate column is integer
if not replicate.isdecimal():
print_error("Replicate id not an integer!", "Line", line)
print_error("Replicate id not an integer!", "Line {}".format(line_number), line)
sys.exit(1)

if with_control and control:
if control.find(" ") != -1:
print(f"WARNING: Spaces have been replaced by underscores for control: {control}")
control = control.replace(" ", "_")
if not control_replicate.isdecimal():
print_error("Control replicate id not an integer!", "Line", line)
print_error("Replicate id not an integer!", "Line {}".format(line_number), line)
sys.exit(1)
control = "{}_REP{}".format(control, control_replicate)

Expand All @@ -126,7 +133,7 @@ def check_samplesheet(file_in, file_out, with_control=False):
elif sample and fastq_1 and not fastq_2:
sample_info = [fastq_1, fastq_2, replicate, "1", control]
else:
print_error("Invalid combination of columns provided!", "Line", line)
print_error("Invalid combination of columns provided!", "Line {}".format(line_number), line)

## Create sample mapping dictionary = {sample: {replicate: [[ fastq_1, fastq_2, replicate, control, single_end ]]}}
replicate = int(replicate)
Expand All @@ -137,7 +144,7 @@ def check_samplesheet(file_in, file_out, with_control=False):
sample_mapping_dict[sample][replicate] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample][replicate]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
print_error("Samplesheet contains duplicate rows!", "Line {}".format(line_number), line)
else:
sample_mapping_dict[sample][replicate].append(sample_info)

Expand Down

0 comments on commit a238997

Please sign in to comment.