Skip to content

Commit

Permalink
Merge pull request #562 from moka-guys/gzip_checks
Browse files Browse the repository at this point in the history
Modification of gzip validation logic (#562)

Co-Authored-By: rebeccahaines1 <84131466+rebeccahaines1@users.noreply.github.com>
Co-Authored-By: RachelDuffin <rachel.g.duffin@gmail.com>
  • Loading branch information
rebeccahaines1 and RachelDuffin authored Jan 28, 2025
2 parents 6bd87d8 + 4d0ecb7 commit 9375f93
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 11 deletions.
4 changes: 2 additions & 2 deletions config/log_msgs_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
"pipelines for the same run. Supported pipelines: %s"
),
"ss_missing": "SampleSheet is missing and is required for sample name parsing",
"fastq_valid": "Gzip --test determined that the fastq is valid: %s",
"fastq_invalid": "Gzip --test determined that the fastq is not valid: %s. Stdout: %s. Stderr: %s",
"fastq_valid": "Gzip testing determined that the fastq is valid: %s",
"fastq_invalid": "Gzip testing determined that the fastq is not valid: %s. Error: %s",
"demux_success": "Demultiplexing was successful for the run with all fastqs valid",
"wes_batch_nos_identified": "WES batch numbers %s identified",
"wes_batch_nos_missing": "WES batch numbers missing. Check for errors in the sample names. Script exited",
Expand Down
47 changes: 38 additions & 9 deletions toolbox/toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from typing import Union, Optional
from config.ad_config import ToolboxConfig
from ad_logger.ad_logger import RunfolderLoggers
import gzip
import zlib


def get_credential(file: str) -> None:
Expand Down Expand Up @@ -304,6 +306,34 @@ def get_samplename_dict(
logger.error(logger.log_msgs["ss_missing"])


def validate_fastq_gzip(file_path):
"""Fast gzip validation by checking header, footer, and partial decompression"""
try:
# Check compressed file header (magic number check)
with open(file_path, 'rb') as f:
magic = f.read(2)
if magic != b'\x1f\x8b':
return False, f"Invalid gzip magic bytes: {magic.hex()}"

# Check footer (last 4 bytes for ISIZE)
f.seek(-4, 2)
isize = int.from_bytes(f.read(4), 'little')
if isize == 0:
return False, "Invalid zero uncompressed size"

# Quick decompression check of first block
with gzip.open(file_path, 'rb') as f:
# Only read first 1KB of decompressed data
f.read(1024)

return True, None

except (OSError, EOFError, zlib.error) as e:
return False, f"Validation error: {str(e)}"
except Exception as e:
return False, f"Unexpected error: {str(e)}"


def validate_fastqs(fastq_dir_path: str, logger: logging.Logger) -> Optional[bool]:
"""
Validate the created fastqs in the BaseCalls directory and log success
Expand All @@ -317,25 +347,24 @@ def validate_fastqs(fastq_dir_path: str, logger: logging.Logger) -> Optional[boo
returncodes = []

for fastq in fastqs:
out, err, returncode = execute_subprocess_command(
f"gzip --test {os.path.join(fastq_dir_path, fastq)}",
logger,
)
returncodes.append(returncode)
if returncode == 0:
full_path = os.path.join(fastq_dir_path, fastq)
is_valid, error_msg = validate_fastq_gzip(full_path)

if is_valid:
logger.info(
logger.log_msgs["fastq_valid"],
fastq,
)
returncodes.append(True)
else:
logger.error(
logger.log_msgs["fastq_invalid"],
fastq,
out,
err,
error_msg,
)
returncodes.append(False)

if all(code == 0 for code in returncodes):
if all(returncodes):
logger.info(logger.log_msgs["demux_success"])
return True

Expand Down
3 changes: 3 additions & 0 deletions wscleaner/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def get_arguments():
# no directories are deleted by the runfolder manager
if parsed_args.dry_run or BRANCH != "main":
dry_run = True # Protects against deleting the test folders (!!)
else:
dry_run = False


RFM = RunFolderManager(
dry_run=dry_run,
Expand Down

0 comments on commit 9375f93

Please sign in to comment.