Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/scripts/validate_schema_completeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@
"state_dir",
# Exposed via negated CLI flag --no-slack
"slack_enabled",
# Deacon tuning (set via params-file or preset)
"deacon_kmer_size",
"deacon_window_size",
"deacon_abs_threshold",
"deacon_rel_threshold",
}


Expand Down
9 changes: 9 additions & 0 deletions conf/results.config
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ params {

// Assign the above paths to publish directories in processes throughout the pipeline
process {
withName: 'DEACON_DEPLETE' {
publishDir = [
path: { params.preprocess_results + "/00_host_depletion" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
]
}

withName: 'EXTRACT_HUMAN_VIRUS_READS' {
publishDir = [
path: { params.human_virus_reads },
Expand Down
2 changes: 1 addition & 1 deletion lib/py_nvd/_fingerprint.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"main.nf": "d3df999c77a6754811017c07fa446c551d8334e72822a6df1c5cdcacb4715ebb",
"nextflow.config": "0b83a6d10e66f13e1821df96ec76849df034bb97c7758993a0c54cdaa3a38166"
"nextflow.config": "0fd69247b50f0351ea0e9e9942425e5135d2bc0b1fbfccedfb02a84868c5992b"
}
21 changes: 21 additions & 0 deletions lib/py_nvd/cli/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,24 @@ def run(
help="Remove host reads with STAT (requires --sra-human-db; default: follows --preprocess)",
rich_help_panel=PANEL_PREPROCESSING,
),
deacon_index: Path | None = typer.Option(
None,
"--deacon-index",
help="Path to prebuilt deacon index (.idx file)",
rich_help_panel=PANEL_PREPROCESSING,
),
deacon_index_url: str | None = typer.Option(
None,
"--deacon-index-url",
help="URL to download prebuilt deacon index (default: panhuman-1)",
rich_help_panel=PANEL_PREPROCESSING,
),
deacon_contaminants_fasta: Path | None = typer.Option(
None,
"--deacon-contaminants-fasta",
help="Custom contaminant FASTA to union with base index",
rich_help_panel=PANEL_PREPROCESSING,
),
filter_reads: bool | None = typer.Option(
None,
"--filter-reads/--no-filter-reads",
Expand Down Expand Up @@ -641,6 +659,9 @@ def run(
"dedup_pos": dedup_pos,
"trim_adapters": trim_adapters,
"scrub_host_reads": scrub_host_reads,
"deacon_index": deacon_index,
"deacon_index_url": deacon_index_url,
"deacon_contaminants_fasta": deacon_contaminants_fasta,
"filter_reads": filter_reads,
"min_read_quality_illumina": min_read_quality_illumina,
"min_read_quality_nanopore": min_read_quality_nanopore,
Expand Down
37 changes: 37 additions & 0 deletions lib/py_nvd/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,43 @@ class NvdParams(BaseModel):
json_schema_extra={"category": "Preprocessing"},
)

# Host scrubbing with deacon
deacon_index: Path | None = Field(
None,
description="Path to prebuilt deacon index (.idx file)",
json_schema_extra={"category": "Preprocessing"},
)
deacon_index_url: str = Field(
"https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content",
description="URL to download prebuilt deacon index (default: panhuman-1)",
json_schema_extra={"category": "Preprocessing"},
)
deacon_contaminants_fasta: Path | None = Field(
None,
description="Custom contaminant FASTA to union with base index",
json_schema_extra={"category": "Preprocessing"},
)
deacon_kmer_size: int = Field(
31,
description="K-mer size for deacon index (must match index if prebuilt)",
json_schema_extra={"category": "Preprocessing"},
)
deacon_window_size: int = Field(
15,
description="Minimizer window size for deacon index",
json_schema_extra={"category": "Preprocessing"},
)
deacon_abs_threshold: int = Field(
2,
description="Minimum absolute minimizer hits to classify as contaminant",
json_schema_extra={"category": "Preprocessing"},
)
deacon_rel_threshold: float = Field(
0.01,
description="Minimum relative proportion of minimizers (0.0-1.0)",
json_schema_extra={"category": "Preprocessing"},
)

# =========================================================================
# Analysis Parameters
# =========================================================================
Expand Down
119 changes: 119 additions & 0 deletions modules/deacon.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Deacon: Fast alignment-free decontamination
* https://github.com/bede/deacon
*
* Key features:
* - Preserves FASTQ headers (critical for read pairing)
* - Composable indexes via set algebra (union, diff, intersect)
* - SIMD-accelerated, ~5GB RAM for panhuman index
*/

process DEACON_BUILD_INDEX {
/*
* Build a deacon index from FASTA file(s).
* Use this for custom contaminant sequences.
*/

tag "${fasta.simpleName}"
label "medium"

input:
path fasta

output:
path "*.idx", emit: index

script:
def prefix = fasta.simpleName
"""
deacon index build \\
--threads ${task.cpus} \\
-k ${params.deacon_kmer_size} \\
-w ${params.deacon_window_size} \\
${fasta} > ${prefix}.k${params.deacon_kmer_size}w${params.deacon_window_size}.idx
"""
}

process DEACON_FETCH_INDEX {
/*
* Download a prebuilt deacon index from URL.
* Takes the URL as a channel value so the process only runs when
* the input channel is non-empty (no `when:` guard needed).
* Caches in work directory; use storeDir for persistent caching.
*/

label "low"

input:
val url

output:
path "*.idx", emit: index

script:
def filename = url.tokenize('/').last()
"""
curl -fsSL "${url}" -o ${filename}
"""
}

process DEACON_UNION_INDEXES {
/*
* Combine multiple deacon indexes via set union.
* Only called when both a base index and custom index are present.
*/

label "low"

input:
path indexes // Collection of .idx files (always 2+)

output:
path "combined.idx", emit: index

script:
def idx_list = indexes.collect { it.name }.join(' ')
"""
deacon index union ${idx_list} > combined.idx
"""
}

process DEACON_DEPLETE {
/*
* Remove contaminant reads using deacon filter in deplete mode.
*
* Critical: This preserves FASTQ headers verbatim, which is required
* for repair.sh to re-pair reads after filtering. SPAdes paired-end
* assembly depends on proper read pairing.
*
* Deacon natively handles gzipped input/output (since v0.13.0).
* When writing .gz output via --output, deacon splits --threads 1:1
* between filtering and compression automatically.
*/

tag "${sample_id}"
label "medium"

errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' }
maxRetries 2

input:
tuple val(sample_id), val(platform), val(read_structure), path(reads), path(index)

output:
tuple val(sample_id), val(platform), val(read_structure), path("${sample_id}.depleted.fastq.gz"), emit: reads
tuple val(sample_id), path("${sample_id}.deacon.json"), emit: stats

script:
"""
deacon filter \\
--deplete \\
--threads ${task.cpus} \\
--abs-threshold ${params.deacon_abs_threshold} \\
--rel-threshold ${params.deacon_rel_threshold} \\
--summary ${sample_id}.deacon.json \\
--output ${sample_id}.depleted.fastq.gz \\
${index} \\
${reads}
"""
}
9 changes: 9 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ params {
min_read_length = 50
max_read_length = null

// Host scrubbing with deacon (used when scrub_host_reads is enabled)
deacon_index = null
deacon_index_url = "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content"
deacon_contaminants_fasta = null
deacon_kmer_size = 31
deacon_window_size = 15
deacon_abs_threshold = 2
deacon_rel_threshold = 0.01

// NVD settings
cutoff_percent = 0.001
entropy = 0.9
Expand Down
Loading