From 8f123d7f2fe3345fea79e4681e4adfa26d4fd884 Mon Sep 17 00:00:00 2001 From: Nick Minor Date: Wed, 11 Feb 2026 17:16:46 -0600 Subject: [PATCH 1/2] Add deacon infrastructure for host read scrubbing Introduces deacon modules, host depletion subworkflow, config params, Pydantic model fields, CLI options, JSON schema, and publishDir entry. Not yet wired into the preprocessing workflow. --- .../scripts/validate_schema_completeness.py | 5 + conf/results.config | 9 ++ lib/py_nvd/cli/commands/run.py | 21 ++++ lib/py_nvd/models.py | 37 ++++++ modules/deacon.nf | 119 ++++++++++++++++++ nextflow.config | 9 ++ pixi.lock | 89 ++++++++++--- pyproject.toml | 1 + schemas/nvd-params.v2.5.0.schema.json | 35 ++++++ subworkflows/host_depletion.nf | 65 ++++++++++ 10 files changed, 374 insertions(+), 16 deletions(-) create mode 100644 modules/deacon.nf create mode 100644 subworkflows/host_depletion.nf diff --git a/.github/scripts/validate_schema_completeness.py b/.github/scripts/validate_schema_completeness.py index 62ee5d5..d55bccc 100644 --- a/.github/scripts/validate_schema_completeness.py +++ b/.github/scripts/validate_schema_completeness.py @@ -83,6 +83,11 @@ "state_dir", # Exposed via negated CLI flag --no-slack "slack_enabled", + # Deacon tuning (set via params-file or preset) + "deacon_kmer_size", + "deacon_window_size", + "deacon_abs_threshold", + "deacon_rel_threshold", } diff --git a/conf/results.config b/conf/results.config index a34dd28..924f408 100644 --- a/conf/results.config +++ b/conf/results.config @@ -47,6 +47,15 @@ params { // Assign the above paths to publish directories in processes throughout the pipeline process { + withName: 'DEACON_DEPLETE' { + publishDir = [ + path: { params.preprocess_results + "/00_host_depletion" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + withName: 'EXTRACT_HUMAN_VIRUS_READS' { publishDir = [ path: { params.human_virus_reads }, diff --git a/lib/py_nvd/cli/commands/run.py b/lib/py_nvd/cli/commands/run.py index a7e97c5..a283856 100644 --- a/lib/py_nvd/cli/commands/run.py +++ b/lib/py_nvd/cli/commands/run.py @@ -364,6 +364,24 @@ def run( help="Remove host reads with STAT (requires --sra-human-db; default: follows --preprocess)", rich_help_panel=PANEL_PREPROCESSING, ), + deacon_index: Path | None = typer.Option( + None, + "--deacon-index", + help="Path to prebuilt deacon index (.idx file)", + rich_help_panel=PANEL_PREPROCESSING, + ), + deacon_index_url: str | None = typer.Option( + None, + "--deacon-index-url", + help="URL to download prebuilt deacon index (default: panhuman-1)", + rich_help_panel=PANEL_PREPROCESSING, + ), + deacon_contaminants_fasta: Path | None = typer.Option( + None, + "--deacon-contaminants-fasta", + help="Custom contaminant FASTA to union with base index", + rich_help_panel=PANEL_PREPROCESSING, + ), filter_reads: bool | None = typer.Option( None, "--filter-reads/--no-filter-reads", @@ -641,6 +659,9 @@ def run( "dedup_pos": dedup_pos, "trim_adapters": trim_adapters, "scrub_host_reads": scrub_host_reads, + "deacon_index": deacon_index, + "deacon_index_url": deacon_index_url, + "deacon_contaminants_fasta": deacon_contaminants_fasta, "filter_reads": filter_reads, "min_read_quality_illumina": min_read_quality_illumina, "min_read_quality_nanopore": min_read_quality_nanopore, diff --git a/lib/py_nvd/models.py b/lib/py_nvd/models.py index 447c89e..2d6ae53 100644 --- a/lib/py_nvd/models.py +++ b/lib/py_nvd/models.py @@ -988,6 +988,43 @@ class NvdParams(BaseModel): json_schema_extra={"category": "Preprocessing"}, ) + # Host scrubbing with deacon + deacon_index: Path | None = Field( + None, + description="Path to prebuilt deacon index (.idx file)", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_index_url: str = Field( + "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content", + description="URL to download prebuilt deacon index (default: panhuman-1)", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_contaminants_fasta: Path | None = Field( + None, + description="Custom contaminant FASTA to union with base index", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_kmer_size: int = Field( + 31, + description="K-mer size for deacon index (must match index if prebuilt)", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_window_size: int = Field( + 15, + description="Minimizer window size for deacon index", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_abs_threshold: int = Field( + 2, + description="Minimum absolute minimizer hits to classify as contaminant", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_rel_threshold: float = Field( + 0.01, + description="Minimum relative proportion of minimizers (0.0-1.0)", + json_schema_extra={"category": "Preprocessing"}, + ) + # ========================================================================= # Analysis Parameters # ========================================================================= diff --git a/modules/deacon.nf b/modules/deacon.nf new file mode 100644 index 0000000..32304b7 --- /dev/null +++ b/modules/deacon.nf @@ -0,0 +1,119 @@ +/* + * Deacon: Fast alignment-free decontamination + * https://github.com/bede/deacon + * + * Key features: + * - Preserves FASTQ headers (critical for read pairing) + * - Composable indexes via set algebra (union, diff, intersect) + * - SIMD-accelerated, ~5GB RAM for panhuman index + */ + +process DEACON_BUILD_INDEX { + /* + * Build a deacon index from FASTA file(s). + * Use this for custom contaminant sequences. + */ + + tag "${fasta.simpleName}" + label "medium" + + input: + path fasta + + output: + path "*.idx", emit: index + + script: + def prefix = fasta.simpleName + """ + deacon index build \\ + --threads ${task.cpus} \\ + -k ${params.deacon_kmer_size} \\ + -w ${params.deacon_window_size} \\ + ${fasta} > ${prefix}.k${params.deacon_kmer_size}w${params.deacon_window_size}.idx + """ +} + +process DEACON_FETCH_INDEX { + /* + * Download a prebuilt deacon index from URL. + * Takes the URL as a channel value so the process only runs when + * the input channel is non-empty (no `when:` guard needed). + * Caches in work directory; use storeDir for persistent caching. + */ + + label "low" + + input: + val url + + output: + path "*.idx", emit: index + + script: + def filename = url.tokenize('/').last() + """ + curl -fsSL "${url}" -o ${filename} + """ +} + +process DEACON_UNION_INDEXES { + /* + * Combine multiple deacon indexes via set union. + * Only called when both a base index and custom index are present. + */ + + label "low" + + input: + path indexes // Collection of .idx files (always 2+) + + output: + path "combined.idx", emit: index + + script: + def idx_list = indexes.collect { it.name }.join(' ') + """ + deacon index union ${idx_list} > combined.idx + """ +} + +process DEACON_DEPLETE { + /* + * Remove contaminant reads using deacon filter in deplete mode. + * + * Critical: This preserves FASTQ headers verbatim, which is required + * for repair.sh to re-pair reads after filtering. SPAdes paired-end + * assembly depends on proper read pairing. + * + * Deacon natively handles gzipped input/output (since v0.13.0). + * When writing .gz output via --output, deacon splits --threads 1:1 + * between filtering and compression automatically. + */ + + tag "${sample_id}" + label "medium" + + errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' } + maxRetries 2 + + input: + tuple val(sample_id), val(platform), val(read_structure), path(reads), path(index) + + output: + tuple val(sample_id), val(platform), val(read_structure), path("${sample_id}.depleted.fastq.gz"), emit: reads + tuple val(sample_id), path("${sample_id}.deacon.json"), emit: stats + + script: + """ + deacon filter \\ + --deplete \\ + --threads ${task.cpus} \\ + --abs-threshold ${params.deacon_abs_threshold} \\ + --rel-threshold ${params.deacon_rel_threshold} \\ + --summary ${sample_id}.deacon.json \\ + --output ${sample_id}.depleted.fastq.gz \\ + ${index} \\ + ${reads} + """ +} diff --git a/nextflow.config b/nextflow.config index 5aa3891..f0c50bf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -104,6 +104,15 @@ params { min_read_length = 50 max_read_length = null + // Host scrubbing with deacon (used when scrub_host_reads is enabled) + deacon_index = null + deacon_index_url = "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content" + deacon_contaminants_fasta = null + deacon_kmer_size = 31 + deacon_window_size = 15 + deacon_abs_threshold = 2 + deacon_rel_threshold = 0.01 + // NVD settings cutoff_percent = 0.001 entropy = 0.9 diff --git a/pixi.lock b/pixi.lock index 44cb84a..6dcbec5 100644 --- a/pixi.lock +++ b/pixi.lock @@ -41,6 +41,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.14.1-h332b0f4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda + - conda: https://conda.anaconda.org/bioconda/linux-64/deacon-0.12.0-h4349ce8_0.conda - conda: https://conda.anaconda.org/bioconda/linux-64/entrez-direct-22.4-he881be0_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/linux-64/fastp-1.0.1-heae3180_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 @@ -213,6 +214,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py313h536fd9c_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -258,7 +260,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -272,7 +274,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/1e/2f/60c51304fbdf47ce992d9eefa61fbd2c0e64feee60aaa439baf42ea6f40b/wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl @@ -308,6 +310,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/coreutils-9.5-h31becfc_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/curl-8.14.1-h6702fde_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cxx-compiler-1.11.0-h7b35c40_0.conda + - conda: https://conda.anaconda.org/bioconda/linux-aarch64/deacon-0.12.0-hba26009_0.conda - conda: https://conda.anaconda.org/bioconda/linux-aarch64/entrez-direct-22.4-h8865c2f_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/linux-aarch64/fastp-1.0.1-h7dc49d2_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 @@ -480,6 +483,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-1.3.1-h86ecc28_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstandard-0.23.0-py313h31d5739_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-hbcf94c1_2.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -525,7 +529,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -539,7 +543,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/03/ce5256e66dd94e521ad5e753c78185c01b6eddbed3147be541f4d38c0cb7/wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl @@ -576,6 +580,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/coreutils-9.5-h10d778d_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/curl-8.14.1-h5dec5d8_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cxx-compiler-1.11.0-h307afc9_0.conda + - conda: https://conda.anaconda.org/bioconda/osx-64/deacon-0.12.0-h00cbfe0_0.conda - conda: https://conda.anaconda.org/bioconda/osx-64/entrez-direct-22.4-h193322a_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/osx-64/fastp-1.0.1-h9ea9c2a_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-64/fortran-compiler-1.11.0-h9ab62e8_0.conda @@ -696,6 +701,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/zlib-1.3.1-hd23fc13_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.23.0-py313h63b0ddb_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -741,7 +747,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -755,7 +761,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f7/ca/3cf290212855b19af9fcc41b725b5620b32f470d6aad970c2593500817eb/wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl @@ -793,6 +799,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coreutils-9.5-h93a5062_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/curl-8.18.0-he38603e_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cxx-compiler-1.11.0-h88570a1_0.conda + - conda: https://conda.anaconda.org/bioconda/osx-arm64/deacon-0.13.2-hc12438c_1.conda - conda: https://conda.anaconda.org/bioconda/osx-arm64/entrez-direct-22.4-hd5f1084_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/osx-arm64/fastp-1.0.1-hee05c9d_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fortran-compiler-1.11.0-h81a4f41_0.conda @@ -918,6 +925,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zlib-1.3.1-h8359307_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-hbf9d68e_6.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -963,7 +971,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -977,7 +985,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b0/9a/d2faba7e61072a7507b5722db63562fdb22f5a24e237d460d18755627f15/wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl @@ -1049,6 +1057,11 @@ packages: purls: [] size: 585566 timestamp: 1718118473054 +- pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl + name: annotated-doc + version: 0.0.4 + sha256: 571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320 + requires_python: '>=3.8' - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl name: annotated-types version: 0.7.0 @@ -1383,6 +1396,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 148270198 timestamp: 1743181320604 - conda: https://conda.anaconda.org/bioconda/linux-aarch64/blast-2.16.0-h6a93c2d_5.tar.bz2 @@ -1404,6 +1418,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 145178710 timestamp: 1743180765609 - conda: https://conda.anaconda.org/bioconda/osx-64/blast-2.16.0-h53185c9_5.tar.bz2 @@ -1424,6 +1439,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 168734931 timestamp: 1743189752491 - conda: https://conda.anaconda.org/bioconda/osx-arm64/blast-2.16.0-hb260f6e_5.tar.bz2 @@ -1444,6 +1460,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 157885023 timestamp: 1743182400505 - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py313h46c70d0_3.conda @@ -2426,6 +2443,44 @@ packages: purls: [] size: 6715 timestamp: 1753098739952 +- conda: https://conda.anaconda.org/bioconda/linux-64/deacon-0.12.0-h4349ce8_0.conda + sha256: bd3e60e3edd432285384936be007d6a6fcf6b9a486e3916e147d97cfcdb4295a + md5: 9ee4d381da4213ecff8260998df5dedf + constrains: + - __glibc >=2.17 + license: MIT + purls: [] + size: 1450249 + timestamp: 1760628460538 +- conda: https://conda.anaconda.org/bioconda/linux-aarch64/deacon-0.12.0-hba26009_0.conda + sha256: 0ae4549c93059a205b76436951b581785b5a3eb0c17adcb0896075d8268521fb + md5: d6ad20691a294922c987d59c1c77cc81 + constrains: + - __glibc >=2.17 + license: MIT + purls: [] + size: 1495925 + timestamp: 1760628026072 +- conda: https://conda.anaconda.org/bioconda/osx-64/deacon-0.12.0-h00cbfe0_0.conda + sha256: be450d2174d937ae8c0b619573938110153202d6b111dde13d4058112f48c830 + md5: 2a188468ca7b093692401a834c9719d9 + constrains: + - __osx >=10.13 + license: MIT + purls: [] + size: 941742 + timestamp: 1760629349144 +- conda: https://conda.anaconda.org/bioconda/osx-arm64/deacon-0.13.2-hc12438c_1.conda + sha256: e0a1e650be9816d132d8e8028b4b5a8c47fc9347de9a77389c25d25263d79d12 + md5: da5a53c437863c6dd705ff1c29b7ebce + depends: + - openssl >=3.6.0,<4.0a0 + constrains: + - __osx >=11.0 + license: MIT + purls: [] + size: 1325263 + timestamp: 1763749605148 - pypi: https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl name: docutils version: 0.22.4 @@ -6692,7 +6747,7 @@ packages: - pypi: ./ name: nvd version: 2.4.0 - sha256: e271dfa809ef9357fbb4bae15d6fbec954e2ed279c634c1e15b517edc148a1de + sha256: cff095653e9b8e1f6123c1bec891054c35eb9ef3f0ed91ae16756fa972cc6dff requires_dist: - biopython>=1.85 - blake3>=1.0.8 @@ -8587,6 +8642,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - ncurses >=6.5,<7.0a0 license: MIT + purls: [] size: 499281 timestamp: 1752528204243 - conda: https://conda.anaconda.org/bioconda/linux-aarch64/samtools-1.22.1-h0b41a95_0.tar.bz2 @@ -8598,6 +8654,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - ncurses >=6.5,<7.0a0 license: MIT + purls: [] size: 572155 timestamp: 1752527954545 - conda: https://conda.anaconda.org/bioconda/osx-64/samtools-1.22.1-ha21ef43_0.tar.bz2 @@ -8788,10 +8845,10 @@ packages: - pkg:pypi/six?source=hash-mapping size: 18455 timestamp: 1753199211006 -- pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl +- pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl name: slack-sdk - version: 3.39.0 - sha256: b1556b2f5b8b12b94e5ea3f56c4f2c7f04462e4e1013d325c5764ff118044fa8 + version: 3.40.0 + sha256: f2bada5ed3adb10a01e154e90db01d6d8938d0461b5790c12bcb807b2d28bbe2 requires_dist: - aiodns>1.0 ; extra == 'optional' - aiohttp>=3.7.3,<4 ; extra == 'optional' @@ -9254,15 +9311,15 @@ packages: - pytest-mypy-testing ; extra == 'test' - pytest>=7.0,<8.2 ; extra == 'test' requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl +- pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl name: typer - version: 0.21.1 - sha256: 7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01 + version: 0.23.0 + sha256: 79f4bc262b6c37872091072a3cb7cb6d7d79ee98c0c658b4364bdcde3c42c913 requires_dist: - click>=8.0.0 - - typing-extensions>=3.7.4.3 - shellingham>=1.3.0 - rich>=10.11.0 + - annotated-doc>=0.0.2 requires_python: '>=3.9' - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl name: typing-extensions diff --git a/pyproject.toml b/pyproject.toml index 7fbd6fc..fbb02c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ unzip = ">=6.0,<7" yaml = ">=0.2.5,<0.3" zstd = ">=1.5.7,<2" nextflow = ">=25.10.2,<26" +deacon = ">=0.12.0,<0.14" # ----------------------------------------------------------------------------------- # diff --git a/schemas/nvd-params.v2.5.0.schema.json b/schemas/nvd-params.v2.5.0.schema.json index a164b22..69b3c8e 100644 --- a/schemas/nvd-params.v2.5.0.schema.json +++ b/schemas/nvd-params.v2.5.0.schema.json @@ -202,6 +202,41 @@ "minimum": 1, "description": "Maximum read length to retain (no limit if not specified)" }, + "deacon_index": { + "type": ["string", "null"], + "default": null, + "description": "Path to prebuilt deacon index (.idx file)" + }, + "deacon_index_url": { + "type": "string", + "default": "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content", + "description": "URL to download prebuilt deacon index (default: panhuman-1)" + }, + "deacon_contaminants_fasta": { + "type": ["string", "null"], + "default": null, + "description": "Custom contaminant FASTA to union with base index" + }, + "deacon_kmer_size": { + "type": "integer", + "default": 31, + "description": "K-mer size for deacon index (must match index if prebuilt)" + }, + "deacon_window_size": { + "type": "integer", + "default": 15, + "description": "Minimizer window size for deacon index" + }, + "deacon_abs_threshold": { + "type": "integer", + "default": 2, + "description": "Minimum absolute minimizer hits to classify as contaminant" + }, + "deacon_rel_threshold": { + "type": "number", + "default": 0.01, + "description": "Minimum relative proportion of minimizers (0.0-1.0)" + }, "cutoff_percent": { "type": "number", "default": 0.001, diff --git a/subworkflows/host_depletion.nf b/subworkflows/host_depletion.nf new file mode 100644 index 0000000..0ea408c --- /dev/null +++ b/subworkflows/host_depletion.nf @@ -0,0 +1,65 @@ +include { DEACON_BUILD_INDEX ; DEACON_FETCH_INDEX ; DEACON_UNION_INDEXES ; DEACON_DEPLETE } from "../modules/deacon" + +workflow HOST_DEPLETION { + /* + * Orchestrates deacon-based host/contaminant depletion. + * + * Index resolution priority: + * 1. params.deacon_index (explicit local path) + * 2. params.deacon_index_url (download prebuilt, e.g., panhuman-1) + * 3. params.deacon_contaminants_fasta (build from FASTA at runtime) + * + * If both a base index (local or URL) and deacon_contaminants_fasta are provided, + * the indexes are unioned to combine panhuman with custom contaminants. + * + * Uses declarative channel ternaries so the DAG is consistent across runs, + * matching the pattern in preprocess_reads.nf. + */ + + take: + ch_reads // tuple(sample_id, platform, read_structure, reads) + + main: + // --- Resolve base index --- + // Explicit local path takes priority over URL download. + // Both produce a channel of one .idx file; unused path emits nothing. + ch_local_index = params.deacon_index + ? Channel.fromPath(params.deacon_index) + : Channel.empty() + + ch_fetch_url = (!params.deacon_index && params.deacon_index_url) + ? Channel.of(params.deacon_index_url) + : Channel.empty() + + DEACON_FETCH_INDEX(ch_fetch_url) + + ch_base_index = ch_local_index.mix(DEACON_FETCH_INDEX.out.index) + + // --- Build custom index from FASTA if provided --- + ch_custom_fasta = params.deacon_contaminants_fasta + ? Channel.fromPath(params.deacon_contaminants_fasta) + : Channel.empty() + + DEACON_BUILD_INDEX(ch_custom_fasta) + ch_custom_index = DEACON_BUILD_INDEX.out.index + + // --- Combine indexes --- + // Determine at parse time whether we need to union multiple indexes. + // Both branches produce a value channel of one path (.idx file). + def needs_union = params.deacon_contaminants_fasta && (params.deacon_index || params.deacon_index_url) + + DEACON_UNION_INDEXES( + ch_base_index.mix(ch_custom_index).collect() + ) + + ch_index = needs_union + ? DEACON_UNION_INDEXES.out.index.first() + : ch_base_index.mix(ch_custom_index).first() + + // --- Run depletion --- + DEACON_DEPLETE(ch_reads.combine(ch_index)) + + emit: + reads = DEACON_DEPLETE.out.reads + stats = DEACON_DEPLETE.out.stats +} From 44d0885976084e94f09ab0e6f2a7598439e9f87c Mon Sep 17 00:00:00 2001 From: Nick Minor Date: Thu, 12 Feb 2026 08:02:52 -0600 Subject: [PATCH 2/2] Wire deacon host scrubbing into preprocessing workflow Replaces STAT-based SCRUB_HOST_READS with deacon-based HOST_DEPLETION in preprocess_reads.nf. The scrub_host_reads param continues to gate the step; deacon is used when any deacon index config is available. The host_depletion subworkflow uses declarative channel ternaries and empty-channel gating rather than procedural if/else blocks. This ensures Nextflow constructs the same DAG structure regardless of which params are set, which matters for -resume cache consistency and makes the dataflow easier to reason about. Processes that aren't needed receive Channel.empty() inputs and simply don't execute, rather than being conditionally excluded from the DAG. Index union routing is determined at parse time from params via a def, avoiding runtime .branch/.size()/.map gymnastics that are fragile in Groovy's type system. --- lib/py_nvd/_fingerprint.json | 2 +- workflows/preprocess_reads.nf | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/py_nvd/_fingerprint.json b/lib/py_nvd/_fingerprint.json index f1db982..15234d4 100644 --- a/lib/py_nvd/_fingerprint.json +++ b/lib/py_nvd/_fingerprint.json @@ -1,4 +1,4 @@ { "main.nf": "d3df999c77a6754811017c07fa446c551d8334e72822a6df1c5cdcacb4715ebb", - "nextflow.config": "0b83a6d10e66f13e1821df96ec76849df034bb97c7758993a0c54cdaa3a38166" + "nextflow.config": "0fd69247b50f0351ea0e9e9942425e5135d2bc0b1fbfccedfb02a84868c5992b" } diff --git a/workflows/preprocess_reads.nf b/workflows/preprocess_reads.nf index d63ab0b..b97b274 100644 --- a/workflows/preprocess_reads.nf +++ b/workflows/preprocess_reads.nf @@ -1,5 +1,5 @@ include { DEDUP_WITH_CLUMPIFY ; TRIM_ADAPTERS ; FILTER_READS ; REPAIR_PAIRS } from "../modules/bbmap" -include { SCRUB_HOST_READS } from "../modules/stat" +include { HOST_DEPLETION } from "../subworkflows/host_depletion" workflow PREPROCESS_READS { take: @@ -29,13 +29,12 @@ workflow PREPROCESS_READS { ch_after_trim = ch_trimmed_illumina.mix(ch_branched_for_trim.other) - // 3. Host scrub (requires sra_human_db to be set) - ch_human_db = params.sra_human_db - ? Channel.fromPath(params.sra_human_db) - : Channel.empty() + // 3. Host scrub with deacon + // Requires at least one of: deacon_index, deacon_index_url, deacon_contaminants_fasta + def has_deacon_config = params.deacon_index || params.deacon_index_url || params.deacon_contaminants_fasta - ch_after_scrub = (should_scrub && params.sra_human_db) - ? SCRUB_HOST_READS(ch_after_trim.combine(ch_human_db)) + ch_after_scrub = (should_scrub && has_deacon_config) + ? HOST_DEPLETION(ch_after_trim).reads : ch_after_trim // 4. Quality/length filter (with platform-specific quality threshold) @@ -51,6 +50,7 @@ workflow PREPROCESS_READS { : ch_after_scrub // 5. Repair pairs (interleaved reads only) - fixes orphans from upstream steps + // Note: This works because deacon preserves CASAVA FASTQ headers ch_branched_for_repair = ch_after_filter.branch { _id, _platform, read_structure, _reads -> interleaved: read_structure == "interleaved" other: true