diff --git a/.github/scripts/validate_schema_completeness.py b/.github/scripts/validate_schema_completeness.py index 62ee5d5..d55bccc 100644 --- a/.github/scripts/validate_schema_completeness.py +++ b/.github/scripts/validate_schema_completeness.py @@ -83,6 +83,11 @@ "state_dir", # Exposed via negated CLI flag --no-slack "slack_enabled", + # Deacon tuning (set via params-file or preset) + "deacon_kmer_size", + "deacon_window_size", + "deacon_abs_threshold", + "deacon_rel_threshold", } diff --git a/conf/results.config b/conf/results.config index a34dd28..924f408 100644 --- a/conf/results.config +++ b/conf/results.config @@ -47,6 +47,15 @@ params { // Assign the above paths to publish directories in processes throughout the pipeline process { + withName: 'DEACON_DEPLETE' { + publishDir = [ + path: { params.preprocess_results + "/00_host_depletion" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + withName: 'EXTRACT_HUMAN_VIRUS_READS' { publishDir = [ path: { params.human_virus_reads }, diff --git a/lib/py_nvd/_fingerprint.json b/lib/py_nvd/_fingerprint.json index f1db982..15234d4 100644 --- a/lib/py_nvd/_fingerprint.json +++ b/lib/py_nvd/_fingerprint.json @@ -1,4 +1,4 @@ { "main.nf": "d3df999c77a6754811017c07fa446c551d8334e72822a6df1c5cdcacb4715ebb", - "nextflow.config": "0b83a6d10e66f13e1821df96ec76849df034bb97c7758993a0c54cdaa3a38166" + "nextflow.config": "0fd69247b50f0351ea0e9e9942425e5135d2bc0b1fbfccedfb02a84868c5992b" } diff --git a/lib/py_nvd/cli/commands/run.py b/lib/py_nvd/cli/commands/run.py index a7e97c5..a283856 100644 --- a/lib/py_nvd/cli/commands/run.py +++ b/lib/py_nvd/cli/commands/run.py @@ -364,6 +364,24 @@ def run( help="Remove host reads with STAT (requires --sra-human-db; default: follows --preprocess)", rich_help_panel=PANEL_PREPROCESSING, ), + deacon_index: Path | None = typer.Option( + None, + "--deacon-index", + help="Path to prebuilt deacon index (.idx file)", + rich_help_panel=PANEL_PREPROCESSING, + ), + deacon_index_url: str | None = typer.Option( + None, + "--deacon-index-url", + help="URL to download prebuilt deacon index (default: panhuman-1)", + rich_help_panel=PANEL_PREPROCESSING, + ), + deacon_contaminants_fasta: Path | None = typer.Option( + None, + "--deacon-contaminants-fasta", + help="Custom contaminant FASTA to union with base index", + rich_help_panel=PANEL_PREPROCESSING, + ), filter_reads: bool | None = typer.Option( None, "--filter-reads/--no-filter-reads", @@ -641,6 +659,9 @@ def run( "dedup_pos": dedup_pos, "trim_adapters": trim_adapters, "scrub_host_reads": scrub_host_reads, + "deacon_index": deacon_index, + "deacon_index_url": deacon_index_url, + "deacon_contaminants_fasta": deacon_contaminants_fasta, "filter_reads": filter_reads, "min_read_quality_illumina": min_read_quality_illumina, "min_read_quality_nanopore": min_read_quality_nanopore, diff --git a/lib/py_nvd/models.py b/lib/py_nvd/models.py index 447c89e..2d6ae53 100644 --- a/lib/py_nvd/models.py +++ b/lib/py_nvd/models.py @@ -988,6 +988,43 @@ class NvdParams(BaseModel): json_schema_extra={"category": "Preprocessing"}, ) + # Host scrubbing with deacon + deacon_index: Path | None = Field( + None, + description="Path to prebuilt deacon index (.idx file)", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_index_url: str = Field( + "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content", + description="URL to download prebuilt deacon index (default: panhuman-1)", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_contaminants_fasta: Path | None = Field( + None, + description="Custom contaminant FASTA to union with base index", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_kmer_size: int = Field( + 31, + description="K-mer size for deacon index (must match index if prebuilt)", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_window_size: int = Field( + 15, + description="Minimizer window size for deacon index", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_abs_threshold: int = Field( + 2, + description="Minimum absolute minimizer hits to classify as contaminant", + json_schema_extra={"category": "Preprocessing"}, + ) + deacon_rel_threshold: float = Field( + 0.01, + description="Minimum relative proportion of minimizers (0.0-1.0)", + json_schema_extra={"category": "Preprocessing"}, + ) + # ========================================================================= # Analysis Parameters # ========================================================================= diff --git a/modules/deacon.nf b/modules/deacon.nf new file mode 100644 index 0000000..32304b7 --- /dev/null +++ b/modules/deacon.nf @@ -0,0 +1,119 @@ +/* + * Deacon: Fast alignment-free decontamination + * https://github.com/bede/deacon + * + * Key features: + * - Preserves FASTQ headers (critical for read pairing) + * - Composable indexes via set algebra (union, diff, intersect) + * - SIMD-accelerated, ~5GB RAM for panhuman index + */ + +process DEACON_BUILD_INDEX { + /* + * Build a deacon index from FASTA file(s). + * Use this for custom contaminant sequences. + */ + + tag "${fasta.simpleName}" + label "medium" + + input: + path fasta + + output: + path "*.idx", emit: index + + script: + def prefix = fasta.simpleName + """ + deacon index build \\ + --threads ${task.cpus} \\ + -k ${params.deacon_kmer_size} \\ + -w ${params.deacon_window_size} \\ + ${fasta} > ${prefix}.k${params.deacon_kmer_size}w${params.deacon_window_size}.idx + """ +} + +process DEACON_FETCH_INDEX { + /* + * Download a prebuilt deacon index from URL. + * Takes the URL as a channel value so the process only runs when + * the input channel is non-empty (no `when:` guard needed). + * Caches in work directory; use storeDir for persistent caching. + */ + + label "low" + + input: + val url + + output: + path "*.idx", emit: index + + script: + def filename = url.tokenize('/').last() + """ + curl -fsSL "${url}" -o ${filename} + """ +} + +process DEACON_UNION_INDEXES { + /* + * Combine multiple deacon indexes via set union. + * Only called when both a base index and custom index are present. + */ + + label "low" + + input: + path indexes // Collection of .idx files (always 2+) + + output: + path "combined.idx", emit: index + + script: + def idx_list = indexes.collect { it.name }.join(' ') + """ + deacon index union ${idx_list} > combined.idx + """ +} + +process DEACON_DEPLETE { + /* + * Remove contaminant reads using deacon filter in deplete mode. + * + * Critical: This preserves FASTQ headers verbatim, which is required + * for repair.sh to re-pair reads after filtering. SPAdes paired-end + * assembly depends on proper read pairing. + * + * Deacon natively handles gzipped input/output (since v0.13.0). + * When writing .gz output via --output, deacon splits --threads 1:1 + * between filtering and compression automatically. + */ + + tag "${sample_id}" + label "medium" + + errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' } + maxRetries 2 + + input: + tuple val(sample_id), val(platform), val(read_structure), path(reads), path(index) + + output: + tuple val(sample_id), val(platform), val(read_structure), path("${sample_id}.depleted.fastq.gz"), emit: reads + tuple val(sample_id), path("${sample_id}.deacon.json"), emit: stats + + script: + """ + deacon filter \\ + --deplete \\ + --threads ${task.cpus} \\ + --abs-threshold ${params.deacon_abs_threshold} \\ + --rel-threshold ${params.deacon_rel_threshold} \\ + --summary ${sample_id}.deacon.json \\ + --output ${sample_id}.depleted.fastq.gz \\ + ${index} \\ + ${reads} + """ +} diff --git a/nextflow.config b/nextflow.config index 5aa3891..f0c50bf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -104,6 +104,15 @@ params { min_read_length = 50 max_read_length = null + // Host scrubbing with deacon (used when scrub_host_reads is enabled) + deacon_index = null + deacon_index_url = "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content" + deacon_contaminants_fasta = null + deacon_kmer_size = 31 + deacon_window_size = 15 + deacon_abs_threshold = 2 + deacon_rel_threshold = 0.01 + // NVD settings cutoff_percent = 0.001 entropy = 0.9 diff --git a/pixi.lock b/pixi.lock index 44cb84a..6dcbec5 100644 --- a/pixi.lock +++ b/pixi.lock @@ -41,6 +41,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.14.1-h332b0f4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda + - conda: https://conda.anaconda.org/bioconda/linux-64/deacon-0.12.0-h4349ce8_0.conda - conda: https://conda.anaconda.org/bioconda/linux-64/entrez-direct-22.4-he881be0_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/linux-64/fastp-1.0.1-heae3180_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 @@ -213,6 +214,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py313h536fd9c_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -258,7 +260,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -272,7 +274,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/1e/2f/60c51304fbdf47ce992d9eefa61fbd2c0e64feee60aaa439baf42ea6f40b/wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl @@ -308,6 +310,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/coreutils-9.5-h31becfc_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/curl-8.14.1-h6702fde_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cxx-compiler-1.11.0-h7b35c40_0.conda + - conda: https://conda.anaconda.org/bioconda/linux-aarch64/deacon-0.12.0-hba26009_0.conda - conda: https://conda.anaconda.org/bioconda/linux-aarch64/entrez-direct-22.4-h8865c2f_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/linux-aarch64/fastp-1.0.1-h7dc49d2_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 @@ -480,6 +483,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-1.3.1-h86ecc28_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstandard-0.23.0-py313h31d5739_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-hbcf94c1_2.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -525,7 +529,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -539,7 +543,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/03/ce5256e66dd94e521ad5e753c78185c01b6eddbed3147be541f4d38c0cb7/wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl @@ -576,6 +580,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/coreutils-9.5-h10d778d_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/curl-8.14.1-h5dec5d8_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/cxx-compiler-1.11.0-h307afc9_0.conda + - conda: https://conda.anaconda.org/bioconda/osx-64/deacon-0.12.0-h00cbfe0_0.conda - conda: https://conda.anaconda.org/bioconda/osx-64/entrez-direct-22.4-h193322a_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/osx-64/fastp-1.0.1-h9ea9c2a_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-64/fortran-compiler-1.11.0-h9ab62e8_0.conda @@ -696,6 +701,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/zlib-1.3.1-hd23fc13_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.23.0-py313h63b0ddb_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -741,7 +747,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -755,7 +761,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/f7/ca/3cf290212855b19af9fcc41b725b5620b32f470d6aad970c2593500817eb/wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl @@ -793,6 +799,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/coreutils-9.5-h93a5062_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/curl-8.18.0-he38603e_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/cxx-compiler-1.11.0-h88570a1_0.conda + - conda: https://conda.anaconda.org/bioconda/osx-arm64/deacon-0.13.2-hc12438c_1.conda - conda: https://conda.anaconda.org/bioconda/osx-arm64/entrez-direct-22.4-hd5f1084_0.tar.bz2 - conda: https://conda.anaconda.org/bioconda/osx-arm64/fastp-1.0.1-hee05c9d_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/osx-arm64/fortran-compiler-1.11.0-h81a4f41_0.conda @@ -918,6 +925,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zlib-1.3.1-h8359307_2.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-hbf9d68e_6.conda + - pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b3/66/e6c0a808950ba5a4042e2fcedd577fc7401536c7db063de4d7c36be06f84/argparse_dataclass-2.0.0-py3-none-any.whl @@ -963,7 +971,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ad/95/bc978be7ea0babf2fb48a414b6afaad414c6a9e8b1eafc5b8a53c030381a/smart_open-7.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/2b/0c/adf4310b15ebcbbbe36d316808d48142424f904422fef1fecca3d6e76c8a/snakemake-9.16.3-py3-none-any.whl @@ -977,7 +985,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/84/98/06ba4db905613fa24d227bc81219b7b2e6d48f803eac407d86f2ca54991d/taxopy-0.14.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/66/70/42d8796acc57c8bcd9ae395b1a6a0bbc833f738492a8ed192a44ccd58035/throttler-1.2.3-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/b0/9a/d2faba7e61072a7507b5722db63562fdb22f5a24e237d460d18755627f15/wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl @@ -1049,6 +1057,11 @@ packages: purls: [] size: 585566 timestamp: 1718118473054 +- pypi: https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl + name: annotated-doc + version: 0.0.4 + sha256: 571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320 + requires_python: '>=3.8' - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl name: annotated-types version: 0.7.0 @@ -1383,6 +1396,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 148270198 timestamp: 1743181320604 - conda: https://conda.anaconda.org/bioconda/linux-aarch64/blast-2.16.0-h6a93c2d_5.tar.bz2 @@ -1404,6 +1418,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 145178710 timestamp: 1743180765609 - conda: https://conda.anaconda.org/bioconda/osx-64/blast-2.16.0-h53185c9_5.tar.bz2 @@ -1424,6 +1439,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 168734931 timestamp: 1743189752491 - conda: https://conda.anaconda.org/bioconda/osx-arm64/blast-2.16.0-hb260f6e_5.tar.bz2 @@ -1444,6 +1460,7 @@ packages: - rpsbproc - zlib license: NCBI-PD + purls: [] size: 157885023 timestamp: 1743182400505 - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py313h46c70d0_3.conda @@ -2426,6 +2443,44 @@ packages: purls: [] size: 6715 timestamp: 1753098739952 +- conda: https://conda.anaconda.org/bioconda/linux-64/deacon-0.12.0-h4349ce8_0.conda + sha256: bd3e60e3edd432285384936be007d6a6fcf6b9a486e3916e147d97cfcdb4295a + md5: 9ee4d381da4213ecff8260998df5dedf + constrains: + - __glibc >=2.17 + license: MIT + purls: [] + size: 1450249 + timestamp: 1760628460538 +- conda: https://conda.anaconda.org/bioconda/linux-aarch64/deacon-0.12.0-hba26009_0.conda + sha256: 0ae4549c93059a205b76436951b581785b5a3eb0c17adcb0896075d8268521fb + md5: d6ad20691a294922c987d59c1c77cc81 + constrains: + - __glibc >=2.17 + license: MIT + purls: [] + size: 1495925 + timestamp: 1760628026072 +- conda: https://conda.anaconda.org/bioconda/osx-64/deacon-0.12.0-h00cbfe0_0.conda + sha256: be450d2174d937ae8c0b619573938110153202d6b111dde13d4058112f48c830 + md5: 2a188468ca7b093692401a834c9719d9 + constrains: + - __osx >=10.13 + license: MIT + purls: [] + size: 941742 + timestamp: 1760629349144 +- conda: https://conda.anaconda.org/bioconda/osx-arm64/deacon-0.13.2-hc12438c_1.conda + sha256: e0a1e650be9816d132d8e8028b4b5a8c47fc9347de9a77389c25d25263d79d12 + md5: da5a53c437863c6dd705ff1c29b7ebce + depends: + - openssl >=3.6.0,<4.0a0 + constrains: + - __osx >=11.0 + license: MIT + purls: [] + size: 1325263 + timestamp: 1763749605148 - pypi: https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl name: docutils version: 0.22.4 @@ -6692,7 +6747,7 @@ packages: - pypi: ./ name: nvd version: 2.4.0 - sha256: e271dfa809ef9357fbb4bae15d6fbec954e2ed279c634c1e15b517edc148a1de + sha256: cff095653e9b8e1f6123c1bec891054c35eb9ef3f0ed91ae16756fa972cc6dff requires_dist: - biopython>=1.85 - blake3>=1.0.8 @@ -8587,6 +8642,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - ncurses >=6.5,<7.0a0 license: MIT + purls: [] size: 499281 timestamp: 1752528204243 - conda: https://conda.anaconda.org/bioconda/linux-aarch64/samtools-1.22.1-h0b41a95_0.tar.bz2 @@ -8598,6 +8654,7 @@ packages: - libzlib >=1.3.1,<2.0a0 - ncurses >=6.5,<7.0a0 license: MIT + purls: [] size: 572155 timestamp: 1752527954545 - conda: https://conda.anaconda.org/bioconda/osx-64/samtools-1.22.1-ha21ef43_0.tar.bz2 @@ -8788,10 +8845,10 @@ packages: - pkg:pypi/six?source=hash-mapping size: 18455 timestamp: 1753199211006 -- pypi: https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl +- pypi: https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl name: slack-sdk - version: 3.39.0 - sha256: b1556b2f5b8b12b94e5ea3f56c4f2c7f04462e4e1013d325c5764ff118044fa8 + version: 3.40.0 + sha256: f2bada5ed3adb10a01e154e90db01d6d8938d0461b5790c12bcb807b2d28bbe2 requires_dist: - aiodns>1.0 ; extra == 'optional' - aiohttp>=3.7.3,<4 ; extra == 'optional' @@ -9254,15 +9311,15 @@ packages: - pytest-mypy-testing ; extra == 'test' - pytest>=7.0,<8.2 ; extra == 'test' requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl +- pypi: https://files.pythonhosted.org/packages/7a/ed/d6fca788b51d0d4640c4bc82d0e85bad4b49809bca36bf4af01b4dcb66a7/typer-0.23.0-py3-none-any.whl name: typer - version: 0.21.1 - sha256: 7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01 + version: 0.23.0 + sha256: 79f4bc262b6c37872091072a3cb7cb6d7d79ee98c0c658b4364bdcde3c42c913 requires_dist: - click>=8.0.0 - - typing-extensions>=3.7.4.3 - shellingham>=1.3.0 - rich>=10.11.0 + - annotated-doc>=0.0.2 requires_python: '>=3.9' - pypi: https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl name: typing-extensions diff --git a/pyproject.toml b/pyproject.toml index 7fbd6fc..fbb02c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ unzip = ">=6.0,<7" yaml = ">=0.2.5,<0.3" zstd = ">=1.5.7,<2" nextflow = ">=25.10.2,<26" +deacon = ">=0.12.0,<0.14" # ----------------------------------------------------------------------------------- # diff --git a/schemas/nvd-params.v2.5.0.schema.json b/schemas/nvd-params.v2.5.0.schema.json index a164b22..69b3c8e 100644 --- a/schemas/nvd-params.v2.5.0.schema.json +++ b/schemas/nvd-params.v2.5.0.schema.json @@ -202,6 +202,41 @@ "minimum": 1, "description": "Maximum read length to retain (no limit if not specified)" }, + "deacon_index": { + "type": ["string", "null"], + "default": null, + "description": "Path to prebuilt deacon index (.idx file)" + }, + "deacon_index_url": { + "type": "string", + "default": "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content", + "description": "URL to download prebuilt deacon index (default: panhuman-1)" + }, + "deacon_contaminants_fasta": { + "type": ["string", "null"], + "default": null, + "description": "Custom contaminant FASTA to union with base index" + }, + "deacon_kmer_size": { + "type": "integer", + "default": 31, + "description": "K-mer size for deacon index (must match index if prebuilt)" + }, + "deacon_window_size": { + "type": "integer", + "default": 15, + "description": "Minimizer window size for deacon index" + }, + "deacon_abs_threshold": { + "type": "integer", + "default": 2, + "description": "Minimum absolute minimizer hits to classify as contaminant" + }, + "deacon_rel_threshold": { + "type": "number", + "default": 0.01, + "description": "Minimum relative proportion of minimizers (0.0-1.0)" + }, "cutoff_percent": { "type": "number", "default": 0.001, diff --git a/subworkflows/host_depletion.nf b/subworkflows/host_depletion.nf new file mode 100644 index 0000000..0ea408c --- /dev/null +++ b/subworkflows/host_depletion.nf @@ -0,0 +1,65 @@ +include { DEACON_BUILD_INDEX ; DEACON_FETCH_INDEX ; DEACON_UNION_INDEXES ; DEACON_DEPLETE } from "../modules/deacon" + +workflow HOST_DEPLETION { + /* + * Orchestrates deacon-based host/contaminant depletion. + * + * Index resolution priority: + * 1. params.deacon_index (explicit local path) + * 2. params.deacon_index_url (download prebuilt, e.g., panhuman-1) + * 3. params.deacon_contaminants_fasta (build from FASTA at runtime) + * + * If both a base index (local or URL) and deacon_contaminants_fasta are provided, + * the indexes are unioned to combine panhuman with custom contaminants. + * + * Uses declarative channel ternaries so the DAG is consistent across runs, + * matching the pattern in preprocess_reads.nf. + */ + + take: + ch_reads // tuple(sample_id, platform, read_structure, reads) + + main: + // --- Resolve base index --- + // Explicit local path takes priority over URL download. + // Both produce a channel of one .idx file; unused path emits nothing. + ch_local_index = params.deacon_index + ? Channel.fromPath(params.deacon_index) + : Channel.empty() + + ch_fetch_url = (!params.deacon_index && params.deacon_index_url) + ? Channel.of(params.deacon_index_url) + : Channel.empty() + + DEACON_FETCH_INDEX(ch_fetch_url) + + ch_base_index = ch_local_index.mix(DEACON_FETCH_INDEX.out.index) + + // --- Build custom index from FASTA if provided --- + ch_custom_fasta = params.deacon_contaminants_fasta + ? Channel.fromPath(params.deacon_contaminants_fasta) + : Channel.empty() + + DEACON_BUILD_INDEX(ch_custom_fasta) + ch_custom_index = DEACON_BUILD_INDEX.out.index + + // --- Combine indexes --- + // Determine at parse time whether we need to union multiple indexes. + // Both branches produce a value channel of one path (.idx file). + def needs_union = params.deacon_contaminants_fasta && (params.deacon_index || params.deacon_index_url) + + DEACON_UNION_INDEXES( + ch_base_index.mix(ch_custom_index).collect() + ) + + ch_index = needs_union + ? DEACON_UNION_INDEXES.out.index.first() + : ch_base_index.mix(ch_custom_index).first() + + // --- Run depletion --- + DEACON_DEPLETE(ch_reads.combine(ch_index)) + + emit: + reads = DEACON_DEPLETE.out.reads + stats = DEACON_DEPLETE.out.stats +} diff --git a/workflows/preprocess_reads.nf b/workflows/preprocess_reads.nf index d63ab0b..b97b274 100644 --- a/workflows/preprocess_reads.nf +++ b/workflows/preprocess_reads.nf @@ -1,5 +1,5 @@ include { DEDUP_WITH_CLUMPIFY ; TRIM_ADAPTERS ; FILTER_READS ; REPAIR_PAIRS } from "../modules/bbmap" -include { SCRUB_HOST_READS } from "../modules/stat" +include { HOST_DEPLETION } from "../subworkflows/host_depletion" workflow PREPROCESS_READS { take: @@ -29,13 +29,12 @@ workflow PREPROCESS_READS { ch_after_trim = ch_trimmed_illumina.mix(ch_branched_for_trim.other) - // 3. Host scrub (requires sra_human_db to be set) - ch_human_db = params.sra_human_db - ? Channel.fromPath(params.sra_human_db) - : Channel.empty() + // 3. Host scrub with deacon + // Requires at least one of: deacon_index, deacon_index_url, deacon_contaminants_fasta + def has_deacon_config = params.deacon_index || params.deacon_index_url || params.deacon_contaminants_fasta - ch_after_scrub = (should_scrub && params.sra_human_db) - ? SCRUB_HOST_READS(ch_after_trim.combine(ch_human_db)) + ch_after_scrub = (should_scrub && has_deacon_config) + ? HOST_DEPLETION(ch_after_trim).reads : ch_after_trim // 4. Quality/length filter (with platform-specific quality threshold) @@ -51,6 +50,7 @@ workflow PREPROCESS_READS { : ch_after_scrub // 5. Repair pairs (interleaved reads only) - fixes orphans from upstream steps + // Note: This works because deacon preserves CASAVA FASTQ headers ch_branched_for_repair = ch_after_filter.branch { _id, _platform, read_structure, _reads -> interleaved: read_structure == "interleaved" other: true