diff --git a/conf/modules.config b/conf/modules.config index 369b900d2..40a6398c4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -70,6 +70,10 @@ process { ] } + withName: GTFFILTER { + ext.suffix = "filtered.gtf" + } + withName: SEQKIT_SPLIT { ext.args = "-i --by-id-prefix \"\"" publishDir = [ diff --git a/modules.json b/modules.json index 3792423c3..8a2d096a1 100644 --- a/modules.json +++ b/modules.json @@ -80,6 +80,11 @@ "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", "installed_by": ["modules"] }, + "custom/gtffilter": { + "branch": "master", + "git_sha": "a0aee18374b7f072aa0f89f4d66f5a3a9f8176d2", + "installed_by": ["modules"] + }, "custom/tx2gene": { "branch": "master", "git_sha": "ec155021a9104441bf6a9bae3b55d1b5b0bfdb3a", diff --git a/modules/nf-core/custom/gtffilter/environment.yml b/modules/nf-core/custom/gtffilter/environment.yml new file mode 100644 index 000000000..115f41235 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_gtffilter" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "conda-forge::python=3.9.5" diff --git a/modules/nf-core/custom/gtffilter/main.nf b/modules/nf-core/custom/gtffilter/main.nf new file mode 100644 index 000000000..b682ff8c5 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/main.nf @@ -0,0 +1,37 @@ +process CUSTOM_GTFFILTER { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(gtf) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "gtf" + (gtf.extension == 'gz' ? '.gz' : '') + template 'gtffilter.py' + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "gtf" + (gtf.extension == 'gz' ? '.gz' : '') + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | cut -d ' ' -f 2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/gtffilter/meta.yml b/modules/nf-core/custom/gtffilter/meta.yml new file mode 100644 index 000000000..2c8692218 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/meta.yml @@ -0,0 +1,51 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_gtffilter" +description: Filter a gtf file to keep only regions that are located on a chromosome represented in a given fasta file +keywords: + - gtf + - fasta + - filter +tools: + - "gtffilter": + description: "Filter a gtf file to keep only regions that are located on a chromosome represented in a given fasta file" + tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/gtffilter/main.nf" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + - gtf: + type: file + description: GTF file + pattern: "*.{gtf}" + + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fasta,fa}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + - gtf: + type: file + description: Filtered GTF file + pattern: "*.{gtf}" + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@nictru" +maintainers: + - "@nictru" diff --git a/modules/nf-core/custom/gtffilter/templates/gtffilter.py b/modules/nf-core/custom/gtffilter/templates/gtffilter.py new file mode 100644 index 000000000..764ec2eff --- /dev/null +++ b/modules/nf-core/custom/gtffilter/templates/gtffilter.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +# Written by Olga Botvinnik with subsequent reworking by Jonathan Manning and Nico Trummer. + +# MIT License + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import logging +import re +import gzip +import statistics +import platform +from typing import Set + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger("fasta_gtf_filter") +logger.setLevel(logging.INFO) + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + + +def extract_fasta_seq_names(fasta_name: str) -> Set[str]: + """Extracts the sequence names from a FASTA file.""" + + is_gz = fasta_name.endswith(".gz") + open_fn = gzip.open if is_gz else open + + with open_fn(fasta_name) as fasta: + sequences = set() + for line in fasta: + line = line.decode("utf-8") if is_gz else line + if line.startswith(">"): + sequences.add(line[1:].split(None, 1)[0]) + + return sequences + + +def tab_delimited(file: str) -> float: + """Check if file is tab-delimited and return median number of tabs.""" + with open(file, "r") as f: + data = f.read(102400) + return statistics.median(line.count("\\t") for line in data.split("\\n")) + + +def filter_gtf( + fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool +) -> None: + """Filter GTF file based on FASTA sequence names.""" + if tab_delimited(gtf_in) != 8: + raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.") + + seq_names_in_genome = extract_fasta_seq_names(fasta) + logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.debug( + "All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)) + ) + + seq_names_in_gtf = set() + try: + is_gz = gtf_in.endswith(".gz") + open_fn = gzip.open if is_gz else open + with open_fn(gtf_in) as gtf, open_fn(filtered_gtf_out, "wb" if is_gz else "w") as out: + line_count = 0 + for line in gtf: + line = line.decode("utf-8") if is_gz else line + seq_name = line.split("\\t")[0] + seq_names_in_gtf.add(seq_name) # Add sequence name to the set + + if seq_name in seq_names_in_genome: + if skip_transcript_id_check or re.search( + r'transcript_id "([^"]+)"', line + ): + out.write(line.encode() if is_gz else line) + line_count += 1 + + if line_count == 0: + raise ValueError("All GTF lines removed by filters") + + except IOError as e: + logger.error(f"File operation failed: {e}") + return + + logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.info( + f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}" + ) + + +filter_gtf("${fasta}", "${gtf}", "${prefix}.${suffix}", False) + +# Versions + +versions = {"${task.process}": {"python": platform.python_version()}} + +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/nf-core/custom/gtffilter/tests/main.nf.test b/modules/nf-core/custom/gtffilter/tests/main.nf.test new file mode 100644 index 000000000..252d11a16 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/tests/main.nf.test @@ -0,0 +1,115 @@ +nextflow_process { + + name "Test Process CUSTOM_GTFFILTER" + script "../main.nf" + process "CUSTOM_GTFFILTER" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/gtffilter" + + test("test_custom_gtffilter") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_custom_gtffilter_gzip") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_custom_gtffilter - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_custom_gtffilter_gzip - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/gtffilter/tests/main.nf.test.snap b/modules/nf-core/custom/gtffilter/tests/main.nf.test.snap new file mode 100644 index 000000000..787dd42e1 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/tests/main.nf.test.snap @@ -0,0 +1,134 @@ +{ + "test_custom_gtffilter_gzip": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "1": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "versions": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:11.091273747" + }, + "test_custom_gtffilter": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "1": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "versions": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:03.654104046" + }, + "test_custom_gtffilter_gzip - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:24.216284615" + }, + "test_custom_gtffilter - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:17.765499066" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/gtffilter/tests/tags.yml b/modules/nf-core/custom/gtffilter/tests/tags.yml new file mode 100644 index 000000000..34dda2178 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/tests/tags.yml @@ -0,0 +1,2 @@ +custom/gtffilter: + - "modules/nf-core/custom/gtffilter/**" diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 473d314b7..76fe152dd 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -1,12 +1,13 @@ -include { SEQKIT_SPLIT } from '../../modules/local/seqkit/split' -include { BOWTIE_BUILD } from '../../modules/nf-core/bowtie/build' -include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build' -include { BWA_INDEX } from '../../modules/nf-core/bwa/index' -include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites' -include { HISAT2_BUILD } from '../../modules/nf-core/hisat2/build' -include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' -include { GAWK as CLEAN_FASTA } from '../../modules/nf-core/gawk' -include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx' +include { CUSTOM_GTFFILTER as GTFFILTER } from '../../modules/nf-core/custom/gtffilter' +include { SEQKIT_SPLIT } from '../../modules/local/seqkit/split' +include { BOWTIE_BUILD } from '../../modules/nf-core/bowtie/build' +include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build' +include { BWA_INDEX } from '../../modules/nf-core/bwa/index' +include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites' +include { HISAT2_BUILD } from '../../modules/nf-core/hisat2/build' +include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' +include { GAWK as CLEAN_FASTA } from '../../modules/nf-core/gawk' +include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx' workflow PREPARE_GENOME { @@ -26,6 +27,9 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(CLEAN_FASTA.out.versions) } + GTFFILTER(ch_gtf, ch_fasta) + ch_gtf = GTFFILTER.out.gtf + SEQKIT_SPLIT(ch_fasta) BOWTIE_BUILD(ch_fasta.map{ meta, fasta -> fasta }) @@ -43,7 +47,8 @@ workflow PREPARE_GENOME { SAMTOOLS_FAIDX(ch_fasta, [[], []]) // Collect versions - ch_versions = ch_versions.mix(SEQKIT_SPLIT.out.versions, + ch_versions = ch_versions.mix(GTFFILTER.out.versions, + SEQKIT_SPLIT.out.versions, BOWTIE_BUILD.out.versions, BOWTIE2_BUILD.out.versions, BWA_INDEX.out.versions, @@ -53,6 +58,7 @@ workflow PREPARE_GENOME { SAMTOOLS_FAIDX.out.versions) emit: + gtf = ch_gtf faidx = SAMTOOLS_FAIDX.out.fai bowtie = params.bowtie ?: BOWTIE_BUILD.out.index bowtie2 = params.bowtie2 ? Channel.value([[id: "bowtie2"], file(params.bowtie2, checkIfExists: true)]) : BOWTIE2_BUILD.out.index.collect() diff --git a/workflows/circrna/main.nf b/workflows/circrna/main.nf index a16fb7525..0d852ae7a 100644 --- a/workflows/circrna/main.nf +++ b/workflows/circrna/main.nf @@ -112,6 +112,7 @@ workflow CIRCRNA { ch_gtf ) + ch_gtf = PREPARE_GENOME.out.gtf bowtie_index = PREPARE_GENOME.out.bowtie bowtie2_index = PREPARE_GENOME.out.bowtie2 bwa_index = PREPARE_GENOME.out.bwa