Stitch (nf-core#3766)

* stitch module yaml * stitch * status * test path * test path * stitch seed * stitch seed * stitch seed * stitch seed * stitch seed * stitch seed * stitch test refactoring * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * more tests * check if bam * check if bam * check if bam * check if bam * check if bam * label to medium * plot hashes removed
rannick · Aug 24, 2023 · f732bfc · f732bfc
1 parent 1288cb1
commit f732bfc
Show file tree

Hide file tree

Showing 6 changed files with 486 additions and 0 deletions.
diff --git a/modules/nf-core/stitch/main.nf b/modules/nf-core/stitch/main.nf
@@ -0,0 +1,86 @@
+process STITCH {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::r-stitch=1.6.10"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/r-stitch:1.6.10--r43h06b5641_0':
+        'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }"
+
+    input:
+    tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen)
+    tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist)
+    tuple val(meta3), path(fasta), path(fasta_fai)
+    val seed
+
+    output:
+    tuple val(meta), path("input", type: "dir") , emit: input
+    tuple val(meta), path("RData", type: "dir") , emit: rdata
+    tuple val(meta), path("plots", type: "dir") , emit: plots , optional: { generate_input_only }
+    tuple val(meta), path("*.vcf.gz")           , emit: vcf   , optional: { generate_input_only || bgen_output }
+    tuple val(meta), path("*.bgen")             , emit: bgen  , optional: { generate_input_only || !bgen_output }
+    path "versions.yml"                         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix               = task.ext.prefix ?: "${meta.id}"
+    def args                 = task.ext.args   ?: ""
+    def args2                = task.ext.args2  ?: ""
+    def generate_input_only  = args2.contains( "--generateInputOnly TRUE" )
+    def bgen_output          = args2.contains( "--output_format bgen" )
+    def reads_ext            = collected_crams             ? collected_crams.extension.unique()                                : []
+    def rsync_cmd            = rdata                       ? "rsync -rL ${rdata}/ RData"                                       : ""
+    def stitch_cmd           = seed                        ? "Rscript <(cat \$(which STITCH.R) | tail -n +2 | cat <(echo 'set.seed(${seed})') -)" : "STITCH.R"
+    def cramlist_cmd         = cramlist && reads_ext == ["cram"] ? "--cramlist ${cramlist}"                                    : ""
+    def bamlist_cmd          = cramlist && reads_ext == ["bam" ] ? "--bamlist ${cramlist}"                                     : ""
+    def reference_cmd        = fasta                       ? "--reference ${fasta}"                                            : ""
+    def regenerate_input_cmd = input && rdata && !cramlist ? "--regenerateInput FALSE --originalRegionName ${chromosome_name}" : ""
+    def rsync_version_cmd    = rdata                       ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync  version //; s/ .*\$//')" : ""
+    """
+    ${rsync_cmd} ${args}
+
+    ${stitch_cmd} \\
+        --chr ${chromosome_name} \\
+        --posfile ${posfile} \\
+        --outputdir . \\
+        --nCores ${task.cpus} \\
+        --K ${K} \\
+        --nGen ${nGen} \\
+        ${cramlist_cmd} \\
+        ${bamlist_cmd} \\
+        ${reference_cmd} \\
+        ${regenerate_input_cmd} \\
+        ${args2}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ${rsync_version_cmd}
+        r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])")
+        r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))")
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix               = task.ext.prefix     ?: "${meta.id}"
+    def args                 = task.ext.args       ?: ""
+    def args2                = task.ext.args2      ?: ""
+    def generate_input_only  = args2.contains( "--generateInputOnly TRUE" )
+    def generate_plots_cmd   = generate_input_only ? "mkdir plots"                                                                   : ""
+    def generate_vcf_cmd     = generate_input_only ? "touch ${prefix}.vcf.gz"                                                        : ""
+    def rsync_version_cmd    = rdata               ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync  version //; s/ .*\$//')" : ""
+    """
+    touch input
+    touch RData
+    ${generate_plots_cmd}
+    ${generate_vcf_cmd}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ${rsync_version_cmd}
+        r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])")
+        r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))")
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml
@@ -0,0 +1,140 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "stitch"
+description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format."
+keywords:
+  - imputation
+  - genomics
+  - vcf
+  - bgen
+  - cram
+  - bam
+  - sam
+tools:
+  - "stitch":
+      description: "STITCH - Sequencing To Imputation Through Constructing Haplotypes"
+      homepage: "https://github.com/rwdavies/stitch"
+      documentation: "https://github.com/rwdavies/stitch"
+      tool_dev_url: "https://github.com/rwdavies/stitch"
+      doi: "10.1038/ng.3594"
+      licence: "['GPL v3']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing information about the set of positions to run the imputation over
+        e.g. `[ id:'test' ]`
+
+  - posfile:
+      type: file
+      description: |
+        Tab-separated file describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information.
+      pattern: "*.tsv"
+
+  - input:
+      type: directory
+      description: |
+        Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
+      pattern: "input"
+
+  - rdata:
+      type: directory
+      description: |
+        Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
+      pattern: "RData"
+
+  - chromosome_name:
+      type: string
+      description: Name of the chromosome to impute. Should match a chromosome name in the reference genome.
+
+  - K:
+      type: integer
+      description: Number of ancestral haplotypes to use for imputation. Refer to the documentation for the `--K` argument of STITCH for more information.
+
+  - nGen:
+      type: integer
+      description: Number of generations since founding of the population to use for imputation. Refer to the documentation for the `--nGen` argument of STITCH for more information.
+
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing information about the set of samples
+        e.g. `[ id:'test' ]`
+
+  - collected_crams:
+      type: file
+      description: List of sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+
+  - collected_crais:
+      type: file
+      description: List of BAM/CRAM/SAM index files
+      pattern: "*.{bai,crai,sai}"
+
+  - cramlist:
+      type: file
+      description: |
+        Text file with the path to the cram files to use in imputation, one per line. Since the cram files are staged to the working directory for the process, this file should just contain the file names without any pre-pending path.
+      pattern: "*.txt"
+
+  - meta3:
+      type: map
+      description: |
+        Groovy Map containing information about the reference genome used
+        e.g. `[ id:'test' ]`
+
+  - fasta:
+      type: file
+      description: FASTA reference genome file
+      pattern: "*.{fa,fasta}"
+
+  - fasta_fai:
+      type: file
+      description: FASTA index file
+      pattern: "*.{fai}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test' ]`
+
+  - input:
+      type: directory
+      description: |
+        Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
+      pattern: "input"
+
+  - rdata:
+      type: directory
+      description: |
+        Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
+      pattern: "RData"
+
+  - plots:
+      type: directory
+      description: |
+        Folder containing plots produced by STITCH during imputation. Which plots are produced depends on the command-line arguments passed to STITCH.
+      pattern: "plots"
+
+  - vcf:
+      type: file
+      description: |
+        Imputed genotype calls for the positions in `posfile`, in vcf format. This is the default output.
+      pattern: ".vcf.gz"
+
+  - bgen:
+      type: file
+      description: |
+        Imputed genotype calls for the positions in `posfile`, in vcf format. This is the produced if `--output_format bgen` is specified.
+      pattern: ".bgen"
+
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@saulpierotti"
diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml
@@ -3640,6 +3640,10 @@ star/genomegenerate:
   - modules/nf-core/star/genomegenerate/**
   - tests/modules/nf-core/star/genomegenerate/**
 
+stitch:
+  - modules/nf-core/stitch/**
+  - tests/modules/nf-core/stitch/**
+
 stranger:
   - modules/nf-core/stranger/**
   - tests/modules/nf-core/stranger/**

diff --git a/tests/modules/nf-core/stitch/main.nf b/tests/modules/nf-core/stitch/main.nf
@@ -0,0 +1,135 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { STITCH as STITCH_ONE_STEP        } from '../../../../modules/nf-core/stitch/main.nf'
+include { STITCH as STITCH_GENERATE_INPUTS } from '../../../../modules/nf-core/stitch/main.nf'
+include { STITCH as STITCH_IMPUTE_ONLY     } from '../../../../modules/nf-core/stitch/main.nf'
+
+// positions and essential parameters
+def posfile             = file(params.test_data['homo_sapiens']['genome']['genome_21_stitch_posfile'], checkIfExists: true)
+def input_empty         = []
+def rdata_empty         = []
+def chromosome_name_val = "chr21"
+def K_val               = 2
+def nGen_val            = 1
+def stitch_input        = [ [ id: "test_positions" ], posfile, input_empty, rdata_empty, chromosome_name_val, K_val, nGen_val ]
+
+// sequencing data in cram format
+def crams_val = [
+    params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram' ],
+    params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_cram'],
+]
+def crais_val = [
+    params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai' ],
+    params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_cram_crai'],
+]
+def reads = [ [ id:"test_reads" ], crams_val, crais_val ]
+
+// sequencing data in bam format
+def bams_val = [
+    params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam' ],
+    params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam'],
+]
+def bais_val = [
+    params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam_bai' ],
+    params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam_bai'],
+]
+def reads_bam = [ [ id:"test_reads" ], bams_val, bais_val ]
+
+// reference genome
+def reference = [
+    [ id:"test_reference" ],
+    file(params.test_data['homo_sapiens']['genome']['genome_21_fasta']    , checkIfExists: true),
+    file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true),
+]
+
+// for reproducibility
+def seed = 1
+
+//
+// Test workflows
+//
+
+workflow GET_READS {
+    main:
+    cramlist = Channel.fromPath( crams_val )
+    .map { it[-1] as String } // get only filename
+    .collectFile( name: "cramlist.txt", newLine: true, sort: true )
+
+
+    emit:
+    Channel.of( reads ).combine( cramlist ).first()
+}
+
+workflow GET_READS_BAM {
+    main:
+    bamlist = Channel.fromPath( bams_val )
+    .map { it[-1] as String } // get only filename
+    .collectFile( name: "bamlist.txt", newLine: true, sort: true )
+
+
+    emit:
+    Channel.of( reads_bam ).combine( bamlist ).first()
+}
+
+
+workflow test_with_seed {
+    GET_READS()
+    STITCH_ONE_STEP (
+        stitch_input,
+        GET_READS.out,
+        reference,
+        seed,
+    )
+}
+
+workflow test_no_seed {
+    GET_READS()
+    STITCH_ONE_STEP (
+        stitch_input,
+        GET_READS.out,
+        reference,
+        [],
+    )
+}
+
+workflow test_two_stage_imputation {
+    GET_READS()
+    STITCH_GENERATE_INPUTS (
+        stitch_input,
+        GET_READS.out,
+        reference,
+        seed,
+    )
+
+    Channel.of( stitch_input )
+    .map {
+        meta, positions, input, rdata, chromosome_name, K, nGen ->
+        [ meta, positions ]
+    }
+    .join ( STITCH_GENERATE_INPUTS.out.input )
+    .join ( STITCH_GENERATE_INPUTS.out.rdata )
+    .map {
+        meta, positions, input, rdata ->
+        [ meta, positions, input, rdata, chromosome_name_val, K_val, nGen_val ]
+    }
+    .set { stitch_input_second_step }
+
+    STITCH_IMPUTE_ONLY(
+        stitch_input_second_step,
+        [[id: null], [], [], []],
+        [[id: null], [], []],
+        seed,
+    )
+}
+
+workflow test_bam {
+    GET_READS_BAM()
+    STITCH_ONE_STEP (
+        stitch_input,
+        GET_READS_BAM.out,
+        reference,
+        seed,
+    )
+}
diff --git a/tests/modules/nf-core/stitch/nextflow.config b/tests/modules/nf-core/stitch/nextflow.config
@@ -0,0 +1,12 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+
+    withName: STITCH_GENERATE_INPUTS {
+        ext.args2 = "--generateInputOnly TRUE"
+    }
+    withName: STITCH_IMPUTE_ONLY {
+        ext.args2 = "--regenerateInputWithDefaultValues TRUE"
+    }
+
+}