Skip to content

Commit

Permalink
Stitch (nf-core#3766)
Browse files Browse the repository at this point in the history
* stitch module yaml

* stitch

* status

* test path

* test path

* stitch seed

* stitch seed

* stitch seed

* stitch seed

* stitch seed

* stitch seed

* stitch test refactoring

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* more tests

* check if bam

* check if bam

* check if bam

* check if bam

* check if bam

* label to medium

* plot hashes removed
  • Loading branch information
saulpierotti authored Aug 24, 2023
1 parent 1288cb1 commit f732bfc
Show file tree
Hide file tree
Showing 6 changed files with 486 additions and 0 deletions.
86 changes: 86 additions & 0 deletions modules/nf-core/stitch/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
process STITCH {
tag "$meta.id"
label 'process_medium'

conda "bioconda::r-stitch=1.6.10"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/r-stitch:1.6.10--r43h06b5641_0':
'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }"

input:
tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen)
tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist)
tuple val(meta3), path(fasta), path(fasta_fai)
val seed

output:
tuple val(meta), path("input", type: "dir") , emit: input
tuple val(meta), path("RData", type: "dir") , emit: rdata
tuple val(meta), path("plots", type: "dir") , emit: plots , optional: { generate_input_only }
tuple val(meta), path("*.vcf.gz") , emit: vcf , optional: { generate_input_only || bgen_output }
tuple val(meta), path("*.bgen") , emit: bgen , optional: { generate_input_only || !bgen_output }
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def prefix = task.ext.prefix ?: "${meta.id}"
def args = task.ext.args ?: ""
def args2 = task.ext.args2 ?: ""
def generate_input_only = args2.contains( "--generateInputOnly TRUE" )
def bgen_output = args2.contains( "--output_format bgen" )
def reads_ext = collected_crams ? collected_crams.extension.unique() : []
def rsync_cmd = rdata ? "rsync -rL ${rdata}/ RData" : ""
def stitch_cmd = seed ? "Rscript <(cat \$(which STITCH.R) | tail -n +2 | cat <(echo 'set.seed(${seed})') -)" : "STITCH.R"
def cramlist_cmd = cramlist && reads_ext == ["cram"] ? "--cramlist ${cramlist}" : ""
def bamlist_cmd = cramlist && reads_ext == ["bam" ] ? "--bamlist ${cramlist}" : ""
def reference_cmd = fasta ? "--reference ${fasta}" : ""
def regenerate_input_cmd = input && rdata && !cramlist ? "--regenerateInput FALSE --originalRegionName ${chromosome_name}" : ""
def rsync_version_cmd = rdata ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //; s/ .*\$//')" : ""
"""
${rsync_cmd} ${args}
${stitch_cmd} \\
--chr ${chromosome_name} \\
--posfile ${posfile} \\
--outputdir . \\
--nCores ${task.cpus} \\
--K ${K} \\
--nGen ${nGen} \\
${cramlist_cmd} \\
${bamlist_cmd} \\
${reference_cmd} \\
${regenerate_input_cmd} \\
${args2}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
${rsync_version_cmd}
r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])")
r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))")
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
def args = task.ext.args ?: ""
def args2 = task.ext.args2 ?: ""
def generate_input_only = args2.contains( "--generateInputOnly TRUE" )
def generate_plots_cmd = generate_input_only ? "mkdir plots" : ""
def generate_vcf_cmd = generate_input_only ? "touch ${prefix}.vcf.gz" : ""
def rsync_version_cmd = rdata ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //; s/ .*\$//')" : ""
"""
touch input
touch RData
${generate_plots_cmd}
${generate_vcf_cmd}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
${rsync_version_cmd}
r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])")
r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))")
END_VERSIONS
"""
}
140 changes: 140 additions & 0 deletions modules/nf-core/stitch/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
name: "stitch"
description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format."
keywords:
- imputation
- genomics
- vcf
- bgen
- cram
- bam
- sam
tools:
- "stitch":
description: "STITCH - Sequencing To Imputation Through Constructing Haplotypes"
homepage: "https://github.com/rwdavies/stitch"
documentation: "https://github.com/rwdavies/stitch"
tool_dev_url: "https://github.com/rwdavies/stitch"
doi: "10.1038/ng.3594"
licence: "['GPL v3']"

input:
- meta:
type: map
description: |
Groovy Map containing information about the set of positions to run the imputation over
e.g. `[ id:'test' ]`
- posfile:
type: file
description: |
Tab-separated file describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information.
pattern: "*.tsv"

- input:
type: directory
description: |
Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
pattern: "input"

- rdata:
type: directory
description: |
Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
pattern: "RData"

- chromosome_name:
type: string
description: Name of the chromosome to impute. Should match a chromosome name in the reference genome.

- K:
type: integer
description: Number of ancestral haplotypes to use for imputation. Refer to the documentation for the `--K` argument of STITCH for more information.

- nGen:
type: integer
description: Number of generations since founding of the population to use for imputation. Refer to the documentation for the `--nGen` argument of STITCH for more information.

- meta2:
type: map
description: |
Groovy Map containing information about the set of samples
e.g. `[ id:'test' ]`
- collected_crams:
type: file
description: List of sorted BAM/CRAM/SAM file
pattern: "*.{bam,cram,sam}"

- collected_crais:
type: file
description: List of BAM/CRAM/SAM index files
pattern: "*.{bai,crai,sai}"

- cramlist:
type: file
description: |
Text file with the path to the cram files to use in imputation, one per line. Since the cram files are staged to the working directory for the process, this file should just contain the file names without any pre-pending path.
pattern: "*.txt"

- meta3:
type: map
description: |
Groovy Map containing information about the reference genome used
e.g. `[ id:'test' ]`
- fasta:
type: file
description: FASTA reference genome file
pattern: "*.{fa,fasta}"

- fasta_fai:
type: file
description: FASTA index file
pattern: "*.{fai}"

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'test' ]`
- input:
type: directory
description: |
Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
pattern: "input"

- rdata:
type: directory
description: |
Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag.
pattern: "RData"

- plots:
type: directory
description: |
Folder containing plots produced by STITCH during imputation. Which plots are produced depends on the command-line arguments passed to STITCH.
pattern: "plots"

- vcf:
type: file
description: |
Imputed genotype calls for the positions in `posfile`, in vcf format. This is the default output.
pattern: ".vcf.gz"

- bgen:
type: file
description: |
Imputed genotype calls for the positions in `posfile`, in vcf format. This is the produced if `--output_format bgen` is specified.
pattern: ".bgen"

- versions:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@saulpierotti"
4 changes: 4 additions & 0 deletions tests/config/pytest_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3640,6 +3640,10 @@ star/genomegenerate:
- modules/nf-core/star/genomegenerate/**
- tests/modules/nf-core/star/genomegenerate/**

stitch:
- modules/nf-core/stitch/**
- tests/modules/nf-core/stitch/**

stranger:
- modules/nf-core/stranger/**
- tests/modules/nf-core/stranger/**
Expand Down
135 changes: 135 additions & 0 deletions tests/modules/nf-core/stitch/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env nextflow

nextflow.enable.dsl = 2

include { STITCH as STITCH_ONE_STEP } from '../../../../modules/nf-core/stitch/main.nf'
include { STITCH as STITCH_GENERATE_INPUTS } from '../../../../modules/nf-core/stitch/main.nf'
include { STITCH as STITCH_IMPUTE_ONLY } from '../../../../modules/nf-core/stitch/main.nf'

// positions and essential parameters
def posfile = file(params.test_data['homo_sapiens']['genome']['genome_21_stitch_posfile'], checkIfExists: true)
def input_empty = []
def rdata_empty = []
def chromosome_name_val = "chr21"
def K_val = 2
def nGen_val = 1
def stitch_input = [ [ id: "test_positions" ], posfile, input_empty, rdata_empty, chromosome_name_val, K_val, nGen_val ]

// sequencing data in cram format
def crams_val = [
params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram' ],
params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_cram'],
]
def crais_val = [
params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai' ],
params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_cram_crai'],
]
def reads = [ [ id:"test_reads" ], crams_val, crais_val ]

// sequencing data in bam format
def bams_val = [
params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam' ],
params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam'],
]
def bais_val = [
params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_bam_bai' ],
params.test_data['homo_sapiens']['illumina']['test2_paired_end_recalibrated_sorted_bam_bai'],
]
def reads_bam = [ [ id:"test_reads" ], bams_val, bais_val ]

// reference genome
def reference = [
[ id:"test_reference" ],
file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'] , checkIfExists: true),
file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true),
]

// for reproducibility
def seed = 1

//
// Test workflows
//

workflow GET_READS {
main:
cramlist = Channel.fromPath( crams_val )
.map { it[-1] as String } // get only filename
.collectFile( name: "cramlist.txt", newLine: true, sort: true )


emit:
Channel.of( reads ).combine( cramlist ).first()
}

workflow GET_READS_BAM {
main:
bamlist = Channel.fromPath( bams_val )
.map { it[-1] as String } // get only filename
.collectFile( name: "bamlist.txt", newLine: true, sort: true )


emit:
Channel.of( reads_bam ).combine( bamlist ).first()
}


workflow test_with_seed {
GET_READS()
STITCH_ONE_STEP (
stitch_input,
GET_READS.out,
reference,
seed,
)
}

workflow test_no_seed {
GET_READS()
STITCH_ONE_STEP (
stitch_input,
GET_READS.out,
reference,
[],
)
}

workflow test_two_stage_imputation {
GET_READS()
STITCH_GENERATE_INPUTS (
stitch_input,
GET_READS.out,
reference,
seed,
)

Channel.of( stitch_input )
.map {
meta, positions, input, rdata, chromosome_name, K, nGen ->
[ meta, positions ]
}
.join ( STITCH_GENERATE_INPUTS.out.input )
.join ( STITCH_GENERATE_INPUTS.out.rdata )
.map {
meta, positions, input, rdata ->
[ meta, positions, input, rdata, chromosome_name_val, K_val, nGen_val ]
}
.set { stitch_input_second_step }

STITCH_IMPUTE_ONLY(
stitch_input_second_step,
[[id: null], [], [], []],
[[id: null], [], []],
seed,
)
}

workflow test_bam {
GET_READS_BAM()
STITCH_ONE_STEP (
stitch_input,
GET_READS_BAM.out,
reference,
seed,
)
}
12 changes: 12 additions & 0 deletions tests/modules/nf-core/stitch/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
process {

publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }

withName: STITCH_GENERATE_INPUTS {
ext.args2 = "--generateInputOnly TRUE"
}
withName: STITCH_IMPUTE_ONLY {
ext.args2 = "--regenerateInputWithDefaultValues TRUE"
}

}
Loading

0 comments on commit f732bfc

Please sign in to comment.