diff --git a/Dockerfile b/Dockerfile index 5be9f7d..c2de092 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,8 +70,8 @@ COPY --from=bioinformatics_base ${star_dir}/source/STAR ${star_dir}/source/STAR #===========================# # Installing tools # #===========================# -RUN mkdir -p /scripts /resources /results -RUN chmod ugo+wx /results +RUN mkdir -p /scripts /resources /results /STAR_tmp +RUN chmod ugo+wx /results /STAR_tmp COPY ERVmapping.sh /scripts COPY templates/ERValign.sh /scripts COPY templates/ERVcount.sh /scripts diff --git a/README.md b/README.md index 1668c8f..7f09a5f 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,7 @@ This option can only have 3 values: { `ALL`, `STAR`, `BED` }: * `STAR` to only perform the alignment; * `BED` to only run the ERV quantification. -### Optional parameters (recommended) - +### Optional parameters (*recommended*) There are a few parameters that can be added to the ERVmap image to make the process more efficient. * `--cpus 20`: if you have a multi-core system (and you should have one), you can specify the number of CPUs to use (e.g. 20); @@ -86,22 +85,25 @@ There are also other parameters from Docker that should be included before `ervm ## Nextflow version To run this pipeline using [Nextflow](https://www.nextflow.io/), simply run the following: -`nextflow -C nextflow.config run main.nf` +`nextflow -C nextflow.config run main.nf` where `nextflow.config` include the minimum set of parameters to run ERVmap within the docker container. Specifically: ```bash params { - inputDir='' # external path of the input data - outputDir='' # external path of the output results - localOutDir='' # internal path of the results - outPrefix='test.' # [optional] it defines the prefix for the results - cpus=1 # Number of cpus/threads to use for the alignment - limitMemory=32000000 # memory limit for samtools - debug='off' # either [on|off] + genome='/path/to/genome' # external path to the indexed genome for the STAR aligner + inputDir='path/to/input/folder' # external path of the input data + inputPattern="*{1,2}.fastq.gz" # pattern to search for input FASTQ files + outputDir='/path/to/output/folder' # external path of the output results + starTmpDir='/path/to/STAR/temp/folder' # external path of the STAR aligner temporary folder. REQUIRED + localOutDir='.' # internal path of the results + cpus=20 # Number of cpus/threads to use for the alignment + limitMemory=1850861158 # memory limit for STAR + debug='off' # either [on|off] } ``` +**NOTE:** Adjust the memory settings of the docker container if needed, but recall that STAR requires about 32G of RAM (see [Optional Parameters](#optparam)). -**NOTE**: another critical parameter is the location of the indexed genome. This information must be included in the config as `containerOptions = '--memory 35G --memory-swap 100G -v /path/to/genome:/genome:ro'` (replace `/path/to/genome` with the external location of the genome) +**NOTE:** If you need to keep the BAM files generated, please make sure to make a copy. By default, the BAMs remains in the nextflow `work` folder and only symbolic links are available in `outputDir`. By cleaning up the `work` folder, e.g. by running `nextflow clean`, the bam files will be removed. The ERVmap results are copied into `outputDir` and thus are permanent. ---- diff --git a/main.nf b/main.nf index ef4b815..dcd1447 100644 --- a/main.nf +++ b/main.nf @@ -3,50 +3,56 @@ // check parameters if (!params.inputDir) { - exit 1, "inputDir parameter is missing." + exit 1, 'inputDir parameter is missing.' +} +if (!new File(params.inputDir).exists()) { + exit 1, "The input folder does not exists."+params.inputDir+"\n" } if (!params.outputDir) { exit 1, "outputDir parameter is missing." } -if ( !new File(params.inputDir).exists()) { - exit 1, "The input folder does not exists."+params.inputDir+"\n" +if (!params.starTmpDir) { + exit 1, "starTmpDir parameter is missing." +} + +if (!new File(params.starTmpDir).exists()) { + exit 1, 'The STAR temporary folder does not exists. ('+params.starTmpDir+')\n' } -if (!params.outPrefix) { - exit 1, "Output prefix parameter is missing." +if (!params.localOutputDir) { + params.localOutputDir='bam' } if (!params.debug) { exit 1, "Debug prefix parameter is missing." } -if (!params.localOutDir) { - params.localOutDir='bam' -} -pairFiles_ch = Channel.fromFilePairs( params.inputDir+"/*{1,2}.fastq.gz", size: 2, checkIfExists: true ) + +pairFiles_ch = Channel.fromFilePairs( params.inputDir+"/"+params.inputPattern, size: 2, checkIfExists: true ) + process ERValign { tag "${sample}" // executor configuration time '8h' - // memory '35.GB' + memory '35 GB' scratch true cpus params.cpus - storeDir params.outputDir + publishDir params.outputDir // other configuration echo true errorStrategy 'terminate' input: - val(outPrefix) from params.outPrefix - val(localOutDir) from params.localOutDir - val(limitMemory) from params.limitMemory - val(debug) from params.debug - tuple val(sample), file(reads) from pairFiles_ch + tuple val(sample), file (reads) from pairFiles_ch + val (localOutputDir) from params.localOutputDir + val (limitMemory) from params.limitMemory + val (debug) from params.debug output: - path ( "${localOutDir}/${outPrefix}Aligned.sortedByCoord.out.bam" ) into bam_ch - path ( "${localOutDir}/${outPrefix}Aligned.sortedByCoord.out.bam.bai" ) into bai_ch + path ("${localOutputDir}/${sample}.Aligned.sortedByCoord.out.bam") into bam_ch + path ("${localOutputDir}/${sample}.Aligned.sortedByCoord.out.bam.bai") into bai_ch + val (sample) into prefix_ch shell: template 'ERValign.sh' @@ -64,20 +70,19 @@ process ERVcount { // other configuration echo true errorStrategy 'terminate' - mode = 'BED' stageInMode 'symlink' input: - val(outPrefix) from params.outPrefix - val(debug) from params.debug + val (sample) from prefix_ch + val (debug) from params.debug path (bam) from bam_ch path (bai) from bai_ch output: - path (params.outPrefix+'ERVresults.txt') into final_results_ch + path ("${sample}"+'.ERVresults.txt') into final_results_ch shell: - template "ERVcount.sh" + template 'ERVcount.sh' } // ~~~~~~~~~~~~~~~ PIPELINE COMPLETION EVENTS ~~~~~~~~~~~~~~~~~~~ // diff --git a/nextflow.config b/nextflow.config index b2c6696..79dccf8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,19 +9,21 @@ docker { } params { - inputDir='' - outputDir='' - localOutDir='' - outPrefix='test.' - cpus=1 - limitMemory=32000000 + genome='/path/to/genome' + inputDir='path/to/input/folder' + inputPattern="*{1,2}.fastq.gz" + outputDir='/path/to/output/folder' + starTmpDir='/path/to/STAR/temp/folder' + localOutDir='.' + cpus=3 + limitMemory=1850861158 debug='off' } process { withName: ERValign { container = 'eipm/ervmap:latest' - containerOptions = '--memory 35G --memory-swap 100G -v /path/to/genome:/genome:ro' + containerOptions = "--memory 35G --memory-swap 100G -v ${params.genome}:/genome:ro -v ${params.starTmpDir}:/STAR_tmp" } withName: ERVcount { diff --git a/templates/ERValign.sh b/templates/ERValign.sh index 87b1f17..a678506 100644 --- a/templates/ERValign.sh +++ b/templates/ERValign.sh @@ -24,16 +24,12 @@ logMsg() { fi } -function usage() { - echo "Usage: ERValign.sh <-r1|--read1> SAMPLE_1.fastq.gz <-r2|--read2> SAMPLE_1.fastq.gz [-o|--output] results/SAMPLE [-c|--cpus] Ncpus [-l|--limit-ram] 35129075129 [-d|--debug {off|on}]\nA genome folder should be present in /genome" -} - # initializing parameters for STAR READS="!{reads}" CPUS=!{task.cpus} LIMIT_RAM=!{limitMemory} -OUT_PREFIX="!{outPrefix}" -LOCAL_OUTDIR="!{localOutDir}" +OUT_PREFIX="!{sample}." +LOCAL_OUTDIR="!{localOutputDir}" # checking the prefix of the output BAM if [ -z ${OUT_PREFIX+x} ];then @@ -46,7 +42,8 @@ if [ -z ${LIMIT_RAM+x} ];then export LIMIT_RAM=35129075129;fi [ -e "/genome/genomeParameters.txt" ] || logMsg "ERROR" "The indexed genome cannot be found. Check that it is present and you have read permissions." logMsg "DEBUG" "OUT_PREFIX:($OUT_PREFIX)" -logMsg "DEBUG" "Reads: ($READS)"l +logMsg "DEBUG" "Local OutDir: $(pwd)/$LOCAL_OUTDIR" +logMsg "DEBUG" "Reads: ($READS)" logMsg "DEBUG" "CPUs:($CPUS)" logMsg "DEBUG" "Limit RAM:($LIMIT_RAM)" @@ -55,7 +52,7 @@ logMsg "INFO" "-------- START ERValign ---------" BAM="$LOCAL_OUTDIR/$OUT_PREFIX""Aligned.sortedByCoord.out.bam" logMsg "INFO" "---- Alignment ----" -STAR --genomeDir /genome --runThreadN $CPUS --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM $LIMIT_RAM --outFilterMultimapNmax 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.02 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesIn $READS --readFilesCommand zcat --outFileNamePrefix $LOCAL_OUTDIR/$OUT_PREFIX +STAR --genomeDir /genome --runThreadN $CPUS --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM $LIMIT_RAM --outFilterMultimapNmax 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.02 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesIn $READS --readFilesCommand zcat --outFileNamePrefix $LOCAL_OUTDIR/$OUT_PREFIX --outTmpDir /STAR_tmp/$OUT_PREFIX""tmp [ $? == 0 ] || logMsg "ERROR" "The alignment didn't complete succesfully. Check the logs." logMsg "INFO" "---- Alignment Complete ----" diff --git a/templates/ERVcount.sh b/templates/ERVcount.sh index a0d0c34..38a33b8 100644 --- a/templates/ERVcount.sh +++ b/templates/ERVcount.sh @@ -1,9 +1,5 @@ #!/bin/bash -function usage() { - echo "Usage: ERVcount.sh <-b|--bam> Aligned.bam [-o|--output] results/SAMPLE [-d|--debug {off|on}]" -} - BAM="!{bam}" _DEBUG="!{debug}" @@ -28,7 +24,7 @@ logMsg() { fi } -OUT_PREFIX=!{outPrefix} +OUT_PREFIX="!{sample}." if [ -z ${OUT_PREFIX+x} ];then OUT_PREFIX="$RANDOM""_" logMsg "WARN" "This prefix will be used as output: $OUT_PREFIX"