diff --git a/.circleci/config.yml b/.circleci/config.yml index 47d815f3..e01a76b2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -24,7 +24,9 @@ jobs: - checkout - run: pip3 install -r tests/requirements.txt - - run: pytest --git-aware tests/subworkflows/genotyping_empirical/freebayes/polyrad + - run: pytest --git-aware tests/subworkflows/gatk_genotyping/ + - run: pytest --git-aware tests/subworkflows/stacks_genotyping/ + - run: pytest --git-aware tests/subworkflows/freebayes_genotyping/ release-to-github: <<: *machine_default diff --git a/.configurations/cromwell_mysql.conf b/.configurations/cromwell_mysql.conf index 9ada24b5..f90277b1 100644 --- a/.configurations/cromwell_mysql.conf +++ b/.configurations/cromwell_mysql.conf @@ -1,5 +1,5 @@ backend { - default = SlurmSingularity + default = Local providers { diff --git a/.dockerfiles/java-in-the-cloud/Dockerfile b/.dockerfiles/java-in-the-cloud/Dockerfile index abf78727..0227c496 100644 --- a/.dockerfiles/java-in-the-cloud/Dockerfile +++ b/.dockerfiles/java-in-the-cloud/Dockerfile @@ -1,4 +1,4 @@ -FROM openjdk:7 +FROM ibmjava:jre COPY ./PedigreeSim.jar /usr/jars/ @@ -10,3 +10,10 @@ RUN apt-get update \ && make \ && mv vcf2diploid.jar /usr/jars/ + +RUN wget https://bitbucket.org/tasseladmin/tassel-5-standalone/get/5f68583d0f56.zip \ + && unzip 5f68583d0f56.zip \ + && mkdir /usr/tassel \ + && mv tasseladmin-tassel-5-standalone-5f68583d0f56/* /usr/tassel + +RUN apt-get install -y samtools \ No newline at end of file diff --git a/.dockerfiles/tassel/Dockerfile b/.dockerfiles/tassel/Dockerfile new file mode 100644 index 00000000..dc6ce35d --- /dev/null +++ b/.dockerfiles/tassel/Dockerfile @@ -0,0 +1,11 @@ +FROM ibmjava:jre + +RUN apt update \ + && apt install -y parallel vcftools + +RUN conda install -y -c bioconda freebayes \ + && conda install -y -c bioconda vcflib + +RUN wget https://bitbucket.org/tasseladmin/tassel-5-standalone/get/5f68583d0f56.zip \ + && unzip 5f68583d0f56.zip \ + && mv tasseladmin-tassel-5-standalone-5f68583d0f56/ /tassel diff --git a/.scripts/build_pipeline_release.sh b/.scripts/build_pipeline_release.sh index b94e011a..4b00a1e2 100755 --- a/.scripts/build_pipeline_release.sh +++ b/.scripts/build_pipeline_release.sh @@ -6,14 +6,16 @@ declare -r SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 & source ${SCRIPT_DIR}/common.sh declare ROOT_WDL="" +declare ROOT_JSON="" declare VERSION="" declare OUTPUT_DIR="" declare ENV="" -declare -r ZIP_SUFFIX=".zip" WDL_SUFFIX=".wdl" OPTIONS_SUFFIX=".options.json" +declare -r ZIP_SUFFIX=".zip" WDL_SUFFIX=".wdl" OPTIONS_SUFFIX=".inputs.json" function make_release() { local -r rootWdl=${ROOT_WDL} + local -r rootJSON=${ROOT_JSON} local -r wdlBasename=$(basename ${rootWdl} ${WDL_SUFFIX}) # Store the current working directory @@ -37,7 +39,8 @@ function make_release() { # Strip the paths out of the root WDL imports sed -E '/http/! s/import "(.*)\/(.*\'${WDL_SUFFIX}')"/import "\2"/g' ${rootWdl} > ${outputVersionedPrefix}${WDL_SUFFIX} - write_options ${rootWdl} ${outputVersionedPrefix} + sed -E '/http/! s/import "(.*)\/(.*\'${OPTIONS_SUFFIX}')"/import "\2"/g' ${rootJSON} > ${outputVersionedPrefix}${OPTIONS_SUFFIX} + # write_options ${rootWdl} ${outputVersionedPrefix} versioned_dependencies_zip=${outputVersionedPrefix}${ZIP_SUFFIX} @@ -91,6 +94,7 @@ function show_help() { echo "" echo "Arguments:" echo " -w The path to the workflow (.wdl) file" + echo " -j The path to the JSON input file template (.json) file" echo " -v The version of the workflow (used in building the release name)" echo " -o The directory into which the outputs will be written" echo " -e The environment (dev, staging, or prod)" @@ -98,7 +102,7 @@ function show_help() { echo "" } -while getopts "hw:v:o:e:" opt; do +while getopts "hw:j:v:o:e:" opt; do case ${opt} in h) show_help @@ -111,6 +115,13 @@ while getopts "hw:v:o:e:" opt; do fi ROOT_WDL=${OPTARG} ;; + j) + if [[ ! -f ${OPTARG} ]]; then + echo >&2 Error: ${OPTARG} does not exist! + exit 1 + fi + ROOT_JSON=${OPTARG} + ;; v) VERSION=${OPTARG} ;; diff --git a/.scripts/release_pipeline_to_github.sh b/.scripts/release_pipeline_to_github.sh index f01b338d..80c5b05b 100755 --- a/.scripts/release_pipeline_to_github.sh +++ b/.scripts/release_pipeline_to_github.sh @@ -103,9 +103,10 @@ function build_and_release_to_github() { local -r prerelease=${5} local -r pipelineName=$(basename ${pipeline} .wdl) local -r changelog=$(dirname ${pipeline})/${pipelineName}.changelog.md + local -r inputsName=$(dirname ${pipeline})/${pipelineName}.inputs.json stderr "Building artifacts for ${releaseName}" - ${SCRIPT_DIR}/build_pipeline_release.sh -w ${pipeline} -e prod -v ${version} -o ${localReleaseDir} + ${SCRIPT_DIR}/build_pipeline_release.sh -w ${pipeline} -j ${inputsName} -e prod -v ${version} -o ${localReleaseDir} stderr "Building release notes for ${releaseName}" local previousEntryStart @@ -197,8 +198,8 @@ function upload_to_github_as_draft() { ${pipelineName} \ ${version} \ ${releaseName} \ - "options.json" \ - "application/json" + "inputs.json" \ + "text/json" local -r dependenciesZip=${localReleaseDir}/${pipelineName}/${pipelineName}_${version}.zip @@ -308,4 +309,4 @@ localReleaseDir=$(mktemp -d) trap cleanup_failed_release EXIT -release_to_github ${PIPELINE_TO_RELEASE} ${ENV} +release_to_github ${PIPELINE_TO_RELEASE} ${ENV} \ No newline at end of file diff --git a/README.md b/README.md index d1435746..e384d086 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,32 @@ [![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) [![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map) -## Reads2Map -Reads2Map presents a collection of [WDL workflows](https://openwdl.org/) to build linkage maps from sequencing reads. Each workflow release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases). +

+
+ +
+

-The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. +Reads2Map is a collection of [WDL workflows](https://openwdl.org/) designed to facilitate the contruction of linkage maps from sequencing reads. You can find details of each workflow release on the Read2Map releases page, available [here](https://github.com/Cristianetaniguti/Reads2Map/releases). -By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build. +The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` used RADinitio software to simulate Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. -![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png) +The SNP calling step in Reads2Map currently includes the popular tools: GATK, Freebayes, TASSEL, and STACKs. For genotype/dosage calling, the workflow utilizes tools like updog, polyRAD, and SuperMASSA. Lastly, Reads2Map leverages OneMap, GUSMap, and MAPpoly for linkage map construction. + +For diploid data, you can visualize the results using the R package and shiny app called Reads2MapApp, available [here](https://github.com/Cristianetaniguti/Reads2MapApp). This package supports the visualization of linkage maps built using OneMap and GUSMap. + +The Reads2Map workflows perform the SNP and genotype/dosage calling for your complete data set. However, it builds the linkage map for only a single chromosome (reference genome is required) for each combination of software and parameters. The produced maps will probably still require improvements, but their characteristics will suggest which combination of SNP and genotype calling software and parameters you should use for your data. Once the pipeline is selected, you can input the respective VCF file in R and build the complete linkage map using OneMap or MAPpoly. Use [OneMap](https://statgen-esalq.github.io/tutorials/onemap/Outcrossing_Populations.html) or [MAPoly](https://rpubs.com/mmollin/tetra_mappoly_vignette) tutorials for guidance on building and improving the linkage map for the complete dataset. ## How to use -Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines). +Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines). -To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies). +In addition, we also suggest two wrappers: [cromwell-cli](https://github.com/lmtani/cromwell-cli) and [Caper](https://github.com/ENCODE-DCC/caper). Here is a tutorial on how to setup these tools and one example running the EmpiricalReads2Map: -## Documentation +* [Setup and run Reads2Map workflows](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html) + +To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies). Check the description of the inputs for the pipelines: @@ -25,14 +34,10 @@ Check the description of the inputs for the pipelines: * [SimulatedReads2Map](https://cristianetaniguti.github.io/Tutorials/Reads2Map/simulatedreads.html) -Check how to evaluate the workflows results in Reads2MapApp Shiny: +Check how to evaluate the workflows results in Reads2MapApp Shiny (so far only available for diploid datasets): * [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) -Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map: - -* [A Guide to Build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/Quick_HighDens/High_density_maps.html) - Check more information and examples of usage in: * [Taniguti, C. H., Taniguti, L. M., Amadeu, R. R., Mollinari, M., Da, G., Pereira, S., Riera-Lizarazu, O., Lau, J., Byrne, D., de Siqueira Gesteira, G., De, T., Oliveira, P., Ferreira, G. C., & Franco Garcia, A. A. Developing best practices for genotyping-by-sequencing analysis using linkage maps as benchmarks. BioRxiv. https://doi.org/10.1101/2022.11.24.517847](https://www.biorxiv.org/content/10.1101/2022.11.24.517847v2) @@ -44,6 +49,8 @@ Check more information and examples of usage in: - [ddRADseqTools](https://github.com/GGFHF/ddRADseqTools) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Set of applications useful to in silico design and testing of double digest RADseq (ddRADseq) experiments; - [Freebayes](https://github.com/ekg/freebayes) in [Cristaniguti/freebayes:0.0.1](): Variant call step; - [GATK](https://github.com/broadinstitute/gatk) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Variant call step using Haplotype Caller, GenomicsDBImport and GenotypeGVCFs; +- [TASSEL](https://www.maizegenetics.net/tassel) in [cristaniguti/java-in-the-cloud:0.0.2](https://hub.docker.com/repository/docker/cristaniguti/java-in-the-cloud/general): Variant Call +- [STACKs](https://catchenlab.life.illinois.edu/stacks/) in [cristaniguti/stacks:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/stacks/general): Variant Call - [PedigreeSim](https://github.com/PBR/pedigreeSim?files=1) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Simulates progeny genotypes from parents genotypes for different types of populations; - [picard](https://github.com/broadinstitute/picard) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files; - [pirs](https://github.com/galaxy001/pirs) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): To generate simulates paired-end reads from a reference genome; @@ -63,4 +70,9 @@ Check more information and examples of usage in: - [updog](https://github.com/dcgerard/updog) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Flexible Genotyping of Polyploids using Next Generation Sequencing Data - [polyRAD](https://github.com/lvclark/polyRAD) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Genotype Calling with Uncertainty from Sequencing Data in Polyploids - [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) in [cristaniguti/reads2mapApp:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Shiny app to evaluate Reads2Map workflows results -- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations \ No newline at end of file +- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations +- [MAPpoly](https://github.com/mmollina/MAPpoly) in [cristaniguti/reads2map:0.0.5](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Build linkage maps for autopolyploid species + +### Funding + +This work was partially supported by the National Council for Scientific and Technological Development (CNPq - 313269/2021-1); by USDA, National Institute of Food and Agriculture (NIFA), Specialty Crop Research Initiative (SCRI) project “Tools for Genomics Assisted Breeding in Polyploids: Development of a Community Resource” (Award No. 2020-51181-32156); and by the Bill and Melinda Gates Foundation (OPP1213329) project SweetGAINS. \ No newline at end of file diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md index 05d8587f..7460fc67 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md @@ -1,3 +1,9 @@ +# 1.2.4 + +* runtimes adapted to run with Caper +* perform the genotype calling with updog, SuperMASSA and polyRAD with complete data set (not only for the selected chromosome) +* [new tutorial](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html) + # 1.2.3 * Supermassa has smaller probability threshold (bugfix) diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.inputs.json b/pipelines/EmpiricalMaps/EmpiricalMaps.inputs.json new file mode 100644 index 00000000..c18ba567 --- /dev/null +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.inputs.json @@ -0,0 +1,23 @@ +{ + "Maps.dataset": { + "parent2": "String", + "name": "String", + "parent1": "String", + "chromosome": "String", + "cross": "String", + "multiallelics": "Boolean" + }, + "Maps.max_cores": "Int", + "Maps.gatk_vcf_multi": "File? (optional)", + "Maps.gatk_mchap": "String", + "Maps.vcfs_counts_source": "Array[String]", + "Maps.filters": "String? (optional)", + "Maps.filt_segr": "String? (optional)", + "Maps.prob_thres": "Float? (optional)", + "Maps.ploidy": "Int", + "Maps.vcfs_software": "Array[String]", + "Maps.filter_noninfo": "Boolean", + "Maps.vcfs": "Array[File]", + "Maps.replaceADbyMissing": "String" +} + diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl index 6a90acdb..63630d34 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl @@ -20,6 +20,10 @@ workflow Maps { Array[File] vcfs Array[String] vcfs_software Array[String] vcfs_counts_source + Boolean run_updog = true + Boolean run_supermassa = false + Boolean run_polyrad = true + Boolean run_gusmap = false Boolean filter_noninfo String replaceADbyMissing File? gatk_vcf_multi @@ -35,10 +39,10 @@ workflow Maps { call utils.ApplyRandomFiltersArray { input: vcfs = vcfs, - vcfs_software = vcfs_software, - vcfs_counts_source = vcfs_counts_source, - filters = filters, - chromosome = dataset.chromosome + vcfs_SNPCall_software = vcfs_software, + vcfs_Counts_source = vcfs_counts_source, + vcfs_GenoCall_software = range(length(vcfs_software)), + filters = filters } } @@ -66,64 +70,71 @@ workflow Maps { File vcf_up = select_first([RemoveNonInformative.vcf_filtered, splitgeno.biallelics]) if(ploidy == 2) { - call genotyping.onemapMapsEmp as updogMaps { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - GenotypeCall_program = "updog", - CountsFrom = vcfs_counts_source[idx], - cross = dataset.cross, - parent1 = dataset.parent1, - parent2 = dataset.parent2, - chromosome = dataset.chromosome, - multiallelics = dataset.multiallelics, - multiallelics_file = splitgeno.multiallelics, - max_cores = max_cores, - ploidy = ploidy + if(run_updog){ + call genotyping.onemapMapsEmp as updogMaps { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + GenotypeCall_program = "updog", + CountsFrom = vcfs_counts_source[idx], + cross = dataset.cross, + parent1 = dataset.parent1, + parent2 = dataset.parent2, + chromosome = dataset.chromosome, + multiallelics = dataset.multiallelics, + multiallelics_file = splitgeno.multiallelics, + max_cores = max_cores, + ploidy = ploidy + } } - call genotyping.onemapMapsEmp as supermassaMaps { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - GenotypeCall_program = "supermassa", - CountsFrom = vcfs_counts_source[idx], - cross = dataset.cross, - parent1 = dataset.parent1, - parent2 = dataset.parent2, - chromosome = dataset.chromosome, - multiallelics = dataset.multiallelics, - multiallelics_file = splitgeno.multiallelics, - max_cores = max_cores, - ploidy = ploidy + if(run_supermassa){ + call genotyping.onemapMapsEmp as supermassaMaps { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + GenotypeCall_program = "supermassa", + CountsFrom = vcfs_counts_source[idx], + cross = dataset.cross, + parent1 = dataset.parent1, + parent2 = dataset.parent2, + chromosome = dataset.chromosome, + multiallelics = dataset.multiallelics, + multiallelics_file = splitgeno.multiallelics, + max_cores = max_cores, + ploidy = ploidy + } } - call genotyping.onemapMapsEmp as polyradMaps { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - GenotypeCall_program = "polyrad", - CountsFrom = vcfs_counts_source[idx], - cross = dataset.cross, - parent1 = dataset.parent1, - parent2 = dataset.parent2, - chromosome = dataset.chromosome, - multiallelics = dataset.multiallelics, - multiallelics_file = splitgeno.multiallelics, - max_cores = max_cores, - ploidy = ploidy + if(run_polyrad){ + call genotyping.onemapMapsEmp as polyradMaps { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + GenotypeCall_program = "polyrad", + CountsFrom = vcfs_counts_source[idx], + cross = dataset.cross, + parent1 = dataset.parent1, + parent2 = dataset.parent2, + chromosome = dataset.chromosome, + multiallelics = dataset.multiallelics, + multiallelics_file = splitgeno.multiallelics, + max_cores = max_cores, + ploidy = ploidy + } } - # Build maps with GUSMap - call gusmap.gusmapMapsEmp { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - CountsFrom = vcfs_counts_source[idx], - GenotypeCall_program = "gusmap", - parent1 = dataset.parent1, - parent2 = dataset.parent2, - max_cores = max_cores + if(run_gusmap){ + call gusmap.gusmapMapsEmp { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + CountsFrom = vcfs_counts_source[idx], + GenotypeCall_program = "gusmap", + parent1 = dataset.parent1, + parent2 = dataset.parent2, + max_cores = max_cores + } } if(vcfs_counts_source[idx] != "bam"){ @@ -147,52 +158,58 @@ workflow Maps { } if(ploidy > 2){ - call mappoly_sub.MappolyMapsEmp as updogPolyMaps { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - GenotypeCall_program = "updog", - CountsFrom = vcfs_counts_source[idx], - cross = "F1", - parent1 = dataset.parent1, - parent2 = dataset.parent2, - max_cores = max_cores, - ploidy = ploidy, - prob_thres = prob_thres, - filt_segr = filt_segr + if(run_updog){ + call mappoly_sub.MappolyMapsEmp as updogPolyMaps { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + GenotypeCall_program = "updog", + CountsFrom = vcfs_counts_source[idx], + cross = "F1", + parent1 = dataset.parent1, + parent2 = dataset.parent2, + max_cores = max_cores, + ploidy = ploidy, + prob_thres = prob_thres, + filt_segr = filt_segr + } } - call mappoly_sub.MappolyMapsEmp as polyradPolyMaps { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - GenotypeCall_program = "polyrad", - CountsFrom = vcfs_counts_source[idx], - cross = "F1", - parent1 = dataset.parent1, - parent2 = dataset.parent2, - max_cores = max_cores, - ploidy = ploidy, - prob_thres = prob_thres, - filt_segr = filt_segr + if(run_polyrad){ + call mappoly_sub.MappolyMapsEmp as polyradPolyMaps { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + GenotypeCall_program = "polyrad", + CountsFrom = vcfs_counts_source[idx], + cross = "F1", + parent1 = dataset.parent1, + parent2 = dataset.parent2, + max_cores = max_cores, + ploidy = ploidy, + prob_thres = prob_thres, + filt_segr = filt_segr + } } - - call mappoly_sub.MappolyMapsEmp as supermassaPolyMaps { - input: - vcf_file = vcf_up, - SNPCall_program = vcfs_software[idx], - GenotypeCall_program = "supermassa", - CountsFrom = vcfs_counts_source[idx], - cross = "F1", - parent1 = dataset.parent1, - parent2 = dataset.parent2, - max_cores = max_cores, - ploidy = ploidy, - prob_thres = prob_thres, - filt_segr = filt_segr + + if(run_supermassa){ + call mappoly_sub.MappolyMapsEmp as supermassaPolyMaps { + input: + vcf_file = vcf_up, + SNPCall_program = vcfs_software[idx], + GenotypeCall_program = "supermassa", + CountsFrom = vcfs_counts_source[idx], + cross = "F1", + parent1 = dataset.parent1, + parent2 = dataset.parent2, + max_cores = max_cores, + ploidy = ploidy, + prob_thres = prob_thres, + filt_segr = filt_segr + } } - if(vcfs_counts_source[idx] != "bam"){ + if(vcfs_counts_source[idx] != "bam" && vcfs_software[idx] != "stacks" && vcfs_software[idx] != "tassel"){ call mappoly_task.MappolyReport { input: vcf_file = vcf_up, @@ -210,35 +227,51 @@ workflow Maps { } } - if(ploidy == 2){ - Array[File] snpcaller_results = select_all(SNPCallerMapsEmp.tar_gz_report) - - # Compress files - call reports.JointReports { - input: - SNPCaller = snpcaller_results, - updog = updogMaps.tar_gz_report, - polyrad = polyradMaps.tar_gz_report, - supermassa = supermassaMaps.tar_gz_report, - gusmap = gusmapMapsEmp.tar_gz_report, - max_cores = max_cores - } + if(defined(updogMaps.tar_gz_report)) { + Array[File] snpcaller_results = select_all(SNPCallerMapsEmp.tar_gz_report) } - - if(ploidy > 2){ - - call reports.JointReportsPoly { - input: - SNPCaller = select_all(MappolyReport.results), - updog = select_all(updogPolyMaps.tar_gz_report), - polyrad = select_all(polyradPolyMaps.tar_gz_report), - supermassa = select_all(supermassaPolyMaps.tar_gz_report) - } + if(defined(updogMaps.tar_gz_report)) { + Array[File] updog_results = select_all(updogMaps.tar_gz_report) + } + if(defined(supermassaMaps.tar_gz_report)){ + Array[File] supermassa_results = select_all(supermassaMaps.tar_gz_report) + } + if(defined(polyradMaps.tar_gz_report)){ + Array[File] polyrad_results = select_all(polyradMaps.tar_gz_report) + } + if(defined(gusmapMapsEmp.tar_gz_report)){ + Array[File] gusmap_results = select_all(gusmapMapsEmp.tar_gz_report) + } + if(defined(MappolyReport.results)){ + Array[File] snpcaller_poly_results = select_all(MappolyReport.results) + } + if(defined(updogPolyMaps.tar_gz_report)){ + Array[File] updog_poly_results = select_all(updogPolyMaps.tar_gz_report) + } + if(defined(polyradPolyMaps.tar_gz_report)){ + Array[File] polyrad_poly_results = select_all(polyradPolyMaps.tar_gz_report) + } + if(defined(supermassaPolyMaps.tar_gz_report)){ + Array[File] supermassa_poly_results = select_all(supermassaPolyMaps.tar_gz_report) + } + + # Compress files + call reports.JointAllReports { + input: + SNPCallerMapsEmp = snpcaller_results, + updogMaps = updog_results, + polyradMaps = polyrad_results, + supermassaMaps = supermassa_results, + gusmapMapsEmp = gusmap_results, + SNPCallerPolyMapsEmp = snpcaller_poly_results, + updogPolyMaps = updog_poly_results, + polyradPolyMaps = polyrad_poly_results, + supermassaPolyMaps = supermassa_poly_results, + max_cores = max_cores, + ploidy = ploidy } - - File Empirical_results_sele = select_first([JointReports.EmpiricalReads_results, JointReportsPoly.EmpiricalReads_results]) output { - File EmpiricalReads_results = Empirical_results_sele + File EmpiricalReads_results = JointAllReports.EmpiricalReads_results } } diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md index 06756f9a..f85a429f 100644 --- a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md +++ b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md @@ -1,3 +1,24 @@ +# 1.4.0 + +* STACKs included +* support to pair-end reads +* defined defaults +* runtimes adapted to run with Caper +* [new tutorial](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html) + +# 1.3.0 + +* TASSEL 5.0 included +* releases include the input files template +* new parameter to control maximum ram memory used by GATK and TASSEL + +# 1.2.0 + +* Add MAPpoly to build linkage maps for polyploid species +* Run freebayes parallelizing in nodes according to chromosomes and cores splitting in genomic regions +* Adjust runtimes +* Add polyploid dataset for tests + # 1.0.0 Initial release diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.inputs.json b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.inputs.json new file mode 100644 index 00000000..9945e780 --- /dev/null +++ b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.inputs.json @@ -0,0 +1,49 @@ +{ + "EmpiricalReads.replaceAD": "Boolean (optional, default = true)", + "EmpiricalReads.filters": "String? (optional)", + "EmpiricalReads.Maps.run_updog": "Boolean (optional, default = true)", + "EmpiricalReads.filt_segr": "String? (optional)", + "EmpiricalReads.max_ram": "Int", + "EmpiricalReads.replaceADbyMissing": "String (optional, default = \"TRUE\")", + "EmpiricalReads.Maps.run_supermassa": "Boolean (optional, default = false)", + "EmpiricalReads.run_tassel": "Boolean (optional, default = true)", + "EmpiricalReads.chunk_size": "Int", + "EmpiricalReads.Maps.run_gusmap": "Boolean (optional, default = false)", + "EmpiricalReads.gatk_mchap": "Boolean (optional, default = false)", + "EmpiricalReads.rm_dupli": "Boolean (optional, default = false)", + "EmpiricalReads.SNPCalling.max_ram": "Int", + "EmpiricalReads.dataset": { + "parent2": "String", + "name": "String", + "parent1": "String", + "chromosome": "String", + "cross": "String", + "multiallelics": "Boolean" + }, + "EmpiricalReads.ploidy": "Int (optional, default = 2)", + "EmpiricalReads.n_chrom": "Int", + "EmpiricalReads.SNPCalling.chunk_size": "Int", + "EmpiricalReads.prob_thres": "Float? (optional)", + "EmpiricalReads.hardfilters": "Boolean (optional, default = true)", + "EmpiricalReads.run_freebayes": "Boolean (optional, default = true)", + "EmpiricalReads.enzyme": "String? (optional)", + "EmpiricalReads.max_cores": "Int", + "EmpiricalReads.references": { + "ref_fasta": "File", + "ref_dict": "File", + "ref_ann": "File", + "ref_sa": "File", + "ref_amb": "File", + "ref_pac": "File", + "ref_bwt": "File", + "ref_fasta_index": "File" + }, + "EmpiricalReads.Maps.run_polyrad": "Boolean (optional, default = true)", + "EmpiricalReads.SNPCalling.run_stacks": "Boolean (optional, default = true)", + "EmpiricalReads.Maps.filter_noninfo": "Boolean", + "EmpiricalReads.SNPCalling.pop_map": "File? (optional)", + "EmpiricalReads.samples_info": "File", + "EmpiricalReads.pair_end": "Boolean (optional, default = false)", + "EmpiricalReads.run_gatk": "Boolean (optional, default = true)" +} + diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.wdl b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.wdl index dd3f408c..20f07cc4 100644 --- a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.wdl +++ b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.wdl @@ -13,6 +13,7 @@ workflow EmpiricalReads { ReferenceFasta references Dataset dataset Int max_cores + Int max_ram Int chunk_size Boolean rm_dupli = false Boolean gatk_mchap = false @@ -21,6 +22,9 @@ workflow EmpiricalReads { String replaceADbyMissing = "TRUE" # Boolean inside R Boolean run_gatk = true Boolean run_freebayes = true + Boolean run_tassel = true + Boolean pair_end = false + String? enzyme Int ploidy = 2 Int n_chrom String? filters @@ -41,8 +45,11 @@ workflow EmpiricalReads { replaceAD = replaceAD, run_gatk = run_gatk, run_freebayes = run_freebayes, + run_tassel = run_tassel, ploidy = ploidy, - n_chrom = n_chrom + n_chrom = n_chrom, + enzyme = enzyme, + pair_end = pair_end } call maps.Maps { diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md index a039a9c8..f51a21ca 100644 --- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md @@ -1,3 +1,17 @@ +# 1.4.0 + +* STACKs included +* support to pair-end reads +* defined defaults +* runtimes adapted to run with Caper +* [new tutorial](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html) + +# 1.3.0 + +* TASSEL 5.0 included +* releases include the input files template +* new parameter to control maximum ram memory used by GATK and TASSEL + # 1.2.1 * Adjustments in runtime diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.inputs.json b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.inputs.json new file mode 100644 index 00000000..2688353a --- /dev/null +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.inputs.json @@ -0,0 +1,36 @@ +{ + "SNPCalling.enzyme": "String? (optional)", + "SNPCalling.P1": "String? (optional)", + "SNPCalling.max_cores": "Int", + "SNPCalling.pair_end": "Boolean (optional, default = false)", + "SNPCalling.ploidy": "Int", + "SNPCalling.rm_dupli": "Boolean (optional, default = false)", + "SNPCalling.GatkGenotyping.vcf_simu": "File? (optional)", + "SNPCalling.replaceAD": "Boolean (optional, default = false)", + "SNPCalling.run_gatk": "Boolean (optional, default = true)", + "SNPCalling.pop_map": "File? (optional)", + "SNPCalling.run_tassel": "Boolean (optional, default = true)", + "SNPCalling.n_chrom": "Int", + "SNPCalling.chunk_size": "Int", + "SNPCalling.GatkGenotyping.depth": "Int? (optional)", + "SNPCalling.samples_info": "File", + "SNPCalling.P2": "String? (optional)", + "SNPCalling.max_ram": "Int", + "SNPCalling.FreebayesGenotyping.vcf_simu": "File? (optional)", + "SNPCalling.gatk_mchap": "Boolean (optional, default = false)", + "SNPCalling.references": { + "ref_fasta": "File", + "ref_dict": "File", + "ref_ann": "File", + "ref_sa": "File", + "ref_amb": "File", + "ref_pac": "File", + "ref_bwt": "File", + "ref_fasta_index": "File" + }, + "SNPCalling.run_freebayes": "Boolean (optional, default = false)", + "SNPCalling.GatkGenotyping.seed": "Int? (optional)", + "SNPCalling.run_stacks": "Boolean (optional, default = true)", + "SNPCalling.hardfilters": "Boolean (optional, default = true)" +} + diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.wdl b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.wdl index 30a723a4..a501579a 100644 --- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.wdl +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.wdl @@ -5,6 +5,8 @@ import "../../structs/dna_seq_structs.wdl" import "../../subworkflows/create_alignment_from_families_files.wdl" as fam import "../../subworkflows/gatk_genotyping.wdl" as gatk import "../../subworkflows/freebayes_genotyping.wdl" as freebayes +import "../../subworkflows/tassel_genotyping.wdl" as tassel +import "../../subworkflows/stacks_genotyping.wdl" as stacks workflow SNPCalling { @@ -13,6 +15,7 @@ workflow SNPCalling { File samples_info ReferenceFasta references Int max_cores + Int max_ram Int chunk_size Boolean rm_dupli = false String? P1 @@ -21,7 +24,12 @@ workflow SNPCalling { Boolean hardfilters = true Boolean replaceAD = false Boolean run_gatk = true - Boolean run_freebayes = true + Boolean run_freebayes = false + Boolean run_tassel = true + Boolean run_stacks = true + Boolean pair_end = false + File? pop_map + String? enzyme Int ploidy Int n_chrom } @@ -33,7 +41,8 @@ workflow SNPCalling { max_cores = max_cores, rm_dupli = rm_dupli, chunk_size = chunk_size, - gatk_mchap = gatk_mchap + gatk_mchap = gatk_mchap, + pair_end = pair_end } if(run_gatk){ @@ -46,6 +55,7 @@ workflow SNPCalling { ploidy = ploidy, program="gatk", max_cores = max_cores, + max_ram = max_ram, merged_bams = CreateAlignmentFromFamilies.merged_bam, P1 = P1, P2 = P2, @@ -68,15 +78,36 @@ workflow SNPCalling { } } - Array[Array[File]] vcfs_sele = select_all([GatkGenotyping.vcfs, FreebayesGenotyping.vcfs]) - Array[Array[String]] software_sele = select_all([GatkGenotyping.vcfs_software, FreebayesGenotyping.vcfs_software]) - Array[Array[String]] source_sele = select_all([GatkGenotyping.vcfs_counts_source, FreebayesGenotyping.vcfs_counts_source]) + if(run_tassel) { + call tassel.TasselGenotyping{ + input: + families_info = samples_info, + references = references, + max_cores = max_cores, + max_ram = max_ram, + enzyme = enzyme + } + } + + if(run_stacks) { + call stacks.StacksGenotyping { + input: + bams = CreateAlignmentFromFamilies.bam, + pop_map = pop_map, + max_cores = max_cores + } + } + + Array[Array[File]] vcfs_sele = select_all([GatkGenotyping.vcfs, FreebayesGenotyping.vcfs, TasselGenotyping.vcfs, StacksGenotyping.vcfs]) + Array[Array[String]] software_sele = select_all([GatkGenotyping.vcfs_software, FreebayesGenotyping.vcfs_software, TasselGenotyping.software_sele, StacksGenotyping.software_sele]) + Array[Array[String]] source_sele = select_all([GatkGenotyping.vcfs_counts_source, FreebayesGenotyping.vcfs_counts_source, TasselGenotyping.source_sele, StacksGenotyping.source_sele]) output { Array[File] vcfs = flatten(vcfs_sele) Array[String] vcfs_software = flatten(software_sele) Array[String] vcfs_counts_source = flatten(source_sele) File? gatk_multi_vcf = GatkGenotyping.vcf_multi + File? stacks_multiallelics = StacksGenotyping.stacks_multiallelics File? gatk_vcfEval = GatkGenotyping.vcfEval File? Plots = GatkGenotyping.Plots File? freebayes_vcfEval = FreebayesGenotyping.vcfEval diff --git a/pipelines/PreprocessingReads/PreprocessingReads.changelog.md b/pipelines/PreprocessingReads/PreprocessingReads.changelog.md index cb359355..fc66cc7a 100644 --- a/pipelines/PreprocessingReads/PreprocessingReads.changelog.md +++ b/pipelines/PreprocessingReads/PreprocessingReads.changelog.md @@ -1,3 +1,7 @@ +# 1.0.1 + +* runtimes adapted to run with Caper + # 1.0.0 Initial release diff --git a/pipelines/PreprocessingReads/PreprocessingReads.inputs.json b/pipelines/PreprocessingReads/PreprocessingReads.inputs.json new file mode 100644 index 00000000..67a40fd1 --- /dev/null +++ b/pipelines/PreprocessingReads/PreprocessingReads.inputs.json @@ -0,0 +1,10 @@ +{ + "PreprocessingReads.spec": { + "barcodes": "File? (optional)", + "adapter": "String", + "raw_dict": "File", + "enzyme": "String", + "enzyme2": "String? (optional)" + } +} + diff --git a/pipelines/SimulatedReads2Map/SimulatedReads2Map.changelog.md b/pipelines/SimulatedReads2Map/SimulatedReads2Map.changelog.md index 4f9ef674..08373c0c 100644 --- a/pipelines/SimulatedReads2Map/SimulatedReads2Map.changelog.md +++ b/pipelines/SimulatedReads2Map/SimulatedReads2Map.changelog.md @@ -1,3 +1,7 @@ +# 1.0.1 + +* runtimes adapted to run with Caper + # 1.0.0 Initial release diff --git a/pipelines/SimulatedReads2Map/SimulatedReads2Map.inputs.json b/pipelines/SimulatedReads2Map/SimulatedReads2Map.inputs.json new file mode 100644 index 00000000..ac689dc8 --- /dev/null +++ b/pipelines/SimulatedReads2Map/SimulatedReads2Map.inputs.json @@ -0,0 +1,49 @@ +{ + "SimulatedReads.filters": "String? (optional)", + "SimulatedReads.hardfilters": "Boolean (optional, default = true)", + "SimulatedReads.family": { + "cmBymb": "Float? (optional)", + "seed": "Int", + "doses": "File? (optional)", + "cross": "String", + "popsize": "Int", + "ploidy": "Int" + }, + "SimulatedReads.references": { + "ref_fasta": "File", + "ref_dict": "File", + "ref_ann": "File", + "ref_sa": "File", + "ref_amb": "File", + "ref_pac": "File", + "ref_bwt": "File", + "ref_fasta_index": "File" + }, + "SimulatedReads.n_chrom": "Int", + "SimulatedReads.chunk_size": "Int (optional, default = 5)", + "SimulatedReads.max_cores": "Int", + "SimulatedReads.number_of_families": "Int", + "SimulatedReads.gatk_mchap": "Boolean (optional, default = false)", + "SimulatedReads.global_seed": "Int", + "SimulatedReads.sequencing": { + "emp_vcf": "File", + "enzyme1": "String", + "pcr_cycles": "Int? (optional)", + "vcf_parent1": "String", + "library_type": "String", + "depth_parents": "Int", + "mapsize": "Int", + "rm_dupli": "String", + "insert_size": "Int? (optional)", + "depth": "Int", + "insert_size_dev": "Int? (optional)", + "ref_map": "File? (optional)", + "enzyme2": "String? (optional)", + "multiallelics": "String", + "vcf_parent2": "String", + "read_length": "Int? (optional)", + "emp_bam": "File? (optional)", + "chromosome": "String" + } +} + diff --git a/subworkflows/SimulatedSingleFamily.wdl b/subworkflows/SimulatedSingleFamily.wdl index 54d716ad..80718f4e 100644 --- a/subworkflows/SimulatedSingleFamily.wdl +++ b/subworkflows/SimulatedSingleFamily.wdl @@ -86,8 +86,9 @@ workflow SimulatedSingleFamily { call utils.ApplyRandomFiltersArray { input: vcfs = flatten(vcfs_sele), - vcfs_software = flatten(software_sele), - vcfs_counts_source = flatten(source_sele), + vcfs_SNPCall_software = flatten(software_sele), + vcfs_Counts_source = flatten(source_sele), + vcfs_GenoCall_software = range(length(flatten(software_sele))), filters = filters, chromosome = sequencing.chromosome } diff --git a/subworkflows/create_alignment_from_families_files.wdl b/subworkflows/create_alignment_from_families_files.wdl index b284de7f..79b03230 100644 --- a/subworkflows/create_alignment_from_families_files.wdl +++ b/subworkflows/create_alignment_from_families_files.wdl @@ -11,6 +11,7 @@ workflow CreateAlignmentFromFamilies { Int max_cores Boolean rm_dupli Boolean gatk_mchap + Boolean pair_end Int chunk_size } @@ -24,11 +25,17 @@ workflow CreateAlignmentFromFamilies { Array[Array[String]] sample_file = read_tsv(chunk) + if(pair_end) { + Array[File] pair = sample_file[3] + } + call alg.RunBwaAlignment { input: sampleName = sample_file[1], - reads = sample_file[0], + reads1 = sample_file[0], + reads2 = pair, libraries = sample_file[2], + pair_end = pair_end, references = references, max_cores = max_cores, rm_dupli = rm_dupli diff --git a/subworkflows/gatk_genotyping.wdl b/subworkflows/gatk_genotyping.wdl index 1a4375d0..4421d3d1 100644 --- a/subworkflows/gatk_genotyping.wdl +++ b/subworkflows/gatk_genotyping.wdl @@ -29,6 +29,7 @@ workflow GatkGenotyping { File? merged_bams String? P1 String? P2 + Int? max_ram } call chunk_lists.CreateChunksBam { @@ -49,7 +50,8 @@ workflow GatkGenotyping { reference_fai = references.ref_fasta_index, reference_dict = references.ref_dict, ploidy = ploidy, - chunk_size = chunk_size + chunk_size = chunk_size, + max_ram = max_ram } } @@ -63,7 +65,8 @@ workflow GatkGenotyping { reference_fasta=references.ref_fasta, reference_fai=references.ref_fasta_index, reference_dict=references.ref_dict, - interval = interval + interval = interval, + max_ram = max_ram } call gatk.GenotypeGVCFs { @@ -72,7 +75,8 @@ workflow GatkGenotyping { interval = interval, reference_fasta = references.ref_fasta, reference_fai = references.ref_fasta_index, - reference_dict = references.ref_dict + reference_dict = references.ref_dict, + max_ram = max_ram } } diff --git a/subworkflows/genotyping_empirical.wdl b/subworkflows/genotyping_empirical.wdl index 646b56ac..f08ddf62 100644 --- a/subworkflows/genotyping_empirical.wdl +++ b/subworkflows/genotyping_empirical.wdl @@ -23,6 +23,8 @@ workflow onemapMapsEmp { call utilsR.ReGenotyping { input: vcf_file = vcf_file, + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, GenotypeCall_program = GenotypeCall_program, cross = cross, parent1 = parent1, @@ -31,15 +33,39 @@ workflow onemapMapsEmp { ploidy = ploidy } + call utils.ApplyRandomFiltersArray as FilterBi{ + input: + vcfs = [ReGenotyping.regeno_vcf], + vcfs_SNPCall_software = [SNPCall_program], + vcfs_Counts_source = [CountsFrom], + vcfs_GenoCall_software = [GenotypeCall_program + "_biallelic"], + chromosome = chromosome + } + if (multiallelics) { + + Array[File] array_vcf = select_all([multiallelics_file]) + + call utils.ApplyRandomFiltersArray as FilterMulti{ + input: + vcfs = array_vcf, + vcfs_SNPCall_software = [SNPCall_program], + vcfs_Counts_source = [CountsFrom], + vcfs_GenoCall_software = [GenotypeCall_program + "_multiallelic"], + chromosome = chromosome + } + call utils.JointMarkers { input: - biallelic_vcf = ReGenotyping.regeno_vcf, - multiallelic_vcf = multiallelics_file + biallelic_vcf = FilterBi.vcfs_filt[0], + multiallelic_vcf = FilterMulti.vcfs_filt[0], + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, + GenotypeCall_program = GenotypeCall_program } } - File updated_vcf = select_first([JointMarkers.merged_vcf, ReGenotyping.regeno_vcf]) + File updated_vcf = select_first([JointMarkers.merged_vcf, FilterBi.vcfs_filt[0]]) call utilsR.SetProbs { input: diff --git a/subworkflows/genotyping_simulated.wdl b/subworkflows/genotyping_simulated.wdl index b3f1f3bf..d67d5041 100644 --- a/subworkflows/genotyping_simulated.wdl +++ b/subworkflows/genotyping_simulated.wdl @@ -28,6 +28,8 @@ workflow onemapMaps { input: vcf_file = vcf_file, GenotypeCall_program = genotyping_program, + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, cross = cross, parent1 = "P1", parent2 = "P2", @@ -39,7 +41,10 @@ workflow onemapMaps { call utils.JointMarkers { input: biallelic_vcf = ReGenotyping.regeno_vcf, - multiallelic_vcf = multiallelics_file + multiallelic_vcf = multiallelics_file, + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, + GenotypeCall_program = genotyping_program } } diff --git a/subworkflows/mappoly_maps_empirical.wdl b/subworkflows/mappoly_maps_empirical.wdl index 5b02c5a9..91dfefa2 100644 --- a/subworkflows/mappoly_maps_empirical.wdl +++ b/subworkflows/mappoly_maps_empirical.wdl @@ -23,6 +23,8 @@ workflow MappolyMapsEmp { input: vcf_file = vcf_file, GenotypeCall_program = GenotypeCall_program, + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, cross = cross, parent1 = parent1, parent2 = parent2, diff --git a/subworkflows/snpcaller_maps_empirical.wdl b/subworkflows/snpcaller_maps_empirical.wdl index 0e24aa25..ddc5f680 100644 --- a/subworkflows/snpcaller_maps_empirical.wdl +++ b/subworkflows/snpcaller_maps_empirical.wdl @@ -25,7 +25,10 @@ workflow SNPCallerMapsEmp { call utils.JointMarkers { input: biallelic_vcf = vcf_file, - multiallelic_vcf = multiallelics_file + multiallelic_vcf = multiallelics_file, + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, + GenotypeCall_program = GenotypeCall_program } } diff --git a/subworkflows/snpcaller_maps_simulated.wdl b/subworkflows/snpcaller_maps_simulated.wdl index 96563505..c0138bb0 100644 --- a/subworkflows/snpcaller_maps_simulated.wdl +++ b/subworkflows/snpcaller_maps_simulated.wdl @@ -27,7 +27,10 @@ workflow SNPCallerMaps { call utils.JointMarkers { input: biallelic_vcf = vcf_file, - multiallelic_vcf = multiallelics_file + multiallelic_vcf = multiallelics_file, + SNPCall_program = SNPCall_program, + CountsFrom = CountsFrom, + GenotypeCall_program = GenotypeCall_program } } diff --git a/subworkflows/stacks_genotyping.wdl b/subworkflows/stacks_genotyping.wdl new file mode 100644 index 00000000..baaf7c15 --- /dev/null +++ b/subworkflows/stacks_genotyping.wdl @@ -0,0 +1,35 @@ +version 1.0 + +import "../tasks/stacks.wdl" + + +workflow StacksGenotyping { + input { + Array[File] bams + File? pop_map + Int max_cores + } + + if(!defined(pop_map)){ + call stacks.CreatePopMapFile { + input: + bams = bams + } + } + + File pop_map_sele = select_first([pop_map, CreatePopMapFile.pop_map]) + + call stacks.RefMap { + input: + bams = bams, + pop_map = pop_map_sele, + max_cores = max_cores + } + + output { + Array[File] vcfs = [RefMap.stacks_vcf] + File stacks_multiallelics = RefMap.stacks_multiallelics + Array[String] software_sele = ["stacks"] + Array[String] source_sele = ["vcf"] + } +} \ No newline at end of file diff --git a/subworkflows/tassel_genotyping.wdl b/subworkflows/tassel_genotyping.wdl new file mode 100644 index 00000000..1e0d5ff1 --- /dev/null +++ b/subworkflows/tassel_genotyping.wdl @@ -0,0 +1,65 @@ +version 1.0 + +import "../structs/dna_seq_structs.wdl" +import "../tasks/tassel.wdl" +import "../tasks/BWA.wdl" +import "../tasks/utils.wdl" as utils + + +workflow TasselGenotyping { + input { + File families_info + ReferenceFasta references + Int max_cores + Int max_ram + String enzyme = "ApeKI" + } + + call tassel.transposeSamples { + input: + families_info = families_info + } + + Array[Array[String]] sample_file = read_tsv(transposeSamples.transpose_samples) + + call tassel.BarcodeFaker { + input: + fastq = sample_file[0], + FullSampleName = sample_file[2] + } + + call tassel.TasselBeforeAlign { + input: + fastq = BarcodeFaker.barcode_fastq, + enzyme = enzyme, + key_file = BarcodeFaker.key_file, + max_ram = max_ram + } + + call BWA.RunBwaAlignment { + input: + sampleName = ["tasselSample"], + reads1 = [TasselBeforeAlign.fastq_align], + pair_end = false, + libraries = ["tasselSample"], + references = references, + max_cores = max_cores, + rm_dupli = false + } + + call tassel.TasselAfterAlign { + input: + tassel_database = TasselBeforeAlign.tassel_database, + bam = RunBwaAlignment.bam[0], + max_ram = max_ram, + enzyme = enzyme, + key_file = BarcodeFaker.key_file, + fastq = BarcodeFaker.barcode_fastq + } + + output { + Array[File] vcfs = [TasselAfterAlign.tassel_vcf] + Array[String] software_sele = ["tassel"] + Array[String] source_sele = ["vcf"] + } +} \ No newline at end of file diff --git a/tasks/BWA.wdl b/tasks/BWA.wdl index 60749b68..86129fff 100644 --- a/tasks/BWA.wdl +++ b/tasks/BWA.wdl @@ -9,27 +9,38 @@ task RunBwaAlignment { input { Array[String] sampleName - Array[File] reads + Array[File] reads1 + Array[File]? reads2 Array[String] libraries ReferenceFasta references + Boolean pair_end Int max_cores String rm_dupli } - Int disk_size = ceil(size(reads, "GiB") * 2 + size(references.ref_fasta, "GiB") + 20) + Int disk_size = ceil(size(reads1, "GiB") * 2 + size(references.ref_fasta, "GiB") + 20) Int memory_size = 4000 * max_cores command <<< mkdir tmp - reads_list=( ~{sep=" " reads} ) + reads1_list=( ~{sep=" " reads1} ) + reads2_list=( ~{sep=" " reads2} ) lib_list=( ~{sep=" " libraries} ) sampleName_list=( ~{sep=" " sampleName}) BAMS=() - for index in ${!reads_list[*]}; do - echo "${reads_list[$index]} is in ${lib_list[$index]}" + for index in ${!reads1_list[*]}; do + echo "${reads1_list[$index]} is in ${lib_list[$index]}" + + if "~{pair_end}" == "true" + then + reads=( "${reads1_list[$index]} ${reads2_list[$index]}" ) + else + reads=( "${reads1_list[$index]}" ) + fi + bwa_header="@RG\tID:${sampleName_list[$index]}.${lib_list[$index]}\tLB:lib-${lib_list[$index]}\tPL:illumina\tSM:${sampleName_list[$index]}\tPU:FLOWCELL1.LANE1.${lib_list[$index]}" - /usr/gitc/./bwa mem -t ~{max_cores} -R "${bwa_header}" ~{references.ref_fasta} "${reads_list[$index]}" | \ + /usr/gitc/./bwa mem -t ~{max_cores} -R "${bwa_header}" ~{references.ref_fasta} "$reads" | \ java -jar /usr/gitc/picard.jar SortSam \ I=/dev/stdin \ O="${sampleName_list[$index]}.${lib_list[$index]}.sorted.bam" \ @@ -92,6 +103,7 @@ task RunBwaAlignment { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -100,7 +112,7 @@ task RunBwaAlignment { # Slurm job_name: "RunBwaAlignment" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -185,6 +197,7 @@ task RunBwaAlignmentSimu { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -193,7 +206,7 @@ task RunBwaAlignmentSimu { # Slurm job_name: "RunBwaAlignmentSimu" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -227,6 +240,7 @@ task CreateChunksFastq { runtime { docker: "ubuntu:20.04" + singularity:"docker://ubuntu:20.04" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -234,7 +248,7 @@ task CreateChunksFastq { # Slurm job_name: "CreateChunksFastq" mem:"~{memory_size}M" - time:"00:05:00" + time: 1 } meta { diff --git a/tasks/JointReports.wdl b/tasks/JointReports.wdl index 2d7a69aa..ae76562c 100644 --- a/tasks/JointReports.wdl +++ b/tasks/JointReports.wdl @@ -1,16 +1,171 @@ version 1.0 + +task JointAllReports{ + input{ + Array[File]? SNPCallerMapsEmp + Array[File]? updogMaps + Array[File]? polyradMaps + Array[File]? supermassaMaps + Array[File]? gusmapMapsEmp + Array[File]? SNPCallerPolyMapsEmp + Array[File]? updogPolyMaps + Array[File]? polyradPolyMaps + Array[File]? supermassaPolyMaps + Int max_cores + Int ploidy + } + + Int memory_size = 4000 + + command <<< + R --vanilla --no-save < 2){ + + SNPCallerPolyMapsEmp <- str_split("~{sep=';' SNPCallerPolyMapsEmp}", ";", simplify = TRUE) + updogPolyMaps <- str_split("~{sep=';' updogPolyMaps}", ";", simplify = TRUE) + polyradPolyMaps <- str_split("~{sep=';' polyradPolyMaps}", ";", simplify = TRUE) + supermassaPolyMaps <- str_split("~{sep=';' supermassaPolyMaps}", ";", simplify = TRUE) + + all_files <- c(SNPCallerPolyMapsEmp, updogPolyMaps, polyradPolyMaps, supermassaPolyMaps) + all_files <- all_files[-which(all_files == "")] + + system("mkdir results_all") + + for(i in 1:length(all_files)) { + system(paste("tar -xvf ", all_files[i])) + system("mv results/* results_all") + system("rm -r results") + } + + system("tar -czvf EmpiricalReads_results.tar.gz results_all") + + } else { + + SNPCallerMapsEmp <- str_split("~{sep=';' SNPCallerMapsEmp}", ";", simplify = TRUE) + updogMaps <- str_split("~{sep=';' updogMaps}", ";", simplify = TRUE) + polyradMaps <- str_split("~{sep=';' polyradMaps}", ";", simplify = TRUE) + supermassaMaps <- str_split("~{sep=';' supermassaMaps}", ";", simplify = TRUE) + gusmapMapsEmp <- str_split("~{sep=';' gusmapMapsEmp}", ";", simplify = TRUE) + + files <- list(SNPCallerMapsEmp, updogMaps, polyradMaps, supermassaMaps, gusmapMapsEmp) + files <- files[-which(files == "")] + + path_dir <- tempdir() + system(paste0("mkdir ", paste0(path_dir, c("/maps", "/filters", "/errors", "/times", "/RDatas"), collapse = " "))) + for(i in 1:length(files)){ + for(j in 1:length(files[[i]])){ + untar(files[[i]][[j]], exdir = path_dir) + list_files <- untar(files[[i]][[j]], exdir = path_dir, list = T) + system(paste0("mv ",path_dir, "/",list_files[1], "*_map_report.tsv.gz ", path_dir, "/maps")) + system(paste0("mv ",path_dir, "/",list_files[1], "*_times_report.tsv.gz ", path_dir, "/times")) + system(paste0("mv ",path_dir, "/",list_files[1], "*.RData ", path_dir, "/RDatas")) + if(!grepl("gusmap", list_files[1])){ + system(paste0("mv ",path_dir, "/",list_files[1], "*_filters_report.tsv.gz ", path_dir, "/filters")) + system(paste0("mv ",path_dir, "/",list_files[1], "*_errors_report.tsv.gz ", path_dir, "/errors")) + } + } + } + + files <- system(paste0("ls ", path_dir, "/maps/"), intern = T) + files <- paste0(path_dir, "/maps/", files) + maps <- vroom(files, num_threads = ~{max_cores}) + + files <- system(paste0("ls ", path_dir, "/filters/"), intern = T) + files <- paste0(path_dir, "/filters/", files) + filters <- vroom(files, num_threads = ~{max_cores}) + + files <- system(paste0("ls ", path_dir, "/errors/"), intern = T) + files <- paste0(path_dir, "/errors/", files) + errors <- vroom(files, num_threads = ~{max_cores}) + + files <- system(paste0("ls ", path_dir, "/times/"), intern = T) + files <- paste0(path_dir, "/times/", files) + times <- vroom(files, num_threads = ~{max_cores}) + + files <- system(paste0("ls ", path_dir, "/RDatas/"), intern = T) + files <- paste0(path_dir, "/RDatas/", files) + + all_RDatas <- list() + for(i in 1:length(files)){ + map_temp <- load(files[i]) + all_RDatas[[i]] <- get(map_temp) + } + + names(all_RDatas) <- basename(files) + + if(any(grepl("gusmap", names(all_RDatas)))){ + gusmap_RDatas <- all_RDatas[grep("gusmap", names(all_RDatas))] + save(gusmap_RDatas, file = "gusmap_RDatas.RData") + RDatas <- all_RDatas[-grep("gusmap", names(all_RDatas))] + } else RDatas <- all_RDatas + + # # Converting onemap sequencig objects to list. LargeList do not accept other class + # # Also because of this gusmap is separated, because the developers worked with enviroments, not classes + for(i in 1:length(RDatas)){ + class(RDatas[[i]]) <- "list" + } + + saveList(RDatas, file = "sequences_emp.llo", append=FALSE, compress=TRUE) + + new_names <- names(all_RDatas) + vroom_write(as.data.frame(new_names), "names.tsv.gz") + + # Outputs + vroom_write(errors, "data1_depths_geno_prob.tsv.gz", num_threads = ~{max_cores}) + vroom_write(maps, "data2_maps.tsv.gz", num_threads = ~{max_cores}) + vroom_write(filters, "data3_filters.tsv.gz", num_threads = ~{max_cores}) + vroom_write(times, "data4_times.tsv.gz", num_threads = ~{max_cores}) + + system("mkdir EmpiricalReads_results") + + if(any(grepl("gusmap", names(all_RDatas)))){ + system("mv gusmap_RDatas.RData sequences_emp.llo data1_depths_geno_prob.tsv.gz data2_maps.tsv.gz data3_filters.tsv.gz data4_times.tsv.gz names.tsv.gz EmpiricalReads_results") + } else system("mv sequences_emp.llo data1_depths_geno_prob.tsv.gz data2_maps.tsv.gz data3_filters.tsv.gz data4_times.tsv.gz names.tsv.gz EmpiricalReads_results") + + system("tar -czvf EmpiricalReads_results.tar.gz EmpiricalReads_results") + + } + + RSCRIPT + >>> + + runtime{ + docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" + cpu: max_cores + # Cloud + memory:"~{memory_size} MiB" + disks:"local-disk 2 GiB HDD" + # Slurm + job_name: "JointAllReports" + mem:"~{memory_size}M" + time: 2 + } + + output { + File EmpiricalReads_results = "EmpiricalReads_results.tar.gz" + } +} + + task JointReports{ input{ - Array[File?] SNPCaller - Array[File?] updog - Array[File?] polyrad - Array[File?] supermassa - Array[File?] gusmap + Array[File]? SNPCaller + Array[File]? updog + Array[File]? polyrad + Array[File]? supermassa + Array[File]? gusmap Int max_cores } - Int disk_size = ceil(size(SNPCaller, "GiB") * 1.5 + size(updog, "GiB") * 1.5 + size(polyrad, "GiB") * 1.5 + size(supermassa, "GiB") * 1.5 + size(gusmap, "GiB") * 1.5) + Int disk_size = 4000 Int memory_size = 4000 command <<< @@ -104,6 +259,7 @@ task JointReports{ runtime{ docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -111,7 +267,7 @@ task JointReports{ # Slurm job_name: "JointReports" mem:"~{memory_size}M" - time:"01:40:00" + time: 2 } output{ @@ -247,6 +403,7 @@ task JointReportsSimu { runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -254,7 +411,7 @@ task JointReportsSimu { # Slurm job_name: "JointReports" mem:"~{memory_size}M" - time:"01:40:00" + time: 2 } meta { @@ -370,6 +527,7 @@ task JointTablesSimu{ runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -377,7 +535,7 @@ task JointTablesSimu{ # Slurm job_name: "JointTables" mem:"~{memory_size}M" - time:"01:40:00" + time: 2 } output { @@ -435,6 +593,7 @@ task JointReportsPoly{ runtime{ docker:"ubuntu:20.04" + singularity: "docker://ubuntu:20.04" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -442,7 +601,7 @@ task JointReportsPoly{ # Slurm job_name: "JointReports" mem:"~{memory_size}M" - time:"01:40:00" + time: 2 } output{ diff --git a/tasks/bcftools.wdl b/tasks/bcftools.wdl index 160fe460..b0f22141 100644 --- a/tasks/bcftools.wdl +++ b/tasks/bcftools.wdl @@ -13,7 +13,7 @@ task BiallelicNormalization { } Int disk_size = ceil(size(vcf_file, "GiB") + size(reference, "GiB") + 2) - Int memory_size = 7000 + Int memory_size = 4000 + ceil(size(vcf_file, "MiB") + size(reference, "MiB") + 2) command <<< @@ -30,6 +30,7 @@ task BiallelicNormalization { runtime { docker: "lifebitai/bcftools:1.10.2" + singularity:"docker://lifebitai/bcftools:1.10.2" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -37,7 +38,7 @@ task BiallelicNormalization { # Slurm job_name: "BiallelicNormalization" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { diff --git a/tasks/chunk_lists.wdl b/tasks/chunk_lists.wdl index 689126d7..0ad7c3d4 100644 --- a/tasks/chunk_lists.wdl +++ b/tasks/chunk_lists.wdl @@ -7,11 +7,14 @@ task SepareChunksFastqString { } Int disk_size = ceil(size(families_info, "GiB") * 2) - Int memory_size = 1000 + Int memory_size = 4000 + ceil(size(families_info, "MiB") * 2) command <<< R --vanilla --no-save < 3) { #pair-end + df <- df[,c(1,3,4,2)] + } split_df <- split.data.frame(df, df[,2]) n_chunk <- as.integer(length(split_df)/~{chunk_size}) @@ -22,6 +25,7 @@ task SepareChunksFastqString { for(i in 1:length(chunk_sep)){ df <- do.call(rbind, unlist(chunk_sep[i], recursive = F)) df <- t(df) + print(df) write.table(df, file = paste0("chunk_",i, ".txt"), quote = F, col.names = F, row.names = F, sep="\t") } @@ -31,6 +35,7 @@ task SepareChunksFastqString { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -38,7 +43,7 @@ task SepareChunksFastqString { # Slurm job_name: "SepareChunksIndividuals" mem:"~{memory_size}M" - time:"00:10:00" + time: 1 } meta { @@ -60,7 +65,7 @@ task SepareChunksFastq { } Int disk_size = ceil(size(fastqs, "GiB") * 2) - Int memory_size = 1000 + Int memory_size = 1000 + ceil(size(fastqs, "MiB") * 2) command <<< R --vanilla --no-save <>> runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -52,7 +54,7 @@ task HaplotypeCaller { # Slurm job_name: "HaplotypeCaller" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { @@ -75,12 +77,13 @@ task ImportGVCFs { File reference_fai File reference_dict String interval + Int max_ram = 26000 } Int disk_size = ceil(size(vcfs, "GiB") * 1.5 + size(reference_fasta, "GiB") * 1.5) - Int memory_max = 2300 - Int memory_min = 2000 - Int memory_size = 26000 + Int memory_max = ceil(max_ram - 1000) + Int memory_min = ceil(max_ram/3.85) + Int memory_size = max_ram command <<< set -euo pipefail @@ -103,6 +106,7 @@ task ImportGVCFs { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 4 # Cloud memory:"~{memory_size} MiB" @@ -110,7 +114,7 @@ task ImportGVCFs { # Slurm job_name: "ImportGVCFs" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { @@ -131,12 +135,13 @@ task GenotypeGVCFs { File reference_fai File reference_dict String interval + Int max_ram = 26000 } Int disk_size = ceil(size(reference_fasta, "GiB") * 1.5 + size(workspace_tar, "GiB") * 1.5) - Int memory_max = 2300 - Int memory_min = 2000 - Int memory_size = 26000 + Int memory_max = ceil(max_ram - 1000) + Int memory_min = ceil(max_ram/3.85) + Int memory_size = max_ram command <<< set -euo pipefail @@ -155,6 +160,7 @@ task GenotypeGVCFs { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 2 # Cloud memory:"~{memory_size} MiB" @@ -162,7 +168,7 @@ task GenotypeGVCFs { # Slurm job_name: "GenotypeGVCFs" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { @@ -185,8 +191,8 @@ task MergeVCFs { } Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 - Int memory_max = 2800 - Int memory_min = 2600 + Int memory_max = 2600 + Int memory_min = 2500 Int memory_size = 3000 command <<< @@ -199,6 +205,7 @@ task MergeVCFs { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -206,7 +213,7 @@ task MergeVCFs { # Slurm job_name: "MergeVCFs" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -231,7 +238,7 @@ task VariantsToTable { } Int disk_size = ceil(size(reference, "GB") + size(vcf_file, "GB") + 2) - Int memory_size = 2500 + Int memory_size = 2500 + ceil(size(reference, "MB") + size(vcf_file, "MB") + 2) command <<< @@ -250,6 +257,7 @@ task VariantsToTable { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -257,7 +265,7 @@ task VariantsToTable { # Slurm job_name: "VariantsToTable" mem:"~{memory_size}M" - time:"01:40:00" + time: 2 } meta { @@ -282,7 +290,7 @@ task VariantFiltration { } Int disk_size = ceil(size(vcf_file, "GB") + size(reference, "GB") + 1) - Int memory_size = 3000 + Int memory_size = 3000 + ceil(size(vcf_file, "MB") + size(reference, "MB") + 1) command <<< /usr/gitc/gatk4/./gatk VariantFiltration \ @@ -306,6 +314,7 @@ task VariantFiltration { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -313,7 +322,7 @@ task VariantFiltration { # Slurm job_name: "VariantFiltration" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -339,7 +348,7 @@ task VariantsToTableForHardFilteringSimulated { } Int disk_size = ceil(size(reference, "GB") + size(vcf_file, "GB") + size(simu_vcf, "GB") + 2) - Int memory_size = 3000 + Int memory_size = 3000 + ceil(size(reference, "MB") + size(vcf_file, "MB") + size(simu_vcf, "MB") + 2) command <<< /usr/gitc/./bgzip -c ~{simu_vcf} > ~{simu_vcf}.gz @@ -393,6 +402,7 @@ task VariantsToTableForHardFilteringSimulated { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity:"docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -400,7 +410,7 @@ task VariantsToTableForHardFilteringSimulated { # Slurm job_name: "VariantsToTable" mem:"~{memory_size}M" - time:"01:40:00" + time: 2 } meta { @@ -428,7 +438,7 @@ task VariantEval { } Int disk_size = ceil(size(vcf_norm, "GiB") + size(reference, "GiB") + size(vcf_simu, "GiB") + 2) - Int memory_size = 3000 + Int memory_size = 3000 + ceil(size(vcf_norm, "MiB") + size(reference, "MiB") + size(vcf_simu, "MiB") + 2) command <<< java -jar /usr/gitc/GATK35.jar -T VariantEval -R ~{reference} -eval ~{vcf_norm} ~{"-D " + vcf_simu} -EV ValidationReport -EV CountVariants -ploidy ~{ploidy} -o vcfEval.txt @@ -436,6 +446,7 @@ task VariantEval { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity: "docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -443,7 +454,7 @@ task VariantEval { # Slurm job_name: "VariantEval" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { diff --git a/tasks/gusmap.wdl b/tasks/gusmap.wdl index 471e3b4e..67290d9a 100644 --- a/tasks/gusmap.wdl +++ b/tasks/gusmap.wdl @@ -47,6 +47,7 @@ task GusmapReport { runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -54,7 +55,7 @@ task GusmapReport { # Slurm job_name: "GusmapReport" mem:"~{memory_size}G" - time:"24:00:00" + time: 24 } meta { @@ -154,6 +155,7 @@ task GusmapReportForSimulated { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -161,7 +163,7 @@ task GusmapReportForSimulated { # Slurm job_name: "GusmapReport" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { @@ -200,6 +202,7 @@ task CompressGusmap { runtime { docker:"ubuntu:20.04" + singularity: "docker://ubuntu:20.04" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -207,7 +210,7 @@ task CompressGusmap { # Slurm job_name: "CompressGusmap" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -244,6 +247,7 @@ task CompressGusmapSimu { runtime { docker:"ubuntu:20.04" + singularity: "docker://ubuntu:20.04" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -251,7 +255,7 @@ task CompressGusmapSimu { # Slurm job_name: "CompressGusmap" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index f5bc1d8c..377d2d5b 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -136,6 +136,7 @@ task MappolyReport { runtime { docker:"cristaniguti/reads2map:0.0.5" + singularity: "docker://cristaniguti/reads2map:0.0.5" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -143,7 +144,7 @@ task MappolyReport { # Slurm job_name: "MappolyReport" mem:"~{memory_size}G" - time:"24:00:00" + time: 24 } meta { @@ -153,6 +154,6 @@ task MappolyReport { } output { - File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_results.tar.gz" + File results = "~{SNPCall_program}_~{GenotypeCall_program}Poly_~{CountsFrom}_results.tar.gz" } } \ No newline at end of file diff --git a/tasks/mchap.wdl b/tasks/mchap.wdl index e5486d46..57526da8 100644 --- a/tasks/mchap.wdl +++ b/tasks/mchap.wdl @@ -13,7 +13,7 @@ task OneMCHap { } Int disk_size = ceil(size(bams, "GiB") * 1.5 + size(bed, "GiB") * 1.5 + size(vcf_file, "GiB") * 1.5 + size(reference, "GiB")) - Int memory_size = 3000 + Int memory_size = 3000 + ceil(size(bams, "MiB") * 1.5 + size(bed, "MiB") * 1.5 + size(vcf_file, "MiB") * 1.5 + size(reference, "MiB")) command <<< @@ -46,6 +46,7 @@ task OneMCHap { runtime { docker: "cristaniguti/mchap:0.0.1" + singularity: "docker://cristaniguti/mchap:0.0.1" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -53,7 +54,7 @@ task OneMCHap { # Slurm job_name: "MCHap" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { @@ -77,7 +78,7 @@ task OneMCHap_recall { } Int disk_size = ceil(size(bams, "GiB") * 1.25 + size(vcf_file, "GiB") * 1.5) - Int memory_size = 3000 + Int memory_size = 3000 + ceil(size(bams, "MiB") * 1.25 + size(vcf_file, "MiB") * 1.5) command <<< @@ -104,6 +105,7 @@ task OneMCHap_recall { runtime { docker: "cristaniguti/mchap:0.0.1" + singularity: "docker://cristaniguti/mchap:0.0.1" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -111,7 +113,7 @@ task OneMCHap_recall { # Slurm job_name: "OneMCHap_recall" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { diff --git a/tasks/pedigree_simulator.wdl b/tasks/pedigree_simulator.wdl index 49d8ce3a..a1f56744 100644 --- a/tasks/pedigree_simulator.wdl +++ b/tasks/pedigree_simulator.wdl @@ -23,6 +23,7 @@ task RunPedigreeSimulator { runtime { docker: "cristaniguti/java-in-the-cloud:0.0.1" + singularity: "docker://cristaniguti/java-in-the-cloud:0.0.1" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -30,7 +31,7 @@ task RunPedigreeSimulator { # Slurm job_name: "RunPedigreeSimulator" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { diff --git a/tasks/pedigree_simulator_utils.wdl b/tasks/pedigree_simulator_utils.wdl index 669c7ffb..975d36bb 100644 --- a/tasks/pedigree_simulator_utils.wdl +++ b/tasks/pedigree_simulator_utils.wdl @@ -173,6 +173,7 @@ task CreatePedigreeSimulatorInputs { runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -180,7 +181,7 @@ task CreatePedigreeSimulatorInputs { # Slurm job_name: "CreatePedigreeSimulatorInputs" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -277,6 +278,7 @@ task ConvertPedigreeSimulationToVcf { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -284,7 +286,7 @@ task ConvertPedigreeSimulationToVcf { # Slurm job_name: "ConvertPedigreeSimulationToVcf" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -349,6 +351,7 @@ task Vcf2PedigreeSimulator{ runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -356,7 +359,7 @@ task Vcf2PedigreeSimulator{ # Slurm job_name: "Vcf2PedigreeSimulator" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -395,6 +398,7 @@ task ProduceFamiliesSeeds { runtime { docker:"python:3.7" + singularity: "docker://python:3.7" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -402,7 +406,7 @@ task ProduceFamiliesSeeds { # Slurm job_name: "ProduceFamiliesSeeds" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } output { diff --git a/tasks/pirs.wdl b/tasks/pirs.wdl index 7fde5ef0..9d2fb76f 100644 --- a/tasks/pirs.wdl +++ b/tasks/pirs.wdl @@ -16,6 +16,7 @@ task GenerateAlternativeGenome { runtime { docker: "cristaniguti/pirs-ddrad-cutadapt:0.0.1" + singularity: "docker://cristaniguti/pirs-ddrad-cutadapt:0.0.1" cpu:1 # Cloud memory:"3000 MiB" @@ -23,7 +24,7 @@ task GenerateAlternativeGenome { # Slurm job_name: "GenerateAlternativeGenome" mem:"3000M" - time:"01:00:00" + time: 1 } meta { diff --git a/tasks/radinitio.wdl b/tasks/radinitio.wdl index af6a1183..d2cbbce5 100644 --- a/tasks/radinitio.wdl +++ b/tasks/radinitio.wdl @@ -99,6 +99,7 @@ task RADinitioSimulation { runtime { docker: "cristaniguti/radinitio:0.0.1" + singularity: "docker://cristaniguti/radinitio:0.0.1" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -108,7 +109,7 @@ task RADinitioSimulation { # Slurm job_name: "RADinitioSimulation" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { diff --git a/tasks/simuscop.wdl b/tasks/simuscop.wdl index 1f588f17..62320687 100644 --- a/tasks/simuscop.wdl +++ b/tasks/simuscop.wdl @@ -37,6 +37,7 @@ task SimuscopProfile { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -44,7 +45,7 @@ task SimuscopProfile { # Slurm job_name: "SimuscopProfile" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -114,6 +115,7 @@ task SimuscopSimulation { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -121,7 +123,7 @@ task SimuscopSimulation { # Slurm job_name: "SimuscopSimulation" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { diff --git a/tasks/stacks.wdl b/tasks/stacks.wdl index 23c8117a..e2fa5fb2 100644 --- a/tasks/stacks.wdl +++ b/tasks/stacks.wdl @@ -9,7 +9,7 @@ task ProcessRadTags { } Int disk_size = ceil(size(fq_files, "GiB") * 2) - Int memory_size = 6000 + Int memory_size = 4000 + ceil(size(fq_files, "MiB") * 2) command <<< mkdir raw process_radtags_results @@ -24,6 +24,7 @@ task ProcessRadTags { runtime { docker:"cristaniguti/stacks:0.0.1" + singularity: "docker://cristaniguti/stacks:0.0.1" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -31,10 +32,100 @@ task ProcessRadTags { # Slurm job_name: "ProcessRadTags" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } output { Array[File] seq_results = glob("process_radtags_results/*.fq.gz") } } + +task CreatePopMapFile { + input { + Array[File] bams + } + + command <<< + R --vanilla --no-save <>> + + runtime { + docker:"cristaniguti/r-samtools:latest" + singularity: "docker://cristaniguti/r-samtools:latest" + cpu:1 + # Cloud + memory:"1000 MiB" + disks:"local-disk 1 GiB HDD" + # Slurm + job_name: "CreatePopMapFile" + mem:"1000M" + time: 10 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "Reads a SAM file to determine the potential positions of Tags against the reference genome. Identifies SNPs from the aligned tags" + } + + output { + File pop_map = "pop_map.tsv" + } +} + +task RefMap { + input { + Array[File] bams + File pop_map + Int max_cores + } + + Int disk_size = ceil(size(bams, "GiB") * 2) + Int memory_size = 4000 + ceil(size(bams, "MiB") * 2) + + command <<< + + mkdir aligned stacks + ln -s ~{sep=" " bams} aligned + + gstacks -I aligned/ -M ~{pop_map} -O stacks/ -t ~{max_cores} + + populations -P stacks/ -M ~{pop_map} --vcf -t ~{max_cores} + + >>> + + runtime { + docker:"cristaniguti/stacks:0.0.1" + singularity: "docker://cristaniguti/stacks:0.0.1" + cpu:1 + # Cloud + memory:"~{memory_size} MiB" + disks:"local-disk " + disk_size + " HDD" + # Slurm + job_name: "ProcessRadTags" + mem:"~{memory_size}M" + time: 10 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "Reads a SAM file to determine the potential positions of Tags against the reference genome. Identifies SNPs from the aligned tags" + } + + output { + File stacks_vcf = "stacks/populations.snps.vcf" + File stacks_multiallelics = "stacks/populations.haps.vcf" + } +} diff --git a/tasks/tassel.wdl b/tasks/tassel.wdl new file mode 100644 index 00000000..daf78c3e --- /dev/null +++ b/tasks/tassel.wdl @@ -0,0 +1,323 @@ +version 1.0 + + +task transposeSamples { + input { + File families_info + } + + command <<< + R --vanilla --no-save <>> + + runtime { + docker: "cristaniguti/r-samtools:latest" + singularity: "docker://cristaniguti/r-samtools:latest" + cpu: 1 + # Cloud + memory:"1 GiB" + disks:"local-disk 3 GiB HDD" + # Slurm + job_name: "transposeSamples" + mem:"1G" + time: 4 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "transpose the sample info" + } + + output { + File transpose_samples = "transpose_sample.tsv" + } +} + +task BarcodeFaker { + input { + Array[File] fastq + Array[String] FullSampleName + } + + command <<< + R --vanilla --no-save < 0]) # subset the real and positive roots, and round up + if(r < 4){r <- 4} # keep a minimum barcode length of 4, probably unnecessary + + fake_barcodes <- barcode_inventor(fake_barcodes, r, fastqs) + + #make the fake perfect quality scores matching the barcode length + fake_quals <- rep(paste(rep("E", times = r), collapse = ""), times = length(fastqs)) + + + #make fake header files for FASTQ reads + #headers are totally fake (do not refer to input file) + #fake headers allows interoperability between TASSEL and demultiplexing software FASTQ formats + fake_flowcell_no <- seq(from = 1, to = length(fastqs), by = 1) + fake_headers <- paste("D00553R:56:", "C8B56ANXX", fake_flowcell_no, ":3:1101:1203:2037 1:N:0:3", sep = "") + fake_flowcells <- paste("C8B56ANXX", fake_flowcell_no, sep = "") + fake_lanes <- rep(3, times = length(fastqs)) + + #make fake file names that appropriately reference fake flowcell and lane + newfile_names <- paste(fake_flowcells, "_", "3","_","fastq.fastq", sep = "") + + #stop if files with the fake file names already exist + workdir <- getwd() + dirfil <- list.files(path = workdir, full.names = FALSE) + if(sum(newfile_names %in% dirfil) != 0){ + stop("Files in your working directory seem to have same names as those + output by this function. Please move them, delete them, or choose a new working directory.") + } + + #write a key of barcode, flowcell, lane and new file names + key <- cbind(Flowcell = fake_flowcells, Lane = fake_lanes, Barcode = fake_barcodes, FullSampleName) + write.table(key, "fakebarcodes_key.txt", row.names = FALSE, sep = "\t", quote = FALSE) + + #read 400 lines per file at a time, paste on the barcodes and quality scores, make unique flowcells, + #and write them out with unique names + for(i in 1:length(fastqs)){ + incon <- file(fastqs[i], open = "r") + outcon <- file(newfile_names[i], open = "w") + while(length(mylines <- readLines(incon, n = 400, warn = FALSE))){ + head_pos <- seq(1, length(mylines), by = 4) + seq_pos <- seq(2, length(mylines), by = 4) + qual_pos <- seq(4, length(mylines), by = 4) + + for(j in 1:length(mylines)){ + mylines[head_pos[j]] <- fake_headers[i] + mylines[seq_pos[j]] <- paste(fake_barcodes[i], mylines[seq_pos[j]], sep = "") + mylines[qual_pos[j]] <- paste(fake_quals[i], mylines[qual_pos[j]], sep = "") + } + writeLines(mylines, outcon) + } + print(paste("Finished writing File", i, "...", sep = " ")) + #close the connections + close(outcon, warn = FALSE) + close(incon, warn = FALSE) + } + } + + file_names <- "~{sep=',' fastq}" + file_names <- unlist(strsplit(file_names, ",")) + is_gz <- basename(file_names[1]) + if(grepl("gz", is_gz)) system(paste("gunzip", file_names)) + dir_name <- dirname(file_names[1]) + + sample_names <- "~{sep=',' FullSampleName}" + sample_names <- unlist(strsplit(sample_names, ",")) + + barcode_faker(fastq_dir = dir_name, FullSampleName = sample_names) + + RSCRIPT + >>> + + runtime { + docker: "cristaniguti/r-samtools:latest" + singularity: "docker://cristaniguti/r-samtools:latest" + cpu: 1 + # Cloud + memory:"2 GiB" + disks:"local-disk 3 GiB HDD" + # Slurm + job_name: "BarcodeFaker" + mem:"2G" + time: 4 + } + + meta { + author: "Marlee Labroo function adapted by Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "Add fake barcode adaptors to fastq sequences to be input for TASSEL" + } + + output { + File key_file = "fakebarcodes_key.txt" + Array[File] barcode_fastq = glob("*.fastq") + } +} + +task TasselBeforeAlign { + input { + Array[File] fastq + String enzyme = "ApeKI" + File key_file + Int max_ram = 10000 + } + + Int disk_size = ceil(size(fastq, "GiB")) + Int memory_min = ceil(max_ram/2) + + command <<< + + mkdir fastqs + ln -s ~{sep=" " fastq} fastqs + + /usr/tassel/run_pipeline.pl -Xms~{memory_min}m -Xmx~{max_ram}m -fork1 -GBSSeqToTagDBPlugin -e ~{enzyme} -i fastqs \ + -db GBSV2.db \ + -k ~{key_file} \ + -kmerLength 64 -minKmerL 20 -mnQS 20 -mxKmerNum 100000000 -endPlugin -runfork1 + + /usr/tassel/run_pipeline.pl -Xms~{memory_min}m -Xmx~{max_ram}m -fork1 -TagExportToFastqPlugin -db GBSV2.db \ + -o tagsForAlign.fa.gz -c 1 -endPlugin -runfork1 + + >>> + + runtime { + docker: "cristaniguti/java-in-the-cloud:0.0.2" + singularity: "docker://cristaniguti/java-in-the-cloud:0.0.2" + cpu: 1 + # Cloud + memory:"~{max_ram} MiB" + disks:"local-disk " + disk_size + " HDD" + # Slurm + job_name: "GBSSeqToTagDBPlugin" + mem:"~{max_ram}M" + time: 24 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "GBSSeqToTagDBPlugin takes fastQ files as input, identifies tags and the taxa in which they appear, and stores this data to a local database. Retrieves distinct tags stored in the database and reformats them to a FASTQ file" + } + + output { + File tassel_database = "GBSV2.db" + File fastq_align = "tagsForAlign.fa.gz" + } +} + +task TasselAfterAlign { + input { + File tassel_database + File bam + Int max_ram = 10000 + String enzyme + File key_file + Array[File] fastq + } + + Int disk_size = ceil(size(tassel_database, "GiB")) + Int memory_min = ceil(max_ram/2) + + command <<< + + samtools view -h ~{bam} > file.sam + mv ~{tassel_database} . + + /usr/tassel/run_pipeline.pl -Xms~{memory_min}m -Xmx~{max_ram}m -fork1 -SAMToGBSdbPlugin \ + -i file.sam \ + -db GBSV2.db \ + -aProp 0.0 -aLen 0 -endPlugin -runfork1 + + /usr/tassel/run_pipeline.pl -Xms~{memory_min}m -Xmx~{max_ram}m -fork1 -DiscoverySNPCallerPluginV2 \ + -db GBSV2.db \ + -mnLCov 0.1 -mnMAF 0.01 -deleteOldData true -endPlugin -runfork1 + + /usr/tassel/run_pipeline.pl -Xms~{memory_min}m -Xmx~{max_ram}m -fork1 -SNPQualityProfilerPlugin \ + -db GBSV2.db \ + -deleteOldData true -endPlugin -runfork1 + + mkdir fastqs + ln -s ~{sep=" " fastq} fastqs + + /usr/tassel/run_pipeline.pl -Xms~{memory_min}m -Xmx~{max_ram}m -fork1 -ProductionSNPCallerPluginV2 \ + -db GBSV2.db \ + -e ~{enzyme} -i fastqs \ + -k ~{key_file} \ + -kmerLength 64 \ + -o tassel.vcf -endPlugin -runfork1 + + >>> + + runtime { + docker: "cristaniguti/java-in-the-cloud:0.0.2" + singularity: "docker://cristaniguti/java-in-the-cloud:0.0.2" + cpu: 1 + # Cloud + memory:"~{max_ram} MiB" + disks:"local-disk " + disk_size + " HDD" + # Slurm + job_name: "SAMToGBSdbPlugin" + mem:"~{max_ram}M" + time: 24 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "Reads a SAM file to determine the potential positions of Tags against the reference genome. Identifies SNPs from the aligned tags" + } + + output { + File tassel_vcf = "tassel.vcf" + } + +} diff --git a/tasks/utils.wdl b/tasks/utils.wdl index ee3fd387..e98428a5 100644 --- a/tasks/utils.wdl +++ b/tasks/utils.wdl @@ -6,7 +6,7 @@ task mergeVCFs { } Int disk_size = ceil(size(haplo_vcf, "GiB") * 2) - Int memory_size = 5000 + Int memory_size = ceil(3000 + size(haplo_vcf, "MiB") * 2) command <<< @@ -32,6 +32,7 @@ task mergeVCFs { runtime { docker:"lifebitai/bcftools:1.10.2" + singularity: "docker://lifebitai/bcftools:1.10.2" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -39,7 +40,7 @@ task mergeVCFs { # Slurm job_name: "mergeVCFs" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -62,7 +63,7 @@ task GenerateSampleNames { # TODO: probably a name like 'ReadSamplesNamesInVcf' } Int disk_size = ceil(size(simulated_vcf, "GiB") * 2) - Int memory_size = 1000 + Int memory_size = ceil(1000 + size(simulated_vcf, "MiB") * 2) command <<< export PATH=$PATH:/opt/conda/bin @@ -80,6 +81,7 @@ task GenerateSampleNames { # TODO: probably a name like 'ReadSamplesNamesInVcf' runtime { docker: "cristaniguti/miniconda-alpine:0.0.1" + singularity: "docker://cristaniguti/miniconda-alpine:0.0.1" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -87,7 +89,7 @@ task GenerateSampleNames { # TODO: probably a name like 'ReadSamplesNamesInVcf' # Slurm job_name: "GenerateSampleNames" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -112,7 +114,7 @@ task ApplyRandomFilters { } Int disk_size = ceil(size(gatk_vcf, "GiB") * 2 + size(freebayes_vcf, "GiB") * 2 + size(gatk_vcf_bam_counts, "GiB") * 2 + size(freebayes_vcf_bam_counts, "GiB") * 2) - Int memory_size = 3000 + Int memory_size = ceil(3000 + size(gatk_vcf, "MiB") * 2 + size(freebayes_vcf, "MiB") * 2 + size(gatk_vcf_bam_counts, "MiB") * 2 + size(freebayes_vcf_bam_counts, "MiB") * 2) command <<< # Required update to deal with polyploids @@ -128,6 +130,7 @@ task ApplyRandomFilters { runtime { docker:"cristaniguti/split_markers:0.0.1" + singularity: "docker://cristaniguti/split_markers:0.0.1" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -135,7 +138,7 @@ task ApplyRandomFilters { # Slurm job_name: "ApplyRandomFilters" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -155,34 +158,44 @@ task ApplyRandomFilters { task ApplyRandomFiltersArray { input{ Array[File] vcfs - Array[String] vcfs_software - Array[String] vcfs_counts_source + Array[String] vcfs_SNPCall_software + Array[String] vcfs_Counts_source + Array[String] vcfs_GenoCall_software String? filters String? chromosome } Int disk_size = ceil(size(vcfs, "GiB") * 2) - Int memory_size = 3000 + Int memory_size = ceil(3000 + size(vcfs, "MiB") * 2) command <<< vcfs=(~{sep = " " vcfs}) - vcfs_software=(~{sep=" " vcfs_software}) - vcfs_counts_source=(~{sep=" " vcfs_counts_source}) + vcfs_snp_software=(~{sep=" " vcfs_SNPCall_software}) + vcfs_counts_source=(~{sep=" " vcfs_Counts_source}) + vcfs_geno_software=(~{sep=" " vcfs_GenoCall_software}) for index in ${!vcfs[*]}; do - cp ${vcfs[$index]} temp.vcf - tabix -p vcf temp.vcf - bcftools view temp.vcf ~{filters} -r ~{chromosome} \ - -o vcf_filt_${vcfs_software[$index]}_${vcfs_counts_source[$index]}.vcf.gz - rm temp.vcf temp.vcf.tbi - echo vcf_filt_${vcfs_software[$index]}_${vcfs_counts_source[$index]}.vcf.gz >> outputs.txt + if [[ ${vcfs[$index]} != *.gz ]]; then + cp ${vcfs[$index]} temp.vcf + bgzip temp.vcf + else + cp ${vcfs[$index]} temp.vcf.gz + fi + + tabix -p vcf temp.vcf.gz + bcftools view temp.vcf.gz ~{filters} ~{" -r " + chromosome} \ + -o vcf_filt_${vcfs_snp_software[$index]}_${vcfs_counts_source[$index]}_${vcfs_geno_software[$index]}.vcf + bgzip vcf_filt_${vcfs_snp_software[$index]}_${vcfs_counts_source[$index]}_${vcfs_geno_software[$index]}.vcf + rm temp.vcf.gz temp.vcf.gz.tbi + echo vcf_filt_${vcfs_snp_software[$index]}_${vcfs_counts_source[$index]}_${vcfs_geno_software[$index]}.vcf.gz >> outputs.txt done >>> runtime { docker:"lifebitai/bcftools:1.10.2" + singularity: "docker://lifebitai/bcftools:1.10.2" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -190,7 +203,7 @@ task ApplyRandomFiltersArray { # Slurm job_name: "ApplyRandomFilters" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -211,7 +224,7 @@ task SplitMarkers { } Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = 3000 + Int memory_size = ceil(3000 + size(vcf_file, "MiB") * 2) command <<< bcftools view --max-alleles 2 --min-alleles 2 --output-type z --output-file biallelics.vcf.gz ~{vcf_file} @@ -220,6 +233,7 @@ task SplitMarkers { runtime { docker:"lifebitai/bcftools:1.10.2" + singularity: "docker://lifebitai/bcftools:1.10.2" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -227,7 +241,7 @@ task SplitMarkers { # Slurm job_name: "SplitMarkers" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -244,12 +258,15 @@ task SplitMarkers { task JointMarkers { input{ + String SNPCall_program + String CountsFrom + String GenotypeCall_program File biallelic_vcf File? multiallelic_vcf } Int disk_size = ceil(size(biallelic_vcf, "GiB") * 2 + size(multiallelic_vcf, "GiB") * 2) - Int memory_size = 3000 + Int memory_size = ceil(3000 + size(biallelic_vcf, "MiB") * 2 + size(multiallelic_vcf, "MiB") * 2) command <<< @@ -270,13 +287,14 @@ task JointMarkers { tabix -p vcf ~{multiallelic_vcf} bcftools view -S samples.txt ~{multiallelic_vcf} > multiallelic_sort.vcf - bcftools concat biallelic_sort.vcf multiallelic_sort.vcf --output merged.vcf - bgzip merged.vcf + bcftools concat biallelic_sort.vcf multiallelic_sort.vcf --output ~{SNPCall_program}_~{CountsFrom}_~{GenotypeCall_program}.vcf + bgzip ~{SNPCall_program}_~{CountsFrom}_~{GenotypeCall_program}.vcf >>> runtime { docker:"lifebitai/bcftools:1.10.2" + singularity: "docker://lifebitai/bcftools:1.10.2" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -284,7 +302,7 @@ task JointMarkers { # Slurm job_name: "JointMarkers" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -294,7 +312,7 @@ task JointMarkers { } output { - File merged_vcf = "merged.vcf.gz" + File merged_vcf = "~{SNPCall_program}_~{CountsFrom}_~{GenotypeCall_program}.vcf.gz" } } @@ -340,6 +358,7 @@ task ReplaceAD { runtime { docker:"lifebitai/bcftools:1.10.2" + singularity: "docker://lifebitai/bcftools:1.10.2" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -347,7 +366,7 @@ task ReplaceAD { # Slurm job_name: "ReplaceAD" mem:"~{memory_size}M" - time:"24:00:00" + time: 24 } meta { @@ -375,7 +394,7 @@ task Compress { } Int disk_size = ceil(size(RDatas, "GiB") + size(maps_report, "GiB") + size(times, "GiB") + size(filters_report, "GiB") + size(errors_report, "GiB")) - Int memory_size = 1000 + Int memory_size = ceil(2000 + size(RDatas, "MiB") + size(maps_report, "MiB") + size(times, "MiB") + size(filters_report, "MiB") + size(errors_report, "MiB")) command <<< @@ -391,6 +410,7 @@ task Compress { runtime { docker:"ubuntu:20.04" + singularity: "docker://ubuntu:20.04" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -398,7 +418,7 @@ task Compress { # Slurm job_name: "Compress" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -425,7 +445,7 @@ task GetMarkersPos { } Int disk_size = ceil(size(true_vcf, "GiB") * 1.5 + size(filtered_gatk_vcf, "GiB") * 1.5 + size(filtered_gatk_vcf_bamcounts, "GiB") + size(filtered_freebayes_vcf, "GiB") + size(filtered_freebayes_vcf_bamcounts, "GiB")) - Int memory_size = 5000 + Int memory_size = ceil(3000 + size(true_vcf, "MiB") * 1.5 + size(filtered_gatk_vcf, "MiB") * 1.5 + size(filtered_gatk_vcf_bamcounts, "MiB") + size(filtered_freebayes_vcf, "MiB") + size(filtered_freebayes_vcf_bamcounts, "MiB")) command <<< @@ -442,6 +462,7 @@ task GetMarkersPos { runtime { docker:"lifebitai/bcftools:1.10.2" + singularity: "docker://lifebitai/bcftools:1.10.2" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -449,7 +470,7 @@ task GetMarkersPos { # Slurm job_name: "GetMarkerPos" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -469,7 +490,7 @@ task MergeBams{ } Int disk_size = ceil(size(bam_files, "GiB") * 2) - Int memory_size = ceil(size(bam_files, "MiB") * 2) + Int memory_size = ceil(3000 + size(bam_files, "MiB") * 2) command <<< samtools merge merged.bam ~{sep=" " bam_files} @@ -477,6 +498,7 @@ task MergeBams{ runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity: "docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -484,7 +506,7 @@ task MergeBams{ # Slurm job_name: "MergeBams" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -505,7 +527,7 @@ task TarFiles { } Int disk_size = ceil(size(sequences, "GiB") * 2) - Int memory_size = 6000 + Int memory_size = ceil(4000 + size(sequences, "MiB") * 2) command <<< mkdir results @@ -515,6 +537,7 @@ task TarFiles { runtime { docker:"kfdrc/cutadapt" + singularity: "docker://kfdrc/cutadapt" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -522,7 +545,7 @@ task TarFiles { # Slurm job_name: "TarFiles" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } output { @@ -540,7 +563,7 @@ task VariantFiltration { } Int disk_size = ceil(size(vcf_file, "GB") + size(reference, "GB") + 1) - Int memory_size = 5000 + Int memory_size = ceil(5000 + size(vcf_file, "MiB") * 2) command <<< /usr/gitc/gatk4/./gatk VariantFiltration \ @@ -564,6 +587,7 @@ task VariantFiltration { runtime { docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" + singularity: "docker://us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -571,7 +595,7 @@ task VariantFiltration { # Slurm job_name: "VariantFiltration" mem:"~{memory_size}M" - time:"01:00:00" + time: 1 } meta { @@ -592,7 +616,7 @@ task BamToBed { } Int disk_size = ceil(size(merged_bams, "GiB") * 1.5) - Int memory_size = 3000 + Int memory_size = ceil(3000 + size(merged_bams, "MiB") * 1.5) command <<< bamToBed -i ~{merged_bams} > file.bed @@ -601,6 +625,7 @@ task BamToBed { runtime { docker: "biocontainers/bedtools:v2.27.1dfsg-4-deb_cv1" + singularity: "docker://biocontainers/bedtools:v2.27.1dfsg-4-deb_cv1" cpu: 1 # Cloud memory:"~{memory_size} MiB" @@ -608,7 +633,7 @@ task BamToBed { # Slurm job_name: "BamToBed" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { diff --git a/tasks/utilsR.wdl b/tasks/utilsR.wdl index e30cb8f8..a03061a9 100644 --- a/tasks/utilsR.wdl +++ b/tasks/utilsR.wdl @@ -45,6 +45,7 @@ task vcf2onemap { runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity: "docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -52,7 +53,7 @@ task vcf2onemap { # Slurm job_name: "vcf2onemap" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -99,6 +100,7 @@ task FiltersReport { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -106,7 +108,7 @@ task FiltersReport { # Slurm job_name: "Filters" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -151,6 +153,7 @@ task FiltersReportEmp { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -158,7 +161,7 @@ task FiltersReportEmp { # Slurm job_name: "Filters" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -257,6 +260,7 @@ task MapsReport { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:4 # Cloud memory:"~{memory_size} MiB" @@ -264,7 +268,7 @@ task MapsReport { # Slurm job_name: "MapsReport" mem:"~{memory_size}M" - time:"48:00:00" + time: 48 } meta { @@ -346,6 +350,7 @@ task ErrorsReport { runtime { docker: "cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -353,7 +358,7 @@ task ErrorsReport { # Slurm job_name: "ErrorsReport" mem:"~{memory_size}M" - time:"05:00:00" + time: 5 } meta { @@ -410,6 +415,7 @@ task CheckDepths { runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -417,7 +423,7 @@ task CheckDepths { # Slurm job_name: "CheckDepths" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -473,6 +479,7 @@ task MapsReportEmp { runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:max_cores # Cloud memory:"~{memory_size} MiB" @@ -480,7 +487,7 @@ task MapsReportEmp { # Slurm job_name: "MapsEmp" mem:"~{memory_size}M" - time:"48:00:00" + time: 48 } meta { @@ -500,6 +507,8 @@ task MapsReportEmp { # Exclusive for biallelic markers, input VCF file are normalized with -m-any task ReGenotyping { input { + String SNPCall_program + String CountsFrom String GenotypeCall_program File vcf_file String cross @@ -526,7 +535,7 @@ task ReGenotyping { stop("Invalid crosstype") } - out_vcf <- "regeno.vcf.gz" + out_vcf <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_regeno.vcf.gz" if (method == "updog") { updog_genotype_vcf(vcf="~{vcf_file}", @@ -560,7 +569,7 @@ task ReGenotyping { out_vcf = out_vcf) } - system("gunzip regeno.vcf.gz") + system("gunzip ~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_regeno.vcf.gz") RSCRIPT @@ -568,6 +577,7 @@ task ReGenotyping { runtime { docker:"cristaniguti/reads2map:0.0.5" + singularity:"docker://cristaniguti/reads2map:0.0.5" cpu: max_cores # Cloud memory:"~{memory_size} MiB" @@ -575,7 +585,7 @@ task ReGenotyping { # Slurm job_name: "ReGenotyping" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -585,7 +595,7 @@ task ReGenotyping { } output { - File regeno_vcf = "regeno.vcf" + File regeno_vcf = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_regeno.vcf" } } @@ -654,6 +664,7 @@ task SetProbs { >>> runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -661,7 +672,7 @@ task SetProbs { # Slurm job_name: "SetProbs" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -745,6 +756,7 @@ task SetProbsDefault { >>> runtime { docker:"cristaniguti/reads2map:0.0.4" + singularity:"docker://cristaniguti/reads2map:0.0.4" cpu:1 # Cloud memory:"~{memory_size} MiB" @@ -752,7 +764,7 @@ task SetProbsDefault { # Slurm job_name: "SetProbsDefault" mem:"~{memory_size}M" - time:"10:00:00" + time: 10 } meta { @@ -778,7 +790,7 @@ task RemoveNonInformative { } Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = 3000 + Int memory_size = ceil(3000 + size(vcf_file, "MiB") * 2) command <<< R --vanilla --no-save <