Merge pull request #101 from Cristianetaniguti/develop

Develop
Cristianetaniguti · Jun 1, 2023 · 0674caa · 0674caa
2 parents 2bc89df + 32dc71e
commit 0674caa
Show file tree

Hide file tree

Showing 92 changed files with 1,597 additions and 372 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -24,7 +24,9 @@ jobs:
       - checkout
 
       - run: pip3 install -r tests/requirements.txt
-      - run: pytest --git-aware tests/subworkflows/genotyping_empirical/freebayes/polyrad
+      - run: pytest --git-aware tests/subworkflows/gatk_genotyping/
+      - run: pytest --git-aware tests/subworkflows/stacks_genotyping/
+      - run: pytest --git-aware tests/subworkflows/freebayes_genotyping/
 
   release-to-github:
     <<: *machine_default

diff --git a/.configurations/cromwell_mysql.conf b/.configurations/cromwell_mysql.conf
@@ -1,5 +1,5 @@
 backend {
-  default = SlurmSingularity
+  default = Local
 
   providers {
 

diff --git a/.dockerfiles/java-in-the-cloud/Dockerfile b/.dockerfiles/java-in-the-cloud/Dockerfile
@@ -1,4 +1,4 @@
-FROM openjdk:7
+FROM ibmjava:jre
 
 COPY ./PedigreeSim.jar /usr/jars/
 
@@ -10,3 +10,10 @@ RUN apt-get update \
     && make \
     && mv vcf2diploid.jar /usr/jars/ 
 
+
+RUN wget https://bitbucket.org/tasseladmin/tassel-5-standalone/get/5f68583d0f56.zip \
+    && unzip 5f68583d0f56.zip \
+    && mkdir /usr/tassel \
+    && mv tasseladmin-tassel-5-standalone-5f68583d0f56/* /usr/tassel
+
+RUN apt-get install -y samtools
diff --git a/.dockerfiles/tassel/Dockerfile b/.dockerfiles/tassel/Dockerfile
@@ -0,0 +1,11 @@
+FROM ibmjava:jre
+
+RUN apt update \
+    && apt install -y  parallel vcftools
+
+RUN conda install -y -c bioconda freebayes \
+    && conda install -y -c bioconda vcflib
+
+RUN wget https://bitbucket.org/tasseladmin/tassel-5-standalone/get/5f68583d0f56.zip \
+    && unzip 5f68583d0f56.zip \
+    && mv tasseladmin-tassel-5-standalone-5f68583d0f56/ /tassel
diff --git a/.scripts/build_pipeline_release.sh b/.scripts/build_pipeline_release.sh
@@ -6,14 +6,16 @@ declare -r SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 &
 source ${SCRIPT_DIR}/common.sh
 
 declare ROOT_WDL=""
+declare ROOT_JSON=""
 declare VERSION=""
 declare OUTPUT_DIR=""
 declare ENV=""
 
-declare -r ZIP_SUFFIX=".zip" WDL_SUFFIX=".wdl" OPTIONS_SUFFIX=".options.json"
+declare -r ZIP_SUFFIX=".zip" WDL_SUFFIX=".wdl" OPTIONS_SUFFIX=".inputs.json"
 
 function make_release() {
   local -r rootWdl=${ROOT_WDL}
+  local -r rootJSON=${ROOT_JSON}
   local -r wdlBasename=$(basename ${rootWdl} ${WDL_SUFFIX})
 
   # Store the current working directory
@@ -37,7 +39,8 @@ function make_release() {
   # Strip the paths out of the root WDL imports
   sed -E '/http/! s/import "(.*)\/(.*\'${WDL_SUFFIX}')"/import "\2"/g' ${rootWdl} > ${outputVersionedPrefix}${WDL_SUFFIX}
 
-  write_options ${rootWdl} ${outputVersionedPrefix}
+  sed -E '/http/! s/import "(.*)\/(.*\'${OPTIONS_SUFFIX}')"/import "\2"/g' ${rootJSON} > ${outputVersionedPrefix}${OPTIONS_SUFFIX}
+  # write_options ${rootWdl} ${outputVersionedPrefix}
 
   versioned_dependencies_zip=${outputVersionedPrefix}${ZIP_SUFFIX}
 
@@ -91,14 +94,15 @@ function show_help() {
     echo ""
     echo "Arguments:"
     echo "  -w             The path to the workflow (.wdl) file"
+    echo "  -j             The path to the JSON input file template (.json) file"
     echo "  -v             The version of the workflow (used in building the release name)"
     echo "  -o             The directory into which the outputs will be written"
     echo "  -e             The environment (dev, staging, or prod)"
     echo "  -h             print this helpful message"
     echo ""
 }
 
-while getopts "hw:v:o:e:" opt; do
+while getopts "hw:j:v:o:e:" opt; do
     case ${opt} in
 	    h)
 	      show_help
@@ -111,6 +115,13 @@ while getopts "hw:v:o:e:" opt; do
         fi
         ROOT_WDL=${OPTARG}
         ;;
+      j)
+        if [[ ! -f ${OPTARG} ]]; then
+          echo >&2 Error: ${OPTARG} does not exist!
+          exit 1
+        fi
+        ROOT_JSON=${OPTARG}
+        ;;
       v)
         VERSION=${OPTARG}
         ;;

diff --git a/.scripts/release_pipeline_to_github.sh b/.scripts/release_pipeline_to_github.sh
@@ -103,9 +103,10 @@ function build_and_release_to_github() {
   local -r prerelease=${5}
   local -r pipelineName=$(basename ${pipeline} .wdl)
   local -r changelog=$(dirname ${pipeline})/${pipelineName}.changelog.md
+  local -r inputsName=$(dirname ${pipeline})/${pipelineName}.inputs.json
 
   stderr "Building artifacts for ${releaseName}"
-  ${SCRIPT_DIR}/build_pipeline_release.sh -w ${pipeline} -e prod -v ${version} -o ${localReleaseDir}
+  ${SCRIPT_DIR}/build_pipeline_release.sh -w ${pipeline} -j ${inputsName} -e prod -v ${version} -o ${localReleaseDir}
 
   stderr "Building release notes for ${releaseName}"
   local previousEntryStart
@@ -197,8 +198,8 @@ function upload_to_github_as_draft() {
     ${pipelineName} \
     ${version} \
     ${releaseName} \
-    "options.json" \
-    "application/json"
+    "inputs.json" \
+    "text/json"
 
   local -r dependenciesZip=${localReleaseDir}/${pipelineName}/${pipelineName}_${version}.zip
 
@@ -308,4 +309,4 @@ localReleaseDir=$(mktemp -d)
 
 trap cleanup_failed_release EXIT
 
-release_to_github ${PIPELINE_TO_RELEASE} ${ENV}
+release_to_github ${PIPELINE_TO_RELEASE} ${ENV}
diff --git a/README.md b/README.md
@@ -1,38 +1,43 @@
 [![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg)
 [![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map)
 
-## Reads2Map 
 
-Reads2Map presents a collection of [WDL workflows](https://openwdl.org/)  to build linkage maps from sequencing reads. Each workflow release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases). 
+<p align="center">
+<br>
+<img src="https://github.com/Cristianetaniguti/Reads2Map/assets/7572527/6074320a-0eba-44b9-88e1-b89eda8aad70" width="450"/>
+<br>
+<p/>
 
-The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building.
+Reads2Map is a collection of [WDL workflows](https://openwdl.org/) designed to facilitate the contruction of linkage maps from sequencing reads. You can find details of each workflow release on the Read2Map releases page, available [here](https://github.com/Cristianetaniguti/Reads2Map/releases). 
 
-By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build.
+The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` used RADinitio software to simulate Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building.
 
-![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png)
+The SNP calling step in Reads2Map currently includes the popular tools: GATK, Freebayes, TASSEL, and STACKs. For genotype/dosage calling, the workflow utilizes tools like updog, polyRAD, and SuperMASSA. Lastly, Reads2Map leverages OneMap, GUSMap, and MAPpoly for linkage map construction.
+
+For diploid data, you can visualize the results using the R package and shiny app called Reads2MapApp, available [here](https://github.com/Cristianetaniguti/Reads2MapApp). This package supports the visualization of linkage maps built using OneMap and GUSMap.
+
+The Reads2Map workflows perform the SNP and genotype/dosage calling for your complete data set. However, it builds the linkage map for only a single chromosome (reference genome is required) for each combination of software and parameters. The produced maps will probably still require improvements, but their characteristics will suggest which combination of SNP and genotype calling software and parameters you should use for your data. Once the pipeline is selected, you can input the respective VCF file in R and build the complete linkage map using OneMap or MAPpoly. Use [OneMap](https://statgen-esalq.github.io/tutorials/onemap/Outcrossing_Populations.html) or [MAPoly](https://rpubs.com/mmollin/tetra_mappoly_vignette) tutorials for guidance on building and improving the linkage map for the complete dataset. 
 
 ## How to use
 
-Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines).
+Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines). 
 
-To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies).
+In addition, we also suggest two wrappers: [cromwell-cli](https://github.com/lmtani/cromwell-cli) and [Caper](https://github.com/ENCODE-DCC/caper). Here is a tutorial on how to setup these tools and one example running the EmpiricalReads2Map:
 
-## Documentation
+* [Setup and run Reads2Map workflows](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html)
+
+To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies).
 
 Check the description of the inputs for the pipelines:
 
 * [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Reads2Map/EmpiricalReads.html)
 
 * [SimulatedReads2Map](https://cristianetaniguti.github.io/Tutorials/Reads2Map/simulatedreads.html)
 
-Check how to evaluate the workflows results in Reads2MapApp Shiny:
+Check how to evaluate the workflows results in Reads2MapApp Shiny (so far only available for diploid datasets):
 
 * [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp)
 
-Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map:
-
-* [A Guide to Build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/Quick_HighDens/High_density_maps.html)
-
 Check more information and examples of usage in:
 
 * [Taniguti, C. H., Taniguti, L. M., Amadeu, R. R., Mollinari, M., Da, G., Pereira, S., Riera-Lizarazu, O., Lau, J., Byrne, D., de Siqueira Gesteira, G., De, T., Oliveira, P., Ferreira, G. C., &#38; Franco Garcia, A. A.  Developing best practices for genotyping-by-sequencing analysis using linkage maps as benchmarks. BioRxiv. https://doi.org/10.1101/2022.11.24.517847](https://www.biorxiv.org/content/10.1101/2022.11.24.517847v2)
@@ -44,6 +49,8 @@ Check more information and examples of usage in:
 - [ddRADseqTools](https://github.com/GGFHF/ddRADseqTools) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Set of applications useful to in silico design and testing of double digest RADseq (ddRADseq) experiments;
 - [Freebayes](https://github.com/ekg/freebayes) in [Cristaniguti/freebayes:0.0.1](): Variant call step;
 - [GATK](https://github.com/broadinstitute/gatk) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Variant call step using Haplotype Caller, GenomicsDBImport and GenotypeGVCFs;
+- [TASSEL](https://www.maizegenetics.net/tassel) in [cristaniguti/java-in-the-cloud:0.0.2](https://hub.docker.com/repository/docker/cristaniguti/java-in-the-cloud/general): Variant Call
+- [STACKs](https://catchenlab.life.illinois.edu/stacks/) in [cristaniguti/stacks:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/stacks/general): Variant Call
 - [PedigreeSim](https://github.com/PBR/pedigreeSim?files=1) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Simulates progeny genotypes from parents genotypes for different types of populations;
 - [picard](https://github.com/broadinstitute/picard) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files;
 - [pirs](https://github.com/galaxy001/pirs) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): To generate simulates paired-end reads from a reference genome;
@@ -63,4 +70,9 @@ Check more information and examples of usage in:
 - [updog](https://github.com/dcgerard/updog) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Flexible Genotyping of Polyploids using Next Generation Sequencing Data
 - [polyRAD](https://github.com/lvclark/polyRAD) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Genotype Calling with Uncertainty from Sequencing Data in Polyploids
 - [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) in [cristaniguti/reads2mapApp:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Shiny app to evaluate Reads2Map workflows results
-- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations
+- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations
+- [MAPpoly](https://github.com/mmollina/MAPpoly) in [cristaniguti/reads2map:0.0.5](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Build linkage maps for autopolyploid species
+
+### Funding
+
+This work was partially supported by the National Council for Scientific and Technological Development (CNPq - 313269/2021-1); by USDA, National Institute of Food and Agriculture (NIFA), Specialty Crop Research Initiative (SCRI) project “Tools for Genomics Assisted Breeding in Polyploids: Development of a Community Resource” (Award No. 2020-51181-32156); and by the Bill and Melinda Gates Foundation (OPP1213329) project SweetGAINS.
diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md
@@ -1,3 +1,9 @@
+# 1.2.4
+
+* runtimes adapted to run with Caper
+* perform the genotype calling with updog, SuperMASSA and polyRAD with complete data set (not only for the selected chromosome)
+* [new tutorial](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html)
+
 # 1.2.3
 
 * Supermassa has smaller probability threshold (bugfix)

diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.inputs.json b/pipelines/EmpiricalMaps/EmpiricalMaps.inputs.json
@@ -0,0 +1,23 @@
+{
+  "Maps.dataset": {
+    "parent2": "String",
+    "name": "String",
+    "parent1": "String",
+    "chromosome": "String",
+    "cross": "String",
+    "multiallelics": "Boolean"
+  },
+  "Maps.max_cores": "Int",
+  "Maps.gatk_vcf_multi": "File? (optional)",
+  "Maps.gatk_mchap": "String",
+  "Maps.vcfs_counts_source": "Array[String]",
+  "Maps.filters": "String? (optional)",
+  "Maps.filt_segr": "String? (optional)",
+  "Maps.prob_thres": "Float? (optional)",
+  "Maps.ploidy": "Int",
+  "Maps.vcfs_software": "Array[String]",
+  "Maps.filter_noninfo": "Boolean",
+  "Maps.vcfs": "Array[File]",
+  "Maps.replaceADbyMissing": "String"
+}
+