Skip to content

Commit

Permalink
Merge pull request #101 from Cristianetaniguti/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
Cristianetaniguti authored Jun 1, 2023
2 parents 2bc89df + 32dc71e commit 0674caa
Show file tree
Hide file tree
Showing 92 changed files with 1,597 additions and 372 deletions.
4 changes: 3 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ jobs:
- checkout

- run: pip3 install -r tests/requirements.txt
- run: pytest --git-aware tests/subworkflows/genotyping_empirical/freebayes/polyrad
- run: pytest --git-aware tests/subworkflows/gatk_genotyping/
- run: pytest --git-aware tests/subworkflows/stacks_genotyping/
- run: pytest --git-aware tests/subworkflows/freebayes_genotyping/

release-to-github:
<<: *machine_default
Expand Down
2 changes: 1 addition & 1 deletion .configurations/cromwell_mysql.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
backend {
default = SlurmSingularity
default = Local

providers {

Expand Down
9 changes: 8 additions & 1 deletion .dockerfiles/java-in-the-cloud/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM openjdk:7
FROM ibmjava:jre

COPY ./PedigreeSim.jar /usr/jars/

Expand All @@ -10,3 +10,10 @@ RUN apt-get update \
&& make \
&& mv vcf2diploid.jar /usr/jars/


RUN wget https://bitbucket.org/tasseladmin/tassel-5-standalone/get/5f68583d0f56.zip \
&& unzip 5f68583d0f56.zip \
&& mkdir /usr/tassel \
&& mv tasseladmin-tassel-5-standalone-5f68583d0f56/* /usr/tassel

RUN apt-get install -y samtools
11 changes: 11 additions & 0 deletions .dockerfiles/tassel/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM ibmjava:jre

RUN apt update \
&& apt install -y parallel vcftools

RUN conda install -y -c bioconda freebayes \
&& conda install -y -c bioconda vcflib

RUN wget https://bitbucket.org/tasseladmin/tassel-5-standalone/get/5f68583d0f56.zip \
&& unzip 5f68583d0f56.zip \
&& mv tasseladmin-tassel-5-standalone-5f68583d0f56/ /tassel
17 changes: 14 additions & 3 deletions .scripts/build_pipeline_release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ declare -r SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 &
source ${SCRIPT_DIR}/common.sh

declare ROOT_WDL=""
declare ROOT_JSON=""
declare VERSION=""
declare OUTPUT_DIR=""
declare ENV=""

declare -r ZIP_SUFFIX=".zip" WDL_SUFFIX=".wdl" OPTIONS_SUFFIX=".options.json"
declare -r ZIP_SUFFIX=".zip" WDL_SUFFIX=".wdl" OPTIONS_SUFFIX=".inputs.json"

function make_release() {
local -r rootWdl=${ROOT_WDL}
local -r rootJSON=${ROOT_JSON}
local -r wdlBasename=$(basename ${rootWdl} ${WDL_SUFFIX})

# Store the current working directory
Expand All @@ -37,7 +39,8 @@ function make_release() {
# Strip the paths out of the root WDL imports
sed -E '/http/! s/import "(.*)\/(.*\'${WDL_SUFFIX}')"/import "\2"/g' ${rootWdl} > ${outputVersionedPrefix}${WDL_SUFFIX}

write_options ${rootWdl} ${outputVersionedPrefix}
sed -E '/http/! s/import "(.*)\/(.*\'${OPTIONS_SUFFIX}')"/import "\2"/g' ${rootJSON} > ${outputVersionedPrefix}${OPTIONS_SUFFIX}
# write_options ${rootWdl} ${outputVersionedPrefix}

versioned_dependencies_zip=${outputVersionedPrefix}${ZIP_SUFFIX}

Expand Down Expand Up @@ -91,14 +94,15 @@ function show_help() {
echo ""
echo "Arguments:"
echo " -w The path to the workflow (.wdl) file"
echo " -j The path to the JSON input file template (.json) file"
echo " -v The version of the workflow (used in building the release name)"
echo " -o The directory into which the outputs will be written"
echo " -e The environment (dev, staging, or prod)"
echo " -h print this helpful message"
echo ""
}

while getopts "hw:v:o:e:" opt; do
while getopts "hw:j:v:o:e:" opt; do
case ${opt} in
h)
show_help
Expand All @@ -111,6 +115,13 @@ while getopts "hw:v:o:e:" opt; do
fi
ROOT_WDL=${OPTARG}
;;
j)
if [[ ! -f ${OPTARG} ]]; then
echo >&2 Error: ${OPTARG} does not exist!
exit 1
fi
ROOT_JSON=${OPTARG}
;;
v)
VERSION=${OPTARG}
;;
Expand Down
9 changes: 5 additions & 4 deletions .scripts/release_pipeline_to_github.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,10 @@ function build_and_release_to_github() {
local -r prerelease=${5}
local -r pipelineName=$(basename ${pipeline} .wdl)
local -r changelog=$(dirname ${pipeline})/${pipelineName}.changelog.md
local -r inputsName=$(dirname ${pipeline})/${pipelineName}.inputs.json

stderr "Building artifacts for ${releaseName}"
${SCRIPT_DIR}/build_pipeline_release.sh -w ${pipeline} -e prod -v ${version} -o ${localReleaseDir}
${SCRIPT_DIR}/build_pipeline_release.sh -w ${pipeline} -j ${inputsName} -e prod -v ${version} -o ${localReleaseDir}

stderr "Building release notes for ${releaseName}"
local previousEntryStart
Expand Down Expand Up @@ -197,8 +198,8 @@ function upload_to_github_as_draft() {
${pipelineName} \
${version} \
${releaseName} \
"options.json" \
"application/json"
"inputs.json" \
"text/json"

local -r dependenciesZip=${localReleaseDir}/${pipelineName}/${pipelineName}_${version}.zip

Expand Down Expand Up @@ -308,4 +309,4 @@ localReleaseDir=$(mktemp -d)

trap cleanup_failed_release EXIT

release_to_github ${PIPELINE_TO_RELEASE} ${ENV}
release_to_github ${PIPELINE_TO_RELEASE} ${ENV}
40 changes: 26 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,38 +1,43 @@
[![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg)
[![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map)

## Reads2Map

Reads2Map presents a collection of [WDL workflows](https://openwdl.org/) to build linkage maps from sequencing reads. Each workflow release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases).
<p align="center">
<br>
<img src="https://github.com/Cristianetaniguti/Reads2Map/assets/7572527/6074320a-0eba-44b9-88e1-b89eda8aad70" width="450"/>
<br>
<p/>

The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building.
Reads2Map is a collection of [WDL workflows](https://openwdl.org/) designed to facilitate the contruction of linkage maps from sequencing reads. You can find details of each workflow release on the Read2Map releases page, available [here](https://github.com/Cristianetaniguti/Reads2Map/releases).

By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build.
The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` used RADinitio software to simulate Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building.

![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png)
The SNP calling step in Reads2Map currently includes the popular tools: GATK, Freebayes, TASSEL, and STACKs. For genotype/dosage calling, the workflow utilizes tools like updog, polyRAD, and SuperMASSA. Lastly, Reads2Map leverages OneMap, GUSMap, and MAPpoly for linkage map construction.

For diploid data, you can visualize the results using the R package and shiny app called Reads2MapApp, available [here](https://github.com/Cristianetaniguti/Reads2MapApp). This package supports the visualization of linkage maps built using OneMap and GUSMap.

The Reads2Map workflows perform the SNP and genotype/dosage calling for your complete data set. However, it builds the linkage map for only a single chromosome (reference genome is required) for each combination of software and parameters. The produced maps will probably still require improvements, but their characteristics will suggest which combination of SNP and genotype calling software and parameters you should use for your data. Once the pipeline is selected, you can input the respective VCF file in R and build the complete linkage map using OneMap or MAPpoly. Use [OneMap](https://statgen-esalq.github.io/tutorials/onemap/Outcrossing_Populations.html) or [MAPoly](https://rpubs.com/mmollin/tetra_mappoly_vignette) tutorials for guidance on building and improving the linkage map for the complete dataset.

## How to use

Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines).
Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines).

To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies).
In addition, we also suggest two wrappers: [cromwell-cli](https://github.com/lmtani/cromwell-cli) and [Caper](https://github.com/ENCODE-DCC/caper). Here is a tutorial on how to setup these tools and one example running the EmpiricalReads2Map:

## Documentation
* [Setup and run Reads2Map workflows](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html)

To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies).

Check the description of the inputs for the pipelines:

* [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Reads2Map/EmpiricalReads.html)

* [SimulatedReads2Map](https://cristianetaniguti.github.io/Tutorials/Reads2Map/simulatedreads.html)

Check how to evaluate the workflows results in Reads2MapApp Shiny:
Check how to evaluate the workflows results in Reads2MapApp Shiny (so far only available for diploid datasets):

* [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp)

Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map:

* [A Guide to Build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/Quick_HighDens/High_density_maps.html)

Check more information and examples of usage in:

* [Taniguti, C. H., Taniguti, L. M., Amadeu, R. R., Mollinari, M., Da, G., Pereira, S., Riera-Lizarazu, O., Lau, J., Byrne, D., de Siqueira Gesteira, G., De, T., Oliveira, P., Ferreira, G. C., &#38; Franco Garcia, A. A. Developing best practices for genotyping-by-sequencing analysis using linkage maps as benchmarks. BioRxiv. https://doi.org/10.1101/2022.11.24.517847](https://www.biorxiv.org/content/10.1101/2022.11.24.517847v2)
Expand All @@ -44,6 +49,8 @@ Check more information and examples of usage in:
- [ddRADseqTools](https://github.com/GGFHF/ddRADseqTools) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Set of applications useful to in silico design and testing of double digest RADseq (ddRADseq) experiments;
- [Freebayes](https://github.com/ekg/freebayes) in [Cristaniguti/freebayes:0.0.1](): Variant call step;
- [GATK](https://github.com/broadinstitute/gatk) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Variant call step using Haplotype Caller, GenomicsDBImport and GenotypeGVCFs;
- [TASSEL](https://www.maizegenetics.net/tassel) in [cristaniguti/java-in-the-cloud:0.0.2](https://hub.docker.com/repository/docker/cristaniguti/java-in-the-cloud/general): Variant Call
- [STACKs](https://catchenlab.life.illinois.edu/stacks/) in [cristaniguti/stacks:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/stacks/general): Variant Call
- [PedigreeSim](https://github.com/PBR/pedigreeSim?files=1) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Simulates progeny genotypes from parents genotypes for different types of populations;
- [picard](https://github.com/broadinstitute/picard) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files;
- [pirs](https://github.com/galaxy001/pirs) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): To generate simulates paired-end reads from a reference genome;
Expand All @@ -63,4 +70,9 @@ Check more information and examples of usage in:
- [updog](https://github.com/dcgerard/updog) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Flexible Genotyping of Polyploids using Next Generation Sequencing Data
- [polyRAD](https://github.com/lvclark/polyRAD) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Genotype Calling with Uncertainty from Sequencing Data in Polyploids
- [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) in [cristaniguti/reads2mapApp:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Shiny app to evaluate Reads2Map workflows results
- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations
- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations
- [MAPpoly](https://github.com/mmollina/MAPpoly) in [cristaniguti/reads2map:0.0.5](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Build linkage maps for autopolyploid species

### Funding

This work was partially supported by the National Council for Scientific and Technological Development (CNPq - 313269/2021-1); by USDA, National Institute of Food and Agriculture (NIFA), Specialty Crop Research Initiative (SCRI) project “Tools for Genomics Assisted Breeding in Polyploids: Development of a Community Resource” (Award No. 2020-51181-32156); and by the Bill and Melinda Gates Foundation (OPP1213329) project SweetGAINS.
6 changes: 6 additions & 0 deletions pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 1.2.4

* runtimes adapted to run with Caper
* perform the genotype calling with updog, SuperMASSA and polyRAD with complete data set (not only for the selected chromosome)
* [new tutorial](https://cristianetaniguti.github.io/Tutorials/Reads2Map/Setup_and_run_Reads2Map_workflows.html)

# 1.2.3

* Supermassa has smaller probability threshold (bugfix)
Expand Down
23 changes: 23 additions & 0 deletions pipelines/EmpiricalMaps/EmpiricalMaps.inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"Maps.dataset": {
"parent2": "String",
"name": "String",
"parent1": "String",
"chromosome": "String",
"cross": "String",
"multiallelics": "Boolean"
},
"Maps.max_cores": "Int",
"Maps.gatk_vcf_multi": "File? (optional)",
"Maps.gatk_mchap": "String",
"Maps.vcfs_counts_source": "Array[String]",
"Maps.filters": "String? (optional)",
"Maps.filt_segr": "String? (optional)",
"Maps.prob_thres": "Float? (optional)",
"Maps.ploidy": "Int",
"Maps.vcfs_software": "Array[String]",
"Maps.filter_noninfo": "Boolean",
"Maps.vcfs": "Array[File]",
"Maps.replaceADbyMissing": "String"
}

Loading

0 comments on commit 0674caa

Please sign in to comment.