Merge pull request #66 from Cristianetaniguti/develop

Develop
Cristianetaniguti · Jan 21, 2023 · 05f5913 · 05f5913
2 parents acca101 + 09c04ef
commit 05f5913
Show file tree

Hide file tree

Showing 117 changed files with 43,398 additions and 311 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -24,8 +24,7 @@ jobs:
       - checkout
 
       - run: pip3 install -r tests/requirements.txt
-      - run: pytest --git-aware --tag freebayes tests/
-      - run: pytest --git-aware --tag gatk tests/
+      - run: pytest --git-aware tests/subworkflows/genotyping_empirical/freebayes/polyrad
 
   release-to-github:
     <<: *machine_default

diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,5 @@ figures_gabaritos
 **/__pycache__/**
 import_paths.txt
 releases/
+2023*
+_LAST
diff --git a/README.md b/README.md
@@ -63,4 +63,4 @@ Check more information and examples of usage in:
 - [updog](https://github.com/dcgerard/updog) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Flexible Genotyping of Polyploids using Next Generation Sequencing Data
 - [polyRAD](https://github.com/lvclark/polyRAD) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Genotype Calling with Uncertainty from Sequencing Data in Polyploids
 - [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) in [cristaniguti/reads2mapApp:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Shiny app to evaluate Reads2Map workflows results
-- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations.
+- [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations
diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md
@@ -1,3 +1,18 @@
+# 1.2.0
+
+* Add MAPpoly to build linkage maps for polyploid species
+* Adjust runtimes 
+* Add polyploid dataset for tests
+
+# 1.1.0
+
+2022-12-05
+
+* Include optional values to skip some of the sub-workflows
+* Accepts any number of VCF files as input
+* Dosage calling with updog, polyRAD and SuperMASSA adapted to polyploids
+* More tests and example data sets included
+
 # 1.0.0
 
 Initial release

diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
@@ -6,166 +6,230 @@ import "../../structs/population_structs.wdl"
 import "../../tasks/utils.wdl" as utils
 import "../../tasks/utilsR.wdl" as utilsR
 import "../../tasks/JointReports.wdl" as reports
+import "../../tasks/mappoly.wdl" as mappoly_task
 
 import "../../subworkflows/genotyping_empirical.wdl" as genotyping
 import "../../subworkflows/snpcaller_maps_empirical.wdl" as snpcaller
 import "../../subworkflows/gusmap_maps_empirical.wdl" as gusmap
+import "../../subworkflows/mappoly_maps_empirical.wdl" as mappoly_sub
 
 workflow Maps {
 
     input {
         Dataset dataset
+        Array[File] vcfs
+        Array[String] vcfs_software
+        Array[String] vcfs_counts_source
+        Boolean filter_noninfo
+        String replaceADbyMissing
         File? gatk_vcf_multi
         String gatk_mchap
-        File? gatk_vcf
-        File? freebayes_vcf
-        File? gatk_vcf_bam_counts
-        File? freebayes_vcf_bam_counts
         String? filters
         Int max_cores
+        Int ploidy
     }
 
     if (defined(filters)) {
-        call utils.ApplyRandomFilters {
+        call utils.ApplyRandomFiltersArray {
             input:
-                gatk_vcf = gatk_vcf,
-                freebayes_vcf = freebayes_vcf,
-                gatk_vcf_bam_counts = gatk_vcf_bam_counts,
-                freebayes_vcf_bam_counts = freebayes_vcf_bam_counts,
+                vcfs = vcfs,
+                vcfs_software = vcfs_software,
+                vcfs_counts_source = vcfs_counts_source,
                 filters = filters,
                 chromosome = dataset.chromosome
         }
     }
 
-    File filtered_gatk_vcf = select_first([ApplyRandomFilters.gatk_vcf_filt, gatk_vcf])
-    File filtered_gatk_vcf_bamcounts = select_first([ApplyRandomFilters.gatk_vcf_bam_counts_filt, gatk_vcf_bam_counts])
-    File filtered_freebayes_vcf = select_first([ApplyRandomFilters.freebayes_vcf_filt, freebayes_vcf])
-    File filtered_freebayes_vcf_bamcounts = select_first([ApplyRandomFilters.freebayes_vcf_bam_counts_filt, freebayes_vcf_bam_counts])
-
-    PopulationAnalysis gatk_processing = {"method": "gatk", "vcf": filtered_gatk_vcf, "bam": filtered_gatk_vcf_bamcounts}
-    PopulationAnalysis freebayes_processing = {"method": "freebayes", "vcf": filtered_freebayes_vcf, "bam": filtered_freebayes_vcf_bamcounts}
+    Array[File] filtered_vcfs = select_first([ApplyRandomFiltersArray.vcfs_filt, vcfs])
 
     # Re-Genotyping with updog, supermassa and polyrad; and building maps with onemap
-    scatter (analysis in [gatk_processing, freebayes_processing]) {
-
-        Map[String, File] vcfs = {"vcf": analysis.vcf, "bam": analysis.bam}
-
-        scatter (origin in ["vcf", "bam"]) {
+    scatter (idx in range(length(filtered_vcfs))) {
 
-            call utils.SplitMarkers as splitgeno {
-                 input:
-                    vcf_file = vcfs[origin]
-            }
-
-            # Suggestion to improve performance of SuperMASSA, polyRAD and updog
-            call utilsR.FilterSegregation {
+        call utils.SplitMarkers as splitgeno {
              input:
+                vcf_file = filtered_vcfs[idx]
+        }
+
+        # Suggestion to improve performance of SuperMASSA, polyRAD and updog
+        if(filter_noninfo){
+            call utilsR.RemoveNonInformative {
+            input:
                 vcf_file = splitgeno.biallelics,
                 parent1 = dataset.parent1,
-                parent2 = dataset.parent2
+                parent2 = dataset.parent2,
+                replaceADbyMissing = replaceADbyMissing
             }
+        }
+
+        File vcf_up = select_first([RemoveNonInformative.vcf_filtered, splitgeno.biallelics])
 
+        if(ploidy == 2) {
             call genotyping.onemapMapsEmp as updogMaps {
                 input:
-                    vcf_file = FilterSegregation.vcf_filtered,
-                    # vcf_file = splitgeno.biallelics,
-                    SNPCall_program = analysis.method,
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
                     GenotypeCall_program = "updog",
-                    CountsFrom = origin,
+                    CountsFrom = vcfs_counts_source[idx],
                     cross = dataset.cross,
                     parent1 = dataset.parent1,
                     parent2 = dataset.parent2,
                     chromosome = dataset.chromosome,
                     multiallelics = dataset.multiallelics,
                     multiallelics_file = splitgeno.multiallelics,
-                    max_cores = max_cores
+                    max_cores = max_cores,
+                    ploidy = ploidy
             }
 
             call genotyping.onemapMapsEmp as supermassaMaps {
                 input:
-                    vcf_file = FilterSegregation.vcf_filtered,
-                    # vcf_file = splitgeno.biallelics,
-                    SNPCall_program = analysis.method,
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
                     GenotypeCall_program = "supermassa",
-                    CountsFrom = origin,
+                    CountsFrom = vcfs_counts_source[idx],
                     cross = dataset.cross,
                     parent1 = dataset.parent1,
                     parent2 = dataset.parent2,
                     chromosome = dataset.chromosome,
                     multiallelics = dataset.multiallelics,
                     multiallelics_file = splitgeno.multiallelics,
-                    max_cores = max_cores
+                    max_cores = max_cores,
+                    ploidy = ploidy
             }
 
             call genotyping.onemapMapsEmp as polyradMaps {
                 input:
-                    vcf_file = FilterSegregation.vcf_filtered,
-                    # vcf_file = splitgeno.biallelics,
-                    SNPCall_program = analysis.method,
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
                     GenotypeCall_program = "polyrad",
-                    CountsFrom = origin,
+                    CountsFrom = vcfs_counts_source[idx],
                     cross = dataset.cross,
                     parent1 = dataset.parent1,
                     parent2 = dataset.parent2,
                     chromosome = dataset.chromosome,
                     multiallelics = dataset.multiallelics,
                     multiallelics_file = splitgeno.multiallelics,
+                    max_cores = max_cores,
+                    ploidy = ploidy
+            }
+
+            # Build maps with GUSMap
+            call gusmap.gusmapMapsEmp {
+                    input:
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
+                    CountsFrom = vcfs_counts_source[idx],
+                    GenotypeCall_program = "gusmap",
+                    parent1 = dataset.parent1,
+                    parent2 = dataset.parent2,
                     max_cores = max_cores
             }
+
+            if(vcfs_counts_source[idx] != "bam"){
+                call snpcaller.SNPCallerMapsEmp {
+                    input:
+                        vcf_file = splitgeno.biallelics,
+                        cross = dataset.cross,
+                        SNPCall_program = vcfs_software[idx],
+                        GenotypeCall_program = "SNPCaller",
+                        CountsFrom = vcfs_counts_source[idx],
+                        parent1 = dataset.parent1,
+                        parent2 = dataset.parent2,
+                        chromosome = dataset.chromosome,
+                        multiallelics = dataset.multiallelics,
+                        max_cores = max_cores,
+                        multiallelics_file = splitgeno.multiallelics,
+                        multiallelics_mchap = gatk_vcf_multi,
+                        mchap = gatk_mchap
+                }
+            }
         }
 
-       call utils.SplitMarkers as splitvcf {
-            input:
-              vcf_file = analysis.vcf
-       }
+        if(ploidy > 2){
+            call mappoly_sub.MappolyMapsEmp as updogPolyMaps {
+                input:
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
+                    GenotypeCall_program = "updog",
+                    CountsFrom = vcfs_counts_source[idx],
+                    cross = "F1",
+                    parent1 = dataset.parent1,
+                    parent2 = dataset.parent2,
+                    max_cores = max_cores,
+                    ploidy = ploidy
+            }
 
-       call utils.SplitMarkers as splitbam {
-            input:
-              vcf_file = analysis.bam
-       }
+            call mappoly_sub.MappolyMapsEmp as polyradPolyMaps {
+                input:
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
+                    GenotypeCall_program = "polyrad",
+                    CountsFrom = vcfs_counts_source[idx],
+                    cross = "F1",
+                    parent1 = dataset.parent1,
+                    parent2 = dataset.parent2,
+                    max_cores = max_cores,
+                    ploidy = ploidy
+            }
 
-       # Build maps with GUSMap
-       call gusmap.gusmapMapsEmp {
-            input:
-              vcf_file = splitvcf.biallelics,
-              vcf_bam_file = splitbam.biallelics,
-              SNPCall_program = analysis.method,
-              GenotypeCall_program = "gusmap",
-              parent1 = dataset.parent1,
-              parent2 = dataset.parent2,
-              max_cores = max_cores
+            call mappoly_sub.MappolyMapsEmp as supermassaPolyMaps {
+                input:
+                    vcf_file = vcf_up,
+                    SNPCall_program = vcfs_software[idx],
+                    GenotypeCall_program = "supermassa",
+                    CountsFrom = vcfs_counts_source[idx],
+                    cross = "F1",
+                    parent1 = dataset.parent1,
+                    parent2 = dataset.parent2,
+                    max_cores = max_cores,
+                    ploidy = ploidy
+            }
+
+            if(vcfs_counts_source[idx] != "bam"){
+                call mappoly_task.MappolyReport {
+                    input:
+                        vcf_file = vcf_up,
+                        SNPCall_program = vcfs_software[idx],
+                        GenotypeCall_program = "SNPCaller",
+                        CountsFrom = vcfs_counts_source[idx],
+                        parent1 = dataset.parent1,
+                        parent2 = dataset.parent2,
+                        max_cores = max_cores,
+			            ploidy = ploidy
+                }
+            }
         }
+    }
+
+    if(ploidy == 2){
+        Array[File] snpcaller_results = select_all(SNPCallerMapsEmp.tar_gz_report)
 
-        call snpcaller.SNPCallerMapsEmp {
+        # Compress files
+        call reports.JointReports {
             input:
-                vcf_file = splitvcf.biallelics,
-                cross = dataset.cross,
-                SNPCall_program = analysis.method,
-                GenotypeCall_program = "SNPCaller",
-                CountsFrom = "vcf",
-                parent1 = dataset.parent1,
-                parent2 = dataset.parent2,
-                chromosome = dataset.chromosome,
-                multiallelics = dataset.multiallelics,
-                max_cores = max_cores,
-                multiallelics_file = splitvcf.multiallelics,
-                multiallelics_mchap = gatk_vcf_multi,
-                mchap = gatk_mchap
+                SNPCaller = snpcaller_results,
+                updog = updogMaps.tar_gz_report,
+                polyrad = polyradMaps.tar_gz_report,
+                supermassa = supermassaMaps.tar_gz_report,
+                gusmap = gusmapMapsEmp.tar_gz_report,
+                max_cores = max_cores
         }
     }
 
-    # Compress files
-    call reports.JointReports {
-        input:
-            SNPCaller = SNPCallerMapsEmp.tar_gz_report,
-            updog = flatten(updogMaps.tar_gz_report),
-            polyrad = flatten(polyradMaps.tar_gz_report),
-            supermassa = flatten(supermassaMaps.tar_gz_report),
-            gusmap = gusmapMapsEmp.tar_gz_report,
-            max_cores = max_cores
+    if(ploidy > 2){
+        Array[File] snpcaller_results_poly = select_all(MappolyReport.results)
+
+        call reports.JointReportsPoly {
+            input:
+                SNPCaller = snpcaller_results_poly,
+                updog = updogPolyMaps.tar_gz_report,
+                polyrad = polyradPolyMaps.tar_gz_report,
+                supermassa = supermassaPolyMaps.tar_gz_report
+        }
     }
 
+    File Empirical_results_sele = select_first([JointReports.EmpiricalReads_results, JointReportsPoly.EmpiricalReads_results])
+
     output {
-        File EmpiricalReads_results = JointReports.EmpiricalReads_results
+        File EmpiricalReads_results = Empirical_results_sele
     }
 }
diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.wdl b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.wdl
@@ -18,6 +18,7 @@ workflow EmpiricalReads {
         Boolean gatk_mchap = false
         Boolean hardfilters = true
         Boolean replaceAD = true
+        String replaceADbyMissing = "TRUE" # Boolean inside R
         Boolean run_gatk = true
         Boolean run_freebayes = true
         Int ploidy = 2
@@ -45,14 +46,15 @@ workflow EmpiricalReads {
     call maps.Maps {
         input:
             dataset = dataset,
+            vcfs = SNPCalling.vcfs,
+            vcfs_software = SNPCalling.vcfs_software,
+            vcfs_counts_source = SNPCalling.vcfs_counts_source,
             gatk_vcf_multi = SNPCalling.gatk_multi_vcf,
             gatk_mchap = gatk_mchap,
-            gatk_vcf = SNPCalling.gatk_vcf,
-            freebayes_vcf = SNPCalling.freebayes_vcf,
-            gatk_vcf_bam_counts = SNPCalling.gatk_vcf_bam_count,
-            freebayes_vcf_bam_counts = SNPCalling.freebayes_vcf_bam_count,
             filters = filters,
-            max_cores = max_cores
+            max_cores = max_cores,
+            replaceADbyMissing = replaceADbyMissing,
+            ploidy = ploidy
     }
 
     output {