From 56135a9b085ea566e48d945a9f413bcfea95efed Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Sun, 10 Dec 2023 14:19:03 -0600
Subject: [PATCH 01/10] adjust ram

---
 tasks/mappoly.wdl | 20 +++++++-------------
 tasks/stacks.wdl  |  4 ++++
 tasks/utilsR.wdl  |  2 +-
 3 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index 9fbbd83..8ea71ed 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -16,7 +16,7 @@ task MappolyReport {
   }
 
   Int disk_size = ceil(size(vcf_file, "GiB") * 2)
-  Int memory_size = ceil(size(vcf_file, "MiB") + 2000)
+  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores)
 
   command <<<
     R --vanilla --no-save <<RSCRIPT
@@ -57,12 +57,8 @@ task MappolyReport {
         seq.filt <- rf_snp_filter(input.twopt = tpt, diagnostic.plot = FALSE, probs = c(0.05, 0.95))
         mat2 <- make_mat_mappoly(mat, seq.filt)
 
-        # Run MDS for ordering according to recombination fractions
-        seq_test_mds <- mds_mappoly(mat2)
-        seq_mds <- make_seq_mappoly(seq_test_mds)
-
         # Sequence with genomic order
-        geno_order <- get_genomic_order(seq_mds)
+        geno_order <- get_genomic_order(seq.filt)
         seq_geno_order <- make_seq_mappoly(geno_order)
 
         init.map.list <- framework_map(input.seq = seq_geno_order,
@@ -91,16 +87,14 @@ task MappolyReport {
              map_error[[i]] <- est_full_hmm_with_global_error(res[[2]][[1]][[iter]], error = as.numeric(global_errors[i]), verbose = FALSE)
              saveRDS(map_error, file= paste0("~{SNPCall_program}_~{GenotypeCall_program}",global_errors[i], "_~{CountsFrom}_map.rds"))
         }
-        map_prob <- est_full_hmm_with_prior_prob(res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE)
-
-        # Diagnostic graphics - overall from plot(dat)
-        # Heatmap of mds with plot(mat2, ord=seq_mds)
-        # Heatmap of genomic order order plot(mat2, ord = map_error$info$mk.names)
-        # Relation between mds and genome plot(seq_mds$genome.pos)
 
+	if(!is.null(dat[["geno"]])){
+ 	 map_prob <- est_full_hmm_with_prior_prob(input.map = res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE)
+    	 saveRDS(map_prob, file= "freebayes_SNPCaller_vcf_map.rds")
+        }
+	
         saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")
         saveRDS(mat2, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds")
-        saveRDS(seq_mds, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_seq_mds.rds")
         saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds")
 
         system("mkdir results")
diff --git a/tasks/stacks.wdl b/tasks/stacks.wdl
index 9244c80..ff2b54b 100644
--- a/tasks/stacks.wdl
+++ b/tasks/stacks.wdl
@@ -103,6 +103,10 @@ task RefMap {
         
     populations -P stacks/ -M ~{pop_map} --vcf -t ~{max_cores}
 
+    # Fix sample names
+    sed  -i -e 's/.sorted.merged.filtered//g' stacks/populations.snps.vcf
+    sed  -i -e 's/.sorted.merged.filtered//g' stacks/populations.haps.vcf
+
   >>>
 
    runtime {
diff --git a/tasks/utilsR.wdl b/tasks/utilsR.wdl
index 3105971..94ad505 100644
--- a/tasks/utilsR.wdl
+++ b/tasks/utilsR.wdl
@@ -511,7 +511,7 @@ task ReGenotyping {
   }
 
   Int disk_size = ceil((size(vcf_file, "GiB") * 4))
-  Int memory_size = ceil(size(vcf_file, "MiB") * 5 + 4000)
+  Int memory_size = ceil(size(vcf_file, "MiB") * 5 + 5000*max_cores)
 
   command <<<
      R --vanilla --no-save <<RSCRIPT

From 8a730163b017c77a99a40f8c141f00e1f2bb7edd Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Thu, 14 Dec 2023 09:15:08 -0600
Subject: [PATCH 02/10] adjust ram and rm mds

---
 tasks/mappoly.wdl | 19 +++++++++----------
 tasks/tassel.wdl  |  7 ++++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index 8ea71ed..ebaf581 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -36,10 +36,10 @@ task MappolyReport {
         dat <- filter_missing(input.data = dat, type = "marker", 
                             filter.thres = 0.25, inter = FALSE)
 
-        dat <- filter_missing(dat, type = 'individual', filter.thres = 0.1, inter = FALSE)
+        dat <- filter_missing(dat, type = 'individual', filter.thres = 0.25, inter = FALSE)
 
         if("~{filt_segr}"){
-          pval.bonf <- 0.05/dat[[3]]
+          pval.bonf <- 0.05/dat[["n.mrk"]]
           mrks.chi.filt <- filter_segregation(dat, 
                                               chisq.pval.thres =  pval.bonf, 
                                               inter = FALSE)
@@ -64,8 +64,8 @@ task MappolyReport {
         init.map.list <- framework_map(input.seq = seq_geno_order,
                                        twopt = tpt,
                                        start.set = 5,
-                                       inflation.lim.p1 = 10,
-                                       inflation.lim.p2 = 10,
+                                       inflation.lim.p1 = 100,
+                                       inflation.lim.p2 = 100,
                                        verbose = FALSE)
 
         res <- update_framework_map(input.map.list = init.map.list,
@@ -75,7 +75,7 @@ task MappolyReport {
                                     init.LOD = 100,
                                     max.rounds = 3,
                                     size.rem.cluster = 3,
-                                    gap.threshold = 20,
+                                    gap.threshold = 30,
                                     verbose = FALSE)
 
         # Get last interaction
@@ -88,14 +88,13 @@ task MappolyReport {
              saveRDS(map_error, file= paste0("~{SNPCall_program}_~{GenotypeCall_program}",global_errors[i], "_~{CountsFrom}_map.rds"))
         }
 
-	if(!is.null(dat[["geno"]])){
- 	 map_prob <- est_full_hmm_with_prior_prob(input.map = res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE)
-    	 saveRDS(map_prob, file= "freebayes_SNPCaller_vcf_map.rds")
+        if(!is.null(dat[["geno"]])){
+         map_prob <- est_full_hmm_with_prior_prob(input.map = res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE)
+         saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds")
         }
-	
+        
         saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")
         saveRDS(mat2, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds")
-        saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds")
 
         system("mkdir results")
         system("mv *.rds  results")
diff --git a/tasks/tassel.wdl b/tasks/tassel.wdl
index baf4c0c..6329335 100644
--- a/tasks/tassel.wdl
+++ b/tasks/tassel.wdl
@@ -169,7 +169,8 @@ task BarcodeFaker {
      is_gz <- basename(file_names[1])
      if(grepl(".gz", is_gz)) {
         for(i in 1:length(file_names)){
-           system(paste("gunzip", file_names[i]))
+           cat(paste("Uncompressing",file_names[i]))
+           system(paste("gunzip -f", file_names[i]))
         }
      }
      dir_name <- dirname(file_names[1])
@@ -191,8 +192,8 @@ task BarcodeFaker {
     disks:"local-disk 3 GiB HDD"
     # Slurm
     job_name: "BarcodeFaker"
-    mem:"2G"
-    time: 24
+    mem:"5G"
+    time: 50
   }
 
   meta {

From 0fd18ea1092917c79e3e8177ee01640d3f97a4a2 Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Sat, 16 Dec 2023 12:51:25 -0600
Subject: [PATCH 03/10] resample mappoly

---
 .dockerfiles/reads2map/Dockerfile         |   2 +-
 pipelines/EmpiricalMaps/EmpiricalMaps.wdl |   6 +-
 subworkflows/mappoly_maps_empirical.wdl   |   4 +-
 tasks/mappoly.wdl                         | 404 +++++++++++++++-------
 4 files changed, 282 insertions(+), 134 deletions(-)

diff --git a/.dockerfiles/reads2map/Dockerfile b/.dockerfiles/reads2map/Dockerfile
index 63ea5ca..1063957 100644
--- a/.dockerfiles/reads2map/Dockerfile
+++ b/.dockerfiles/reads2map/Dockerfile
@@ -60,4 +60,4 @@ RUN Rscript -e 'remotes::install_github("Cristianetaniguti/onemap")'
 # Still privates
 RUN Rscript -e 'remotes::install_github("Cristianetaniguti/simuscopR")'
 RUN Rscript -e 'remotes::install_github("Cristianetaniguti/Reads2MapTools")'
-RUN Rscript -e 'remotes::install_github("Cristianetaniguti/MAPpoly")'
+RUN Rscript -e 'remotes::install_github("Cristianetaniguti/MAPpoly" )'
diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
index 590b780..087602c 100644
--- a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
+++ b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
@@ -226,7 +226,7 @@ workflow Maps {
                         ploidy = ploidy,
                         prob_thres = prob_thres,
                         filt_segr = filt_segr,
-			global_errors = global_errors
+                        global_errors = global_errors
                 }
             }
 
@@ -243,7 +243,9 @@ workflow Maps {
                         ploidy = ploidy,
                         prob_thres = prob_thres,
                         filt_segr = filt_segr,
-                        global_errors = global_errors
+                        global_errors = global_errors,
+                        repetitions = repetitions,
+                        sample_size = sample_size
                 }
             }
         }
diff --git a/subworkflows/mappoly_maps_empirical.wdl b/subworkflows/mappoly_maps_empirical.wdl
index 6c531f6..a01f26f 100644
--- a/subworkflows/mappoly_maps_empirical.wdl
+++ b/subworkflows/mappoly_maps_empirical.wdl
@@ -1,10 +1,10 @@
 version 1.0
 
+
 import "../tasks/utilsR.wdl" as utilsR
 import "../tasks/mappoly.wdl" as mappolyTasks
 
 workflow MappolyMapsEmp {
-
   input {
     File vcf_file
     String SNPCall_program
@@ -52,4 +52,4 @@ workflow MappolyMapsEmp {
       File tar_gz_report = MappolyReport.results
       File regeno_vcf = ReGenotyping.regeno_vcf
    }
-}
+}
\ No newline at end of file
diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index ebaf581..881c749 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -1,130 +1,276 @@
-version 1.0
-
-task MappolyReport {
-  input {
-    File vcf_file
-    String SNPCall_program
-    String GenotypeCall_program
-    String CountsFrom
-    String parent1
-    String parent2
-    Float prob_thres = 0.8 
-    Int max_cores
-    Int ploidy
-    String filt_segr = "TRUE"
-    Array[String] global_errors = ["0.05"]
-  }
-
-  Int disk_size = ceil(size(vcf_file, "GiB") * 2)
-  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores)
-
-  command <<<
-    R --vanilla --no-save <<RSCRIPT
-
-        library(mappoly)
-
-        if("~{GenotypeCall_program}" == "supermassa") prob.thres = ~{prob_thres} - ~{prob_thres}*0.3 else prob.thres = ~{prob_thres}
-        
-        dat <- read_vcf(file = "~{vcf_file}", 
-                        parent.1 = "~{parent1}", 
-                        parent.2 = "~{parent2}", 
-                        verbose = FALSE, 
-                        read.geno.prob = TRUE, 
-                        prob.thres = prob.thres, 
-                        ploidy = ~{ploidy})
-
-        dat <- filter_missing(input.data = dat, type = "marker", 
-                            filter.thres = 0.25, inter = FALSE)
-
-        dat <- filter_missing(dat, type = 'individual', filter.thres = 0.25, inter = FALSE)
-
-        if("~{filt_segr}"){
-          pval.bonf <- 0.05/dat[["n.mrk"]]
-          mrks.chi.filt <- filter_segregation(dat, 
-                                              chisq.pval.thres =  pval.bonf, 
-                                              inter = FALSE)
-
-          seq.init <- make_seq_mappoly(mrks.chi.filt)
-        } else {
-          seq.init <- make_seq_mappoly(dat, "all")
-        }
-        
-        # Estimate two-point recombination fraction
-        tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores})
-        mat <- rf_list_to_matrix(input.twopt = tpt)
-
-        # Filter markers by recombination fraction values
-        seq.filt <- rf_snp_filter(input.twopt = tpt, diagnostic.plot = FALSE, probs = c(0.05, 0.95))
-        mat2 <- make_mat_mappoly(mat, seq.filt)
-
-        # Sequence with genomic order
-        geno_order <- get_genomic_order(seq.filt)
-        seq_geno_order <- make_seq_mappoly(geno_order)
-
-        init.map.list <- framework_map(input.seq = seq_geno_order,
-                                       twopt = tpt,
-                                       start.set = 5,
-                                       inflation.lim.p1 = 100,
-                                       inflation.lim.p2 = 100,
-                                       verbose = FALSE)
-
-        res <- update_framework_map(input.map.list = init.map.list,
-                                    input.seq = seq_geno_order,
-                                    twopt = tpt,
-                                    thres.twopt = 5,
-                                    init.LOD = 100,
-                                    max.rounds = 3,
-                                    size.rem.cluster = 3,
-                                    gap.threshold = 30,
-                                    verbose = FALSE)
-
-        # Get last interaction
-        iter <- length(res[[2]][[1]])
-
-        global_errors <- unlist(strsplit("~{sep="," global_errors}", ","))
-        map_error <- list()
-        for(i in 1:length(global_errors)){
-             map_error[[i]] <- est_full_hmm_with_global_error(res[[2]][[1]][[iter]], error = as.numeric(global_errors[i]), verbose = FALSE)
-             saveRDS(map_error, file= paste0("~{SNPCall_program}_~{GenotypeCall_program}",global_errors[i], "_~{CountsFrom}_map.rds"))
-        }
-
-        if(!is.null(dat[["geno"]])){
-         map_prob <- est_full_hmm_with_prior_prob(input.map = res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE)
-         saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds")
-        }
-        
-        saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")
-        saveRDS(mat2, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds")
-
-        system("mkdir results")
-        system("mv *.rds  results")
-
-        system(paste0("tar -czvf ", "~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}","_poly_results.tar.gz results"))
-
-    RSCRIPT
-
-  >>>
-
-  runtime {
-    docker:"cristaniguti/reads2map:0.1.0"
-    singularity: "docker://cristaniguti/reads2map:0.1.0"
-    cpu: max_cores
-    # Cloud
-    memory:"~{memory_size} MiB"
-    disks:"local-disk " + disk_size + " HDD"
-    # Slurm
-    job_name: "MappolyReport"
-    mem:"~{memory_size}G"
-    time: 24
-  }
-
-  meta {
-        author: "Cristiane Taniguti"
-        email: "chtaniguti@tamu.edu"
-        description: "Build linkage map using MAPpoly"
-  }
-
-  output {
-    File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz"
-  }
+version 1.0
+
+task MappolyReport {
+  input {
+    File vcf_file
+    String SNPCall_program
+    String GenotypeCall_program
+    String CountsFrom
+    String parent1
+    String parent2
+    Float prob_thres = 0.8
+    Int? repetitions = 100
+    Int? sample_size = 30
+    Int max_cores
+    Int ploidy
+    String filt_segr = "TRUE"
+    Array[String] global_errors = ["0.05"]
+  }
+
+  Int disk_size = ceil(size(vcf_file, "GiB") * 2)
+  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores)
+
+  command <<<
+    R --vanilla --no-save <<RSCRIPT
+      library(mappoly)
+      library(parallel)
+      
+      run_tests <- function(seed, s_all, sample_size, dat){
+        set.seed(seed[1])
+        library(mappoly)
+        res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL
+        map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed))
+        for(i in 1:length(seed)){
+          tryCatch({
+            mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size)
+            s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id))
+            
+            ## Parent 1
+            s1<-make_seq_mappoly(s, info.parent = "p1")
+            tpt <- est_pairwise_rf(s1)
+            map <- est_rf_hmm_sequential(input.seq = s1,
+                                         start.set = 5,
+                                         thres.twopt = 10,
+                                         thres.hmm = 10,
+                                         extend.tail = 30,
+                                         info.tail = TRUE,
+                                         twopt = tpt,
+                                         phase.number.limit = 10,
+                                         reestimate.single.ph.configuration = TRUE,
+                                         tol = 10e-2,
+                                         tol.final = 10e-4, 
+                                         verbose = FALSE)
+            map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
+            
+            # Global error
+            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
+            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
+            map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
+            x<-summary_maps(list(map.err.p1[[i]]))
+            res.p1 <-rbind(res.p1,x[1,])
+            
+            # Prob error
+            if(!is.null(dat[["geno"]])){
+              map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
+              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
+              map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
+              x<-summary_maps(list(map.prob.p1[[i]]))
+              res.prob.p1 <-rbind(res.prob.p1,x[1,])
+            }
+            
+            # Parent 2
+            s2<-make_seq_mappoly(s, info.parent = "p2")
+            tpt <- est_pairwise_rf(s2)
+            map <- est_rf_hmm_sequential(input.seq = s2,
+                                         start.set = 5,
+                                         thres.twopt = 10,
+                                         thres.hmm = 10,
+                                         extend.tail = 30,
+                                         info.tail = TRUE,
+                                         twopt = tpt,
+                                         phase.number.limit = 10,
+                                         reestimate.single.ph.configuration = TRUE,
+                                         tol = 10e-2,
+                                         tol.final = 10e-4, 
+                                         verbose = FALSE)
+            map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
+            
+            # Global error
+            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
+            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
+            map.err.p2[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
+            x<-summary_maps(list(map.err.p2[[i]]))
+            res.p2<-rbind(res.p2,x[1,])
+            
+            # Prob error
+            if(!is.null(dat[["geno"]])){
+              map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
+              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
+              map.prob.p2[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
+              x<-summary_maps(list(map.prob.p2[[i]]))
+              res.prob.p2 <-rbind(res.prob.p2,x[1,])
+            }
+          }, error=function(e){})
+        }
+        return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, 
+                    res.p1, res.p2, res.prob.p1, res.prob.p2))
+      }
+      
+      if("~{GenotypeCall_program}" == "supermassa") 
+        prob.thres = ~{prob_thres} - ~{prob_thres}*0.3 else prob.thres = ~{prob_thres}
+      
+      dat <- read_vcf(file = "~{vcf_file}", 
+                      parent.1 = "~{parent1}", 
+                      parent.2 = "~{parent2}", 
+                      verbose = FALSE, 
+                      read.geno.prob = TRUE, 
+                      prob.thres = prob.thres, 
+                      ploidy = ~{ploidy})
+      
+      info <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                         step = "raw", 
+                         n.markers = dat[["n.mrk"]], 
+                         mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2),
+                         n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                         n.ind = dat[["n.ind"]])
+      
+      dat <- filter_missing(input.data = dat, type = "marker", 
+                            filter.thres = 0.25, inter = FALSE)
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "miss filtered", 
+                              n.markers = dat[["n.mrk"]], 
+                              mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2),
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      
+      info <- rbind(info, info_temp)
+      
+      
+      if("TRUE"){
+        pval.bonf <- 0.05/dat[["n.mrk"]]
+        mrks.chi.filt <- filter_segregation(dat, 
+                                            chisq.pval.thres =  pval.bonf, 
+                                            inter = FALSE)
+        
+        seq.init <- make_seq_mappoly(mrks.chi.filt)
+      } else {
+        seq.init <- make_seq_mappoly(dat, "all")
+      }
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "segr filtered", 
+                              n.markers = length(seq.init[["seq.mrk.names"]]), 
+                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% seq.init[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(seq.init[["seq.mrk.names"]])))*100,2), 
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      
+      info <- rbind(info, info_temp)
+      
+      sp1<-make_seq_mappoly(seq.init, info.parent = "p1")
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "p1", 
+                              n.markers = length(sp1[["seq.mrk.names"]]), 
+                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp1[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp1[["seq.mrk.names"]])))*100,2), 
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      info <- rbind(info, info_temp)
+      
+      sp2<-make_seq_mappoly(seq.init, info.parent = "p2")
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "p2", 
+                              n.markers = length(sp2[["seq.mrk.names"]]), 
+                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp2[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp2[["seq.mrk.names"]])))*100,2), 
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      info <- rbind(info, info_temp)
+      
+      # Estimate two-point recombination fraction
+      tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores})
+      mat <- rf_list_to_matrix(input.twopt = tpt)
+      
+      sample_size <- if(length(seq.init[["seq.mrk.names"]]) <= ~{sample_size}) length(seq.init[["seq.mrk.names"]]) else ~{sample_size}
+      seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores}))
+      
+      clust <- makeCluster(~{max_cores})
+      clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests"))
+      results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, 
+                                                               sample_size = sample_size, dat = dat))
+      stopCluster(clust)
+      
+      map.err.p1 <- sapply(results, "[[", 1)
+      idx <- sapply(map.err.p1, is.null)
+      map.err.p1 <- map.err.p1[which(!idx)]
+      
+      map.err.p2 <- sapply(results, "[[", 2)
+      idx <- sapply(map.err.p2, is.null)
+      map.err.p2 <- map.err.p2[which(!idx)]
+      
+      if(!is.null(dat[["geno"]])){
+        map.prob.p1 <- sapply(results, "[[", 3)
+        idx <- sapply(map.prob.p1, is.null)
+        map.prob.p1 <- map.prob.p1[which(!idx)]
+        
+        map.prob.p2 <- sapply(results, "[[", 4)
+        idx <- sapply(map.prob.p2, is.null)
+        map.prob.p2 <- map.prob.p2[which(!idx)]
+        
+        maps <- list(map.err.p1 = map.err.p1, 
+                     map.err.p2 = map.err.p2, 
+                     map.prob.p1 = map.prob.p1, 
+                     map.prob.p2 = map.prob.p2)
+      } else {
+        maps <- list(map.err.p1 = map.err.p1, 
+                     map.err.p2 = map.err.p2) 
+      }
+      
+      res.p1 <- lapply(results, "[[", 5)
+      res.p1 <- do.call(rbind, res.p1)
+      res.p1[["map"]] <- "error.p1"
+      
+      res.p2 <- lapply(results, "[[", 6)
+      res.p2 <- do.call(rbind, res.p2)
+      res.p2[["map"]] <- "error.p2"
+      
+      if(!is.null(dat[["geno"]])){
+        res.prob.p1 <- lapply(results, "[[", 7)
+        res.prob.p1 <- do.call(rbind, res.prob.p1)
+        res.prob.p1[["map"]] <- "prob.p1"
+        
+        res.prob.p2 <- lapply(results, "[[", 8)
+        res.prob.p2 <- do.call(rbind, res.prob.p2)
+        res.prob.p2[["map"]] <- "prob.p2"
+        summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2)
+      } else summaries <- rbind(res.p1, res.p2)
+      
+      summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}"
+      
+      saveRDS(summaries, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_summaries.rds")
+      saveRDS(info, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_info.rds")
+      saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")
+      saveRDS(mat, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds")
+      saveRDS(maps, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_maps.rds")
+      
+      system("mkdir results")
+      system("mv *.rds  results")
+      
+      system(paste0("tar -czvf ", "~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}","_poly_results.tar.gz results"))
+
+    RSCRIPT
+  >>>
+
+  runtime {
+    docker:"cristaniguti/reads2map:0.1.0"
+    singularity: "docker://cristaniguti/reads2map:0.1.0"
+    cpu: max_cores
+    # Cloud
+    memory:"~{memory_size} MiB"
+    disks:"local-disk " + disk_size + " HDD"
+    # Slurm
+    job_name: "MappolyReport"
+    mem:"~{memory_size}G"
+    time: 24
+  }
+
+  meta {
+        author: "Cristiane Taniguti"
+        email: "chtaniguti@tamu.edu"
+        description: "Build linkage map using MAPpoly"
+  }
+
+  output {
+    File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz"
+  }
 }
\ No newline at end of file

From 5b360b4ad64f38a2a91e0a23cb46141961e466d0 Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Sat, 16 Dec 2023 13:14:33 -0600
Subject: [PATCH 04/10] sample size variable

---
 pipelines/EmpiricalMaps/EmpiricalMaps.wdl |  14 +-
 subworkflows/mappoly_maps_empirical.wdl   |   6 +-
 tasks/mappoly.wdl                         | 550 +++++++++++-----------
 3 files changed, 291 insertions(+), 279 deletions(-)

diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
index 087602c..384c1ef 100644
--- a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
+++ b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl
@@ -36,6 +36,8 @@ workflow Maps {
         Array[String] global_errors = ["0.05"]
         Boolean genoprob_error = true
         Array[String] genoprob_global_errors = ["0.05"]
+        Int? repetitions 
+        Int? sample_size 
     }
 
     if (defined(filters)) {
@@ -190,7 +192,9 @@ workflow Maps {
                         ploidy = ploidy,
                         prob_thres = prob_thres,
                         filt_segr = filt_segr,
-                        global_errors = global_errors
+                        global_errors = global_errors,
+                        repetitions = repetitions,
+                        sample_size = sample_size
                 }
             }
 
@@ -208,7 +212,9 @@ workflow Maps {
                         ploidy = ploidy,
                         prob_thres = prob_thres,
                         filt_segr = filt_segr,
-                        global_errors = global_errors
+                        global_errors = global_errors,
+                        repetitions = repetitions,
+                        sample_size = sample_size
                 }
             }
             
@@ -226,7 +232,9 @@ workflow Maps {
                         ploidy = ploidy,
                         prob_thres = prob_thres,
                         filt_segr = filt_segr,
-                        global_errors = global_errors
+                        global_errors = global_errors,
+                        repetitions = repetitions,
+                        sample_size = sample_size
                 }
             }
 
diff --git a/subworkflows/mappoly_maps_empirical.wdl b/subworkflows/mappoly_maps_empirical.wdl
index a01f26f..44f1f31 100644
--- a/subworkflows/mappoly_maps_empirical.wdl
+++ b/subworkflows/mappoly_maps_empirical.wdl
@@ -14,6 +14,8 @@ workflow MappolyMapsEmp {
     String parent1
     String parent2
     Float? prob_thres
+    Int? repetitions 
+    Int? sample_size 
     Int max_cores
     Int ploidy
     String? filt_segr
@@ -45,7 +47,9 @@ workflow MappolyMapsEmp {
       ploidy = ploidy,
       prob_thres = prob_thres,
       filt_segr = filt_segr,
-      global_errors = global_errors
+      global_errors = global_errors,
+      repetitions = repetitions,
+      sample_size = sample_size
   }
 
    output {
diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index 881c749..658966e 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -1,276 +1,276 @@
-version 1.0
-
-task MappolyReport {
-  input {
-    File vcf_file
-    String SNPCall_program
-    String GenotypeCall_program
-    String CountsFrom
-    String parent1
-    String parent2
-    Float prob_thres = 0.8
-    Int? repetitions = 100
-    Int? sample_size = 30
-    Int max_cores
-    Int ploidy
-    String filt_segr = "TRUE"
-    Array[String] global_errors = ["0.05"]
-  }
-
-  Int disk_size = ceil(size(vcf_file, "GiB") * 2)
-  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores)
-
-  command <<<
-    R --vanilla --no-save <<RSCRIPT
-      library(mappoly)
-      library(parallel)
-      
-      run_tests <- function(seed, s_all, sample_size, dat){
-        set.seed(seed[1])
-        library(mappoly)
-        res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL
-        map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed))
-        for(i in 1:length(seed)){
-          tryCatch({
-            mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size)
-            s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id))
-            
-            ## Parent 1
-            s1<-make_seq_mappoly(s, info.parent = "p1")
-            tpt <- est_pairwise_rf(s1)
-            map <- est_rf_hmm_sequential(input.seq = s1,
-                                         start.set = 5,
-                                         thres.twopt = 10,
-                                         thres.hmm = 10,
-                                         extend.tail = 30,
-                                         info.tail = TRUE,
-                                         twopt = tpt,
-                                         phase.number.limit = 10,
-                                         reestimate.single.ph.configuration = TRUE,
-                                         tol = 10e-2,
-                                         tol.final = 10e-4, 
-                                         verbose = FALSE)
-            map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
-            
-            # Global error
-            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
-            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
-            map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
-            x<-summary_maps(list(map.err.p1[[i]]))
-            res.p1 <-rbind(res.p1,x[1,])
-            
-            # Prob error
-            if(!is.null(dat[["geno"]])){
-              map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
-              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
-              map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
-              x<-summary_maps(list(map.prob.p1[[i]]))
-              res.prob.p1 <-rbind(res.prob.p1,x[1,])
-            }
-            
-            # Parent 2
-            s2<-make_seq_mappoly(s, info.parent = "p2")
-            tpt <- est_pairwise_rf(s2)
-            map <- est_rf_hmm_sequential(input.seq = s2,
-                                         start.set = 5,
-                                         thres.twopt = 10,
-                                         thres.hmm = 10,
-                                         extend.tail = 30,
-                                         info.tail = TRUE,
-                                         twopt = tpt,
-                                         phase.number.limit = 10,
-                                         reestimate.single.ph.configuration = TRUE,
-                                         tol = 10e-2,
-                                         tol.final = 10e-4, 
-                                         verbose = FALSE)
-            map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
-            
-            # Global error
-            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
-            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
-            map.err.p2[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
-            x<-summary_maps(list(map.err.p2[[i]]))
-            res.p2<-rbind(res.p2,x[1,])
-            
-            # Prob error
-            if(!is.null(dat[["geno"]])){
-              map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
-              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
-              map.prob.p2[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
-              x<-summary_maps(list(map.prob.p2[[i]]))
-              res.prob.p2 <-rbind(res.prob.p2,x[1,])
-            }
-          }, error=function(e){})
-        }
-        return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, 
-                    res.p1, res.p2, res.prob.p1, res.prob.p2))
-      }
-      
-      if("~{GenotypeCall_program}" == "supermassa") 
-        prob.thres = ~{prob_thres} - ~{prob_thres}*0.3 else prob.thres = ~{prob_thres}
-      
-      dat <- read_vcf(file = "~{vcf_file}", 
-                      parent.1 = "~{parent1}", 
-                      parent.2 = "~{parent2}", 
-                      verbose = FALSE, 
-                      read.geno.prob = TRUE, 
-                      prob.thres = prob.thres, 
-                      ploidy = ~{ploidy})
-      
-      info <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
-                         step = "raw", 
-                         n.markers = dat[["n.mrk"]], 
-                         mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2),
-                         n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
-                         n.ind = dat[["n.ind"]])
-      
-      dat <- filter_missing(input.data = dat, type = "marker", 
-                            filter.thres = 0.25, inter = FALSE)
-      
-      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
-                              step = "miss filtered", 
-                              n.markers = dat[["n.mrk"]], 
-                              mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2),
-                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
-                              n.ind = dat[["n.ind"]])
-      
-      info <- rbind(info, info_temp)
-      
-      
-      if("TRUE"){
-        pval.bonf <- 0.05/dat[["n.mrk"]]
-        mrks.chi.filt <- filter_segregation(dat, 
-                                            chisq.pval.thres =  pval.bonf, 
-                                            inter = FALSE)
-        
-        seq.init <- make_seq_mappoly(mrks.chi.filt)
-      } else {
-        seq.init <- make_seq_mappoly(dat, "all")
-      }
-      
-      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
-                              step = "segr filtered", 
-                              n.markers = length(seq.init[["seq.mrk.names"]]), 
-                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% seq.init[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(seq.init[["seq.mrk.names"]])))*100,2), 
-                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
-                              n.ind = dat[["n.ind"]])
-      
-      info <- rbind(info, info_temp)
-      
-      sp1<-make_seq_mappoly(seq.init, info.parent = "p1")
-      
-      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
-                              step = "p1", 
-                              n.markers = length(sp1[["seq.mrk.names"]]), 
-                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp1[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp1[["seq.mrk.names"]])))*100,2), 
-                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
-                              n.ind = dat[["n.ind"]])
-      info <- rbind(info, info_temp)
-      
-      sp2<-make_seq_mappoly(seq.init, info.parent = "p2")
-      
-      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
-                              step = "p2", 
-                              n.markers = length(sp2[["seq.mrk.names"]]), 
-                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp2[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp2[["seq.mrk.names"]])))*100,2), 
-                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
-                              n.ind = dat[["n.ind"]])
-      info <- rbind(info, info_temp)
-      
-      # Estimate two-point recombination fraction
-      tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores})
-      mat <- rf_list_to_matrix(input.twopt = tpt)
-      
-      sample_size <- if(length(seq.init[["seq.mrk.names"]]) <= ~{sample_size}) length(seq.init[["seq.mrk.names"]]) else ~{sample_size}
-      seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores}))
-      
-      clust <- makeCluster(~{max_cores})
-      clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests"))
-      results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, 
-                                                               sample_size = sample_size, dat = dat))
-      stopCluster(clust)
-      
-      map.err.p1 <- sapply(results, "[[", 1)
-      idx <- sapply(map.err.p1, is.null)
-      map.err.p1 <- map.err.p1[which(!idx)]
-      
-      map.err.p2 <- sapply(results, "[[", 2)
-      idx <- sapply(map.err.p2, is.null)
-      map.err.p2 <- map.err.p2[which(!idx)]
-      
-      if(!is.null(dat[["geno"]])){
-        map.prob.p1 <- sapply(results, "[[", 3)
-        idx <- sapply(map.prob.p1, is.null)
-        map.prob.p1 <- map.prob.p1[which(!idx)]
-        
-        map.prob.p2 <- sapply(results, "[[", 4)
-        idx <- sapply(map.prob.p2, is.null)
-        map.prob.p2 <- map.prob.p2[which(!idx)]
-        
-        maps <- list(map.err.p1 = map.err.p1, 
-                     map.err.p2 = map.err.p2, 
-                     map.prob.p1 = map.prob.p1, 
-                     map.prob.p2 = map.prob.p2)
-      } else {
-        maps <- list(map.err.p1 = map.err.p1, 
-                     map.err.p2 = map.err.p2) 
-      }
-      
-      res.p1 <- lapply(results, "[[", 5)
-      res.p1 <- do.call(rbind, res.p1)
-      res.p1[["map"]] <- "error.p1"
-      
-      res.p2 <- lapply(results, "[[", 6)
-      res.p2 <- do.call(rbind, res.p2)
-      res.p2[["map"]] <- "error.p2"
-      
-      if(!is.null(dat[["geno"]])){
-        res.prob.p1 <- lapply(results, "[[", 7)
-        res.prob.p1 <- do.call(rbind, res.prob.p1)
-        res.prob.p1[["map"]] <- "prob.p1"
-        
-        res.prob.p2 <- lapply(results, "[[", 8)
-        res.prob.p2 <- do.call(rbind, res.prob.p2)
-        res.prob.p2[["map"]] <- "prob.p2"
-        summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2)
-      } else summaries <- rbind(res.p1, res.p2)
-      
-      summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}"
-      
-      saveRDS(summaries, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_summaries.rds")
-      saveRDS(info, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_info.rds")
-      saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")
-      saveRDS(mat, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds")
-      saveRDS(maps, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_maps.rds")
-      
-      system("mkdir results")
-      system("mv *.rds  results")
-      
-      system(paste0("tar -czvf ", "~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}","_poly_results.tar.gz results"))
-
-    RSCRIPT
-  >>>
-
-  runtime {
-    docker:"cristaniguti/reads2map:0.1.0"
-    singularity: "docker://cristaniguti/reads2map:0.1.0"
-    cpu: max_cores
-    # Cloud
-    memory:"~{memory_size} MiB"
-    disks:"local-disk " + disk_size + " HDD"
-    # Slurm
-    job_name: "MappolyReport"
-    mem:"~{memory_size}G"
-    time: 24
-  }
-
-  meta {
-        author: "Cristiane Taniguti"
-        email: "chtaniguti@tamu.edu"
-        description: "Build linkage map using MAPpoly"
-  }
-
-  output {
-    File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz"
-  }
+version 1.0
+
+task MappolyReport {
+  input {
+    File vcf_file
+    String SNPCall_program
+    String GenotypeCall_program
+    String CountsFrom
+    String parent1
+    String parent2
+    Float prob_thres = 0.8
+    Int repetitions = 100
+    Int sample_size = 30
+    Int max_cores
+    Int ploidy
+    String filt_segr = "TRUE"
+    Array[String] global_errors = ["0.05"]
+  }
+
+  Int disk_size = ceil(size(vcf_file, "GiB") * 2)
+  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores)
+
+  command <<<
+    R --vanilla --no-save <<RSCRIPT
+      library(mappoly)
+      library(parallel)
+      
+      run_tests <- function(seed, s_all, sample_size, dat){
+        set.seed(seed[1])
+        library(mappoly)
+        res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL
+        map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed))
+        for(i in 1:length(seed)){
+          tryCatch({
+            mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size)
+            s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id))
+            
+            ## Parent 1
+            s1<-make_seq_mappoly(s, info.parent = "p1")
+            tpt <- est_pairwise_rf(s1)
+            map <- est_rf_hmm_sequential(input.seq = s1,
+                                         start.set = 5,
+                                         thres.twopt = 10,
+                                         thres.hmm = 10,
+                                         extend.tail = 30,
+                                         info.tail = TRUE,
+                                         twopt = tpt,
+                                         phase.number.limit = 10,
+                                         reestimate.single.ph.configuration = TRUE,
+                                         tol = 10e-2,
+                                         tol.final = 10e-4, 
+                                         verbose = FALSE)
+            map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
+            
+            # Global error
+            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
+            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
+            map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
+            x<-summary_maps(list(map.err.p1[[i]]))
+            res.p1 <-rbind(res.p1,x[1,])
+            
+            # Prob error
+            if(!is.null(dat[["geno"]])){
+              map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
+              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
+              map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
+              x<-summary_maps(list(map.prob.p1[[i]]))
+              res.prob.p1 <-rbind(res.prob.p1,x[1,])
+            }
+            
+            # Parent 2
+            s2<-make_seq_mappoly(s, info.parent = "p2")
+            tpt <- est_pairwise_rf(s2)
+            map <- est_rf_hmm_sequential(input.seq = s2,
+                                         start.set = 5,
+                                         thres.twopt = 10,
+                                         thres.hmm = 10,
+                                         extend.tail = 30,
+                                         info.tail = TRUE,
+                                         twopt = tpt,
+                                         phase.number.limit = 10,
+                                         reestimate.single.ph.configuration = TRUE,
+                                         tol = 10e-2,
+                                         tol.final = 10e-4, 
+                                         verbose = FALSE)
+            map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
+            
+            # Global error
+            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
+            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
+            map.err.p2[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
+            x<-summary_maps(list(map.err.p2[[i]]))
+            res.p2<-rbind(res.p2,x[1,])
+            
+            # Prob error
+            if(!is.null(dat[["geno"]])){
+              map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
+              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
+              map.prob.p2[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
+              x<-summary_maps(list(map.prob.p2[[i]]))
+              res.prob.p2 <-rbind(res.prob.p2,x[1,])
+            }
+          }, error=function(e){})
+        }
+        return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, 
+                    res.p1, res.p2, res.prob.p1, res.prob.p2))
+      }
+      
+      if("~{GenotypeCall_program}" == "supermassa") 
+        prob.thres = ~{prob_thres} - ~{prob_thres}*0.3 else prob.thres = ~{prob_thres}
+      
+      dat <- read_vcf(file = "~{vcf_file}", 
+                      parent.1 = "~{parent1}", 
+                      parent.2 = "~{parent2}", 
+                      verbose = FALSE, 
+                      read.geno.prob = TRUE, 
+                      prob.thres = prob.thres, 
+                      ploidy = ~{ploidy})
+      
+      info <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                         step = "raw", 
+                         n.markers = dat[["n.mrk"]], 
+                         mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2),
+                         n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                         n.ind = dat[["n.ind"]])
+      
+      dat <- filter_missing(input.data = dat, type = "marker", 
+                            filter.thres = 0.25, inter = FALSE)
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "miss filtered", 
+                              n.markers = dat[["n.mrk"]], 
+                              mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2),
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      
+      info <- rbind(info, info_temp)
+      
+      
+      if("TRUE"){
+        pval.bonf <- 0.05/dat[["n.mrk"]]
+        mrks.chi.filt <- filter_segregation(dat, 
+                                            chisq.pval.thres =  pval.bonf, 
+                                            inter = FALSE)
+        
+        seq.init <- make_seq_mappoly(mrks.chi.filt)
+      } else {
+        seq.init <- make_seq_mappoly(dat, "all")
+      }
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "segr filtered", 
+                              n.markers = length(seq.init[["seq.mrk.names"]]), 
+                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% seq.init[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(seq.init[["seq.mrk.names"]])))*100,2), 
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      
+      info <- rbind(info, info_temp)
+      
+      sp1<-make_seq_mappoly(seq.init, info.parent = "p1")
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "p1", 
+                              n.markers = length(sp1[["seq.mrk.names"]]), 
+                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp1[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp1[["seq.mrk.names"]])))*100,2), 
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      info <- rbind(info, info_temp)
+      
+      sp2<-make_seq_mappoly(seq.init, info.parent = "p2")
+      
+      info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), 
+                              step = "p2", 
+                              n.markers = length(sp2[["seq.mrk.names"]]), 
+                              mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp2[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp2[["seq.mrk.names"]])))*100,2), 
+                              n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), 
+                              n.ind = dat[["n.ind"]])
+      info <- rbind(info, info_temp)
+      
+      # Estimate two-point recombination fraction
+      tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores})
+      mat <- rf_list_to_matrix(input.twopt = tpt)
+      
+      sample_size <- if(length(seq.init[["seq.mrk.names"]]) <= ~{sample_size}) length(seq.init[["seq.mrk.names"]]) else ~{sample_size}
+      seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores}))
+      
+      clust <- makeCluster(~{max_cores})
+      clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests"))
+      results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, 
+                                                               sample_size = sample_size, dat = dat))
+      stopCluster(clust)
+      
+      map.err.p1 <- sapply(results, "[[", 1)
+      idx <- sapply(map.err.p1, is.null)
+      map.err.p1 <- map.err.p1[which(!idx)]
+      
+      map.err.p2 <- sapply(results, "[[", 2)
+      idx <- sapply(map.err.p2, is.null)
+      map.err.p2 <- map.err.p2[which(!idx)]
+      
+      if(!is.null(dat[["geno"]])){
+        map.prob.p1 <- sapply(results, "[[", 3)
+        idx <- sapply(map.prob.p1, is.null)
+        map.prob.p1 <- map.prob.p1[which(!idx)]
+        
+        map.prob.p2 <- sapply(results, "[[", 4)
+        idx <- sapply(map.prob.p2, is.null)
+        map.prob.p2 <- map.prob.p2[which(!idx)]
+        
+        maps <- list(map.err.p1 = map.err.p1, 
+                     map.err.p2 = map.err.p2, 
+                     map.prob.p1 = map.prob.p1, 
+                     map.prob.p2 = map.prob.p2)
+      } else {
+        maps <- list(map.err.p1 = map.err.p1, 
+                     map.err.p2 = map.err.p2) 
+      }
+      
+      res.p1 <- lapply(results, "[[", 5)
+      res.p1 <- do.call(rbind, res.p1)
+      res.p1[["map"]] <- "error.p1"
+      
+      res.p2 <- lapply(results, "[[", 6)
+      res.p2 <- do.call(rbind, res.p2)
+      res.p2[["map"]] <- "error.p2"
+      
+      if(!is.null(dat[["geno"]])){
+        res.prob.p1 <- lapply(results, "[[", 7)
+        res.prob.p1 <- do.call(rbind, res.prob.p1)
+        res.prob.p1[["map"]] <- "prob.p1"
+        
+        res.prob.p2 <- lapply(results, "[[", 8)
+        res.prob.p2 <- do.call(rbind, res.prob.p2)
+        res.prob.p2[["map"]] <- "prob.p2"
+        summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2)
+      } else summaries <- rbind(res.p1, res.p2)
+      
+      summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}"
+      
+      saveRDS(summaries, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_summaries.rds")
+      saveRDS(info, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_info.rds")
+      saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")
+      saveRDS(mat, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds")
+      saveRDS(maps, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_maps.rds")
+      
+      system("mkdir results")
+      system("mv *.rds  results")
+      
+      system(paste0("tar -czvf ", "~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}","_poly_results.tar.gz results"))
+
+    RSCRIPT
+  >>>
+
+  runtime {
+    docker:"cristaniguti/reads2map:0.1.0"
+    singularity: "docker://cristaniguti/reads2map:0.1.0"
+    cpu: max_cores
+    # Cloud
+    memory:"~{memory_size} MiB"
+    disks:"local-disk " + disk_size + " HDD"
+    # Slurm
+    job_name: "MappolyReport"
+    mem:"~{memory_size}G"
+    time: 24
+  }
+
+  meta {
+        author: "Cristiane Taniguti"
+        email: "chtaniguti@tamu.edu"
+        description: "Build linkage map using MAPpoly"
+  }
+
+  output {
+    File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz"
+  }
 }
\ No newline at end of file

From fa17ecac1f7583a97ab80975cfd29c31a885b52a Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Tue, 19 Dec 2023 15:07:01 -0600
Subject: [PATCH 05/10] replace for by while

---
 tasks/mappoly.wdl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index 658966e..d631fdf 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -28,9 +28,10 @@ task MappolyReport {
       run_tests <- function(seed, s_all, sample_size, dat){
         set.seed(seed[1])
         library(mappoly)
+        i <- 1
         res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL
         map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed))
-        for(i in 1:length(seed)){
+        while(i < length(seed)){
           tryCatch({
             mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size)
             s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id))
@@ -54,7 +55,7 @@ task MappolyReport {
             
             # Global error
             map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
-            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
+            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
             map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
             x<-summary_maps(list(map.err.p1[[i]]))
             res.p1 <-rbind(res.p1,x[1,])
@@ -62,7 +63,7 @@ task MappolyReport {
             # Prob error
             if(!is.null(dat[["geno"]])){
               map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
-              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt)
+              map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
               map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
               x<-summary_maps(list(map.prob.p1[[i]]))
               res.prob.p1 <-rbind(res.prob.p1,x[1,])
@@ -100,6 +101,7 @@ task MappolyReport {
               x<-summary_maps(list(map.prob.p2[[i]]))
               res.prob.p2 <-rbind(res.prob.p2,x[1,])
             }
+            i <- i + 1
           }, error=function(e){})
         }
         return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, 

From c122cfc459c5d29858dcf48fab82a1e057c70ab0 Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Tue, 26 Dec 2023 19:48:38 -0600
Subject: [PATCH 06/10] adapt ram

---
 tasks/freebayes.wdl |  2 +-
 tasks/gatk.wdl      |  2 +-
 tasks/mappoly.wdl   | 49 ++++++++++++++++++++++++++-------------------
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/tasks/freebayes.wdl b/tasks/freebayes.wdl
index 4e92e74..500c4c6 100644
--- a/tasks/freebayes.wdl
+++ b/tasks/freebayes.wdl
@@ -12,7 +12,7 @@ task RunFreebayes {
   }
 
   Int disk_size = ceil(size(reference, "GiB") + size(bam, "GiB") +  50)
-  Int memory_size = ceil(size(bam, "MiB") * 4 * max_cores + 10000)
+  Int memory_size = 300000
 
   command <<<
 
diff --git a/tasks/gatk.wdl b/tasks/gatk.wdl
index 4c8fe8c..5c879aa 100644
--- a/tasks/gatk.wdl
+++ b/tasks/gatk.wdl
@@ -299,7 +299,7 @@ task VariantFiltration {
             -filter "QUAL < 30.0" --filter-name "QUAL30" \
             -filter "SOR > 3.0" --filter-name "SOR3" \
             -filter "FS > 60.0" --filter-name "FS60" \
-            -filter "MQ < 40.0" --filter-name "MQ40" \
+            -filter "MQ < 40.0" --filter-name "MQ50" \
             -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \
             -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \
             -O gatk_filters.vcf.gz
diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index d631fdf..0327010 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -28,10 +28,10 @@ task MappolyReport {
       run_tests <- function(seed, s_all, sample_size, dat){
         set.seed(seed[1])
         library(mappoly)
-        i <- 1
+        map.err.p1.idx <- map.err.p2.idx <- map.prob.p1.idx <- map.prob.p2.idx <- 1
         res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL
         map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed))
-        while(i < length(seed)){
+        while(map.err.p1.idx <= length(seed) | map.err.p2.idx <= length(seed) | map.prob.p1.idx <= length(seed) | map.prob.p2.idx <= length(seed)){  
           tryCatch({
             mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size)
             s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id))
@@ -54,20 +54,24 @@ task MappolyReport {
             map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
             
             # Global error
-            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
-            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
-            map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
-            x<-summary_maps(list(map.err.p1[[i]]))
-            res.p1 <-rbind(res.p1,x[1,])
+            if(map.err.p1.idx <= length(seed)){
+               map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
+               map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
+               map.err.p1[[map.err.p1.idx]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
+               x<-summary_maps(list(map.err.p1[[map.err.p1.idx]]))
+               res.p1 <-rbind(res.p1,x[1,])
+               map.err.p1.idx <- map.err.p1.idx + 1
+            }
             
             # Prob error
-            if(!is.null(dat[["geno"]])){
+            if(!is.null(dat[["geno"]]) & map.prob.p1.idx <= length(seed)){
               map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
               map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
-              map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
-              x<-summary_maps(list(map.prob.p1[[i]]))
+              map.prob.p1[[map.prob.p1.idx]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
+              x<-summary_maps(list(map.prob.p1[[map.prob.p1.idx]]))
               res.prob.p1 <-rbind(res.prob.p1,x[1,])
-            }
+              map.prob.p1.idx <- map.prob.p1.idx + 1          
+            } else if (is.null(dat[["geno"]])) map.prob.p1.idx <- map.prob.p1.idx + 1
             
             # Parent 2
             s2<-make_seq_mappoly(s, info.parent = "p2")
@@ -87,21 +91,24 @@ task MappolyReport {
             map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001)
             
             # Global error
-            map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
-            map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
-            map.err.p2[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
-            x<-summary_maps(list(map.err.p2[[i]]))
-            res.p2<-rbind(res.p2,x[1,])
+            if(map.err.p2.idx <= length(seed)){
+               map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3)
+               map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
+               map.err.p2[[map.err.p2.idx]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4)
+               x<-summary_maps(list(map.err.p2[[map.err.p2.idx]]))
+               res.p2<-rbind(res.p2,x[1,])
+               map.err.p2.idx <- map.err.p2.idx + 1            
+            }
             
             # Prob error
-            if(!is.null(dat[["geno"]])){
+            if(!is.null(dat[["geno"]]) & map.prob.p2.idx <= length(seed)){
               map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3)
               map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt)
-              map.prob.p2[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
-              x<-summary_maps(list(map.prob.p2[[i]]))
+              map.prob.p2[[map.prob.p2.idx]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4)
+              x<-summary_maps(list(map.prob.p2[[map.prob.p2.idx]]))
               res.prob.p2 <-rbind(res.prob.p2,x[1,])
-            }
-            i <- i + 1
+              map.prob.p2.idx <- map.prob.p2.idx + 1          
+            } else if (is.null(dat[["geno"]])) map.prob.p2.idx <- map.prob.p2.idx + 1
           }, error=function(e){})
         }
         return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, 

From a6d3e7ce464634ecda8d2eac345221945ec87f18 Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Mon, 1 Jan 2024 16:20:48 -0600
Subject: [PATCH 07/10] adjust resources

---
 tasks/freebayes.wdl |  2 +-
 tasks/mappoly.wdl   |  2 +-
 tasks/stacks.wdl    |  2 +-
 tasks/tassel.wdl    | 23 +++--------------------
 4 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/tasks/freebayes.wdl b/tasks/freebayes.wdl
index 500c4c6..98213aa 100644
--- a/tasks/freebayes.wdl
+++ b/tasks/freebayes.wdl
@@ -12,7 +12,7 @@ task RunFreebayes {
   }
 
   Int disk_size = ceil(size(reference, "GiB") + size(bam, "GiB") +  50)
-  Int memory_size = 300000
+  Int memory_size = 350000
 
   command <<<
 
diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index 0327010..e9eaac9 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -18,7 +18,7 @@ task MappolyReport {
   }
 
   Int disk_size = ceil(size(vcf_file, "GiB") * 2)
-  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores)
+  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 12000*max_cores)
 
   command <<<
     R --vanilla --no-save <<RSCRIPT
diff --git a/tasks/stacks.wdl b/tasks/stacks.wdl
index ff2b54b..cb681ea 100644
--- a/tasks/stacks.wdl
+++ b/tasks/stacks.wdl
@@ -92,7 +92,7 @@ task RefMap {
   }
 
     Int disk_size = ceil(size(bams, "GiB") * 2)
-    Int memory_size = 3000 + ceil(size(bams, "MiB") * 2)
+    Int memory_size = 2000*max_cores + ceil(size(bams, "MiB") * 2)
 
   command <<<
 
diff --git a/tasks/tassel.wdl b/tasks/tassel.wdl
index 6329335..389bc21 100644
--- a/tasks/tassel.wdl
+++ b/tasks/tassel.wdl
@@ -51,7 +51,7 @@ task BarcodeFaker {
       # Marlee's functions
       #this function recursively makes a vector of unique fake barcodes to ensure no non-unique barcodes are generated
       #this step does not protect against enzyme cut site being found in the barcode
-      barcode_inventor <- function(fake_barcodes, r, fastqs){
+      barcode_inventor <- function(r, fastqs){
         fake_barcodes <- c()
         
         # make some barcodes of the length needed to ensure unique barcodes for all files are possible
@@ -78,25 +78,8 @@ task BarcodeFaker {
           
           #get the fastq file paths
           fastqs <- list.files(path = fastq_dir, full.names = TRUE)
-          fastqs <- fastqs[which(grepl(".fastq", fastqs) | grepl(".fq", fastqs))]
           fastq.check <- list.files(path = fastq_dir, full.names = FALSE)
-          fastq.check <- fastq.check[which(grepl(".fastq", fastq.check) | grepl(".fq", fastq.check))]
-          if(length(fastqs) == 0) stop("Files not fount.")
-          
-          #check that all files in dir are fastqs         #bug fixed 28 July 2020- reported by Dr. Smit Dhakal
-          long <- length(grep(".fastq", fastq.check))
-          short <- length(grep(".fq", fastq.check))
-          if(long != 0){
-            if(length(fastq.check) != long){
-              stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.")
-            }
-          }
-          if(short != 0){
-            if(length(fastq.check) != short){
-              stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.")
-            }
-          }
-          
+          if(length(fastqs) == 0) stop("Files not fount.")          
           
           # new method: given a number of input files, determine the min barcode length needed to ensure enough unique barcodes are generated
           # the old way failed if a lot of files had short barcodes of the same length- reported 18 Aug 2022 by Meghan Brady
@@ -111,7 +94,7 @@ task BarcodeFaker {
           r <- ceiling(Re(r)[Re(r) > 0]) # subset the real and positive roots, and round up
           if(r < 4){r <- 4} # keep a minimum barcode length of 4, probably unnecessary
 
-          fake_barcodes <- barcode_inventor(fake_barcodes, r, fastqs)
+          fake_barcodes <- barcode_inventor(r, fastqs)
           
           #make the fake perfect quality scores matching the barcode length
           fake_quals <- rep(paste(rep("E", times = r), collapse = ""), times = length(fastqs))

From cf7a86d370feefc64e91fb75327cdb386785b680 Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Thu, 4 Jan 2024 21:55:27 -0600
Subject: [PATCH 08/10] add exception

---
 tasks/mappoly.wdl | 103 +++++++++++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 42 deletions(-)

diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl
index e9eaac9..03433fd 100644
--- a/tasks/mappoly.wdl
+++ b/tasks/mappoly.wdl
@@ -18,7 +18,7 @@ task MappolyReport {
   }
 
   Int disk_size = ceil(size(vcf_file, "GiB") * 2)
-  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 12000*max_cores)
+  Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 18000*max_cores)
 
   command <<<
     R --vanilla --no-save <<RSCRIPT
@@ -117,8 +117,19 @@ task MappolyReport {
       
       if("~{GenotypeCall_program}" == "supermassa") 
         prob.thres = ~{prob_thres} - ~{prob_thres}*0.3 else prob.thres = ~{prob_thres}
+
+      library(vcfR)
+      bugfix <- read.vcfR("~{vcf_file}")
+      idx <- which(bugfix@fix[,1] == "NA")
+
+      if(length(idx) > 0){
+       bugfix@fix <- bugfix@fix[-idx,]
+       bugfix@gt <- bugfix@gt[-idx,]
+      }
       
-      dat <- read_vcf(file = "~{vcf_file}", 
+      write.vcf(bugfix, file = "bugfix.vcf")
+      
+      dat <- read_vcf(file = "bugfix.vcf", 
                       parent.1 = "~{parent1}", 
                       parent.2 = "~{parent2}", 
                       verbose = FALSE, 
@@ -190,62 +201,70 @@ task MappolyReport {
       tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores})
       mat <- rf_list_to_matrix(input.twopt = tpt)
       
-      sample_size <- if(length(seq.init[["seq.mrk.names"]]) <= ~{sample_size}) length(seq.init[["seq.mrk.names"]]) else ~{sample_size}
-      seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores}))
+      if(length(seq.init[["seq.mrk.names"]]) > ~{sample_size}) {
+       sample_size <- ~{sample_size}
+       seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores}))
       
-      clust <- makeCluster(~{max_cores})
-      clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests"))
-      results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, 
+       clust <- makeCluster(~{max_cores})
+       clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests"))
+       results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, 
                                                                sample_size = sample_size, dat = dat))
-      stopCluster(clust)
+       stopCluster(clust)
       
-      map.err.p1 <- sapply(results, "[[", 1)
-      idx <- sapply(map.err.p1, is.null)
-      map.err.p1 <- map.err.p1[which(!idx)]
+       map.err.p1 <- sapply(results, "[[", 1)
+       idx <- sapply(map.err.p1, is.null)
+       map.err.p1 <- map.err.p1[which(!idx)]
       
-      map.err.p2 <- sapply(results, "[[", 2)
-      idx <- sapply(map.err.p2, is.null)
-      map.err.p2 <- map.err.p2[which(!idx)]
+       map.err.p2 <- sapply(results, "[[", 2)
+       idx <- sapply(map.err.p2, is.null)
+       map.err.p2 <- map.err.p2[which(!idx)]
       
-      if(!is.null(dat[["geno"]])){
-        map.prob.p1 <- sapply(results, "[[", 3)
-        idx <- sapply(map.prob.p1, is.null)
-        map.prob.p1 <- map.prob.p1[which(!idx)]
+       if(!is.null(dat[["geno"]])){
+         map.prob.p1 <- sapply(results, "[[", 3)
+         idx <- sapply(map.prob.p1, is.null)
+         map.prob.p1 <- map.prob.p1[which(!idx)]
         
-        map.prob.p2 <- sapply(results, "[[", 4)
-        idx <- sapply(map.prob.p2, is.null)
-        map.prob.p2 <- map.prob.p2[which(!idx)]
+         map.prob.p2 <- sapply(results, "[[", 4)
+         idx <- sapply(map.prob.p2, is.null)
+         map.prob.p2 <- map.prob.p2[which(!idx)]
         
-        maps <- list(map.err.p1 = map.err.p1, 
+         maps <- list(map.err.p1 = map.err.p1, 
                      map.err.p2 = map.err.p2, 
                      map.prob.p1 = map.prob.p1, 
                      map.prob.p2 = map.prob.p2)
-      } else {
-        maps <- list(map.err.p1 = map.err.p1, 
+       } else {
+         maps <- list(map.err.p1 = map.err.p1, 
                      map.err.p2 = map.err.p2) 
-      }
+       }
       
-      res.p1 <- lapply(results, "[[", 5)
-      res.p1 <- do.call(rbind, res.p1)
-      res.p1[["map"]] <- "error.p1"
+       res.p1 <- lapply(results, "[[", 5)
+       res.p1 <- do.call(rbind, res.p1)
+       res.p1[["map"]] <- "error.p1"
       
-      res.p2 <- lapply(results, "[[", 6)
-      res.p2 <- do.call(rbind, res.p2)
-      res.p2[["map"]] <- "error.p2"
+       res.p2 <- lapply(results, "[[", 6)
+       res.p2 <- do.call(rbind, res.p2)
+       res.p2[["map"]] <- "error.p2"
       
-      if(!is.null(dat[["geno"]])){
-        res.prob.p1 <- lapply(results, "[[", 7)
-        res.prob.p1 <- do.call(rbind, res.prob.p1)
-        res.prob.p1[["map"]] <- "prob.p1"
+       if(!is.null(dat[["geno"]])){
+         res.prob.p1 <- lapply(results, "[[", 7)
+         res.prob.p1 <- do.call(rbind, res.prob.p1)
+         res.prob.p1[["map"]] <- "prob.p1"
         
-        res.prob.p2 <- lapply(results, "[[", 8)
-        res.prob.p2 <- do.call(rbind, res.prob.p2)
-        res.prob.p2[["map"]] <- "prob.p2"
-        summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2)
-      } else summaries <- rbind(res.p1, res.p2)
-      
-      summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}"
+         res.prob.p2 <- lapply(results, "[[", 8)
+         res.prob.p2 <- do.call(rbind, res.prob.p2)
+         res.prob.p2[["map"]] <- "prob.p2"
+         summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2)
+       } else summaries <- rbind(res.p1, res.p2)
       
+       summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}"
+      } else {
+       summaries <- 1
+       info <- 1
+       dat <- 1
+       mat <- 1
+       maps <- 1
+      }
+
       saveRDS(summaries, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_summaries.rds")
       saveRDS(info, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_info.rds")
       saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds")

From 4cbd7b7123f9e7b35c23a4d572ece82baf0bafdf Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Thu, 4 Jan 2024 22:08:30 -0600
Subject: [PATCH 09/10] up changelog

---
 pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md        | 8 +++++++-
 .../EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md  | 8 +++++++-
 tasks/gatk.wdl                                            | 2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md
index 1447e70..ffcc10c 100644
--- a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md
+++ b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md
@@ -1,3 +1,9 @@
+# 1.3.1
+
+* Make EmpiricalMaps for polyploids compatible with Reads2MapApp v0.0.2
+* Polyploid map quality diagnostic by resampling
+* Adapt regenotyping memory usage
+
 # 1.3.0
 
 * Add MAPpoly new functions framework_map and update_framework_map
@@ -7,7 +13,7 @@
 
 # 1.2.5
 
-* more flexibility to choose the probability to be used in the HMM: 
+* more flexibility to choose the probability to be used in the HMM:
 
 * new parameters:
 - global_errors: array with global errors to be tested
diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md
index cca2626..28c0d13 100644
--- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md
+++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md
@@ -1,3 +1,9 @@
+# 1.4.4
+
+* Make BarcodeFaker task work for input files that do not finish with .fasta or .fq
+* Adapt BarcodeFaker required memory and time
+* Adapt STACKs required memory
+
 # 1.4.3
 
 * Adapt tassel and stacks tasks also for polyploids
@@ -32,7 +38,7 @@
 # 1.2.0
 
 * Run freebayes parallelizing in nodes according to chromosomes and cores splitting in genomic regions
-* Adjust runtimes 
+* Adjust runtimes
 * Add polyploid dataset for tests
 
 # 1.1.0
diff --git a/tasks/gatk.wdl b/tasks/gatk.wdl
index 5c879aa..4c8fe8c 100644
--- a/tasks/gatk.wdl
+++ b/tasks/gatk.wdl
@@ -299,7 +299,7 @@ task VariantFiltration {
             -filter "QUAL < 30.0" --filter-name "QUAL30" \
             -filter "SOR > 3.0" --filter-name "SOR3" \
             -filter "FS > 60.0" --filter-name "FS60" \
-            -filter "MQ < 40.0" --filter-name "MQ50" \
+            -filter "MQ < 40.0" --filter-name "MQ40" \
             -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \
             -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \
             -O gatk_filters.vcf.gz

From a307b6ebe03cced582354d8153f6a391900bc919 Mon Sep 17 00:00:00 2001
From: cristianetaniguti <chaytaniguti@gmail.com>
Date: Sun, 7 Jan 2024 16:33:27 -0600
Subject: [PATCH 10/10] up changelog

---
 .../EmpiricalReads2Map/EmpiricalReads2Map.changelog.md   | 9 +++++++++
 .../EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json    | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md
index b450238..6328875 100644
--- a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md
+++ b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md
@@ -1,3 +1,12 @@
+# 1.5.1
+
+* Make EmpiricalMaps for polyploids compatible with Reads2MapApp v0.0.2
+* Polyploid map quality diagnostic by resampling
+* Adapt regenotyping memory usage
+* Make BarcodeFaker task work for input files that do not finish with .fasta or .fq
+* Adapt BarcodeFaker required memory and time
+* Adapt STACKs required memory
+
 # 1.5.0
 
 * Adapt tassel and stacks tasks also for polyploids
diff --git a/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json b/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json
index 4c82856..bb185ba 100644
--- a/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json
+++ b/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json
@@ -9,14 +9,14 @@
     "multiallelics": "false"
   },
   "Maps.max_cores": "2",
-  "Maps.run_supermassa":true,
+  "Maps.run_supermassa":false,
   "Maps.run_updog":true,
   "Maps.run_polyrad":true,
   "Maps.gatk_mchap": "FALSE",
   "Maps.vcfs_counts_source": ["vcf"],
   "Maps.vcfs_software": ["gatk"],
   "Maps.filter_noninfo": "true",
-  "Maps.vcfs": ["tests/data/polyploid/vcfs_norm/gatk_Chr04_filt_example.vcf.gz"],
+    "Maps.vcfs": ["/home/rose_lab/Reads2Map/tests/data/polyploid/vcfs_norm/gatk_Chr04_filt_example.vcf.gz"],
     "Maps.replaceADbyMissing": "TRUE",
     "Maps.global_errors":["0.05"]
 }