From 56135a9b085ea566e48d945a9f413bcfea95efed Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Sun, 10 Dec 2023 14:19:03 -0600 Subject: [PATCH 01/10] adjust ram --- tasks/mappoly.wdl | 20 +++++++------------- tasks/stacks.wdl | 4 ++++ tasks/utilsR.wdl | 2 +- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index 9fbbd83..8ea71ed 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -16,7 +16,7 @@ task MappolyReport { } Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = ceil(size(vcf_file, "MiB") + 2000) + Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores) command <<< R --vanilla --no-save <>> runtime { diff --git a/tasks/utilsR.wdl b/tasks/utilsR.wdl index 3105971..94ad505 100644 --- a/tasks/utilsR.wdl +++ b/tasks/utilsR.wdl @@ -511,7 +511,7 @@ task ReGenotyping { } Int disk_size = ceil((size(vcf_file, "GiB") * 4)) - Int memory_size = ceil(size(vcf_file, "MiB") * 5 + 4000) + Int memory_size = ceil(size(vcf_file, "MiB") * 5 + 5000*max_cores) command <<< R --vanilla --no-save < Date: Thu, 14 Dec 2023 09:15:08 -0600 Subject: [PATCH 02/10] adjust ram and rm mds --- tasks/mappoly.wdl | 19 +++++++++---------- tasks/tassel.wdl | 7 ++++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index 8ea71ed..ebaf581 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -36,10 +36,10 @@ task MappolyReport { dat <- filter_missing(input.data = dat, type = "marker", filter.thres = 0.25, inter = FALSE) - dat <- filter_missing(dat, type = 'individual', filter.thres = 0.1, inter = FALSE) + dat <- filter_missing(dat, type = 'individual', filter.thres = 0.25, inter = FALSE) if("~{filt_segr}"){ - pval.bonf <- 0.05/dat[[3]] + pval.bonf <- 0.05/dat[["n.mrk"]] mrks.chi.filt <- filter_segregation(dat, chisq.pval.thres = pval.bonf, inter = FALSE) @@ -64,8 +64,8 @@ task MappolyReport { init.map.list <- framework_map(input.seq = seq_geno_order, twopt = tpt, start.set = 5, - inflation.lim.p1 = 10, - inflation.lim.p2 = 10, + inflation.lim.p1 = 100, + inflation.lim.p2 = 100, verbose = FALSE) res <- update_framework_map(input.map.list = init.map.list, @@ -75,7 +75,7 @@ task MappolyReport { init.LOD = 100, max.rounds = 3, size.rem.cluster = 3, - gap.threshold = 20, + gap.threshold = 30, verbose = FALSE) # Get last interaction @@ -88,14 +88,13 @@ task MappolyReport { saveRDS(map_error, file= paste0("~{SNPCall_program}_~{GenotypeCall_program}",global_errors[i], "_~{CountsFrom}_map.rds")) } - if(!is.null(dat[["geno"]])){ - map_prob <- est_full_hmm_with_prior_prob(input.map = res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE) - saveRDS(map_prob, file= "freebayes_SNPCaller_vcf_map.rds") + if(!is.null(dat[["geno"]])){ + map_prob <- est_full_hmm_with_prior_prob(input.map = res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE) + saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds") } - + saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds") saveRDS(mat2, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds") - saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds") system("mkdir results") system("mv *.rds results") diff --git a/tasks/tassel.wdl b/tasks/tassel.wdl index baf4c0c..6329335 100644 --- a/tasks/tassel.wdl +++ b/tasks/tassel.wdl @@ -169,7 +169,8 @@ task BarcodeFaker { is_gz <- basename(file_names[1]) if(grepl(".gz", is_gz)) { for(i in 1:length(file_names)){ - system(paste("gunzip", file_names[i])) + cat(paste("Uncompressing",file_names[i])) + system(paste("gunzip -f", file_names[i])) } } dir_name <- dirname(file_names[1]) @@ -191,8 +192,8 @@ task BarcodeFaker { disks:"local-disk 3 GiB HDD" # Slurm job_name: "BarcodeFaker" - mem:"2G" - time: 24 + mem:"5G" + time: 50 } meta { From 0fd18ea1092917c79e3e8177ee01640d3f97a4a2 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Sat, 16 Dec 2023 12:51:25 -0600 Subject: [PATCH 03/10] resample mappoly --- .dockerfiles/reads2map/Dockerfile | 2 +- pipelines/EmpiricalMaps/EmpiricalMaps.wdl | 6 +- subworkflows/mappoly_maps_empirical.wdl | 4 +- tasks/mappoly.wdl | 404 +++++++++++++++------- 4 files changed, 282 insertions(+), 134 deletions(-) diff --git a/.dockerfiles/reads2map/Dockerfile b/.dockerfiles/reads2map/Dockerfile index 63ea5ca..1063957 100644 --- a/.dockerfiles/reads2map/Dockerfile +++ b/.dockerfiles/reads2map/Dockerfile @@ -60,4 +60,4 @@ RUN Rscript -e 'remotes::install_github("Cristianetaniguti/onemap")' # Still privates RUN Rscript -e 'remotes::install_github("Cristianetaniguti/simuscopR")' RUN Rscript -e 'remotes::install_github("Cristianetaniguti/Reads2MapTools")' -RUN Rscript -e 'remotes::install_github("Cristianetaniguti/MAPpoly")' +RUN Rscript -e 'remotes::install_github("Cristianetaniguti/MAPpoly" )' diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl index 590b780..087602c 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl @@ -226,7 +226,7 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors } } @@ -243,7 +243,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } } diff --git a/subworkflows/mappoly_maps_empirical.wdl b/subworkflows/mappoly_maps_empirical.wdl index 6c531f6..a01f26f 100644 --- a/subworkflows/mappoly_maps_empirical.wdl +++ b/subworkflows/mappoly_maps_empirical.wdl @@ -1,10 +1,10 @@ version 1.0 + import "../tasks/utilsR.wdl" as utilsR import "../tasks/mappoly.wdl" as mappolyTasks workflow MappolyMapsEmp { - input { File vcf_file String SNPCall_program @@ -52,4 +52,4 @@ workflow MappolyMapsEmp { File tar_gz_report = MappolyReport.results File regeno_vcf = ReGenotyping.regeno_vcf } -} +} \ No newline at end of file diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index ebaf581..881c749 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -1,130 +1,276 @@ -version 1.0 - -task MappolyReport { - input { - File vcf_file - String SNPCall_program - String GenotypeCall_program - String CountsFrom - String parent1 - String parent2 - Float prob_thres = 0.8 - Int max_cores - Int ploidy - String filt_segr = "TRUE" - Array[String] global_errors = ["0.05"] - } - - Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores) - - command <<< - R --vanilla --no-save <>> - - runtime { - docker:"cristaniguti/reads2map:0.1.0" - singularity: "docker://cristaniguti/reads2map:0.1.0" - cpu: max_cores - # Cloud - memory:"~{memory_size} MiB" - disks:"local-disk " + disk_size + " HDD" - # Slurm - job_name: "MappolyReport" - mem:"~{memory_size}G" - time: 24 - } - - meta { - author: "Cristiane Taniguti" - email: "chtaniguti@tamu.edu" - description: "Build linkage map using MAPpoly" - } - - output { - File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz" - } +version 1.0 + +task MappolyReport { + input { + File vcf_file + String SNPCall_program + String GenotypeCall_program + String CountsFrom + String parent1 + String parent2 + Float prob_thres = 0.8 + Int? repetitions = 100 + Int? sample_size = 30 + Int max_cores + Int ploidy + String filt_segr = "TRUE" + Array[String] global_errors = ["0.05"] + } + + Int disk_size = ceil(size(vcf_file, "GiB") * 2) + Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores) + + command <<< + R --vanilla --no-save <>> + + runtime { + docker:"cristaniguti/reads2map:0.1.0" + singularity: "docker://cristaniguti/reads2map:0.1.0" + cpu: max_cores + # Cloud + memory:"~{memory_size} MiB" + disks:"local-disk " + disk_size + " HDD" + # Slurm + job_name: "MappolyReport" + mem:"~{memory_size}G" + time: 24 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "Build linkage map using MAPpoly" + } + + output { + File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz" + } } \ No newline at end of file From 5b360b4ad64f38a2a91e0a23cb46141961e466d0 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Sat, 16 Dec 2023 13:14:33 -0600 Subject: [PATCH 04/10] sample size variable --- pipelines/EmpiricalMaps/EmpiricalMaps.wdl | 14 +- subworkflows/mappoly_maps_empirical.wdl | 6 +- tasks/mappoly.wdl | 550 +++++++++++----------- 3 files changed, 291 insertions(+), 279 deletions(-) diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl index 087602c..384c1ef 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl @@ -36,6 +36,8 @@ workflow Maps { Array[String] global_errors = ["0.05"] Boolean genoprob_error = true Array[String] genoprob_global_errors = ["0.05"] + Int? repetitions + Int? sample_size } if (defined(filters)) { @@ -190,7 +192,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } @@ -208,7 +212,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } @@ -226,7 +232,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } diff --git a/subworkflows/mappoly_maps_empirical.wdl b/subworkflows/mappoly_maps_empirical.wdl index a01f26f..44f1f31 100644 --- a/subworkflows/mappoly_maps_empirical.wdl +++ b/subworkflows/mappoly_maps_empirical.wdl @@ -14,6 +14,8 @@ workflow MappolyMapsEmp { String parent1 String parent2 Float? prob_thres + Int? repetitions + Int? sample_size Int max_cores Int ploidy String? filt_segr @@ -45,7 +47,9 @@ workflow MappolyMapsEmp { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } output { diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index 881c749..658966e 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -1,276 +1,276 @@ -version 1.0 - -task MappolyReport { - input { - File vcf_file - String SNPCall_program - String GenotypeCall_program - String CountsFrom - String parent1 - String parent2 - Float prob_thres = 0.8 - Int? repetitions = 100 - Int? sample_size = 30 - Int max_cores - Int ploidy - String filt_segr = "TRUE" - Array[String] global_errors = ["0.05"] - } - - Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores) - - command <<< - R --vanilla --no-save <>> - - runtime { - docker:"cristaniguti/reads2map:0.1.0" - singularity: "docker://cristaniguti/reads2map:0.1.0" - cpu: max_cores - # Cloud - memory:"~{memory_size} MiB" - disks:"local-disk " + disk_size + " HDD" - # Slurm - job_name: "MappolyReport" - mem:"~{memory_size}G" - time: 24 - } - - meta { - author: "Cristiane Taniguti" - email: "chtaniguti@tamu.edu" - description: "Build linkage map using MAPpoly" - } - - output { - File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz" - } +version 1.0 + +task MappolyReport { + input { + File vcf_file + String SNPCall_program + String GenotypeCall_program + String CountsFrom + String parent1 + String parent2 + Float prob_thres = 0.8 + Int repetitions = 100 + Int sample_size = 30 + Int max_cores + Int ploidy + String filt_segr = "TRUE" + Array[String] global_errors = ["0.05"] + } + + Int disk_size = ceil(size(vcf_file, "GiB") * 2) + Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores) + + command <<< + R --vanilla --no-save <>> + + runtime { + docker:"cristaniguti/reads2map:0.1.0" + singularity: "docker://cristaniguti/reads2map:0.1.0" + cpu: max_cores + # Cloud + memory:"~{memory_size} MiB" + disks:"local-disk " + disk_size + " HDD" + # Slurm + job_name: "MappolyReport" + mem:"~{memory_size}G" + time: 24 + } + + meta { + author: "Cristiane Taniguti" + email: "chtaniguti@tamu.edu" + description: "Build linkage map using MAPpoly" + } + + output { + File results = "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_poly_results.tar.gz" + } } \ No newline at end of file From fa17ecac1f7583a97ab80975cfd29c31a885b52a Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 19 Dec 2023 15:07:01 -0600 Subject: [PATCH 05/10] replace for by while --- tasks/mappoly.wdl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index 658966e..d631fdf 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -28,9 +28,10 @@ task MappolyReport { run_tests <- function(seed, s_all, sample_size, dat){ set.seed(seed[1]) library(mappoly) + i <- 1 res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed)) - for(i in 1:length(seed)){ + while(i < length(seed)){ tryCatch({ mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size) s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id)) @@ -54,7 +55,7 @@ task MappolyReport { # Global error map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3) - map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt) + map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4) x<-summary_maps(list(map.err.p1[[i]])) res.p1 <-rbind(res.p1,x[1,]) @@ -62,7 +63,7 @@ task MappolyReport { # Prob error if(!is.null(dat[["geno"]])){ map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3) - map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 2, twopt = tpt) + map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4) x<-summary_maps(list(map.prob.p1[[i]])) res.prob.p1 <-rbind(res.prob.p1,x[1,]) @@ -100,6 +101,7 @@ task MappolyReport { x<-summary_maps(list(map.prob.p2[[i]])) res.prob.p2 <-rbind(res.prob.p2,x[1,]) } + i <- i + 1 }, error=function(e){}) } return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, From c122cfc459c5d29858dcf48fab82a1e057c70ab0 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 26 Dec 2023 19:48:38 -0600 Subject: [PATCH 06/10] adapt ram --- tasks/freebayes.wdl | 2 +- tasks/gatk.wdl | 2 +- tasks/mappoly.wdl | 49 ++++++++++++++++++++++++++------------------- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/tasks/freebayes.wdl b/tasks/freebayes.wdl index 4e92e74..500c4c6 100644 --- a/tasks/freebayes.wdl +++ b/tasks/freebayes.wdl @@ -12,7 +12,7 @@ task RunFreebayes { } Int disk_size = ceil(size(reference, "GiB") + size(bam, "GiB") + 50) - Int memory_size = ceil(size(bam, "MiB") * 4 * max_cores + 10000) + Int memory_size = 300000 command <<< diff --git a/tasks/gatk.wdl b/tasks/gatk.wdl index 4c8fe8c..5c879aa 100644 --- a/tasks/gatk.wdl +++ b/tasks/gatk.wdl @@ -299,7 +299,7 @@ task VariantFiltration { -filter "QUAL < 30.0" --filter-name "QUAL30" \ -filter "SOR > 3.0" --filter-name "SOR3" \ -filter "FS > 60.0" --filter-name "FS60" \ - -filter "MQ < 40.0" --filter-name "MQ40" \ + -filter "MQ < 40.0" --filter-name "MQ50" \ -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \ -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \ -O gatk_filters.vcf.gz diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index d631fdf..0327010 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -28,10 +28,10 @@ task MappolyReport { run_tests <- function(seed, s_all, sample_size, dat){ set.seed(seed[1]) library(mappoly) - i <- 1 + map.err.p1.idx <- map.err.p2.idx <- map.prob.p1.idx <- map.prob.p2.idx <- 1 res.p1 <- res.p2 <- res.prob.p1 <- res.prob.p2 <- NULL map.err.p1 <- map.err.p2 <- map.prob.p1 <- map.prob.p2 <- vector("list", length(seed)) - while(i < length(seed)){ + while(map.err.p1.idx <= length(seed) | map.err.p2.idx <= length(seed) | map.prob.p1.idx <= length(seed) | map.prob.p2.idx <= length(seed)){ tryCatch({ mrk.id<-sample(s_all[["seq.mrk.names"]], sample_size) s<-get_genomic_order(make_seq_mappoly(s_all,mrk.id)) @@ -54,20 +54,24 @@ task MappolyReport { map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001) # Global error - map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3) - map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) - map.err.p1[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4) - x<-summary_maps(list(map.err.p1[[i]])) - res.p1 <-rbind(res.p1,x[1,]) + if(map.err.p1.idx <= length(seed)){ + map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3) + map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) + map.err.p1[[map.err.p1.idx]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4) + x<-summary_maps(list(map.err.p1[[map.err.p1.idx]])) + res.p1 <-rbind(res.p1,x[1,]) + map.err.p1.idx <- map.err.p1.idx + 1 + } # Prob error - if(!is.null(dat[["geno"]])){ + if(!is.null(dat[["geno"]]) & map.prob.p1.idx <= length(seed)){ map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3) map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) - map.prob.p1[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4) - x<-summary_maps(list(map.prob.p1[[i]])) + map.prob.p1[[map.prob.p1.idx]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4) + x<-summary_maps(list(map.prob.p1[[map.prob.p1.idx]])) res.prob.p1 <-rbind(res.prob.p1,x[1,]) - } + map.prob.p1.idx <- map.prob.p1.idx + 1 + } else if (is.null(dat[["geno"]])) map.prob.p1.idx <- map.prob.p1.idx + 1 # Parent 2 s2<-make_seq_mappoly(s, info.parent = "p2") @@ -87,21 +91,24 @@ task MappolyReport { map <- filter_map_at_hmm_thres(map, thres.hmm = 0.0001) # Global error - map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3) - map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) - map.err.p2[[i]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4) - x<-summary_maps(list(map.err.p2[[i]])) - res.p2<-rbind(res.p2,x[1,]) + if(map.err.p2.idx <= length(seed)){ + map2 <- est_full_hmm_with_global_error(map, error = 0.05, tol = 10e-3) + map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) + map.err.p2[[map.err.p2.idx]] <- est_full_hmm_with_global_error(map3, error = 0.05, tol = 10e-4) + x<-summary_maps(list(map.err.p2[[map.err.p2.idx]])) + res.p2<-rbind(res.p2,x[1,]) + map.err.p2.idx <- map.err.p2.idx + 1 + } # Prob error - if(!is.null(dat[["geno"]])){ + if(!is.null(dat[["geno"]]) & map.prob.p2.idx <= length(seed)){ map2 <- est_full_hmm_with_prior_prob(map, tol = 10e-3) map3 <- split_and_rephase(map2, gap.threshold = 20, size.rem.cluster = 3, twopt = tpt) - map.prob.p2[[i]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4) - x<-summary_maps(list(map.prob.p2[[i]])) + map.prob.p2[[map.prob.p2.idx]] <- est_full_hmm_with_prior_prob(map3, tol = 10e-4) + x<-summary_maps(list(map.prob.p2[[map.prob.p2.idx]])) res.prob.p2 <-rbind(res.prob.p2,x[1,]) - } - i <- i + 1 + map.prob.p2.idx <- map.prob.p2.idx + 1 + } else if (is.null(dat[["geno"]])) map.prob.p2.idx <- map.prob.p2.idx + 1 }, error=function(e){}) } return(list(map.err.p1, map.err.p2, map.prob.p1, map.prob.p2, From a6d3e7ce464634ecda8d2eac345221945ec87f18 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Mon, 1 Jan 2024 16:20:48 -0600 Subject: [PATCH 07/10] adjust resources --- tasks/freebayes.wdl | 2 +- tasks/mappoly.wdl | 2 +- tasks/stacks.wdl | 2 +- tasks/tassel.wdl | 23 +++-------------------- 4 files changed, 6 insertions(+), 23 deletions(-) diff --git a/tasks/freebayes.wdl b/tasks/freebayes.wdl index 500c4c6..98213aa 100644 --- a/tasks/freebayes.wdl +++ b/tasks/freebayes.wdl @@ -12,7 +12,7 @@ task RunFreebayes { } Int disk_size = ceil(size(reference, "GiB") + size(bam, "GiB") + 50) - Int memory_size = 300000 + Int memory_size = 350000 command <<< diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index 0327010..e9eaac9 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -18,7 +18,7 @@ task MappolyReport { } Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 5000*max_cores) + Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 12000*max_cores) command <<< R --vanilla --no-save < 0]) # subset the real and positive roots, and round up if(r < 4){r <- 4} # keep a minimum barcode length of 4, probably unnecessary - fake_barcodes <- barcode_inventor(fake_barcodes, r, fastqs) + fake_barcodes <- barcode_inventor(r, fastqs) #make the fake perfect quality scores matching the barcode length fake_quals <- rep(paste(rep("E", times = r), collapse = ""), times = length(fastqs)) From cf7a86d370feefc64e91fb75327cdb386785b680 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Thu, 4 Jan 2024 21:55:27 -0600 Subject: [PATCH 08/10] add exception --- tasks/mappoly.wdl | 103 +++++++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 42 deletions(-) diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index e9eaac9..03433fd 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -18,7 +18,7 @@ task MappolyReport { } Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 12000*max_cores) + Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 18000*max_cores) command <<< R --vanilla --no-save < 0){ + bugfix@fix <- bugfix@fix[-idx,] + bugfix@gt <- bugfix@gt[-idx,] + } - dat <- read_vcf(file = "~{vcf_file}", + write.vcf(bugfix, file = "bugfix.vcf") + + dat <- read_vcf(file = "bugfix.vcf", parent.1 = "~{parent1}", parent.2 = "~{parent2}", verbose = FALSE, @@ -190,62 +201,70 @@ task MappolyReport { tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores}) mat <- rf_list_to_matrix(input.twopt = tpt) - sample_size <- if(length(seq.init[["seq.mrk.names"]]) <= ~{sample_size}) length(seq.init[["seq.mrk.names"]]) else ~{sample_size} - seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores})) + if(length(seq.init[["seq.mrk.names"]]) > ~{sample_size}) { + sample_size <- ~{sample_size} + seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores})) - clust <- makeCluster(~{max_cores}) - clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests")) - results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, + clust <- makeCluster(~{max_cores}) + clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests")) + results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, sample_size = sample_size, dat = dat)) - stopCluster(clust) + stopCluster(clust) - map.err.p1 <- sapply(results, "[[", 1) - idx <- sapply(map.err.p1, is.null) - map.err.p1 <- map.err.p1[which(!idx)] + map.err.p1 <- sapply(results, "[[", 1) + idx <- sapply(map.err.p1, is.null) + map.err.p1 <- map.err.p1[which(!idx)] - map.err.p2 <- sapply(results, "[[", 2) - idx <- sapply(map.err.p2, is.null) - map.err.p2 <- map.err.p2[which(!idx)] + map.err.p2 <- sapply(results, "[[", 2) + idx <- sapply(map.err.p2, is.null) + map.err.p2 <- map.err.p2[which(!idx)] - if(!is.null(dat[["geno"]])){ - map.prob.p1 <- sapply(results, "[[", 3) - idx <- sapply(map.prob.p1, is.null) - map.prob.p1 <- map.prob.p1[which(!idx)] + if(!is.null(dat[["geno"]])){ + map.prob.p1 <- sapply(results, "[[", 3) + idx <- sapply(map.prob.p1, is.null) + map.prob.p1 <- map.prob.p1[which(!idx)] - map.prob.p2 <- sapply(results, "[[", 4) - idx <- sapply(map.prob.p2, is.null) - map.prob.p2 <- map.prob.p2[which(!idx)] + map.prob.p2 <- sapply(results, "[[", 4) + idx <- sapply(map.prob.p2, is.null) + map.prob.p2 <- map.prob.p2[which(!idx)] - maps <- list(map.err.p1 = map.err.p1, + maps <- list(map.err.p1 = map.err.p1, map.err.p2 = map.err.p2, map.prob.p1 = map.prob.p1, map.prob.p2 = map.prob.p2) - } else { - maps <- list(map.err.p1 = map.err.p1, + } else { + maps <- list(map.err.p1 = map.err.p1, map.err.p2 = map.err.p2) - } + } - res.p1 <- lapply(results, "[[", 5) - res.p1 <- do.call(rbind, res.p1) - res.p1[["map"]] <- "error.p1" + res.p1 <- lapply(results, "[[", 5) + res.p1 <- do.call(rbind, res.p1) + res.p1[["map"]] <- "error.p1" - res.p2 <- lapply(results, "[[", 6) - res.p2 <- do.call(rbind, res.p2) - res.p2[["map"]] <- "error.p2" + res.p2 <- lapply(results, "[[", 6) + res.p2 <- do.call(rbind, res.p2) + res.p2[["map"]] <- "error.p2" - if(!is.null(dat[["geno"]])){ - res.prob.p1 <- lapply(results, "[[", 7) - res.prob.p1 <- do.call(rbind, res.prob.p1) - res.prob.p1[["map"]] <- "prob.p1" + if(!is.null(dat[["geno"]])){ + res.prob.p1 <- lapply(results, "[[", 7) + res.prob.p1 <- do.call(rbind, res.prob.p1) + res.prob.p1[["map"]] <- "prob.p1" - res.prob.p2 <- lapply(results, "[[", 8) - res.prob.p2 <- do.call(rbind, res.prob.p2) - res.prob.p2[["map"]] <- "prob.p2" - summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2) - } else summaries <- rbind(res.p1, res.p2) - - summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}" + res.prob.p2 <- lapply(results, "[[", 8) + res.prob.p2 <- do.call(rbind, res.prob.p2) + res.prob.p2[["map"]] <- "prob.p2" + summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2) + } else summaries <- rbind(res.p1, res.p2) + summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}" + } else { + summaries <- 1 + info <- 1 + dat <- 1 + mat <- 1 + maps <- 1 + } + saveRDS(summaries, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_summaries.rds") saveRDS(info, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_info.rds") saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds") From 4cbd7b7123f9e7b35c23a4d572ece82baf0bafdf Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Thu, 4 Jan 2024 22:08:30 -0600 Subject: [PATCH 09/10] up changelog --- pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md | 8 +++++++- .../EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md | 8 +++++++- tasks/gatk.wdl | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md index 1447e70..ffcc10c 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md @@ -1,3 +1,9 @@ +# 1.3.1 + +* Make EmpiricalMaps for polyploids compatible with Reads2MapApp v0.0.2 +* Polyploid map quality diagnostic by resampling +* Adapt regenotyping memory usage + # 1.3.0 * Add MAPpoly new functions framework_map and update_framework_map @@ -7,7 +13,7 @@ # 1.2.5 -* more flexibility to choose the probability to be used in the HMM: +* more flexibility to choose the probability to be used in the HMM: * new parameters: - global_errors: array with global errors to be tested diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md index cca2626..28c0d13 100644 --- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md @@ -1,3 +1,9 @@ +# 1.4.4 + +* Make BarcodeFaker task work for input files that do not finish with .fasta or .fq +* Adapt BarcodeFaker required memory and time +* Adapt STACKs required memory + # 1.4.3 * Adapt tassel and stacks tasks also for polyploids @@ -32,7 +38,7 @@ # 1.2.0 * Run freebayes parallelizing in nodes according to chromosomes and cores splitting in genomic regions -* Adjust runtimes +* Adjust runtimes * Add polyploid dataset for tests # 1.1.0 diff --git a/tasks/gatk.wdl b/tasks/gatk.wdl index 5c879aa..4c8fe8c 100644 --- a/tasks/gatk.wdl +++ b/tasks/gatk.wdl @@ -299,7 +299,7 @@ task VariantFiltration { -filter "QUAL < 30.0" --filter-name "QUAL30" \ -filter "SOR > 3.0" --filter-name "SOR3" \ -filter "FS > 60.0" --filter-name "FS60" \ - -filter "MQ < 40.0" --filter-name "MQ50" \ + -filter "MQ < 40.0" --filter-name "MQ40" \ -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \ -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \ -O gatk_filters.vcf.gz From a307b6ebe03cced582354d8153f6a391900bc919 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Sun, 7 Jan 2024 16:33:27 -0600 Subject: [PATCH 10/10] up changelog --- .../EmpiricalReads2Map/EmpiricalReads2Map.changelog.md | 9 +++++++++ .../EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md index b450238..6328875 100644 --- a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md +++ b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md @@ -1,3 +1,12 @@ +# 1.5.1 + +* Make EmpiricalMaps for polyploids compatible with Reads2MapApp v0.0.2 +* Polyploid map quality diagnostic by resampling +* Adapt regenotyping memory usage +* Make BarcodeFaker task work for input files that do not finish with .fasta or .fq +* Adapt BarcodeFaker required memory and time +* Adapt STACKs required memory + # 1.5.0 * Adapt tassel and stacks tasks also for polyploids diff --git a/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json b/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json index 4c82856..bb185ba 100644 --- a/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json +++ b/tests/pipelines/EmpiricalMaps/polyploid/EmpiricalMaps_inputs.json @@ -9,14 +9,14 @@ "multiallelics": "false" }, "Maps.max_cores": "2", - "Maps.run_supermassa":true, + "Maps.run_supermassa":false, "Maps.run_updog":true, "Maps.run_polyrad":true, "Maps.gatk_mchap": "FALSE", "Maps.vcfs_counts_source": ["vcf"], "Maps.vcfs_software": ["gatk"], "Maps.filter_noninfo": "true", - "Maps.vcfs": ["tests/data/polyploid/vcfs_norm/gatk_Chr04_filt_example.vcf.gz"], + "Maps.vcfs": ["/home/rose_lab/Reads2Map/tests/data/polyploid/vcfs_norm/gatk_Chr04_filt_example.vcf.gz"], "Maps.replaceADbyMissing": "TRUE", "Maps.global_errors":["0.05"] }