diff --git a/.dockerfiles/reads2map/Dockerfile b/.dockerfiles/reads2map/Dockerfile index 63ea5ca..1063957 100644 --- a/.dockerfiles/reads2map/Dockerfile +++ b/.dockerfiles/reads2map/Dockerfile @@ -60,4 +60,4 @@ RUN Rscript -e 'remotes::install_github("Cristianetaniguti/onemap")' # Still privates RUN Rscript -e 'remotes::install_github("Cristianetaniguti/simuscopR")' RUN Rscript -e 'remotes::install_github("Cristianetaniguti/Reads2MapTools")' -RUN Rscript -e 'remotes::install_github("Cristianetaniguti/MAPpoly")' +RUN Rscript -e 'remotes::install_github("Cristianetaniguti/MAPpoly" )' diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md index 1447e70..ffcc10c 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md @@ -1,3 +1,9 @@ +# 1.3.1 + +* Make EmpiricalMaps for polyploids compatible with Reads2MapApp v0.0.2 +* Polyploid map quality diagnostic by resampling +* Adapt regenotyping memory usage + # 1.3.0 * Add MAPpoly new functions framework_map and update_framework_map @@ -7,7 +13,7 @@ # 1.2.5 -* more flexibility to choose the probability to be used in the HMM: +* more flexibility to choose the probability to be used in the HMM: * new parameters: - global_errors: array with global errors to be tested diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl index 590b780..384c1ef 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.wdl +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.wdl @@ -36,6 +36,8 @@ workflow Maps { Array[String] global_errors = ["0.05"] Boolean genoprob_error = true Array[String] genoprob_global_errors = ["0.05"] + Int? repetitions + Int? sample_size } if (defined(filters)) { @@ -190,7 +192,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } @@ -208,7 +212,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } @@ -226,7 +232,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } @@ -243,7 +251,9 @@ workflow Maps { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } } } diff --git a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md index b450238..6328875 100644 --- a/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md +++ b/pipelines/EmpiricalReads2Map/EmpiricalReads2Map.changelog.md @@ -1,3 +1,12 @@ +# 1.5.1 + +* Make EmpiricalMaps for polyploids compatible with Reads2MapApp v0.0.2 +* Polyploid map quality diagnostic by resampling +* Adapt regenotyping memory usage +* Make BarcodeFaker task work for input files that do not finish with .fasta or .fq +* Adapt BarcodeFaker required memory and time +* Adapt STACKs required memory + # 1.5.0 * Adapt tassel and stacks tasks also for polyploids diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md index cca2626..28c0d13 100644 --- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md @@ -1,3 +1,9 @@ +# 1.4.4 + +* Make BarcodeFaker task work for input files that do not finish with .fasta or .fq +* Adapt BarcodeFaker required memory and time +* Adapt STACKs required memory + # 1.4.3 * Adapt tassel and stacks tasks also for polyploids @@ -32,7 +38,7 @@ # 1.2.0 * Run freebayes parallelizing in nodes according to chromosomes and cores splitting in genomic regions -* Adjust runtimes +* Adjust runtimes * Add polyploid dataset for tests # 1.1.0 diff --git a/subworkflows/mappoly_maps_empirical.wdl b/subworkflows/mappoly_maps_empirical.wdl index 6c531f6..44f1f31 100644 --- a/subworkflows/mappoly_maps_empirical.wdl +++ b/subworkflows/mappoly_maps_empirical.wdl @@ -1,10 +1,10 @@ version 1.0 + import "../tasks/utilsR.wdl" as utilsR import "../tasks/mappoly.wdl" as mappolyTasks workflow MappolyMapsEmp { - input { File vcf_file String SNPCall_program @@ -14,6 +14,8 @@ workflow MappolyMapsEmp { String parent1 String parent2 Float? prob_thres + Int? repetitions + Int? sample_size Int max_cores Int ploidy String? filt_segr @@ -45,11 +47,13 @@ workflow MappolyMapsEmp { ploidy = ploidy, prob_thres = prob_thres, filt_segr = filt_segr, - global_errors = global_errors + global_errors = global_errors, + repetitions = repetitions, + sample_size = sample_size } output { File tar_gz_report = MappolyReport.results File regeno_vcf = ReGenotyping.regeno_vcf } -} +} \ No newline at end of file diff --git a/tasks/freebayes.wdl b/tasks/freebayes.wdl index 4e92e74..98213aa 100644 --- a/tasks/freebayes.wdl +++ b/tasks/freebayes.wdl @@ -12,7 +12,7 @@ task RunFreebayes { } Int disk_size = ceil(size(reference, "GiB") + size(bam, "GiB") + 50) - Int memory_size = ceil(size(bam, "MiB") * 4 * max_cores + 10000) + Int memory_size = 350000 command <<< diff --git a/tasks/mappoly.wdl b/tasks/mappoly.wdl index 9fbbd83..03433fd 100644 --- a/tasks/mappoly.wdl +++ b/tasks/mappoly.wdl @@ -8,7 +8,9 @@ task MappolyReport { String CountsFrom String parent1 String parent2 - Float prob_thres = 0.8 + Float prob_thres = 0.8 + Int repetitions = 100 + Int sample_size = 30 Int max_cores Int ploidy String filt_segr = "TRUE" @@ -16,100 +18,265 @@ task MappolyReport { } Int disk_size = ceil(size(vcf_file, "GiB") * 2) - Int memory_size = ceil(size(vcf_file, "MiB") + 2000) + Int memory_size = ceil(size(vcf_file, "MiB") * max_cores + 18000*max_cores) command <<< R --vanilla --no-save < 0){ + bugfix@fix <- bugfix@fix[-idx,] + bugfix@gt <- bugfix@gt[-idx,] + } + + write.vcf(bugfix, file = "bugfix.vcf") + + dat <- read_vcf(file = "bugfix.vcf", + parent.1 = "~{parent1}", + parent.2 = "~{parent2}", + verbose = FALSE, + read.geno.prob = TRUE, + prob.thres = prob.thres, + ploidy = ~{ploidy}) + + info <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), + step = "raw", + n.markers = dat[["n.mrk"]], + mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2), + n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), + n.ind = dat[["n.ind"]]) + + dat <- filter_missing(input.data = dat, type = "marker", filter.thres = 0.25, inter = FALSE) - - dat <- filter_missing(dat, type = 'individual', filter.thres = 0.1, inter = FALSE) - - if("~{filt_segr}"){ - pval.bonf <- 0.05/dat[[3]] - mrks.chi.filt <- filter_segregation(dat, - chisq.pval.thres = pval.bonf, - inter = FALSE) - - seq.init <- make_seq_mappoly(mrks.chi.filt) - } else { - seq.init <- make_seq_mappoly(dat, "all") - } + + info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), + step = "miss filtered", + n.markers = dat[["n.mrk"]], + mis.perc = round((sum(dat[["geno.dose"]] == dat[["ploidy"]]+1)/(nrow(dat[["geno.dose"]])*ncol(dat[["geno.dose"]])))*100,2), + n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), + n.ind = dat[["n.ind"]]) + + info <- rbind(info, info_temp) + + + if("TRUE"){ + pval.bonf <- 0.05/dat[["n.mrk"]] + mrks.chi.filt <- filter_segregation(dat, + chisq.pval.thres = pval.bonf, + inter = FALSE) - # Estimate two-point recombination fraction - tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores}) - mat <- rf_list_to_matrix(input.twopt = tpt) - - # Filter markers by recombination fraction values - seq.filt <- rf_snp_filter(input.twopt = tpt, diagnostic.plot = FALSE, probs = c(0.05, 0.95)) - mat2 <- make_mat_mappoly(mat, seq.filt) - - # Run MDS for ordering according to recombination fractions - seq_test_mds <- mds_mappoly(mat2) - seq_mds <- make_seq_mappoly(seq_test_mds) - - # Sequence with genomic order - geno_order <- get_genomic_order(seq_mds) - seq_geno_order <- make_seq_mappoly(geno_order) - - init.map.list <- framework_map(input.seq = seq_geno_order, - twopt = tpt, - start.set = 5, - inflation.lim.p1 = 10, - inflation.lim.p2 = 10, - verbose = FALSE) - - res <- update_framework_map(input.map.list = init.map.list, - input.seq = seq_geno_order, - twopt = tpt, - thres.twopt = 5, - init.LOD = 100, - max.rounds = 3, - size.rem.cluster = 3, - gap.threshold = 20, - verbose = FALSE) - - # Get last interaction - iter <- length(res[[2]][[1]]) - - global_errors <- unlist(strsplit("~{sep="," global_errors}", ",")) - map_error <- list() - for(i in 1:length(global_errors)){ - map_error[[i]] <- est_full_hmm_with_global_error(res[[2]][[1]][[iter]], error = as.numeric(global_errors[i]), verbose = FALSE) - saveRDS(map_error, file= paste0("~{SNPCall_program}_~{GenotypeCall_program}",global_errors[i], "_~{CountsFrom}_map.rds")) - } - map_prob <- est_full_hmm_with_prior_prob(res[[2]][[1]][[iter]], dat.prob = dat, verbose = FALSE) - - # Diagnostic graphics - overall from plot(dat) - # Heatmap of mds with plot(mat2, ord=seq_mds) - # Heatmap of genomic order order plot(mat2, ord = map_error$info$mk.names) - # Relation between mds and genome plot(seq_mds$genome.pos) - - saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds") - saveRDS(mat2, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds") - saveRDS(seq_mds, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_seq_mds.rds") - saveRDS(map_prob, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_map.rds") - - system("mkdir results") - system("mv *.rds results") + seq.init <- make_seq_mappoly(mrks.chi.filt) + } else { + seq.init <- make_seq_mappoly(dat, "all") + } + + info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), + step = "segr filtered", + n.markers = length(seq.init[["seq.mrk.names"]]), + mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% seq.init[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(seq.init[["seq.mrk.names"]])))*100,2), + n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), + n.ind = dat[["n.ind"]]) + + info <- rbind(info, info_temp) + + sp1<-make_seq_mappoly(seq.init, info.parent = "p1") + + info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), + step = "p1", + n.markers = length(sp1[["seq.mrk.names"]]), + mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp1[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp1[["seq.mrk.names"]])))*100,2), + n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), + n.ind = dat[["n.ind"]]) + info <- rbind(info, info_temp) + + sp2<-make_seq_mappoly(seq.init, info.parent = "p2") + + info_temp <- data.frame(dat = paste0("~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}"), + step = "p2", + n.markers = length(sp2[["seq.mrk.names"]]), + mis.perc = round((sum(dat[["geno.dose"]][which(rownames(dat[["geno.dose"]]) %in% sp2[["seq.mrk.names"]]),] == dat[["ploidy"]]+1)/(ncol(dat[["geno.dose"]])*length(sp2[["seq.mrk.names"]])))*100,2), + n.redundant = round(100*(nrow(dat[["elim.correspondence"]])/(length(dat[["kept"]])+nrow(dat[["elim.correspondence"]]))),2), + n.ind = dat[["n.ind"]]) + info <- rbind(info, info_temp) + + # Estimate two-point recombination fraction + tpt <- est_pairwise_rf(input.seq = seq.init, ncpus = ~{max_cores}) + mat <- rf_list_to_matrix(input.twopt = tpt) + + if(length(seq.init[["seq.mrk.names"]]) > ~{sample_size}) { + sample_size <- ~{sample_size} + seeds <- split(1:~{repetitions}, rep(1:~{max_cores}, each=~{repetitions}/~{max_cores})) + + clust <- makeCluster(~{max_cores}) + clusterExport(clust, c("seq.init", "sample_size", "dat", "run_tests")) + results <- parLapply(clust, seeds, function(x) run_tests(seed = x, s_all = seq.init, + sample_size = sample_size, dat = dat)) + stopCluster(clust) + + map.err.p1 <- sapply(results, "[[", 1) + idx <- sapply(map.err.p1, is.null) + map.err.p1 <- map.err.p1[which(!idx)] + + map.err.p2 <- sapply(results, "[[", 2) + idx <- sapply(map.err.p2, is.null) + map.err.p2 <- map.err.p2[which(!idx)] + + if(!is.null(dat[["geno"]])){ + map.prob.p1 <- sapply(results, "[[", 3) + idx <- sapply(map.prob.p1, is.null) + map.prob.p1 <- map.prob.p1[which(!idx)] + + map.prob.p2 <- sapply(results, "[[", 4) + idx <- sapply(map.prob.p2, is.null) + map.prob.p2 <- map.prob.p2[which(!idx)] + + maps <- list(map.err.p1 = map.err.p1, + map.err.p2 = map.err.p2, + map.prob.p1 = map.prob.p1, + map.prob.p2 = map.prob.p2) + } else { + maps <- list(map.err.p1 = map.err.p1, + map.err.p2 = map.err.p2) + } + + res.p1 <- lapply(results, "[[", 5) + res.p1 <- do.call(rbind, res.p1) + res.p1[["map"]] <- "error.p1" + + res.p2 <- lapply(results, "[[", 6) + res.p2 <- do.call(rbind, res.p2) + res.p2[["map"]] <- "error.p2" + + if(!is.null(dat[["geno"]])){ + res.prob.p1 <- lapply(results, "[[", 7) + res.prob.p1 <- do.call(rbind, res.prob.p1) + res.prob.p1[["map"]] <- "prob.p1" + + res.prob.p2 <- lapply(results, "[[", 8) + res.prob.p2 <- do.call(rbind, res.prob.p2) + res.prob.p2[["map"]] <- "prob.p2" + summaries <- rbind(res.p1, res.p2, res.prob.p1, res.prob.p2) + } else summaries <- rbind(res.p1, res.p2) + + summaries[["data"]] <- "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}" + } else { + summaries <- 1 + info <- 1 + dat <- 1 + mat <- 1 + maps <- 1 + } - system(paste0("tar -czvf ", "~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}","_poly_results.tar.gz results")) + saveRDS(summaries, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_summaries.rds") + saveRDS(info, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_info.rds") + saveRDS(dat, file= "~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_dat.rds") + saveRDS(mat, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_mat2.rds") + saveRDS(maps, file="~{SNPCall_program}_~{GenotypeCall_program}_~{CountsFrom}_maps.rds") + + system("mkdir results") + system("mv *.rds results") + + system(paste0("tar -czvf ", "~{SNPCall_program}", "_", "~{GenotypeCall_program}", "_", "~{CountsFrom}","_poly_results.tar.gz results")) RSCRIPT - >>> runtime { diff --git a/tasks/stacks.wdl b/tasks/stacks.wdl index 9244c80..cb681ea 100644 --- a/tasks/stacks.wdl +++ b/tasks/stacks.wdl @@ -92,7 +92,7 @@ task RefMap { } Int disk_size = ceil(size(bams, "GiB") * 2) - Int memory_size = 3000 + ceil(size(bams, "MiB") * 2) + Int memory_size = 2000*max_cores + ceil(size(bams, "MiB") * 2) command <<< @@ -103,6 +103,10 @@ task RefMap { populations -P stacks/ -M ~{pop_map} --vcf -t ~{max_cores} + # Fix sample names + sed -i -e 's/.sorted.merged.filtered//g' stacks/populations.snps.vcf + sed -i -e 's/.sorted.merged.filtered//g' stacks/populations.haps.vcf + >>> runtime { diff --git a/tasks/tassel.wdl b/tasks/tassel.wdl index baf4c0c..389bc21 100644 --- a/tasks/tassel.wdl +++ b/tasks/tassel.wdl @@ -51,7 +51,7 @@ task BarcodeFaker { # Marlee's functions #this function recursively makes a vector of unique fake barcodes to ensure no non-unique barcodes are generated #this step does not protect against enzyme cut site being found in the barcode - barcode_inventor <- function(fake_barcodes, r, fastqs){ + barcode_inventor <- function(r, fastqs){ fake_barcodes <- c() # make some barcodes of the length needed to ensure unique barcodes for all files are possible @@ -78,25 +78,8 @@ task BarcodeFaker { #get the fastq file paths fastqs <- list.files(path = fastq_dir, full.names = TRUE) - fastqs <- fastqs[which(grepl(".fastq", fastqs) | grepl(".fq", fastqs))] fastq.check <- list.files(path = fastq_dir, full.names = FALSE) - fastq.check <- fastq.check[which(grepl(".fastq", fastq.check) | grepl(".fq", fastq.check))] - if(length(fastqs) == 0) stop("Files not fount.") - - #check that all files in dir are fastqs #bug fixed 28 July 2020- reported by Dr. Smit Dhakal - long <- length(grep(".fastq", fastq.check)) - short <- length(grep(".fq", fastq.check)) - if(long != 0){ - if(length(fastq.check) != long){ - stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.") - } - } - if(short != 0){ - if(length(fastq.check) != short){ - stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.") - } - } - + if(length(fastqs) == 0) stop("Files not fount.") # new method: given a number of input files, determine the min barcode length needed to ensure enough unique barcodes are generated # the old way failed if a lot of files had short barcodes of the same length- reported 18 Aug 2022 by Meghan Brady @@ -111,7 +94,7 @@ task BarcodeFaker { r <- ceiling(Re(r)[Re(r) > 0]) # subset the real and positive roots, and round up if(r < 4){r <- 4} # keep a minimum barcode length of 4, probably unnecessary - fake_barcodes <- barcode_inventor(fake_barcodes, r, fastqs) + fake_barcodes <- barcode_inventor(r, fastqs) #make the fake perfect quality scores matching the barcode length fake_quals <- rep(paste(rep("E", times = r), collapse = ""), times = length(fastqs)) @@ -169,7 +152,8 @@ task BarcodeFaker { is_gz <- basename(file_names[1]) if(grepl(".gz", is_gz)) { for(i in 1:length(file_names)){ - system(paste("gunzip", file_names[i])) + cat(paste("Uncompressing",file_names[i])) + system(paste("gunzip -f", file_names[i])) } } dir_name <- dirname(file_names[1]) @@ -191,8 +175,8 @@ task BarcodeFaker { disks:"local-disk 3 GiB HDD" # Slurm job_name: "BarcodeFaker" - mem:"2G" - time: 24 + mem:"5G" + time: 50 } meta { diff --git a/tasks/utilsR.wdl b/tasks/utilsR.wdl index 3105971..94ad505 100644 --- a/tasks/utilsR.wdl +++ b/tasks/utilsR.wdl @@ -511,7 +511,7 @@ task ReGenotyping { } Int disk_size = ceil((size(vcf_file, "GiB") * 4)) - Int memory_size = ceil(size(vcf_file, "MiB") * 5 + 4000) + Int memory_size = ceil(size(vcf_file, "MiB") * 5 + 5000*max_cores) command <<< R --vanilla --no-save <