From abcb0ffcf2581d01abf01c69b0100d47c2c26b62 Mon Sep 17 00:00:00 2001 From: Lucas Taniguti Date: Mon, 21 Nov 2022 06:09:50 -0300 Subject: [PATCH 01/10] build: correct env var for prod env --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2367748..b8d5467 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -59,7 +59,7 @@ workflows: - develop - release-to-github: name: release-production-version - env: "dev" + env: "prod" filters: branches: only: From b1a22abb969ff4c917f6b7dec878e800647922bf Mon Sep 17 00:00:00 2001 From: Lucas Taniguti Date: Mon, 21 Nov 2022 06:14:59 -0300 Subject: [PATCH 02/10] refactor: remove not used struct --- structs/read_simulation_structs.wdl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/structs/read_simulation_structs.wdl b/structs/read_simulation_structs.wdl index abd665d..5d1d26d 100644 --- a/structs/read_simulation_structs.wdl +++ b/structs/read_simulation_structs.wdl @@ -29,11 +29,3 @@ struct Sequencing { String rm_dupli Int mapsize } - -struct OptionalFilters { # TODO: Check if it is used. If not: remove it - String? Filter1 - String? Filter2 - String? Filter3 - String? Filter4 - String? Filter5 -} From be782f65c595b8f72524e9bbd6c4546b8645a329 Mon Sep 17 00:00:00 2001 From: Lucas Taniguti Date: Tue, 22 Nov 2022 06:02:34 -0300 Subject: [PATCH 03/10] test: remove unused old tests --- .../freebayes_genotyping/test_data.json | 54 ------ .../freebayes_genotyping/test_freebayes.py | 33 ---- test_legacy/gatk_genotyping/test_data.json | 35 ---- test_legacy/gatk_genotyping/test_gatk.py | 31 ---- test_legacy/genotyping_r/test_data.json | 10 -- test_legacy/genotyping_r/test_genotyping_r.py | 23 --- .../workflows/TestGenotypingR.wdl | 161 ------------------ test_legacy/read_simulation/test_data.json | 33 ---- .../read_simulation/test_read_simulation.py | 49 ------ 9 files changed, 429 deletions(-) delete mode 100755 test_legacy/freebayes_genotyping/test_data.json delete mode 100644 test_legacy/freebayes_genotyping/test_freebayes.py delete mode 100644 test_legacy/gatk_genotyping/test_data.json delete mode 100644 test_legacy/gatk_genotyping/test_gatk.py delete mode 100755 test_legacy/genotyping_r/test_data.json delete mode 100644 test_legacy/genotyping_r/test_genotyping_r.py delete mode 100644 test_legacy/genotyping_r/workflows/TestGenotypingR.wdl delete mode 100755 test_legacy/read_simulation/test_data.json delete mode 100755 test_legacy/read_simulation/test_read_simulation.py diff --git a/test_legacy/freebayes_genotyping/test_data.json b/test_legacy/freebayes_genotyping/test_data.json deleted file mode 100755 index 37565ff..0000000 --- a/test_legacy/freebayes_genotyping/test_data.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "ref_fasta": { - "name": "sub2M.fa" - }, - "ref_dict": { - "name": "sub2M.dict" - }, - "ref_ann": { - "name": "sub2M.fa.ann" - }, - "ref_sa": { - "name": "sub2M.fa.sa" - }, - "ref_amb": { - "name": "sub2M.fa.amb" - }, - "ref_bwt": { - "name": "sub2M.fa.bwt" - }, - "ref_fasta_index": { - "name": "sub2M.fa.fai" - }, - "ref_pac": { - "name": "sub2M.fa.pac" - }, - "P1_bam": { - "name": "P1.sorted.bam" - }, - "P2_bam": { - "name": "P2.sorted.bam" - }, - "F1_01_bam": { - "name": "F1_01.sorted.bam" - }, - "F1_02_bam": { - "name": "F1_02.sorted.bam" - }, - "P1_bai": { - "name": "P1.sorted.bam.bai" - }, - "P2_bai": { - "name": "P2.sorted.bam.bai" - }, - "F1_01_bai": { - "name": "F1_01.sorted.bam.bai" - }, - "F1_02_bai": { - "name": "F1_02.sorted.bam.bai" - }, - "vcf_bi_freebayes": { - "name": "freebayes_bi.recode.vcf.gz", - "type": "vcf" - } -} diff --git a/test_legacy/freebayes_genotyping/test_freebayes.py b/test_legacy/freebayes_genotyping/test_freebayes.py deleted file mode 100644 index 1c9cd9c..0000000 --- a/test_legacy/freebayes_genotyping/test_freebayes.py +++ /dev/null @@ -1,33 +0,0 @@ -def test_freebayes(workflow_data, workflow_runner): - inputs = { - "parent1": "P1", - "parent2": "P2", - "sample_names": ["P1", "P2", "F1_01", "F1_02"], - "program": "freebayes", - "references": { - "ref_fasta": workflow_data["ref_fasta"], - "ref_dict": workflow_data["ref_dict"], - "ref_ann": workflow_data["ref_ann"], - "ref_sa": workflow_data["ref_sa"], - "ref_amb": workflow_data["ref_amb"], - "ref_bwt": workflow_data["ref_bwt"], - "ref_fasta_index": workflow_data["ref_fasta_index"], - "ref_pac": workflow_data["ref_pac"], - }, - "max_cores": 4, - "bams": [ - workflow_data["F1_01_bam"], - workflow_data["F1_02_bam"], - workflow_data["P1_bam"], - workflow_data["P2_bam"], - ], - "bais": [ - workflow_data["F1_01_bai"], - workflow_data["F1_02_bai"], - workflow_data["P1_bai"], - workflow_data["P2_bai"], - ], - } - - expected = {"vcf_biallelics": workflow_data["vcf_bi_freebayes"]} - workflow_runner("tasks/freebayes_genotyping.wdl", inputs, expected) diff --git a/test_legacy/gatk_genotyping/test_data.json b/test_legacy/gatk_genotyping/test_data.json deleted file mode 100644 index 03797d7..0000000 --- a/test_legacy/gatk_genotyping/test_data.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "ref_fasta": { - "name": "sub2M.fa" - }, - "ref_dict": { - "name": "sub2M.dict" - }, - "ref_ann": { - "name": "sub2M.fa.ann" - }, - "ref_sa": { - "name": "sub2M.fa.sa" - }, - "ref_amb": { - "name": "sub2M.fa.amb" - }, - "ref_bwt": { - "name": "sub2M.fa.bwt" - }, - "ref_fasta_index": { - "name": "sub2M.fa.fai" - }, - "ref_pac": { - "name": "sub2M.fa.pac" - }, - "P1_bam": {"name": "P1.sorted.bam"}, - "P2_bam": {"name": "P2.sorted.bam"}, - "F1_01_bam": {"name": "F1_01.sorted.bam"}, - "F1_02_bam": {"name": "F1_02.sorted.bam"}, - "P1_bai": {"name": "P1.sorted.bam.bai"}, - "P2_bai": {"name": "P2.sorted.bam.bai"}, - "F1_01_bai": {"name": "F1_01.sorted.bam.bai"}, - "F1_02_bai": {"name": "F1_02.sorted.bam.bai"}, - "smallest_vcf_bi": {"name": "smallest_gatk_bi.recode.vcf.gz", "type": "vcf"} -} diff --git a/test_legacy/gatk_genotyping/test_gatk.py b/test_legacy/gatk_genotyping/test_gatk.py deleted file mode 100644 index ef8f322..0000000 --- a/test_legacy/gatk_genotyping/test_gatk.py +++ /dev/null @@ -1,31 +0,0 @@ -def test_read_simu(workflow_data, workflow_runner): - inputs = { - "parent1": "P1", - "parent2": "P2", - "program": "gatk", - "references": { - "ref_fasta": workflow_data["ref_fasta"], - "ref_dict": workflow_data["ref_dict"], - "ref_ann": workflow_data["ref_ann"], - "ref_sa": workflow_data["ref_sa"], - "ref_amb": workflow_data["ref_amb"], - "ref_bwt": workflow_data["ref_bwt"], - "ref_fasta_index": workflow_data["ref_fasta_index"], - "ref_pac": workflow_data["ref_pac"], - }, - "bams": [ - workflow_data["F1_01_bam"], - workflow_data["F1_02_bam"], - workflow_data["P1_bam"], - workflow_data["P2_bam"], - ], - "bais": [ - workflow_data["F1_01_bai"], - workflow_data["F1_02_bai"], - workflow_data["P1_bai"], - workflow_data["P2_bai"], - ], - } - - expected = {"vcf_biallelics": workflow_data["smallest_vcf_bi"]} - workflow_runner("tasks/gatk_genotyping.wdl", inputs, expected) diff --git a/test_legacy/genotyping_r/test_data.json b/test_legacy/genotyping_r/test_data.json deleted file mode 100755 index 1e7b5bd..0000000 --- a/test_legacy/genotyping_r/test_data.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "simulated_phases": {"name": "simulated_phases.txt"}, - "ref_alt_alleles": {"name": "ref_alt_alleles.txt"}, - "true_vcf": {"name": "22_10_simu.vcf", "type": "vcf"}, - "vcf_bi": {"name": "gatk_bi.recode.vcf.gz", "type": "vcf"}, - "vcf_bi_bam_counts": {"name": "gatk_bam_vcf.vcf", "type": "vcf"}, - "vcf_multi": {"name": "gatk_multi.recode.vcf.gz", "type": "vcf"}, - "gusmap_rdata_1": {"name": "map_gatk_vcf_gusmap_1.RData"}, - "gusmap_rdata_2": {"name": "map_gatk_vcf_gusmap_2.RData"} -} diff --git a/test_legacy/genotyping_r/test_genotyping_r.py b/test_legacy/genotyping_r/test_genotyping_r.py deleted file mode 100644 index 51d411a..0000000 --- a/test_legacy/genotyping_r/test_genotyping_r.py +++ /dev/null @@ -1,23 +0,0 @@ -def test_genotyping_with_r(workflow_data, workflow_runner): - inputs = { - "analysis_bam": workflow_data["vcf_bi_bam_counts"], - "analysis_vcf": workflow_data["vcf_bi"], - "analysis_multi_vcf": workflow_data["vcf_multi"], - "true_vcf": workflow_data["true_vcf"], - "ref_alt_alleles": workflow_data["ref_alt_alleles"], - "simulated_phases": workflow_data["simulated_phases"], - "method": "gatk", - "parent1": "P1", - "parent2": "P2", - "cross": "F1", - "seed": 22, - "depth": 10, - "max_cores": 4, - } - - expected = { - "gusmap_out": [workflow_data["gusmap_rdata_1"], workflow_data["gusmap_rdata_2"]] - } - workflow_runner( - "tests/genotyping_r/workflows/TestGenotypingR.wdl", inputs, expected - ) diff --git a/test_legacy/genotyping_r/workflows/TestGenotypingR.wdl b/test_legacy/genotyping_r/workflows/TestGenotypingR.wdl deleted file mode 100644 index 619ee34..0000000 --- a/test_legacy/genotyping_r/workflows/TestGenotypingR.wdl +++ /dev/null @@ -1,161 +0,0 @@ -version 1.0 - -import "utilsR.wdl" as utilsR -import "default_maps.wdl" as default # TODO: Could not find this .wdl in the repository -import "snpcaller_maps.wdl" as snpcaller -import "genotyping_simulated.wdl" as genotyping -import "gusmap_maps.wdl" as gusmap - - -workflow TestGenotypingR { - - input { - File analysis_bam - File analysis_vcf - File analysis_multi_vcf - File true_vcf - File ref_alt_alleles - File simulated_phases - String method - String parent1 - String parent2 - String cross - Int seed - Int depth - Int max_cores - } - - call utilsR.vcf2onemap as truth_vcf { - input: - vcf_file = true_vcf, - cross = cross, - SNPCall_program = "simu", - parent1 = "P1", - parent2 = "P2" - } - - call utilsR.vcf2onemap { - input: - vcf_file = analysis_vcf, - cross = cross, - SNPCall_program = method, - parent1 = parent1, - parent2 = parent2 - } - - call utilsR.MultiVcf2onemap { - input: - multi = analysis_multi_vcf, - cross = cross, - SNPCall_program = method, - parent1 = parent1, - parent2 = parent2, - seed = seed, - depth = depth - } - - call default.DefaultMaps { - input: - onemap_obj = vcf2onemap.onemap_obj, - simu_onemap_obj = truth_vcf.onemap_obj, - ref_alt_alleles = ref_alt_alleles, - simulated_phases = simulated_phases, - SNPCall_program = method, - CountsFrom = "vcf", - multi_obj = MultiVcf2onemap.onemap_obj, - simu_vcfR = truth_vcf.vcfR_obj, - vcfR_obj = vcf2onemap.vcfR_obj, - seed = seed, - depth = depth - } - - call snpcaller.SNPCallerMaps { - input: - simu_onemap_obj = truth_vcf.onemap_obj, - onemap_obj = vcf2onemap.onemap_obj, - vcf_file = analysis_vcf, - ref_alt_alleles = ref_alt_alleles, - simulated_phases = simulated_phases, - cross = cross, - SNPCall_program = method, - GenotypeCall_program = "SNPCaller", - CountsFrom = "vcf", - multi_obj = MultiVcf2onemap.onemap_obj, - simu_vcfR = truth_vcf.vcfR_obj, - seed = seed, - depth = depth - } - - Map[String, File] vcfs = {"vcf": analysis_vcf, "bam": analysis_bam} - - scatter (origin in ["vcf", "bam"]){ - call genotyping.SnpBasedGenotypingSimulatedMaps as UpdogMaps { - input: - simu_onemap_obj = truth_vcf.onemap_obj, - onemap_obj = vcf2onemap.onemap_obj, - vcf_file = vcfs[origin], - genotyping_program = "updog", - ref_alt_alleles = ref_alt_alleles, - simulated_phases = simulated_phases, - SNPCall_program = method, - CountsFrom = origin, - cross = cross, - multi_obj = MultiVcf2onemap.onemap_obj, - max_cores = max_cores, - simu_vcfR = truth_vcf.vcfR_obj, - seed = seed, - depth = depth - } - - call genotyping.SnpBasedGenotypingSimulatedMaps as SupermassaMaps { - input: - simu_onemap_obj = truth_vcf.onemap_obj, - onemap_obj = vcf2onemap.onemap_obj, - vcf_file = vcfs[origin], - genotyping_program = "supermassa", - ref_alt_alleles = ref_alt_alleles, - simulated_phases = simulated_phases, - SNPCall_program = method, - CountsFrom = origin, - cross = cross, - multi_obj = MultiVcf2onemap.onemap_obj, - max_cores = max_cores, - simu_vcfR = truth_vcf.vcfR_obj, - seed = seed, - depth = depth - } - - call genotyping.SnpBasedGenotypingSimulatedMaps as PolyradMaps { - input: - simu_onemap_obj = truth_vcf.onemap_obj, - onemap_obj = vcf2onemap.onemap_obj, - vcf_file = vcfs[origin], - genotyping_program = "polyrad", - ref_alt_alleles = ref_alt_alleles, - simulated_phases = simulated_phases, - SNPCall_program = method, - CountsFrom = origin, - cross = cross, - multi_obj = MultiVcf2onemap.onemap_obj, - max_cores = max_cores, - simu_vcfR = truth_vcf.vcfR_obj, - seed = seed, - depth = depth - } - } - - call gusmap.GusmapMaps { - input: - simu_onemap_obj = truth_vcf.onemap_obj, - vcf_file = analysis_vcf, - new_vcf_file = analysis_bam, - SNPCall_program = method, - GenotypeCall_program = "gusmap", - ref_alt_alleles = ref_alt_alleles, - simulated_phases = simulated_phases - } - - output { - Array[File] gusmap_out = GusmapMaps.RDatas - } -} diff --git a/test_legacy/read_simulation/test_data.json b/test_legacy/read_simulation/test_data.json deleted file mode 100755 index c27aa34..0000000 --- a/test_legacy/read_simulation/test_data.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "ref_fasta": { - "name": "sub2M.fa" - }, - "ref_dict": { - "name": "sub2M.dict" - }, - "ref_ann": { - "name": "sub2M.fa.ann" - }, - "ref_sa": { - "name": "sub2M.fa.sa" - }, - "ref_amb": { - "name": "sub2M.fa.amb" - }, - "ref_bwt": { - "name": "sub2M.fa.bwt" - }, - "ref_fasta_index": { - "name": "sub2M.fa.fai" - }, - "ref_pac": { - "name": "sub2M.fa.pac" - }, - "emp_vcf": {"name": "ref.variants.vcf"}, - "emp_vcf_no_indels": {"name": "ref.variants.noindel.recode.vcf"}, - "ref_map": {"name": "ref.map.csv"}, - "simu_haplo": {"name": "22_10_haplo_simu.rds"}, - "simulated_phases": {"name": "simulated_phases.txt"}, - "true_vcf": {"name": "22_10_simu.vcf"}, - "ref_alt_alleles": {"name": "ref_alt_alleles.txt"} -} diff --git a/test_legacy/read_simulation/test_read_simulation.py b/test_legacy/read_simulation/test_read_simulation.py deleted file mode 100755 index e5ca87c..0000000 --- a/test_legacy/read_simulation/test_read_simulation.py +++ /dev/null @@ -1,49 +0,0 @@ -def test_read_simulation_workflow(workflow_data, workflow_runner): - inputs = { - "family": { - "cmBymb": None, - "cross": "F1", - "doses": None, - "ploidy": 2, - "popsize": 20, - "seed": 22, - }, - "references": { - "ref_fasta": workflow_data["ref_fasta"], - "ref_dict": workflow_data["ref_dict"], - "ref_ann": workflow_data["ref_ann"], - "ref_sa": workflow_data["ref_sa"], - "ref_amb": workflow_data["ref_amb"], - "ref_bwt": workflow_data["ref_bwt"], - "ref_fasta_index": workflow_data["ref_fasta_index"], - "ref_pac": workflow_data["ref_pac"], - }, - "sequencing": { - "emp_vcf": workflow_data["emp_vcf_no_indels"], - "enzyme1": "HinDIII", - "enzyme2": "NlaIII", - "library_type": "ddRAD", - "chromosome": "Chr10", - "pcr_cycles": 0, - "read_length": 300, - "insert_size": 500, - "insert_size_dev": 30, - "depth": 10, - "depth_parents": 10, - "ref_map": workflow_data["ref_map"], - "multiallelics": "TRUE", - "vcf_parent1": "PT_F", - "vcf_parent2": "PT_M", - }, - "max_cores": 4, - } - - expected = { - "simu_haplo": workflow_data["simu_haplo"], - "simulated_phases": workflow_data["simulated_phases"], - "true_vcf": workflow_data["true_vcf"], - "ref_alt_alleles": workflow_data["ref_alt_alleles"], - } - workflow_runner( - "tasks/create_alignment_from_read_simulations.wdl", inputs, expected - ) From a3d13f56fb5d14abb7951e49f912cc0e8a85ab0c Mon Sep 17 00:00:00 2001 From: Lucas Taniguti Date: Tue, 22 Nov 2022 06:07:16 -0300 Subject: [PATCH 04/10] refactor: move test files to tests module --- data/populus/SRRs.txt | 138 ------------------ .../data/inputs}/EmpiricalMaps.inputs.json | 1 - .../inputs}/EmpiricalSNPCalling.inputs.json | 0 .../EmpiricalSNPCalling_gatk.inputs.json | 1 - .../data/inputs}/SimulatedReads.inputs.json | 21 ++- tests/data/populus/SRRs.txt | 138 ++++++++++++++++++ {data => tests/data}/populus/download_SRRs.sh | 0 {data => tests/data}/populus/download_sub.sh | 0 {data => tests/data}/populus/sample_info | 0 {data => tests/data}/populus/sample_info_sub | 0 .../data}/toy_simulations/ref.map.csv | 0 .../ref.variants.noindel.recode.vcf | 0 .../data}/toy_simulations/ref.variants.vcf | 0 13 files changed, 148 insertions(+), 151 deletions(-) delete mode 100644 data/populus/SRRs.txt rename {inputs => tests/data/inputs}/EmpiricalMaps.inputs.json (99%) rename {inputs => tests/data/inputs}/EmpiricalSNPCalling.inputs.json (100%) rename {inputs => tests/data/inputs}/EmpiricalSNPCalling_gatk.inputs.json (99%) rename {inputs => tests/data/inputs}/SimulatedReads.inputs.json (55%) create mode 100644 tests/data/populus/SRRs.txt rename {data => tests/data}/populus/download_SRRs.sh (100%) rename {data => tests/data}/populus/download_sub.sh (100%) rename {data => tests/data}/populus/sample_info (100%) rename {data => tests/data}/populus/sample_info_sub (100%) rename {data => tests/data}/toy_simulations/ref.map.csv (100%) rename {data => tests/data}/toy_simulations/ref.variants.noindel.recode.vcf (100%) rename {data => tests/data}/toy_simulations/ref.variants.vcf (100%) diff --git a/data/populus/SRRs.txt b/data/populus/SRRs.txt deleted file mode 100644 index 385b321..0000000 --- a/data/populus/SRRs.txt +++ /dev/null @@ -1,138 +0,0 @@ -SRR6249768 -SRR6249769 -SRR6249770 -SRR6249771 -SRR6249772 -SRR6249773 -SRR6249774 -SRR6249775 -SRR6249776 -SRR6249778 -SRR6249779 -SRR6249780 -SRR6249781 -SRR6249782 -SRR6249783 -SRR6249784 -SRR6249785 -SRR6249786 -SRR6249787 -SRR6249788 -SRR6249789 -SRR6249790 -SRR6249791 -SRR6249792 -SRR6249793 -SRR6249794 -SRR6249795 -SRR6249796 -SRR6249797 -SRR6249798 -SRR6249799 -SRR6249800 -SRR6249801 -SRR6249802 -SRR6249803 -SRR6249804 -SRR6249805 -SRR6249806 -SRR6249807 -SRR6249808 -SRR6249809 -SRR6249810 -SRR6249811 -SRR6249812 -SRR6249813 -SRR6249814 -SRR6249815 -SRR6249816 -SRR6249817 -SRR6249818 -SRR6249819 -SRR6249820 -SRR6249821 -SRR6249822 -SRR6249823 -SRR6249824 -SRR6249825 -SRR6249826 -SRR6249827 -SRR6249828 -SRR6249829 -SRR6249830 -SRR6249831 -SRR6249832 -SRR6249833 -SRR6249834 -SRR6249835 -SRR6249836 -SRR6249837 -SRR6249838 -SRR6249839 -SRR6249840 -SRR6249841 -SRR6249842 -SRR6249843 -SRR6249844 -SRR6249845 -SRR6249847 -SRR6249848 -SRR6249849 -SRR6249850 -SRR6249851 -SRR6249852 -SRR6249853 -SRR6249854 -SRR6249855 -SRR6249856 -SRR6249857 -SRR6249858 -SRR6249859 -SRR6249860 -SRR6249861 -SRR6249862 -SRR6249863 -SRR6249864 -SRR6249865 -SRR6249866 -SRR6249867 -SRR6249868 -SRR6249869 -SRR6249870 -SRR6249871 -SRR6249872 -SRR6249873 -SRR6249874 -SRR6249875 -SRR6249876 -SRR6249877 -SRR6249878 -SRR6249879 -SRR6249880 -SRR6249881 -SRR6249882 -SRR6249883 -SRR6249884 -SRR6249885 -SRR6249886 -SRR6249887 -SRR6249888 -SRR6249889 -SRR6249890 -SRR6249891 -SRR6249892 -SRR6249893 -SRR6249894 -SRR6249895 -SRR6249896 -SRR6249897 -SRR6249898 -SRR6249899 -SRR6249900 -SRR6249901 -SRR6249902 -SRR6249903 -SRR6249904 -SRR6249905 -SRR6249777 -SRR6249846 \ No newline at end of file diff --git a/inputs/EmpiricalMaps.inputs.json b/tests/data/inputs/EmpiricalMaps.inputs.json similarity index 99% rename from inputs/EmpiricalMaps.inputs.json rename to tests/data/inputs/EmpiricalMaps.inputs.json index 3cb92ea..7dc8c50 100644 --- a/inputs/EmpiricalMaps.inputs.json +++ b/tests/data/inputs/EmpiricalMaps.inputs.json @@ -16,4 +16,3 @@ "Maps.gatk_vcf": "File", "Maps.freebayes_vcf_bam_counts": "File" } - diff --git a/inputs/EmpiricalSNPCalling.inputs.json b/tests/data/inputs/EmpiricalSNPCalling.inputs.json similarity index 100% rename from inputs/EmpiricalSNPCalling.inputs.json rename to tests/data/inputs/EmpiricalSNPCalling.inputs.json diff --git a/inputs/EmpiricalSNPCalling_gatk.inputs.json b/tests/data/inputs/EmpiricalSNPCalling_gatk.inputs.json similarity index 99% rename from inputs/EmpiricalSNPCalling_gatk.inputs.json rename to tests/data/inputs/EmpiricalSNPCalling_gatk.inputs.json index 0aef863..a2965d0 100644 --- a/inputs/EmpiricalSNPCalling_gatk.inputs.json +++ b/tests/data/inputs/EmpiricalSNPCalling_gatk.inputs.json @@ -22,4 +22,3 @@ "SNPCalling_gatk.GatkGenotyping.depth": "Int? (optional)", "SNPCalling_gatk.samples_info": "File" } - diff --git a/inputs/SimulatedReads.inputs.json b/tests/data/inputs/SimulatedReads.inputs.json similarity index 55% rename from inputs/SimulatedReads.inputs.json rename to tests/data/inputs/SimulatedReads.inputs.json index e2d6a60..cf62b09 100644 --- a/inputs/SimulatedReads.inputs.json +++ b/tests/data/inputs/SimulatedReads.inputs.json @@ -9,19 +9,19 @@ "cross": "F1" }, "SimulatedReads.references": { - "ref_fasta": "data/toy_genome/Chr10.2M.fa", - "ref_dict": "data/toy_genome/Chr10.2M.dict", - "ref_ann": "data/toy_genome/Chr10.2M.fa.ann", - "ref_sa": "data/toy_genome/Chr10.2M.fa.sa", - "ref_amb": "data/toy_genome/Chr10.2M.fa.amb", - "ref_bwt": "data/toy_genome/Chr10.2M.fa.bwt", - "ref_fasta_index": "data/toy_genome/Chr10.2M.fa.fai", - "ref_pac": "data/toy_genome/Chr10.2M.fa.pac" + "ref_fasta": "tests/data/toy_genome/Chr10.2M.fa", + "ref_dict": "tests/data/toy_genome/Chr10.2M.dict", + "ref_ann": "tests/data/toy_genome/Chr10.2M.fa.ann", + "ref_sa": "tests/data/toy_genome/Chr10.2M.fa.sa", + "ref_amb": "tests/data/toy_genome/Chr10.2M.fa.amb", + "ref_bwt": "tests/data/toy_genome/Chr10.2M.fa.bwt", + "ref_fasta_index": "tests/data/toy_genome/Chr10.2M.fa.fai", + "ref_pac": "tests/data/toy_genome/Chr10.2M.fa.pac" }, "SimulatedReads.global_seed": "8080", "SimulatedReads.sequencing": { "vcf_parent2": "PT_F", - "emp_vcf": "data/toy_simulations/ref.variants.vcf", + "emp_vcf": "tests/data/toy_simulations/ref.variants.vcf", "enzyme1": "HinDIII", "pcr_cycles": "5", "vcf_parent1": "PT_M", @@ -30,11 +30,10 @@ "chromosome": "Chr10", "rm_dupli": "FALSE", "depth": "20", - "ref_map": "data/toy_simulations/ref.map.csv", + "ref_map": "tests/data/toy_simulations/ref.map.csv", "enzyme2": "NlaIII", "multiallelics": "TRUE" }, "SimulatedReads.chunk_size": "1", "SimulatedReads.max_cores": "3" } - diff --git a/tests/data/populus/SRRs.txt b/tests/data/populus/SRRs.txt new file mode 100644 index 0000000..ac65c06 --- /dev/null +++ b/tests/data/populus/SRRs.txt @@ -0,0 +1,138 @@ +SRR6249768 +SRR6249769 +SRR6249770 +SRR6249771 +SRR6249772 +SRR6249773 +SRR6249774 +SRR6249775 +SRR6249776 +SRR6249778 +SRR6249779 +SRR6249780 +SRR6249781 +SRR6249782 +SRR6249783 +SRR6249784 +SRR6249785 +SRR6249786 +SRR6249787 +SRR6249788 +SRR6249789 +SRR6249790 +SRR6249791 +SRR6249792 +SRR6249793 +SRR6249794 +SRR6249795 +SRR6249796 +SRR6249797 +SRR6249798 +SRR6249799 +SRR6249800 +SRR6249801 +SRR6249802 +SRR6249803 +SRR6249804 +SRR6249805 +SRR6249806 +SRR6249807 +SRR6249808 +SRR6249809 +SRR6249810 +SRR6249811 +SRR6249812 +SRR6249813 +SRR6249814 +SRR6249815 +SRR6249816 +SRR6249817 +SRR6249818 +SRR6249819 +SRR6249820 +SRR6249821 +SRR6249822 +SRR6249823 +SRR6249824 +SRR6249825 +SRR6249826 +SRR6249827 +SRR6249828 +SRR6249829 +SRR6249830 +SRR6249831 +SRR6249832 +SRR6249833 +SRR6249834 +SRR6249835 +SRR6249836 +SRR6249837 +SRR6249838 +SRR6249839 +SRR6249840 +SRR6249841 +SRR6249842 +SRR6249843 +SRR6249844 +SRR6249845 +SRR6249847 +SRR6249848 +SRR6249849 +SRR6249850 +SRR6249851 +SRR6249852 +SRR6249853 +SRR6249854 +SRR6249855 +SRR6249856 +SRR6249857 +SRR6249858 +SRR6249859 +SRR6249860 +SRR6249861 +SRR6249862 +SRR6249863 +SRR6249864 +SRR6249865 +SRR6249866 +SRR6249867 +SRR6249868 +SRR6249869 +SRR6249870 +SRR6249871 +SRR6249872 +SRR6249873 +SRR6249874 +SRR6249875 +SRR6249876 +SRR6249877 +SRR6249878 +SRR6249879 +SRR6249880 +SRR6249881 +SRR6249882 +SRR6249883 +SRR6249884 +SRR6249885 +SRR6249886 +SRR6249887 +SRR6249888 +SRR6249889 +SRR6249890 +SRR6249891 +SRR6249892 +SRR6249893 +SRR6249894 +SRR6249895 +SRR6249896 +SRR6249897 +SRR6249898 +SRR6249899 +SRR6249900 +SRR6249901 +SRR6249902 +SRR6249903 +SRR6249904 +SRR6249905 +SRR6249777 +SRR6249846 diff --git a/data/populus/download_SRRs.sh b/tests/data/populus/download_SRRs.sh similarity index 100% rename from data/populus/download_SRRs.sh rename to tests/data/populus/download_SRRs.sh diff --git a/data/populus/download_sub.sh b/tests/data/populus/download_sub.sh similarity index 100% rename from data/populus/download_sub.sh rename to tests/data/populus/download_sub.sh diff --git a/data/populus/sample_info b/tests/data/populus/sample_info similarity index 100% rename from data/populus/sample_info rename to tests/data/populus/sample_info diff --git a/data/populus/sample_info_sub b/tests/data/populus/sample_info_sub similarity index 100% rename from data/populus/sample_info_sub rename to tests/data/populus/sample_info_sub diff --git a/data/toy_simulations/ref.map.csv b/tests/data/toy_simulations/ref.map.csv similarity index 100% rename from data/toy_simulations/ref.map.csv rename to tests/data/toy_simulations/ref.map.csv diff --git a/data/toy_simulations/ref.variants.noindel.recode.vcf b/tests/data/toy_simulations/ref.variants.noindel.recode.vcf similarity index 100% rename from data/toy_simulations/ref.variants.noindel.recode.vcf rename to tests/data/toy_simulations/ref.variants.noindel.recode.vcf diff --git a/data/toy_simulations/ref.variants.vcf b/tests/data/toy_simulations/ref.variants.vcf similarity index 100% rename from data/toy_simulations/ref.variants.vcf rename to tests/data/toy_simulations/ref.variants.vcf From b37457835b35025db342bc716a6e583e77ec6b63 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 22 Nov 2022 15:03:32 -0600 Subject: [PATCH 05/10] update Readme --- README.md | 240 ++++-------------------------------------------------- 1 file changed, 18 insertions(+), 222 deletions(-) diff --git a/README.md b/README.md index 4241191..e92e0fc 100644 --- a/README.md +++ b/README.md @@ -1,243 +1,39 @@ -## Reads2Map workflows +## Reads2Map -Reads2Map presents [WDL workflows](https://openwdl.org/) to build linkage maps for diploid outcrossing species from sequencing reads. It compares performances of SNP calling, genotype calling, and genetic map builders software. By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes), [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [superMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906), [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) are included. +Reads2Map presents [WDL workflows](https://openwdl.org/) a collection of pipelines to build linkage maps from sequencing reads. Each pipeline release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases). -The main workflows are the `SimulatedReads.wdl`, the `EmpiricalSNPCalling.wdl`, and the `EmpiricalMaps.wdl`. The `SimulatedReads.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. `EmpiricalSNPCalling.wdl` performs the SNP calling and `EmpiricalMaps.wdl` performs the genotype calling and map building in empirical reads. -## Requisites +The main workflows are the `EmpiricalSNPCalling.wdl`, the `EmpiricalMaps.wdl`, and the `SimulatedReads.wdl`. `EmpiricalSNPCalling.wdl` performs the SNP calling and `EmpiricalMaps.wdl` performs the genotype calling and map building in empirical reads. The `SimulatedReads.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. -The only software that you will need to download and install to run these workflows are [Java](https://www.java.com/en/), [Cromwell](https://cromwell.readthedocs.io/en/stable/tutorials/FiveMinuteIntro/), and [Docker](https://docs.docker.com/install/) or [Singularity](https://sylabs.io/guides/2.6/user-guide/index.html). +By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build. -Clone or download this repository: -``` -git clone https://github.com/Cristianetaniguti/Reads2Map.git # Or download the directory. It is important to download the entire directory, once the main workflows request the tasks and structs. -``` +![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png) -## Building maps from empirical reads +## How to use -The `EmpiricalSNPcalling` requires demultiplexed and cleaned FASTQ files. We made available a suggestion for preprocessing reads in `PreprocessingReads.wdl`. +Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines). -* Adapt the path of the inputs in `inputs/EmpiricalSNPCalling.inputs.json` +To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies). -**samples_info**: tsv file with the first column with the path to FASTQ files, a second column with sample names, and the third column with sample names and lane specifications. Example: - -``` -data/populus_sub/SRR6249785.sub.fastq I_3_58 I_3_58.Lib1_C11_TTCCACG -data/populus_sub/SRR6249786.sub.fastq I_3_56 I_3_56.Lib1_C10_CCTGCAC -data/populus_sub/SRR6249787.sub.fastq I_3_55 I_3_55.Lib1_C09_AGAAGTC -data/populus_sub/SRR6249788.sub.fastq I_3_66 I_3_66.Lib1_D06_GCCAACT -data/populus_sub/SRR6249795.sub.fastq PT_F PT_F.Lib1_E09_TGAACAT -data/populus_sub/SRR6249808.sub.fastq PT_M PT_M.Lib2_E06_CGATGCG -``` - -**rm_dupli**: if workflow should (TRUE) or not (FALSE) remove the duplicated sequences from the alignment file before the SNP calling analysis; - -**chunk_size**: how many samples to be evaluated by GATK in a single same node; - -**max_cores**: maximum number of cores to be used by alignment and Freebayes tasks; - -**empirical.references** -- ref_fasta: chromosome sequence in fasta format (only one chromosome at a time); -- ref_fasta_index: index made by samtools faidx; -- ref_dict: index made by picard dict; -- ref_sa: index made by bwa index; -- ref_amb: index made by bwa index; -- ref_bwt: index made by bwa index; -- ref_ann: index made by bwa index; -- ref_pac: index made by bwa index. - -### Run EmpiricalSNPCalling workflow with the test data set - -In the `Reads2Map/data` directory you can find files to run examples. The `toy_genome` presents two FASTA files one with a sub set of 2 million base pares of chromosome 10 and other with the same subset for chromosome 10 and 11. For the empirical data we will use the second: `Chr10.11.2M.fa`. We must provide also the index files generated by BWA, samtools and picard. You can use the follow containers to create these indexes: - -``` -docker run -v $(pwd):/data/ us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z samtools faidx /data/toy_genome/Chr10.11.2M.fa -docker run -v $(pwd):/data/ us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z /usr/gitc/./bwa index /data/toy_genome/Chr10.11/2M.fa -docker run -v $(pwd):/data/ us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z java -jar /usr/gitc/picard.jar CreateSequenceDictionary R=/data/toy_genome/Chr10.11.2M.fa O=/data/toy_genome/Chr10.11.2M.dict -``` - -or with singularity: - -``` -singularity run --bind $(pwd):/data/ us.gcr.io_broad-gotc-prod_genomes-in-the-cloud_2.5.7-2021-06-09_16-47-48Z.sif samtools faidx /data/Chr10.11.2M.fa -singularity run --bind $(pwd):/data/ us.gcr.io_broad-gotc-prod_genomes-in-the-cloud_2.5.7-2021-06-09_16-47-48Z.sif /usr/gitc/./bwa index /data/Chr10.11.2M.fa -singularity run --bind $(pwd):/data/ us.gcr.io_broad-gotc-prod_genomes-in-the-cloud_2.5.7-2021-06-09_16-47-48Z.sif java -jar /usr/gitc/picard.jar CreateSequenceDictionary R=/data/Chr10.populus.fa O=/data/Chr10.11.2M.dict -``` - -Once we you have all input files, you can use the [Cromwell](https://cromwell.readthedocs.io/en/stable/) engine to run the workflows. Cromwell offers many different ways of doing it. You can check all option available in [its documentation](https://cromwell.readthedocs.io/en/stable/). The most simple way would be: - -``` -Execute the workflow -java -jar /path/to/cromwell.jar run -i inputs/EmpiricalSNPCalling.inputs.json EmpiricalSNPCalling.wdl -``` - -**Warning**: This analysis demand high computer capacity to run. You will be able to run the example dataset on a computer with 4G of RAM, but we suggest setting personalized configurations according to your system. Check some examples of configurations in `.configurations` directory. Here are two examples of how to use them: - -* For storing the metadata and cache in a MySQL database (see also [cromwell-cli](https://github.com/lmtani/cromwell-cli) to easy access to metadata): - -``` -# Open mySQL cointainer -docker run -d -v banco_cromwell:/var/lib/mysql --rm --name mysql-cromwell -p 3307:3306 -e MYSQL_ROOT_PASSWORD=1234 -e MYSQL_DATABASE=cromwell mysql:5.7 - -# Execute the workflow -java -jar -Dconfig.file=.configurations/cromwell_cache.conf -jar cromwell.jar run -i EmpiricalSNPCalling.inputs.json EmpiricalSNPCalling.wdl - -# Or execute the workflow through the server interface -java -jar -Dconfig.file=.configurations/cromwell_cache.conf -jar cromwell.jar server -``` - -In case you use the server interface, you must open it in your browser in the pointed local adress and submit the input (.json), workflow (.wdl) and the directories struct and tasks compressed with `zip -r -u tasks.zip tasks/ structs/` . - -* If you are using a High-Performance Computing (HPC) with slurm managment system: - -BATCH file named `slurm_main.sh`: - -``` -#!/bin/bash - -#SBATCH --export=NONE -#SBATCH -J cromwell_Reads2Map -#SBATCH --nodes=1 -#SBATCH --mem=1G -#SBATCH --time=01:30:00 -#SBATCH -o /home/user/Reads2Map.log -#SBATCH -e /home/user/Reads2Map.err - -# Maybe it will be required to import java and singularity modules here. Check the specifications of the HPC. -#module load singularity -#module load java - -java -jar -Dconfig.file=/home/user/Reads2Map/.configurations/cromwell_slurm_sing.conf \ - -jar /home/user/Reads2Map/cromwell-65.jar \ - run /home/user/Reads2Map/EmpiricalSNPCalling.wdl \ - -i /home/user/Reads2Map/inputs/EmpiricalSNPCalling.inputs.json -``` - -When the run is ended, the log description printed in the screen will point the path for the workflow output files. The files outputted by `EmpiricalSNPCalling.wdl` are inputs for `EmpiricalMaps.wdl`. However, we suggest to check the VCF markers quality parameters to apply proper filters before proceed to `EmpiricalMaps.wdl`. The `EmpiricalMaps.wdl` is limited to run only one chromosome, make sure you filter the VCF to retain only the selected one. - -* Adapt the path of the inputs in `inputs/EmpiricalMaps.inputs.json` - -**freebayes_vcf**: vcf file containing markers from freebayes snp calling; - -**gatk_vcf**: vcf file containing markers from gatk snp calling; - -**freebayes_vcf_bam_counts**: vcf file containing markers from freebayes snp calling with AD field replaced by BAM files read counts; - -**gatk_vcf_bam_counts**: vcf file containing markers from gatk snp calling with AD field replaced by BAM files read counts; - -**dataset** -- parent1: parent 1 ID; -- parent2: parent 2 ID; -- name: experiment ID; -- chromosome: chromosome being evaluated (only one allowed); -- cross: cross type (by now, only F1 available); -- multiallelics: consider or not the multiallelic markers. - -Running: - -``` -java -jar -Dconfig.file=.configurations/cromwell_cache.conf -jar cromwell.jar run -i EmpiricalMaps.inputs.json EmpiricalMaps.wdl -``` - -Here there are enough data to test the pipeline but not for having a good resolution genetic map. It contains the two parents and 4 progeny individuals. The original study has eight replicates for each parent and 122 progenies. - -You can download black cottonwood genome assembly (FASTA) and [RADseq reads from the BioProject PRJNA395596](https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJNA395596) for testing with the full data set: - -``` -bash /user/path/Reads2Map/data/populus/download_SRRs.sh -``` - -Check session [`Running large datasets`](#Running-large-datasets) before procede with the analysis with entire data set. - -## Run SimulatedReads workflow - -* Adapt the path of the inputs in `inputs/SimulatedReads.input.json` - -**number_of_families** : an integer defining the number of families with `popsize` individuals to be simulated; - -**global_seed**: This seed is used to generate the families seeds; - -**max_cores**: Maximum number of computer cores to be used; - -**filters**: filters in to be applied by VCFtools in the VCF file after SNP calling; - -**chunk_size**: how many samples are to be evaluated by GATK in a single same node - -**family**: -- seed: seed to reproduce the analysis after - warning: some steps are still random, as the reads simulation; -- popsize: number of individuals at the progeny population; -- ploidy: the ploidy of the species, by now only diploid (2) species are supported; -- cross: cross-type. By now, only "F1" option is available; -- doses: if you do not have a VCF file with variants to be simulated, you can define here the percentage of markers with doses 0, 1, and 2 (when the cross is F1); -- cmBymb: if you do not have a reference linkage map, you can simulate using a general recombination rate according to other genetic maps of the specie - -**sequencing**: -- library_type: the options RADseq, WGS, and Exome are available. -- multiallelics: Define with "TRUE" or "FALSE", if the analysis should try to include multiallelic markers in the linkage maps. -- emp_vcf: reference VCF file with the variants to be simulated. -- emp_bam: reference BAM file. It will be used to define the reads profile in WGS and Exome simulation. -- ref_map: reference linkage map, it is a text file with two columns, one named "cM" with values for centimorgan position of markers and the other named "bp" with the respective base pair position of each marker. The markers in your reference map do not need to be the same as the VCF file. Using splines, this map is used to train a model to define the position in centimorgan of the simulated variants in the genome. -- enzyme1: If RADseq, the enzyme used to reduce the genome representation. -- enzyme2: If RADseq, the second enzyme used to reduce the genome representation. -- vcf_parent1: parent 1 ID in the reference VCF. -- vcf_parent2: parent 2 ID in the reference VCF. -- chromosome: chromosome ID to be simulated. -- pcr_cycles: If RADseq, the number of PCR cycles used in the library preparation (default: 9). -- insert_size: If RADseq, define the insert size in bp (default: 350). -- read_length: If RADseq, define the read length in bp (default: 150). -- depth: sequencing depth (default: 20). -- insert_size_dev: If RADseq, define the insert size standard deviation in bp (default: 35). - -**references** -- ref_fasta: chromosome sequence in FASTA format (only one chromosome at a time, and no N are allowed) -- ref_fasta_index: index made by samtools faidx -- ref_dict: index made by picard dict -- ref_sa: index made by bwa index -- ref_amb: index made by bwa index -- ref_bwt: index made by bwa index -- ref_ann: index made by bwa index -- ref_pac: index made by bwa index - -### Run test dataset for simulations - -In the directory `data/toy_simulations` you will find input files required to simulate reads and maps based on a subset of *Populus trichocarpa* chromosome 10. These files are: 1) `ref.variants.noindel.recode.vcf` a reference VCF file only with SNPs (indels are not supported by now); 2) and a reference linkage map `ref.map.csv`. The path to the files must be defined in `inputs/SimulatedReads.inputs.json`. - -Run the workflow: - -``` -Execute the workflow -java -jar -Dconfig.file=.configurations/cromwell_cache.conf -jar cromwell.jar run -i SimulatedReads.inputs.json SimulatedReads.wdl -``` - -**Warning**: This analysis demand high computer capacity to run. You will be able to run the example dataset on a computer with 4G of RAM, but we suggest setting personalized configurations according to your system. Check some examples of configurations in `.configurations` directory. You can also check other option in the [Cromwell documentation](https://cromwell.readthedocs.io/en/stable/). - -## Running large datasets - -By default, the workflows are configurated for High-Performance Computing (HPC) services and we provide some configurations files examples in `.configurations` directory. - -If you don't have an HPC available, you may want to check cloud services. There are several services available, we would suggest starting your search by the [terra.bio](https://terra.bio/resources/analysis-tools/) platform. To adapt the workflows to cloud services it is required to change the `runtime` session of the workflow tasks according to the cloud format. - -The time and memory specifications in `runtime` session of each task also need to be adapted for different datasets. Consult the `Reads2MapApp` session `Workflows efficiency` to optimize the time and memory requirements for your data set. +## Documentation -## Visualize Reads2Map workflows output in Reads2MapApp +Check the description of the inputs for the pipelines: -You can search for all workflow's intermediary files in the `cromwell-executions` directory generated by the Cromwell. The `log` file will specify the workflow id and path for each executed task. The final output of `EmpiricalMaps.wdl` and `SimulatedReads.wdl` are compressed files called `EmpiricalReads_results.tar.gz` and `SimulatedReads_results.tar.gz`. These files contain tables for an overview of the entire procedure. They are inputs for the `Reads2MapApp`, a shiny app that provides a graphical view of the results. Check the [Reads2MapApp repository](https://github.com/Cristianetaniguti/Reads2MapApp) for further information. +* [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Read2Map/EmpiricalReads2Map.html) +* [SimulatedReads](https://cristianetaniguti.github.io/Tutorials/Reads2Map/SimulatedReads.html) -entry +Check how to evaluate the workflows results in Reads2MapApp Shiny: -## Documentation +* [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) -Here are some tutorials for further details about how to use the workflows: +Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map: -* [Introduction](https://cristianetaniguti.github.io/Tutorials/onemap_workflows/docs/introduction.html) -* [Running SimulatedReads workflow](https://cristianetaniguti.github.io/Tutorials/onemap_workflows/docs/SimulatedReads.html) -* [Running EmpiricalReads2Map workflow](https://cristianetaniguti.github.io/Tutorials/onemap_workflows/docs/EmpiricalReads2Map.html) -* [High density maps](https://cristianetaniguti.github.io/Tutorials/onemap_workflows/docs/High_density_maps.html) +* [Quick Guide to build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/High_density_maps.html) -You can also have more details about the workflows and how they can be applied: +Check more information and example of usage in: -* [Papers]() +* [Paper in preparation]() ## Third-party software and images From 56c0573575ba2b5c53422b55489ec616dd8c779c Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 22 Nov 2022 17:09:13 -0600 Subject: [PATCH 06/10] add description --- pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md | 12 ++++++++++++ .../EmpiricalSNPCalling.changelog.md | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md index bf92f77..c9fdb15 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md @@ -1,3 +1,15 @@ # 1.0.0 Initial release + +This workflow receives as input VCF files from EmpiricalSNPCalling workflow and result in 34 linkage maps for a single chromosome running the combinations: + +* SNP calling: GATK and Freebayes +* Dosage/genotype calling: updog, polyRAD and SuperMASSA +* Linkage map build software: OneMap 3.0 and GUSMap +* Using genotype probabilities from GATK, Freebayes, updog, polyRAD and SuperMASSA, and a global error rate of 5% and 0.001% in the OneMap HMM. + +It also has the options to: + +* Include or not multiallelic (MNP) markers +* Apply filters using VCFtools diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md index bf92f77..34e16f2 100644 --- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md @@ -1,3 +1,12 @@ # 1.0.0 Initial release + +This workflow performs the alignment of FASTQ to a reference genome, SNP calling with GATK tools (HaplotypeCaller, GenomicsDBImport, and GenotypeGVCFs) and Freebayes. The samples are splitted into chunks to be run in different nodes and optimize the analyses. Set the number of samples by chunk in the 'chunk_size' input. Use 'max_cores' to define number of cores to be used in each node. + +The workflow also include de options to: + +* Remove of not the read duplicates +* Perform the Hard Filtering in GATK results +* Replace the VCF AD format field by counts from BAM files +* Run MCHap software to build haplotypes based on GATK called markers \ No newline at end of file From 20f289a5f74840073db15e04f8b54450333e86f2 Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 22 Nov 2022 18:41:24 -0600 Subject: [PATCH 07/10] releases description --- .../EmpiricalMaps/EmpiricalMaps.changelog.md | 6 ++++++ .../EmpiricalSNPCalling.changelog.md | 8 +++++++- .../PreprocessingReads.changelog.md | 10 +++++----- .../SimulatedReads.changelog.md | 20 +++++++++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md index c9fdb15..1b3b99f 100644 --- a/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md +++ b/pipelines/EmpiricalMaps/EmpiricalMaps.changelog.md @@ -13,3 +13,9 @@ It also has the options to: * Include or not multiallelic (MNP) markers * Apply filters using VCFtools + +This workflow uses: + +* Diploid bi-parental F1 population +* Genomic positions for markers order +* A single chromosome from a reference genome \ No newline at end of file diff --git a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md index 34e16f2..73a8c7d 100644 --- a/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md +++ b/pipelines/EmpiricalSNPCalling/EmpiricalSNPCalling.changelog.md @@ -9,4 +9,10 @@ The workflow also include de options to: * Remove of not the read duplicates * Perform the Hard Filtering in GATK results * Replace the VCF AD format field by counts from BAM files -* Run MCHap software to build haplotypes based on GATK called markers \ No newline at end of file +* Run MCHap software to build haplotypes based on GATK called markers + +This workflow requires: + +* Diploid or polyploid specie +* Single-end reads +* A reference genome \ No newline at end of file diff --git a/pipelines/PreprocessingReads/PreprocessingReads.changelog.md b/pipelines/PreprocessingReads/PreprocessingReads.changelog.md index f575f41..cb35935 100644 --- a/pipelines/PreprocessingReads/PreprocessingReads.changelog.md +++ b/pipelines/PreprocessingReads/PreprocessingReads.changelog.md @@ -1,9 +1,9 @@ -# 1.0.1 +# 1.0.0 -- Automatic releases +Initial release -- Testing changelog +This workflow use STACKS process_radtags plugin to demultiplex GBS FASTQ files, filter by presence of the enzyme cut site and sequence quality. The cutadapt software is also implemented to remove adaptors sequences. -# 1.0.0 +This workflow requires: -Initial release +* Genotyping-by-sequencing data \ No newline at end of file diff --git a/pipelines/SimulatedReads/SimulatedReads.changelog.md b/pipelines/SimulatedReads/SimulatedReads.changelog.md index bf92f77..4f9ef67 100644 --- a/pipelines/SimulatedReads/SimulatedReads.changelog.md +++ b/pipelines/SimulatedReads/SimulatedReads.changelog.md @@ -1,3 +1,23 @@ # 1.0.0 Initial release + +This workflow perform simulations of one or more (defined by `number_of_families`) bi-parental outcrossing population haplotypes using PedigreeSim software based on a provided linkage map and SNP markers. It uses RADinitio software, the simulated haplotypes and a reference genome to also simulate genotyping-by-sequencing read sequences. After, it performs the SNP and genotype calling and builds 68 linkage maps from the combinations: + +* SNP calling: GATK and Freebayes +* Dosage/genotype calling: updog, polyRAD and SuperMASSA +* Linkage map build software: OneMap 3.0 and GUSMap +* Using genotype probabilities from GATK, Freebayes, updog, polyRAD and SuperMASSA, and a global error rate of 5% and 0.001% in the OneMap HMM. + +It also has the options to: + +* Include or not multiallelic (MNP) markers +* Apply filters using VCFtools + +This workflow uses: + +* A reference linkage map +* A reference VCF file +* A single chromosome from a reference genome +* Diploid bi-parental F1 population +* Genomic positions for markers order From 07caf236289df308c7026dc8a8df6c942360b70b Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 22 Nov 2022 19:24:31 -0600 Subject: [PATCH 08/10] badget --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e92e0fc..f7e84b4 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,15 @@ +[![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) + +[![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map) + ## Reads2Map Reads2Map presents [WDL workflows](https://openwdl.org/) a collection of pipelines to build linkage maps from sequencing reads. Each pipeline release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases). - The main workflows are the `EmpiricalSNPCalling.wdl`, the `EmpiricalMaps.wdl`, and the `SimulatedReads.wdl`. `EmpiricalSNPCalling.wdl` performs the SNP calling and `EmpiricalMaps.wdl` performs the genotype calling and map building in empirical reads. The `SimulatedReads.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build. - ![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png) ## How to use From 7831bd4328a5f4b094c8a188c11ad488448b1f0a Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Tue, 22 Nov 2022 19:25:50 -0600 Subject: [PATCH 09/10] same line --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index f7e84b4..5b223f1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ [![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) - [![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map) ## Reads2Map From 8fdd1ed2294f0aea7a3af66c50248db6eee1542f Mon Sep 17 00:00:00 2001 From: cristianetaniguti Date: Fri, 25 Nov 2022 12:53:22 -0600 Subject: [PATCH 10/10] add preprint --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5b223f1..b0afc7d 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,9 @@ To run a pipeline, first navigate to [Reads2Map releases page](https://github.co Check the description of the inputs for the pipelines: -* [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Read2Map/EmpiricalReads2Map.html) -* [SimulatedReads](https://cristianetaniguti.github.io/Tutorials/Reads2Map/SimulatedReads.html) +* [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Reads2Map/EmpiricalReads.html) + +* [SimulatedReads](https://cristianetaniguti.github.io/Tutorials/Reads2Map/simulatedreads.html) Check how to evaluate the workflows results in Reads2MapApp Shiny: @@ -30,11 +31,11 @@ Check how to evaluate the workflows results in Reads2MapApp Shiny: Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map: -* [Quick Guide to build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/High_density_maps.html) +* [Quick Guide to build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/Quick_HighDens/High_density_maps.html) -Check more information and example of usage in: +Check more information and examples of usage in: -* [Paper in preparation]() +* [Taniguti, C. H., Taniguti, L. M., Amadeu, R. R., Mollinari, M., Da, G., Pereira, S., Riera-Lizarazu, O., Lau, J., Byrne, D., de Siqueira Gesteira, G., De, T., Oliveira, P., Ferreira, G. C., & Franco Garcia, A. A. Developing best practices for genotyping-by-sequencing analysis using linkage maps as benchmarks. BioRxiv. https://doi.org/10.1101/2022.11.24.517847](https://www.biorxiv.org/content/10.1101/2022.11.24.517847v1) ## Third-party software and images