diff --git a/conda_setup.yml b/conda_setup.yml index 5e596f7..c81b970 100644 --- a/conda_setup.yml +++ b/conda_setup.yml @@ -1,5 +1,6 @@ name: lepwrap channels: + - conda-forge/label/main - bioconda - conda-forge - defaults @@ -148,7 +149,7 @@ dependencies: - multidict=5.1.0=py39h3811e60_1 - nbformat=5.1.3=pyhd8ed1ab_0 - ncurses=6.2=h58526e2_4 - - networkx=2.3=py_0 + - networkx=2.5=py_0 - numpy=1.20.3=py39hdbf815f_0 - oauth2client=4.1.3=py_0 - oauthlib=3.0.1=py_0 @@ -206,6 +207,7 @@ dependencies: - r-cli=2.5.0=r40hc72bb7e_0 - r-clipr=0.7.1=r40h142f84f_0 - r-colorspace=2.0_1=r40hcfec24a_0 + - r-cowplot=1.1.1=r40hc72bb7e_0 - r-cpp11=0.2.7=r40hc72bb7e_0 - r-crayon=1.4.1=r40hc72bb7e_0 - r-curl=4.3.1=r40hcfec24a_0 @@ -223,9 +225,11 @@ dependencies: - r-farver=2.1.0=r40h03ef668_0 - r-forcats=0.5.1=r40hc72bb7e_0 - r-fs=1.5.0=r40h0357c0b_0 + - r-fuzzyjoin=0.1.6=r40_0 - r-gargle=1.1.0=r40hc72bb7e_0 - r-gdtools=0.2.2=r40h36050f4_1 - r-generics=0.1.0=r40hc72bb7e_0 + - r-geosphere=1.5_10=r40hcfec24a_2 - r-ggplot2=3.3.3=r40hc72bb7e_0 - r-glue=1.4.2=r40hcfec24a_0 - r-googledrive=1.0.1=r40h6115d3f_1 @@ -281,6 +285,8 @@ dependencies: - r-rvest=1.0.0=r40hc72bb7e_0 - r-scales=1.1.1=r40h6115d3f_0 - r-selectr=0.4_2=r40h6115d3f_1 + - r-sp=1.4_5=r40hcfec24a_0 + - r-stringdist=0.9.6.3=r40hcfec24a_0 - r-stringi=1.6.2=r40hcabe038_0 - r-stringr=1.4.0=r40h6115d3f_2 - r-svglite=2.0.0=r40h03ef668_0 @@ -354,4 +360,4 @@ dependencies: - yarl=1.6.3=py39h3811e60_1 - zipp=3.4.1=pyhd8ed1ab_0 - zlib=1.2.11=h516909a_1010 - - zstd=1.5.0=ha95c52a_0 \ No newline at end of file + - zstd=1.5.0=ha95c52a_0 diff --git a/config.yaml b/config.yaml index 77f26f5..d32e040 100644 --- a/config.yaml +++ b/config.yaml @@ -60,7 +60,7 @@ exp_lg: 24 iterations: 100 # Set threads_per to the number of threads you would like to use per linkage group for ordering -threads_per: 5 +threads_per: 2 # Choose your distance method by commenting the line you dont want dist_method: "useKosambi=1" @@ -76,14 +76,14 @@ phase_iterations: 2 # Edge trimming will examine the first and last X% of markers in a linkage group # and remove clusters that are N centimorgans away from the next marker # you can "skip" trimming by making edge_length really low (e.g. 1) -# and trim_cutoff really high (e.g. 40) +# and trim_cutoff really high (e.g. 50) # Set edge_length to the percent number of markers you would like to examine from either end of the linkage group # Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" -edge_length: 15 +edge_length: 1 # Set trim_cuttoff to the centiMorgan distance cutoff (10 is reasonable) -trim_cutoff: 10 +trim_cutoff: 40 @@ -91,7 +91,7 @@ trim_cutoff: 10 #### Lep-Anchor #### #################### # change this to true if you also want to run Lep-Anchor -run_lepanchor: false +run_lepanchor: true # the path to the genome assembly you are trying to anchor # ideally it *is not* gzipped and ends in .fa, but there are built-in workarounds if it is. @@ -114,3 +114,19 @@ proximity_file: "/dev/null" OS_info: "Ubuntu" #OS_info: "CentOS5" #OS_info: "CentOS6" + + +#-----------------# +# Edge Trimming # +#-----------------# +# Edge trimming will examine the first and last X% of markers in a linkage group +# and remove clusters that are N% centimorgans (of the total cM span) away from +# the next marker. You can "skip" trimming by making edge_length really low (e.g. 1) +# and trim_cutoff really high (e.g. 50) + +# Set edge_length to the percent number of markers you would like to examine from either end of the linkage group +# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" +LA_edge_length: 20 + +# Set trim_cuttoff to the centiMorgan distance cutoff (5 is reasonable) +LA_trim_cutoff: 3 \ No newline at end of file diff --git a/rules/LepAnchor.smk b/rules/LepAnchor.smk index 226c3d9..6a6c666 100644 --- a/rules/LepAnchor.smk +++ b/rules/LepAnchor.smk @@ -9,22 +9,32 @@ proximity = config["proximity_file"] lg = config["lg_count"] lg_range = list(range(1,lg+1)) os_name = config["OS_info"] +edgelen = config["LA_edge_length"] +trimdist = config["LA_trim_cutoff"] rule all: input: - fasta = "10_Anchoring/Anchored.contigs.fa.gz", - scaff = "10_Anchoring/Anchored.scaffolds.fa.gz", - mareydata = "11_MareyMaps/marey.data.gz", - mareymaps = "11_MareyMaps/LepAnchor.mareymaps.pdf" + fasta = "12_Fasta/Anchored.contigs.fa.gz", + scaff = "12_Fasta/Anchored.scaffolds.fa.gz", + fastaonly = "12_Fasta/Anchored.contigs.only.fa.gz", + scaffonly = "12_Fasta/Anchored.scaffolds.only.fa.gz", + mareydata = "13_MareyMaps/data.marey.gz", + mareymaps = "13_MareyMaps/LepAnchor.mareymaps.pdf", + trimmedmareymaps = "15_TrimNewIntervals/LepAnchor.mareymaps.pdf", + trimsummary = "15_TrimNewIntervals/LA.trim.summary.pdf" message: """ Lep-Anchor has finished. Good luck with the rest of your analyses! Output Files: ============= - contig fasta file | 10_Anchoring/Anchored.contigs.fa.gz - scaffold fasta file | 10_Anchoring/Anchored.scaffolds.fa.gz - converted linakge maps | 11_MareyMaps/marey.data.gz - summary MareyMaps | 11_MareyMaps/LepAnchor.mareymaps.pdf + contig fasta file | {input.fasta} + contigs-only fasta | {input.fastaonly} + scaffold fasta file | {input.scaff} + scaffolds-only fasta | {input.scaffonly} + converted linakge maps | {input.mareydata} + untrimmed marey maps | {input.mareymaps} + interval trimming summary | {input.trimsummary} + trimmed marey maps | {input.trimmedmareymaps} """ rule repeatmask: @@ -123,7 +133,7 @@ rule chain_2: rule extract_markers: input: "2_Filtering/data.filtered.lepmap3.gz" - output: "snps.txt" + output: report("snps.txt", category = "Data") message: "Extracting marker information from Lep-Map3 data file {input}" shell: "scripts/extract_markers.sh {input}" @@ -133,7 +143,7 @@ rule generate_intervals: markers = "snps.txt", intervals = expand("7_Intervals/ordered.{x}.intervals", x = range(1, lg + 1)) output: - intervals = "10_Anchoring/lepmap3_intervals.la" + intervals = report("10_Anchoring/lepmap3_intervals.la", category = "Data") message: "Combining {params} Lep-Map3 interval files into single LepAnchor input {output}" params: lg = lg @@ -146,13 +156,13 @@ rule generate_intervals: rule contiglengths: input: geno - output: "10_Anchoring/contigs.length" + output: report("10_Anchoring/contigs.length", category = "Data") message: "Getting contig lengths" shell: "gunzip -fc {input} | awk -f software/LepAnchor/scripts/contigLength.awk > {output}" rule find_haplotypes: input: "9_Chain/chainfile.gz" - output: "10_Anchoring/fullHaplotypes50.txt" + output: report("10_Anchoring/fullHaplotypes50.txt", category = "Logs") message: "Finding full haplotypes (potential chimeric contigs)" shell: """ @@ -166,8 +176,8 @@ rule liftover: intervals = "10_Anchoring/lepmap3_intervals.la", haplos = "10_Anchoring/fullHaplotypes50.txt" output: - lift = "10_Anchoring/liftover.la", - sortedlift = "10_Anchoring/liftover.sorted.la" + lift = report("10_Anchoring/liftover.la", category = "Lifted Intervals"), + sortedlift = report("10_Anchoring/liftover.sorted.la", category = "Lifted Intervals") message: "Running liftoverHaplotypes for the input maps" shell: """ @@ -178,7 +188,7 @@ rule liftover: rule cleanmap: input: "10_Anchoring/liftover.sorted.la" output: "10_Anchoring/map_all.clean" - log: "10_Anchoring/cleamap.log" + log: report("10_Anchoring/cleamap.log", category = "Logs") message: "Running CleanMap" shell: "java -cp software/LepAnchor CleanMap map={input} > {output} 2> {log}" @@ -187,7 +197,7 @@ rule map2bed: cleanmap = "10_Anchoring/map_all.clean", lengths = "10_Anchoring/contigs.length", output: "10_Anchoring/map.bed" - log: "10_Anchoring/map2bed.log" + log: report("10_Anchoring/map2bed.log", category = "Logs") message: "Running Map2Bed" shell: "java -cp software/LepAnchor Map2Bed map={input.cleanmap} contigLength={input.lengths} > {output} 2> {log}" @@ -218,7 +228,7 @@ rule place_orient: output: chrom = "10_Anchoring/orient_1/chr.{lg_range}.la" log: - chrom = "10_Anchoring/orient_1/logs/chr.{lg_range}.la.err" + chrom = report("10_Anchoring/orient_1/logs/chr.{lg_range}.la.err", category = "Anchoring I Logs") params: chrom = "{lg_range}" message: "Running PlaceAndOrientContigs for linkage group {params.chrom}" @@ -264,7 +274,7 @@ rule place_orient2: output: chrom = "10_Anchoring/orient_2/ichr.{lg_range}.la" log: - chrom = "10_Anchoring/orient_2/logs/ichr.{lg_range}.la.err" + chrom = report("10_Anchoring/orient_2/logs/ichr.{lg_range}.la.err", category = "Anchoring II Logs") params: chrom = "{lg_range}" message: "Running a second iteration of PlaceAndOrientContigs for linkage group {params.chrom}" @@ -278,8 +288,8 @@ rule prune: oriented = expand("10_Anchoring/orient_2/ichr.{lgs}.la", lgs = lg_range), bedfile = "10_Anchoring/map_propogated.bed" output: - pruned = "10_Anchoring/orient_2/pruned.la", - cleaned = "10_Anchoring/overlaps_rm.la" + pruned = report("10_Anchoring/orient_2/pruned.la", category = "Logs"), + cleaned = report("10_Anchoring/overlaps_rm.la", category = "Logs") message: "Pruning contig blocks without map support and removing overlaps" params: chrom = lg @@ -296,55 +306,93 @@ rule construct_agp: input: cleaned = "10_Anchoring/overlaps_rm.la" output: - agp = "10_Anchoring/agp/chr.{lg_range}.agp", - scaff_agp = "10_Anchoring/agp_scaffolds/chr.{lg_range}.scaffolds.agp" + agp = report("11_AGP/contigs/chr.{lg_range}.agp", category = "Contig AGP Files"), + scaff_agp = report("11_AGP/scaffolds/chr.{lg_range}.scaffolds.agp", category = "Scaffold AGP Files") message: "Creating AGP files for linkage group {params.chrom}" params: chrom = "{lg_range}" shell: """ - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > 10_Anchoring/agp/chr.{params.chrom}.agp - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > 10_Anchoring/agp_scaffolds/chr.{params.chrom}.scaffolds.agp + awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > {output.agp} + awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > {output.scaff_agp} """ rule unused: input: lengths = "10_Anchoring/contigs.length", haplos = "10_Anchoring/fullHaplotypes50.txt", - agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range), - scaff_agp = expand("10_Anchoring/agp_scaffolds/chr.{lgs}.scaffolds.agp", lgs = lg_range) + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), output: - txt = "10_Anchoring/not_used_final.txt", - agp = "10_Anchoring/not_used.agp", - final_agp = "10_Anchoring/REF_LA.agp", - scaff_agp = "10_Anchoring/REF_LA_scaffolds.agp" + txt = "11_AGP/not_used_final.txt", + agp = "11_AGP/not_used.agp" message: "Finding unused contigs" shell: """ cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos};awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} grep -F -w -f {output.txt} {input.lengths} | awk '{{print $1,1,$2,1,"W",$1,1,$2,"+"}}' > {output.agp} - cat {input.agp} > {output.final_agp} + """ + +rule build_final_agp: + input: + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), + scaff_agp = expand("11_AGP/scaffolds/chr.{lgs}.scaffolds.agp", lgs = lg_range), + unused = "11_AGP/not_used.agp", + output: + contig_agp = "11_AGP/lepanchor.contigs.only.agp", + scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp", + contig_all_agp = "11_AGP/lepanchor.contigs.all.agp", + scaff_all_agp = "11_AGP/lepanchor.scaffolds.all.agp" + message: "Generating final AGP files" + shell: + """ + cat {input.agp} > {output.contig_agp} cat {input.scaff_agp} > {output.scaff_agp} + cat {input.agp} {input.unused} > {output.contig_all_agp} + cat {input.scaff_agp} {input.unused} > {output.scaff_all_agp} """ -rule build_scaffold_fasta: +rule build_scaffold_only_fasta: input: assembly = geno, - agp = "10_Anchoring/REF_LA.agp" + agp = "11_AGP/lepanchor.contigs.only.agp" output: - fasta = "10_Anchoring/Anchored.scaffolds.fa.gz", + fasta = "12_Fasta/Anchored.scaffolds.only.fa.gz", + message: "Constructing final scaffold-only fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} + """ + +rule build_scaffold_contig_fasta: + input: + assembly = geno, + agp = "11_AGP/lepanchor.contigs.all.agp" + output: + fasta = "12_Fasta/Anchored.scaffolds.fa.gz", message: "Constructing final scaffold fasta file {output.fasta}" shell: """ gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} """ +rule build_contig_only_fasta: + input: + assembly = geno, + scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp" + output: + fasta = "12_Fasta/Anchored.contigs.only.fa.gz" + message: "Constructing final contig-only fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} + """ + rule build_contig_fasta: input: assembly = geno, - scaff_agp = "10_Anchoring/REF_LA_scaffolds.agp" + scaff_agp = "11_AGP/lepanchor.scaffolds.all.agp" output: - fasta = "10_Anchoring/Anchored.contigs.fa.gz" + fasta = "12_Fasta/Anchored.contigs.fa.gz" message: "Constructing final contig fasta file {output.fasta}" shell: """ @@ -354,11 +402,11 @@ rule build_contig_fasta: rule mareymap_data: input: lift = "10_Anchoring/liftover.la", - agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range) + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) output: - mareydata = "11_MareyMaps/marey.data.gz", - sexavg = "11_MareyMaps/marey.data.sexavg.gz" - log: "11_MareyMaps/missing_scaffolds.txt" + mareydata = "13_MareyMaps/data.marey.gz", + sexavg = "13_MareyMaps/data.marey.sexavg.gz" + log: report("13_MareyMaps/missing_scaffolds.txt", category = "Logs") message: """ Creating Marey map interval data @@ -371,30 +419,70 @@ rule mareymap_data: """ for c in $(seq 1 {params.chrom}) do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 10_Anchoring/agp/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip + awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip done > {output.mareydata} 2> {log} for c in $(seq 1 {params.chrom}) do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 10_Anchoring/agp/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip + awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip done > {output.sexavg} 2> /dev/null """ rule mareymaps: input: - data = "11_MareyMaps/marey.data.gz", - sexavg = "11_MareyMaps/marey.data.sexavg.gz", - agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range) + data = "13_MareyMaps/data.marey.gz", + sexavg = "13_MareyMaps/data.marey.sexavg.gz", + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) output: - indiv_plots = expand("11_MareyMaps/LG.{lgs}.mareymap.png", lgs = lg_range), - summary = "11_MareyMaps/LepAnchor.mareymaps.pdf", - sequential = "11_MareyMaps/LepAnchor.sequentialmaps.pdf", - SAsummary = "11_MareyMaps/LepAnchor.sexavg.mareymaps.pdf", - SAsequential = "11_MareyMaps/LepAnchor.sexavg.sequentialmaps.pdf" + indiv_plots = report(expand("13_MareyMaps/LG.{lgs}.mareymap.png", lgs = lg_range), category = "Marey Maps"), + summary = report("13_MareyMaps/LepAnchor.mareymaps.pdf", category = "Marey Maps") , + sequential = report("13_MareyMaps/LepAnchor.sequentialmaps.pdf", category = "Sequential Maps"), + SAsummary = report("13_MareyMaps/LepAnchor.sexavg.mareymaps.pdf", category = "Marey Maps Sex Avg"), + SAsequential = report("13_MareyMaps/LepAnchor.sexavg.sequentialmaps.pdf", category = "Sequential Maps Sex Avg") message: "Creating Marey Maps" shell: """ - Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 10_Anchoring/agp - Rscript scripts/LASummary.r {input.data} + Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 11_AGP/contigs + Rscript scripts/LASummary.r {input.data} true Rscript scripts/LASummarySexAvg.r {input.sexavg} - """ \ No newline at end of file + """ + +rule generate_updated_intervals: + input: "13_MareyMaps/data.marey.gz" + output: "14_NewIntervals/LA.intervals.{lg_range}" + message: "Splitting out LG {params.chrom} from {input}" + params: + chrom = "{lg_range}" + shell: + """ + zgrep "LG{params.chrom}\s" {input} > {output} + """ + +rule trim_newintervals: + input: "14_NewIntervals/LA.intervals.{lg_range}" + output: + outfile = "15_TrimNewIntervals/LA.intervals.{lg_range}.trimmed", + plot = "15_TrimNewIntervals/plots/LA.intervals.{lg_range}.trim.pdf" + message: "Trimming edge clusters for {input}" + params: + edge = edgelen, + dist = trimdist + shell: "Rscript scripts/LATrim.r {input} {params.dist} {params.edge} 15_TrimNewIntervals" + +rule merge_trimplots: + input: expand("15_TrimNewIntervals/plots/LA.intervals.{lg}.trim.pdf", lg = lg_range) + output: "15_TrimNewIntervals/LA.trim.summary.pdf" + message: "Merging trimming plots into {output}" + shell: "convert -density 200 {input} {output}" + + +rule merge_trimmedintervals: + input: expand("15_TrimNewIntervals/LA.intervals.{lg}.trimmed", lg = lg_range) + output: "15_TrimNewIntervals/data.marey.trimmed.gz" + message: "Concatenating trimmed intervals to {output}" + shell: "cat {input} | gzip -c > {output}" + +rule plot_trimmedintervals: + input: "15_TrimNewIntervals/data.marey.trimmed.gz" + output: report("15_TrimNewIntervals/LepAnchor.mareymaps.pdf", category = "Trimmed Marey Maps") + shell: "Rscript scripts/LASummary.r {input}" \ No newline at end of file diff --git a/scripts/LASummary.r b/scripts/LASummary.r index 34ecea7..5d7b440 100755 --- a/scripts/LASummary.r +++ b/scripts/LASummary.r @@ -41,28 +41,30 @@ savedims <- length(unique(allmaps$lg)) * 2.5 ggsave(outfile, width = 8.5, height = savedims/4, units = "in") -allmaps %>% - ggplot(aes(x = marker, y = cM)) + - geom_point(size = 0.6, alpha = 0.5) + - labs( - title = "Relative Marker Positions within Linkge Groups", - subtitle = "The distance of sequential markers from each other in the linkage maps", - x = "Marker Number", - y = "Genetic Position (cM)" - ) + - theme( - legend.position = "top", - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-5, -5, -5, -5), - legend.title = element_blank(), - legend.text = element_text(size = 13), - axis.text.x = element_text(colour = "black", size = 11), - axis.text.y = element_text(colour = "black", size = 11), - strip.text = element_text(size = 13), - axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), - axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) +if (length(args) > 1){ + allmaps %>% + ggplot(aes(x = marker, y = cM)) + + geom_point(size = 0.6, alpha = 0.5) + + labs( + title = "Relative Marker Positions within Linkage Groups", + subtitle = "The distance of sequential markers from each other in the linkage maps", + x = "Marker Number", + y = "Genetic Position (cM)" ) + - facet_wrap(lg ~ sex, ncol = 2, scales = "free_x") + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(lg ~ sex, ncol = 2, scales = "free_x") -outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sequentialmaps.pdf") -ggsave(outfile2, width = 8.5, height = savedims, units = "in", limitsize = FALSE) \ No newline at end of file + outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sequentialmaps.pdf") + ggsave(outfile2, width = 8.5, height = savedims, units = "in", limitsize = FALSE) +} \ No newline at end of file diff --git a/scripts/LATrim.r b/scripts/LATrim.r new file mode 100755 index 0000000..d43110b --- /dev/null +++ b/scripts/LATrim.r @@ -0,0 +1,156 @@ +#! /usr/bin/env Rscript + +suppressMessages(if (!require("tidyverse")) install.packages("tidyverse")) +suppressMessages(library("tidyverse")) +suppressMessages(if (!require("cowplot")) install.packages("cowplot")) +library(cowplot) + +args <- commandArgs(trailingOnly = TRUE) +# args[1] is the OrderMarkers2 output file +# args[2] is the centimorgan cutoff distance +# args[3] is the % of edge markers to scan +# args[4] is the name of the output folder + +lgfile <- read.delim( + args[1], + header = FALSE, + sep = "\t", + comment.char="#" +) %>% + mutate(Mpass = "PASS", Fpass = "PASS") + +## setup output file names ## +# split the filename by path +filename <- unlist(strsplit(args[1], "/")) +# pop out just the filename +filename <- filename[length(filename)] +lg <- (strsplit(lgfile$V1[1], "LG") %>% unlist())[2] %>% as.numeric() + +#========= output instantiation ========# + +dir.create(args[4], showWarnings = FALSE) +dir.create(paste0(args[4],"/plots"), showWarnings = FALSE) +dir.create(paste0(args[4],"/logs"), showWarnings = FALSE) +dir.create(paste0(args[4],"/QC_raw"), showWarnings = FALSE) +outfile_base <- paste(args[4], filename, sep = "/") +outfile_log_base <- paste(args[4], "logs", filename, sep = "/") +plotfile_base <- paste(args[4], "plots", filename, sep = "/") +plotfile <- paste(plotfile_base, "trim.pdf", sep = ".") +rawfile_base <- paste(args[4], "QC_raw", filename, sep = "/") + +##### Pruning the ends ##### +dist_thresh <- as.numeric(args[2]) +if(dist_thresh >= 1){ + dist_thresh <- dist_thresh * .01 +} +dist_thresh <- abs(max(lgfile$V5) - min(lgfile$V5)) * dist_thresh + +# if the percent threshold is given as an integer, convert it to a decimal +edge_length <- as.numeric(args[3]) +if(edge_length >= 1){ + edge_length <- edge_length * .01 +} + +n_markers <- length(lgfile$V1) + +front_edge <- round(n_markers * edge_length, digits = 0) +back_edge <- round(n_markers - front_edge, digits = 0) + +for (j in 5:6){ # iterate over male (5) and female (6) + # sort on column + if (j == 5){ + lgfile <- arrange(lgfile, V5) + } else { + lgfile <- arrange(lgfile, V6) + } + # trim beginning + for(a in front_edge:2){ #first n% of total markers starting from the front edge, going out + diff <- abs(lgfile[a,j]-lgfile[a-1,j]) # difference between two points + if( diff > dist_thresh ){ # is the difference between the two points > distance argument? + lgfile[(a-1):1, j+2] <- "FAIL" # mark that marker and all markers BEFORE it as FAIL + break() + } + } + # trim end + for(z in back_edge:(n_markers-1)){ #last n% total markers starting from the back edge going out + diff <- abs(lgfile[z+1,j]-lgfile[z,j]) # difference between two points + if( diff > dist_thresh ){ # is the difference between the two points > distance argument? + lgfile[(z+1):n_markers,j+2] <- "FAIL" # mark that marker and all markers AFTER it as FAIL + break() + } + } +} + +# create new table of markers passing QC +cleaned_markers <- (lgfile %>% filter(Mpass == "PASS" & Fpass == "PASS"))[,1:6] + +# isolate bad markers +removed_markers <- (lgfile %>% filter(Mpass == "FAIL" | Fpass == "FAIL"))[,1:6] + +# get simple counts +rm_male <- (lgfile %>% filter(Mpass == "FAIL" & Fpass == "PASS"))$V1 %>% length() +rm_female <- (lgfile %>% filter(Fpass == "FAIL" & Mpass == "PASS"))$V1 %>% length() +rm_both <- (lgfile %>% filter(Mpass == "FAIL" & Fpass == "FAIL"))$V1 %>% length() + +pdf(NULL) + +plot_male <- lgfile %>% arrange(V5) %>% + ggplot(aes(x=seq_along(V5), y = V5, color = Mpass)) + + geom_point() + + geom_vline(xintercept = front_edge, linetype = "dashed", size = 0.2) + + geom_vline(xintercept = back_edge, linetype = "dashed", size = 0.2) + + labs( + title = "", + subtitle = paste0("Male markers trimmed: ", rm_male), + caption = paste0(edge_length*100, "% of edge markers, ", args[2], "% cM threshold= ", dist_thresh), + x = "Marker Number", + y = "Position (cM)", + color = "Pass Filtering" + ) + +plot_female <- lgfile %>% arrange(V6) %>% + ggplot(aes(x=seq_along(V6), y = V6, color = Fpass)) + + geom_point() + + geom_vline(xintercept = front_edge, linetype = "dashed", size = 0.2) + + geom_vline(xintercept = back_edge, linetype = "dashed", size = 0.2) + + labs( + title = paste("Edge Cluster Trimming for LG:", lg), + subtitle = paste0("Female markers trimmed: ", rm_female), + caption = paste0("Markers failing both M+F: ", rm_both), + x = "Marker Number", + y = "Position (cM)", + color = "Pass Filtering", + legend.position = "none" + ) + +plot_grid(plot_female, plot_male, ncol = 2, nrow = 1) + +suppressMessages(ggsave(plotfile, width = 7, height = 3, units = "in")) + +write.table( + cleaned_markers, + file = paste(outfile_base, "trimmed", sep = "."), + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, +) + +write.table( + lgfile, + file = paste(rawfile_base, "filtered.raw", sep = "."), + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, +) + +write.table( + removed_markers, + file=paste(outfile_log_base, "removed", sep = "."), + append=FALSE, + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE +) \ No newline at end of file