diff --git a/.gitignore b/.gitignore index 04203ea..9da44c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -/.snakemake .vscode +/.snakemake /1_ParentCall /2_Filtering /3_SeparateChromosomes @@ -11,10 +11,21 @@ /7_Intervals /8_RepeatMask /9_Chain -/10_Anchoring -/11_MareyMaps +/10_PlaceAndOrientContigs +/11_AGP +/12_Fasta +/13_MareyMapsUntrimmed +/14_NewIntervals +/15_Trim +/16_MareyMapsTrimmed /JoinSingles2All_iter /RefineMap /pedigree.txt -map.master -/*.vcf \ No newline at end of file +LOD.master +snps.txt +/*.paf +/*.vcf +/*.fasta +/*.fa +/*.fa.gz +/*.fasta.gz \ No newline at end of file diff --git a/.misc/LepAnchor.rulegraph.png b/.misc/LepAnchor.rulegraph.png index dc52ddb..47746d9 100644 Binary files a/.misc/LepAnchor.rulegraph.png and b/.misc/LepAnchor.rulegraph.png differ diff --git a/LepWrap b/LepWrap index 8acd85f..e0cfd4c 100755 --- a/LepWrap +++ b/LepWrap @@ -4,24 +4,42 @@ if which snakemake &>/dev/null; then foo=1 else echo -e "ERROR:\nSnakemake installation is required to run LepWrap, but not found in the current environment." + echo -e "If [ana|mini]conda are installed, created a pre-configred environment with:\n" + printf "\033[01;32m" + printf "conda env create -f conda_setup.yml\n" + printf "\033[0m" + exit 1 +fi + +if [ ! -f config.yaml ]; then + echo -e "ERROR:\nThe file 'config.yaml' is required to run LepWrap, but not found in the current directory." exit 1 fi if [[ -z "$1" ]]; then - echo "Perform the modules of Lep-Map3. Make sure config.yaml is properly configured for how you intend to run things." + echo "Perform the modules of Lep-Map3 and optionally Lep-Anchor" + echo "Make sure config.yaml is properly configured for how you intend to run things." echo "" echo "[usage] LepWrap " echo "[example] LepWrap 16" exit 1 fi -echo "Running Lep-Map3" -sleep 2s -snakemake --cores $1 --snakefile ./rules/LepMap3.smk --directory . +lepmap(){ + echo "Running Lep-Map3" + sleep 2s + snakemake --cores $1 --snakefile ./rules/LepMap3/LepMap3.smk --directory . +} -LA=$(grep "run_lepanchor" config.yaml | cut -d":" -f2 | xargs | tr '[:upper:]' '[:lower:]') -if [ $LA == "true" ]; then -echo -e "\nRunning Lep-Anchor" -sleep 2s -snakemake --cores $1 --snakefile ./rules/LepAnchor.smk --directory . -fi +lepanchor(){ + LA=$(grep "run_lepanchor" config.yaml | cut -d":" -f2 | xargs | tr '[:upper:]' '[:lower:]') + if [ $LA == "true" ]; then + echo -e "\nRunning Lep-Anchor" + sleep 2s + snakemake --cores $1 --snakefile ./rules/LepAnchor/LepAnchor.smk --directory . + else + exit 1 + fi +} + +lepmap $1 && lepanchor $1 \ No newline at end of file diff --git a/README.md b/README.md index cad906c..5e59b78 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ ![logo](.misc/logo.png) + _It's Lep-Map3, but with snakes_ 🐍🐍 [![alt text](https://img.shields.io/badge/docs-wiki-75ae6c?style=for-the-badge&logo=Read%20The%20Docs)](https://github.com/pdimens/LepWrap/wiki) @@ -11,7 +12,7 @@ LepWrap is a reusable pipeline to use the linkage map software [Lep-Map3](https: ### How to install -You will need a `conda` installation (Anaconda or Miniconda), along with cloning this repository locally. +You will need a `conda` installation ([Anaconda](https://docs.anaconda.com/anaconda/install/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html)), along with cloning this repository locally. I recommend Miniconda. #### 1. Cloning LepWrap Download a zip of this repository using the "Code" button on the top-right and unzip it on your machine or: @@ -23,7 +24,9 @@ git clone https://github.com/pdimens/LepWrap.git Assuming you have `anaconda` or `miniconda` installed: ```bash cd LepWrap -conda create -f conda_setup.yml -n lepwrap + +conda env create -f conda_setup.yml + ``` This will create an environment called `lepwrap` that can be activated with: ```bash @@ -38,13 +41,13 @@ You will need to modify `config.yaml` to suit your needs, then you can simply ru where `` is an integer of the maximum number of cores/threads you want the pipeline to use. ### Something to keep in mind - -Lep-Map3/Anchor are **very** comprehensive software, and LepWrap doesn't incorporate all the features and nuances. Your study is unique, so you are encouraged to clone or fork this repo and adapt LepWrap to your needs! All of the code in LepWrap is written as human-readable as possible and aggressively annotated, so give it a shot and adapt it to your workflow. If using LepWrap in a publication, cite **Pasi Rastas** for his work and please include a link to it in your publication. If you like using it, star this repository and give me (Pavel) a shout out on Twitter [@pvdimens](https://twitter.com/PVDimens) [![alt text](http://i.imgur.com/wWzX9uB.png)](https://twitter.com/PVDimens) 😊 +LepWrap does things a certain way, employing the most common/reasonable way of using Lep-Map3 (and LepAnchor more or less). Version `3.2+` is **a lot** more flexible that its predecessors, but might still lack something you're looking for. Your study is unique, and I encourage youto clone/fork this repository and adapt LepWrap to your needs! All of the code in LepWrap is written in human-readable bash or aggressively annotated R, so give it a shot and adapt it to your workflow. PR's always welcome! ## Citation +If using LepWrap in a publication, cite **Pasi Rastas** for their work on Lep-Map3/Lep-Anchor and please include a link to this repository. If you like using it, give me (Pavel) a shout out on Twitter [@pvdimens](https://twitter.com/PVDimens) [![alt text](http://i.imgur.com/wWzX9uB.png)](https://twitter.com/PVDimens) =) -Pasi Rastas, Lep-MAP3: robust linkage mapping even for low-coverage whole genome sequencing data, Bioinformatics, Volume 33, Issue 23, 01 December 2017, Pages 3726–3732,https://doi.org/10.1093/bioinformatics/btx494 +> Pasi Rastas, Lep-MAP3: robust linkage mapping even for low-coverage whole genome sequencing data, Bioinformatics, Volume 33, Issue 23, 01 December 2017, Pages 3726–3732,https://doi.org/10.1093/bioinformatics/btx494 -Pasi Rastas, Lep-Anchor: automated construction of linkage map anchored haploid genomes, Bioinformatics, Volume 36, Issue 8, 15 April 2020, Pages 2359–2364, https://doi.org/10.1093/bioinformatics/btz978 +> Pasi Rastas, Lep-Anchor: automated construction of linkage map anchored haploid genomes, Bioinformatics, Volume 36, Issue 8, 15 April 2020, Pages 2359–2364, https://doi.org/10.1093/bioinformatics/btz978 \ No newline at end of file diff --git a/config.yaml b/config.yaml index d32e040..d74c0a5 100644 --- a/config.yaml +++ b/config.yaml @@ -8,33 +8,45 @@ # ParentCall2 # #---------------# # the filtered VCF file with your genotype likelihoods: -vcf: "out.14.missrecode.vcf" +vcf: "out.15.miss95recode.vcf" # Instructions to create pedigree file: https://sourceforge.net/p/lep-map3/wiki/software/LepMap3 Home/#parentcall2 # the pedigree file associated with your data pedigree: "pedigree.txt" +# If there are any additional parameters you would like to use for ParentCall2 (e.g. halfSibs=1), add them here +extra_params_ParentCall: "removeNonInformative=1" + + #---------------# # Filtering2 # #---------------# # data tolerance value-- set this to 0 if you want to skip the Filtering2 module -data_tol: 0.1 +data_tol: 0 + +# If there are any additional parameters you would like to use for Filtering2 (e.g. convert2Biallelic=1), add them here +extra_params_Filtering: "" + #------------------------# # SeperateChromosomes2 # #------------------------# -# LepMak3r will iteratively perform SeperateChromosomes2 for each +# LepWrap will iteratively perform SeperateChromosomes2 for each # LOD score in the range of lod_min to lod_max # The minimum LOD for SeperateChromosomes2 -lod_min: 5 +lod_min: 20 # The maximum LOD for SeperateChromosomes2 -lod_max: 30 +lod_max: 40 # Use only markers with informative father (1), mother(2), both parents(3) or neither parent(0) informative: "informativeMask=123" +# If there are any additional parameters you would like to use for SeparateChromosomes2 (e.g. distrotionLOD=1), add them here +extra_params_SeparateChromosomes: "sizeLimit=5 distortionLOD=1" + + #-------------------# # JoinSingles2ALL # #-------------------# @@ -48,6 +60,10 @@ lod_limit: "lodLimit=2" # start with lower values for lod_limit and increase as necessary lod_difference: "lodDifference=2" +# If there are any additional parameters you would like to use for JoinSingles2All (e.g. iterate=1), add them here +extra_params_JoinSingles: "iterate=1 distortionLOD=1" + + #-----------------# # OrderMarkers2 # #-----------------# @@ -59,32 +75,39 @@ exp_lg: 24 # Set iterations to the number of iterations you want per chromosome (more is better). Recommend 30 or more iterations: 100 -# Set threads_per to the number of threads you would like to use per linkage group for ordering -threads_per: 2 - -# Choose your distance method by commenting the line you dont want -dist_method: "useKosambi=1" -#dist_method: "useMorgan=1" +# If there are any additional parameters you would like to use for OrderMarkers2 (e.g. hyperPhaser=1), add them here +extra_params_OrderMarkers: "hyperPhaser=1 useKosambi=1 phasingIterations=2" -# number of iterations to use of OrderMarkers2 phasing. -# this number is typically 1-2 -phase_iterations: 2 #-----------------# # Edge Trimming # #-----------------# # Edge trimming will examine the first and last X% of markers in a linkage group -# and remove clusters that are N centimorgans away from the next marker -# you can "skip" trimming by making edge_length really low (e.g. 1) -# and trim_cutoff really high (e.g. 50) +# and remove clusters that are N% centimorgans (of the total cM span) away from +# the next marker. You can "skip" trimming by setting trim_cutoff really high (e.g. 80-100). # Set edge_length to the percent number of markers you would like to examine from either end of the linkage group -# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" -edge_length: 1 +# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" (10-20 is reasonable) +edge_length: 20 + +# Set trim_cuttoff to the centiMorgan distance cutoff (5 is reasonable) +trim_cutoff: 100 -# Set trim_cuttoff to the centiMorgan distance cutoff (10 is reasonable) -trim_cutoff: 40 +#--------------------# +# Re-OrderMarkers2 # +#--------------------# +# The second round of OrderMarkers will use the same basic parameters as the first round (but not the extra params) +# If there are additional parameters you would like to use, add them here: +extra_params_reOrderMarkers: "improveOrder=1 useKosambi=1" + + +#-----------------------# +# Calculate Distances # +#-----------------------# +# If you used useKosambi=1 or useMorgan=1 for Ordering/reOrdering, add that same +# parameter to distance_method: +distance_method: "" #################### @@ -104,7 +127,7 @@ lg_count: 24 PAF_file: "alnPB.paf" # if you have a proximity file add it here, otherwise leave the text as "/dev/null". -# This isn't yet implemented in the official Lep-Anchor wrapper, so don't change this. +# This isn't yet implemented in Lep-Anchor, so don't change this. proximity_file: "/dev/null" # Which system is appropriate for running the HaploMerger2 section? Comment out the one that doesn't apply. @@ -116,16 +139,46 @@ OS_info: "Ubuntu" #OS_info: "CentOS6" +#------------# +# CleanMap # +#------------# +# If there are any additional parameters you would like to use for CleanMap (e.g. chimericDistance=500), add them here +extra_params_CleanMap: "" + + +#-----------# +# Map2Bed # +#-----------# +# If there are any additional parameters you would like to use for Map2Bed (e.g. markerSupport=4), add them here +extra_params_Map2Bed: "" + + +#-------------------------# +# PlaceAndOrientContigs # +#-------------------------# +# choose which of the input types you want to generate by uncommenting the other +# LepAnchor uses intervals by default, but either works. +#lepanchor_input: "noIntervals=0" # uses intervals input +lepanchor_input: "noIntervals=1" # uses map position input + +# The size limit for detecting potential haplotype contigs (default: ~5000) +# Set this value really high (50000+) to ignore haplotype removal in between PlaceOrient iterations +haplotype_limit: 5000 + +# If there are any additional parameters you would like to use for PlaceAndOrientContigs (e.g. randomOrder=1), add them here + +extra_params_PlaceOrient: "keepEmptyIntervals=1 numRuns=10" + + #-----------------# # Edge Trimming # #-----------------# # Edge trimming will examine the first and last X% of markers in a linkage group # and remove clusters that are N% centimorgans (of the total cM span) away from -# the next marker. You can "skip" trimming by making edge_length really low (e.g. 1) -# and trim_cutoff really high (e.g. 50) +# the next marker. You can "skip" trimming by setting LA_trim_cutoff really high (e.g. 80-100) # Set edge_length to the percent number of markers you would like to examine from either end of the linkage group -# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" +# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" (10-15 is reasonable) LA_edge_length: 20 # Set trim_cuttoff to the centiMorgan distance cutoff (5 is reasonable) diff --git a/rules/LAtest.smk b/rules/LAtest.smk deleted file mode 100644 index 97b746b..0000000 --- a/rules/LAtest.smk +++ /dev/null @@ -1,9 +0,0 @@ -rule validation: - input: - output: - message: - threads: 30 - shell: - """ - awk -f software/LepAnchor/scripts/liftover.awk chr1.agp order1.input | sort -V | grep CHR > order1.liftover - """ diff --git a/rules/LepAnchor.smk b/rules/LepAnchor.smk deleted file mode 100644 index 6a6c666..0000000 --- a/rules/LepAnchor.smk +++ /dev/null @@ -1,488 +0,0 @@ -from os import path -import glob - -configfile: "config.yaml" - -geno = config["assembly"] -paf = config["PAF_file"] -proximity = config["proximity_file"] -lg = config["lg_count"] -lg_range = list(range(1,lg+1)) -os_name = config["OS_info"] -edgelen = config["LA_edge_length"] -trimdist = config["LA_trim_cutoff"] - -rule all: - input: - fasta = "12_Fasta/Anchored.contigs.fa.gz", - scaff = "12_Fasta/Anchored.scaffolds.fa.gz", - fastaonly = "12_Fasta/Anchored.contigs.only.fa.gz", - scaffonly = "12_Fasta/Anchored.scaffolds.only.fa.gz", - mareydata = "13_MareyMaps/data.marey.gz", - mareymaps = "13_MareyMaps/LepAnchor.mareymaps.pdf", - trimmedmareymaps = "15_TrimNewIntervals/LepAnchor.mareymaps.pdf", - trimsummary = "15_TrimNewIntervals/LA.trim.summary.pdf" - message: - """ - Lep-Anchor has finished. Good luck with the rest of your analyses! - Output Files: - ============= - contig fasta file | {input.fasta} - contigs-only fasta | {input.fastaonly} - scaffold fasta file | {input.scaff} - scaffolds-only fasta | {input.scaffonly} - converted linakge maps | {input.mareydata} - untrimmed marey maps | {input.mareymaps} - interval trimming summary | {input.trimsummary} - trimmed marey maps | {input.trimmedmareymaps} - """ - -rule repeatmask: - input: geno - output: "8_Repeatmask/repeatmasked.fa.gz" - log: "8_Repeatmask/Red.log" - message: "Using Red to repeat-mask {input}" - threads: 30 - shell: - """ - file={input} - if [ "${{file: -3}}" == ".gz" ]; then - echo "- Assembly is compressed, creating decompressed copy" - file=$(basename $file .gz) - gunzip --stdout {input} > $file - fi - ext=$(echo $file | rev | cut -d"." -f1 | rev) - if [ $ext != "fa" ]; then - echo "- Assembly extension must end in .fa for Red, creating a corrected symlink" - ln -srf $file ${{file}}.fa - fi - echo "- Running Red" - software/LepAnchor/deps/Red -gnm . -msk 8_Repeatmask -sco 8_Repeatmask -cnd 8_Repeatmask -rpt 8_Repeatmask > {log} 2>> {log} - echo "- Compressing repeat-masked genome from Red" - gzip --stdout 8_Repeatmask/*.msk > {output} && rm 8_Repeatmask/*.msk - """ - - -rule chain_1: - input: - geno = "8_Repeatmask/repeatmasked.fa.gz", - ctrl = "software/LepAnchor/deps/all_lastz.ctl", - scoremtx = "software/LepAnchor/deps/scoreMatrix.q" - output: - out1 = "9_Chain/repeatmaskedx.sizes", - out2 = "9_Chain/repeatmasked.sizes" - message: "Running Lastz via HaploMerger2" - threads: 30 - params: - os = os_name - shell: - """ - OS=$(echo {params} | tr '[:upper:]' '[:lower:]') - echo "Using the $OS lastz/chainNet binaries" - if [ $OS == "ubuntu" ] - then - export PATH="$PATH:software/LepAnchor/deps/ubuntu" - elif [ $OS == "centos5" ] - then - export PATH="$PATH:software/LepAnchor/deps/centOS5" - elif [ $OS == "centos6" ] - then - export PATH="$PATH:software/LepAnchor/deps/centOS6" - else - echo "$OS is not recognized as one of Ubuntu, CentOS5, or CentOS6, defaulting to Ubuntu" - export PATH="$PATH:software/LepAnchor/deps/ubuntu" - fi - ln -srf {input} 9_Chain/ - cd 9_Chain - ../software/LepAnchor/deps/step1.HM2 repeatmasked {threads} - """ - -rule chain_2: - input: - f1 = "9_Chain/repeatmaskedx.sizes", - f2 = "9_Chain/repeatmasked.sizes" - output: - original = "9_Chain/repeatmasked.repeatmaskedx.result/all.chain.gz", - slink = "9_Chain/chainfile.gz" - message: "Running HaploMerger2 to generate the chain file" - threads: 30 - params: - os = os_name - shell: - """ - OS=$(echo {params} | tr '[:upper:]' '[:lower:]') - echo "Using the $OS lastz/chainNet binaries" - if [ $OS == "ubuntu" ] - then - export PATH="$PATH:software/LepAnchor/deps/ubuntu" - elif [ $OS == "centos5" ] - then - export PATH="$PATH:software/LepAnchor/deps/centOS5" - elif [ $OS == "centos6" ] - then - export PATH="$PATH:software/LepAnchor/deps/centOS6" - else - echo "$OS is not recognized as one of Ubuntu, CentOS5, or CentOS6, defaulting to Ubuntu" - export PATH="$PATH:software/LepAnchor/deps/ubuntu" - fi - - cd 9_Chain - ../software/LepAnchor/deps/step2.HM2 repeatmasked {threads} && rm -r repeatmasked.repeatmaskedx.result/raw.axt - ln -sr ../{output.original} ../{output.slink} - """ - -rule extract_markers: - input: "2_Filtering/data.filtered.lepmap3.gz" - output: report("snps.txt", category = "Data") - message: "Extracting marker information from Lep-Map3 data file {input}" - shell: "scripts/extract_markers.sh {input}" - - -rule generate_intervals: - input: - markers = "snps.txt", - intervals = expand("7_Intervals/ordered.{x}.intervals", x = range(1, lg + 1)) - output: - intervals = report("10_Anchoring/lepmap3_intervals.la", category = "Data") - message: "Combining {params} Lep-Map3 interval files into single LepAnchor input {output}" - params: - lg = lg - shell: - """ - for i in $(seq 1 {params.lg}); do - awk -vn=$i '(NR==FNR){{map[NR-1]=$0}}(NR!=FNR){{$1=map[$1] "\t" n;print}}' {input.markers} 7_Intervals/ordered.$i.intervals >> {output.intervals} - done - """ - -rule contiglengths: - input: geno - output: report("10_Anchoring/contigs.length", category = "Data") - message: "Getting contig lengths" - shell: "gunzip -fc {input} | awk -f software/LepAnchor/scripts/contigLength.awk > {output}" - -rule find_haplotypes: - input: "9_Chain/chainfile.gz" - output: report("10_Anchoring/fullHaplotypes50.txt", category = "Logs") - message: "Finding full haplotypes (potential chimeric contigs)" - shell: - """ - gunzip -fc {input} | awk -f software/LepAnchor/scripts/findFullHaplotypes.awk > {output} - echo "Detected $(wc -l {output}) potentially chimeric contigs" - """ - -rule liftover: - input: - chain = "9_Chain/chainfile.gz", - intervals = "10_Anchoring/lepmap3_intervals.la", - haplos = "10_Anchoring/fullHaplotypes50.txt" - output: - lift = report("10_Anchoring/liftover.la", category = "Lifted Intervals"), - sortedlift = report("10_Anchoring/liftover.sorted.la", category = "Lifted Intervals") - message: "Running liftoverHaplotypes for the input maps" - shell: - """ - gunzip -fc {input.chain} | java -cp software/LepAnchor LiftoverHaplotypes map={input.intervals} haplotypes={input.haplos} chain=- > {output.lift} - cat {output.lift} | sort -V -k 1,1 -k 2,2n > {output.sortedlift} - """ - -rule cleanmap: - input: "10_Anchoring/liftover.sorted.la" - output: "10_Anchoring/map_all.clean" - log: report("10_Anchoring/cleamap.log", category = "Logs") - message: "Running CleanMap" - shell: "java -cp software/LepAnchor CleanMap map={input} > {output} 2> {log}" - -rule map2bed: - input: - cleanmap = "10_Anchoring/map_all.clean", - lengths = "10_Anchoring/contigs.length", - output: "10_Anchoring/map.bed" - log: report("10_Anchoring/map2bed.log", category = "Logs") - message: "Running Map2Bed" - shell: "java -cp software/LepAnchor Map2Bed map={input.cleanmap} contigLength={input.lengths} > {output} 2> {log}" - -rule ungrouped: - input: - lengths = "10_Anchoring/contigs.length", - haplos = "10_Anchoring/fullHaplotypes50.txt", - bedfile = "10_Anchoring/map.bed" - output: - bedfile = "10_Anchoring/map_extra.bed" - message: "Finding contigs not put into chromosomes" - params: - chrom = lg - shell: - """ - cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos}; cut -f 1 {input.bedfile}) > 10_Anchoring/unused_contigs.txt - grep -w -F -f 10_Anchoring/unused_contigs.txt {input.lengths} | awk -vn={params.chrom} '{{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}}' > 10_Anchoring/chr0.bed - cat {input.bedfile} 10_Anchoring/chr0.bed > {output.bedfile} - """ - -rule place_orient: - input: - chain = "9_Chain/chainfile.gz", - bedfile = "10_Anchoring/map_extra.bed", - paf = paf, - prox = proximity, - lift = "10_Anchoring/liftover.la" - output: - chrom = "10_Anchoring/orient_1/chr.{lg_range}.la" - log: - chrom = report("10_Anchoring/orient_1/logs/chr.{lg_range}.la.err", category = "Anchoring I Logs") - params: - chrom = "{lg_range}" - message: "Running PlaceAndOrientContigs for linkage group {params.chrom}" - shell: - """ - gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} keepEmptyIntervals=1 > {output} 2> {log} - """ - -rule propogate: - input: - placed = expand("10_Anchoring/orient_1/chr.{lgs}.la", lgs = lg_range), - bedfile = "10_Anchoring/map.bed" - output: - propogated = "10_Anchoring/map_propogated.bed", - tmp_prop = temp(expand("10_Anchoring/propogate/propogated.{lgs}.la", lgs = range(lg + 1))), - message: "Propogating ...something" - shell: - """ - awk -f software/LepAnchor/scripts/propagate.awk {input.placed} > 10_Anchoring/tmp1.la - awk -f software/LepAnchor/scripts/propagate.awk 10_Anchoring/tmp1.la > 10_Anchoring/tmp2.la - i=2 - - while ! cmp -s "10_Anchoring/tmp$i.la" "10_Anchoring/tmp$(( $i-1 )).la" ;do - awk -f software/LepAnchor/scripts/propagate.awk 10_Anchoring/tmp$i.la > 10_Anchoring/tmp$[$i+1].la - i=$[$i+1] - done - #create prop*.la - awk '/^[^#]/{{++d[$1 "\t" $7+0 "\t" $8+0]; data[++line]=$0}}END{{for (i = 1; i <= line; ++i) {{$0=data[i];if (d[$1 "\t" $7+0 "\t" $8+0] == 1) fn="10_Anchoring/propogate/propogated."$5".la"; else if ($5==1) fn="10_Anchoring/propogate/propogated.0.la"; else fn=""; if (fn != "") print $0>fn}}}}' 10_Anchoring/tmp$i.la - #awk '/^[^#]/{{++d[$1 "\t" $7+0 "\t" $8+0]; data[++line]=$0}}END{{for (i = 1; i <= line; ++i) {{$0=data[i];if (d[$1 "\t" $7+0 "\t" $8+0] == 1) fn="10_Anchoring/propogate/propogated."$5".la"; else if ($5==1) fn="10_Anchoring/propogate/propogated.0.la"; else fn=""; if (fn != "") print $0>fn}}' 10_Anchoring/tmp$i.la - - #create a new bed by combining propogated.[1-9]*.la and map.bed - awk '(NR==FNR){{print;c[$1]}}(NR!=FNR && !($1 in c)){{print $1 "\t" $7+0 "\t" $8+0"\t?\t"$5}}' {input.bedfile} {output.tmp_prop} > {output.propogated} - rm 10_Anchoring/tmp*.la - """ - -rule place_orient2: - input: - chain = "9_Chain/chainfile.gz", - bedfile = "10_Anchoring/map_propogated.bed", - paf = paf, - prox = proximity, - lift = "10_Anchoring/liftover.la" - output: - chrom = "10_Anchoring/orient_2/ichr.{lg_range}.la" - log: - chrom = report("10_Anchoring/orient_2/logs/ichr.{lg_range}.la.err", category = "Anchoring II Logs") - params: - chrom = "{lg_range}" - message: "Running a second iteration of PlaceAndOrientContigs for linkage group {params.chrom}" - shell: - """ - gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} keepEmptyIntervals=1 > {output.chrom} 2> {log.chrom} - """ - -rule prune: - input: - oriented = expand("10_Anchoring/orient_2/ichr.{lgs}.la", lgs = lg_range), - bedfile = "10_Anchoring/map_propogated.bed" - output: - pruned = report("10_Anchoring/orient_2/pruned.la", category = "Logs"), - cleaned = report("10_Anchoring/overlaps_rm.la", category = "Logs") - message: "Pruning contig blocks without map support and removing overlaps" - params: - chrom = lg - shell: - """ - for i in $(seq {params.chrom}) - do - awk -f software/LepAnchor/scripts/prune.awk 10_Anchoring/orient_2/ichr.$i.la > 10_Anchoring/orient_2/ichr.${{i}}.pruned.la - done 2> {output.pruned} - awk -f software/LepAnchor/scripts/removeOverlaps.awk {input.bedfile} 10_Anchoring/orient_2/ichr.*.pruned.la > {output.cleaned} - """ - -rule construct_agp: - input: - cleaned = "10_Anchoring/overlaps_rm.la" - output: - agp = report("11_AGP/contigs/chr.{lg_range}.agp", category = "Contig AGP Files"), - scaff_agp = report("11_AGP/scaffolds/chr.{lg_range}.scaffolds.agp", category = "Scaffold AGP Files") - message: "Creating AGP files for linkage group {params.chrom}" - params: - chrom = "{lg_range}" - shell: - """ - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > {output.agp} - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > {output.scaff_agp} - """ - -rule unused: - input: - lengths = "10_Anchoring/contigs.length", - haplos = "10_Anchoring/fullHaplotypes50.txt", - agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), - output: - txt = "11_AGP/not_used_final.txt", - agp = "11_AGP/not_used.agp" - message: "Finding unused contigs" - shell: - """ - cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos};awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} - grep -F -w -f {output.txt} {input.lengths} | awk '{{print $1,1,$2,1,"W",$1,1,$2,"+"}}' > {output.agp} - """ - -rule build_final_agp: - input: - agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), - scaff_agp = expand("11_AGP/scaffolds/chr.{lgs}.scaffolds.agp", lgs = lg_range), - unused = "11_AGP/not_used.agp", - output: - contig_agp = "11_AGP/lepanchor.contigs.only.agp", - scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp", - contig_all_agp = "11_AGP/lepanchor.contigs.all.agp", - scaff_all_agp = "11_AGP/lepanchor.scaffolds.all.agp" - message: "Generating final AGP files" - shell: - """ - cat {input.agp} > {output.contig_agp} - cat {input.scaff_agp} > {output.scaff_agp} - cat {input.agp} {input.unused} > {output.contig_all_agp} - cat {input.scaff_agp} {input.unused} > {output.scaff_all_agp} - """ - -rule build_scaffold_only_fasta: - input: - assembly = geno, - agp = "11_AGP/lepanchor.contigs.only.agp" - output: - fasta = "12_Fasta/Anchored.scaffolds.only.fa.gz", - message: "Constructing final scaffold-only fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} - """ - -rule build_scaffold_contig_fasta: - input: - assembly = geno, - agp = "11_AGP/lepanchor.contigs.all.agp" - output: - fasta = "12_Fasta/Anchored.scaffolds.fa.gz", - message: "Constructing final scaffold fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} - """ - -rule build_contig_only_fasta: - input: - assembly = geno, - scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp" - output: - fasta = "12_Fasta/Anchored.contigs.only.fa.gz" - message: "Constructing final contig-only fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} - """ - -rule build_contig_fasta: - input: - assembly = geno, - scaff_agp = "11_AGP/lepanchor.scaffolds.all.agp" - output: - fasta = "12_Fasta/Anchored.contigs.fa.gz" - message: "Constructing final contig fasta file {output.fasta}" - shell: - """ - gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} - """ - -rule mareymap_data: - input: - lift = "10_Anchoring/liftover.la", - agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) - output: - mareydata = "13_MareyMaps/data.marey.gz", - sexavg = "13_MareyMaps/data.marey.sexavg.gz" - log: report("13_MareyMaps/missing_scaffolds.txt", category = "Logs") - message: - """ - Creating Marey map interval data - first points in uncertainty intervals | {output.mareydata} - midpoints in uncertainty intervals | {output.sexavg} - """ - params: - chrom = lg - shell: - """ - for c in $(seq 1 {params.chrom}) - do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip - done > {output.mareydata} 2> {log} - - for c in $(seq 1 {params.chrom}) - do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip - done > {output.sexavg} 2> /dev/null - """ - -rule mareymaps: - input: - data = "13_MareyMaps/data.marey.gz", - sexavg = "13_MareyMaps/data.marey.sexavg.gz", - agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) - output: - indiv_plots = report(expand("13_MareyMaps/LG.{lgs}.mareymap.png", lgs = lg_range), category = "Marey Maps"), - summary = report("13_MareyMaps/LepAnchor.mareymaps.pdf", category = "Marey Maps") , - sequential = report("13_MareyMaps/LepAnchor.sequentialmaps.pdf", category = "Sequential Maps"), - SAsummary = report("13_MareyMaps/LepAnchor.sexavg.mareymaps.pdf", category = "Marey Maps Sex Avg"), - SAsequential = report("13_MareyMaps/LepAnchor.sexavg.sequentialmaps.pdf", category = "Sequential Maps Sex Avg") - message: "Creating Marey Maps" - shell: - """ - Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 11_AGP/contigs - Rscript scripts/LASummary.r {input.data} true - Rscript scripts/LASummarySexAvg.r {input.sexavg} - """ - -rule generate_updated_intervals: - input: "13_MareyMaps/data.marey.gz" - output: "14_NewIntervals/LA.intervals.{lg_range}" - message: "Splitting out LG {params.chrom} from {input}" - params: - chrom = "{lg_range}" - shell: - """ - zgrep "LG{params.chrom}\s" {input} > {output} - """ - -rule trim_newintervals: - input: "14_NewIntervals/LA.intervals.{lg_range}" - output: - outfile = "15_TrimNewIntervals/LA.intervals.{lg_range}.trimmed", - plot = "15_TrimNewIntervals/plots/LA.intervals.{lg_range}.trim.pdf" - message: "Trimming edge clusters for {input}" - params: - edge = edgelen, - dist = trimdist - shell: "Rscript scripts/LATrim.r {input} {params.dist} {params.edge} 15_TrimNewIntervals" - -rule merge_trimplots: - input: expand("15_TrimNewIntervals/plots/LA.intervals.{lg}.trim.pdf", lg = lg_range) - output: "15_TrimNewIntervals/LA.trim.summary.pdf" - message: "Merging trimming plots into {output}" - shell: "convert -density 200 {input} {output}" - - -rule merge_trimmedintervals: - input: expand("15_TrimNewIntervals/LA.intervals.{lg}.trimmed", lg = lg_range) - output: "15_TrimNewIntervals/data.marey.trimmed.gz" - message: "Concatenating trimmed intervals to {output}" - shell: "cat {input} | gzip -c > {output}" - -rule plot_trimmedintervals: - input: "15_TrimNewIntervals/data.marey.trimmed.gz" - output: report("15_TrimNewIntervals/LepAnchor.mareymaps.pdf", category = "Trimmed Marey Maps") - shell: "Rscript scripts/LASummary.r {input}" \ No newline at end of file diff --git a/rules/LepAnchor/LA_extra.smk b/rules/LepAnchor/LA_extra.smk new file mode 100644 index 0000000..d98b15d --- /dev/null +++ b/rules/LepAnchor/LA_extra.smk @@ -0,0 +1,11 @@ +rule remove_haplos: + input: + bedfile = "10_Anchoring/map_extra.bed", + haplotypes = "10_Anchoring/suspected.haplotypes.after" + output: "10_Anchoring/map.nohaplotypes.bed" + message: "Creating bedfile with suspected haplotypes removed" + shell: + """ + grep -w -v -f <(cut -f 2 {input.haplotypes}) {input.bedfile} > {output} + #awk -f software/LepAnchor/scripts/removeHaplotypes.awk {input.bedfile} {input.haplotypes} > {output}" + """ \ No newline at end of file diff --git a/rules/LepAnchor/LepAnchor.smk b/rules/LepAnchor/LepAnchor.smk new file mode 100644 index 0000000..3215e79 --- /dev/null +++ b/rules/LepAnchor/LepAnchor.smk @@ -0,0 +1,58 @@ +from os import path +import glob + +configfile: "config.yaml" + +# PlaceAndOrientContigs # +data_type = config["lepanchor_input"] +geno = config["assembly"] +paf = config["PAF_file"] +proximity = config["proximity_file"] +haplo_limit = config["haplotype_limit"] +place_orient_extra = config["extra_params_PlaceOrient"] +# Repeat-Masking and Chainfile # +os_name = config["OS_info"] +# Map2Bed # +map2bed_extra = config["extra_params_Map2Bed"] +# CleanMap # +cleanmap_extra = config["extra_params_CleanMap"] +# Edge Trimming # +edgelen = config["LA_edge_length"] +trimdist = config["LA_trim_cutoff"] +# Other # +lg = config["lg_count"] +lg_range = list(range(1,lg+1)) + +include: "generate_inputs.smk" +include: "mask_and_chain.smk" +include: "place_orient.smk" +include: "place_orient_ii.smk" +include: "place_orient_iii.smk" +include: "build_agp.smk" +include: "build_fasta.smk" +include: "mareymaps_untrimmed.smk" +include: "trim_edges.smk" + +rule all: + input: + fasta = "12_Fasta/Anchored.contigs.fa.gz", + scaff = "12_Fasta/Anchored.scaffolds.fa.gz", + fastaonly = "12_Fasta/Anchored.contigs.only.fa.gz", + scaffonly = "12_Fasta/Anchored.scaffolds.only.fa.gz", + mareydata = "13_MareyMapsUntrimmed/data.marey.gz", + mareymaps = "13_MareyMapsUntrimmed/LepAnchor.mareymaps.pdf", + trimmedmareymaps = "16_MareyMapsTrimmed/LepAnchor.mareymaps.pdf", + trimmedmareydata= "16_MareyMapsTrimmed/data.marey.trimmed.gz", + trimsummary = "15_Trim/LA.trim.summary.pdf" + message: + """ + Lep-Anchor has finished. Good luck with the rest of your analyses! + + Output Files Location + ==================================================== + anchored assemblies | 12_Fasta/ + untrimmed marey maps | 13_MareyMapsUntrimmed/ + updated linkage maps | 14_NewIntervals/ + trimmed linkage maps | 15_Trim/ + trimmed marey maps | 16_MareyMapsTrimmed/ + """ \ No newline at end of file diff --git a/rules/LepAnchor/build_agp.smk b/rules/LepAnchor/build_agp.smk new file mode 100644 index 0000000..a3f19bc --- /dev/null +++ b/rules/LepAnchor/build_agp.smk @@ -0,0 +1,50 @@ +rule construct_agp: + input: + cleaned = "10_PlaceAndOrientContigs/overlaps_rm.la" + output: + agp = report("11_AGP/contigs/chr.{lg_range}.agp", category = "Contig AGP Files"), + scaff_agp = report("11_AGP/scaffolds/chr.{lg_range}.scaffolds.agp", category = "Scaffold AGP Files") + message: "Creating AGP files for linkage group {params.chrom}" + params: + chrom = "{lg_range}" + shell: + """ + awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > {output.agp} + awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > {output.scaff_agp} + """ + + +rule unused: + input: + lengths = "10_PlaceAndOrientContigs/contigs.length", + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before", + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), + output: + txt = "11_AGP/not_used_final.txt", + agp = "11_AGP/not_used.agp" + message: "Finding unused contigs" + shell: + """ + cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos};awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} + grep -F -w -f {output.txt} {input.lengths} | awk '{{print $1,1,$2,1,"W",$1,1,$2,"+"}}' > {output.agp} + """ + + +rule build_final_agp: + input: + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), + scaff_agp = expand("11_AGP/scaffolds/chr.{lgs}.scaffolds.agp", lgs = lg_range), + unused = "11_AGP/not_used.agp", + output: + contig_agp = "11_AGP/lepanchor.contigs.only.agp", + scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp", + contig_all_agp = "11_AGP/lepanchor.contigs.all.agp", + scaff_all_agp = "11_AGP/lepanchor.scaffolds.all.agp" + message: "Generating final AGP files" + shell: + """ + cat {input.agp} > {output.contig_agp} + cat {input.scaff_agp} > {output.scaff_agp} + cat {input.agp} {input.unused} > {output.contig_all_agp} + cat {input.scaff_agp} {input.unused} > {output.scaff_all_agp} + """ \ No newline at end of file diff --git a/rules/LepAnchor/build_fasta.smk b/rules/LepAnchor/build_fasta.smk new file mode 100644 index 0000000..8aefe0a --- /dev/null +++ b/rules/LepAnchor/build_fasta.smk @@ -0,0 +1,50 @@ +rule build_scaffold_only_fasta: + input: + assembly = geno, + agp = "11_AGP/lepanchor.contigs.only.agp" + output: + fasta = "12_Fasta/Anchored.scaffolds.only.fa.gz", + message: "Constructing final scaffold-only fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} + """ + + +rule build_scaffold_contig_fasta: + input: + assembly = geno, + agp = "11_AGP/lepanchor.contigs.all.agp" + output: + fasta = "12_Fasta/Anchored.scaffolds.fa.gz", + message: "Constructing final scaffold fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} + """ + + +rule build_contig_only_fasta: + input: + assembly = geno, + scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp" + output: + fasta = "12_Fasta/Anchored.contigs.only.fa.gz" + message: "Constructing final contig-only fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} + """ + + +rule build_contig_fasta: + input: + assembly = geno, + scaff_agp = "11_AGP/lepanchor.scaffolds.all.agp" + output: + fasta = "12_Fasta/Anchored.contigs.fa.gz" + message: "Constructing final contig fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} + """ \ No newline at end of file diff --git a/rules/LepAnchor/generate_inputs.smk b/rules/LepAnchor/generate_inputs.smk new file mode 100644 index 0000000..6c34c83 --- /dev/null +++ b/rules/LepAnchor/generate_inputs.smk @@ -0,0 +1,99 @@ +rule extract_markers: + input: "2_Filtering/data.filtered.lepmap3.gz" + output: report("snps.txt", category = "Data") + message: "Extracting marker information from Lep-Map3 data file {input}" + shell: "scripts/extract_markers.sh {input}" + + +rule generate_input_data: + input: + markers = "snps.txt", + data = expand("7_Intervals/ordered.{x}.intervals", x = range(1, lg + 1)) if data_type == "noIntervals=0" else expand("7_Distances/ordered.{x}.distances", x = range(1, lg + 1)) + output: + data = report("10_PlaceAndOrientContigs/lepanchor.input", category = "Data") + message: "Combining {params} Lep-Map3 files into single LepAnchor input {output}" + params: + lg = lg, + datatype = data_type + shell: + """ + for i in $(seq 1 {params.lg}); do + if [ {params.datatype} == "noIntervals=0" ]; then + awk -vn=$i '(NR==FNR){{map[NR-1]=$0}}(NR!=FNR){{$1=map[$1] "\t" n;print}}' {input.markers} 7_Intervals/ordered.$i.intervals >> {output} + else + tail -n +3 7_Distances/ordered.$i.distances | awk -vn=$i '(NR==FNR){{map[NR-1]=$0}}(NR!=FNR){{$1=map[$1] "\t" n;print}}' {input.markers} - >> {output} + fi + done + """ + + +rule contiglengths: + input: geno + output: report("10_PlaceAndOrientContigs/contigs.length", category = "Data") + message: "Getting contig lengths" + shell: "gunzip -fc {input} | awk -f software/LepAnchor/scripts/contigLength.awk > {output}" + + +rule find_haplotypes: + input: "9_Chain/chainfile.gz" + output: report("10_PlaceAndOrientContigs/suspected.haplotypes.before", category = "Logs") + message: "Finding non-haplotype contigs not included in map.bed" + shell: + """ + gunzip -fc {input} | awk -f software/LepAnchor/scripts/findFullHaplotypes.awk > {output} + """ + + +rule liftover: + input: + chain = "9_Chain/chainfile.gz", + intervals = "10_PlaceAndOrientContigs/lepanchor.input", + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before" + output: + lift = report("10_PlaceAndOrientContigs/liftover.la", category = "Lifted Intervals"), + sortedlift = report("10_PlaceAndOrientContigs/liftover.sorted.la", category = "Lifted Intervals") + message: "Running liftoverHaplotypes for the input maps" + shell: + """ + gunzip -fc {input.chain} | java -cp software/LepAnchor LiftoverHaplotypes map={input.intervals} haplotypes={input.haplos} chain=- > {output.lift} + cat {output.lift} | sort -V -k 1,1 -k 2,2n > {output.sortedlift} + """ + + +rule cleanmap: + input: "10_PlaceAndOrientContigs/liftover.sorted.la" + output: "10_PlaceAndOrientContigs/map_all.clean" + log: report("10_PlaceAndOrientContigs/cleamap.log", category = "Logs") + message: "Running CleanMap" + params: + extras = cleanmap_extra + shell: "java -cp software/LepAnchor CleanMap map={input} {params.extras} > {output} 2> {log}" + +rule map2bed: + input: + cleanmap = "10_PlaceAndOrientContigs/map_all.clean", + lengths = "10_PlaceAndOrientContigs/contigs.length", + output: "10_PlaceAndOrientContigs/map.bed" + log: report("10_PlaceAndOrientContigs/map2bed.log", category = "Logs") + message: "Running Map2Bed" + params: + extras = map2bed_extra + shell: "java -cp software/LepAnchor Map2Bed map={input.cleanmap} contigLength={input.lengths} {params.extras} > {output} 2> {log}" + + +rule ungrouped: + input: + lengths = "10_PlaceAndOrientContigs/contigs.length", + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before", + bedfile = "10_PlaceAndOrientContigs/map.bed" + output: + bedfile = "10_PlaceAndOrientContigs/map_extra.bed" + message: "Finding contigs not put into chromosomes" + params: + chrom = lg + shell: + """ + cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos}; cut -f 1 {input.bedfile}) > 10_PlaceAndOrientContigs/unused_contigs.txt + grep -w -F -f 10_PlaceAndOrientContigs/unused_contigs.txt {input.lengths} | awk -vn={params.chrom} '{{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}}' > 10_PlaceAndOrientContigs/chr0.bed + cat {input.bedfile} 10_PlaceAndOrientContigs/chr0.bed > {output.bedfile} + """ diff --git a/rules/LepAnchor/mareymaps_untrimmed.smk b/rules/LepAnchor/mareymaps_untrimmed.smk new file mode 100644 index 0000000..bf93a90 --- /dev/null +++ b/rules/LepAnchor/mareymaps_untrimmed.smk @@ -0,0 +1,49 @@ +rule mareymap_data: + input: + lift = "10_PlaceAndOrientContigs/liftover.la", + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) + output: + mareydata = "13_MareyMapsUntrimmed/data.marey.gz", + sexavg = "13_MareyMapsUntrimmed/data.marey.sexavg.gz" + log: report("13_MareyMapsUntrimmed/missing_scaffolds.txt", category = "Logs") + message: + """ + Marey map interval data + =========================================================== + first points in uncertainty intervals | {output.mareydata} + midpoints in uncertainty intervals | {output.sexavg} + """ + params: + chrom = lg + shell: + """ + for c in $(seq 1 {params.chrom}) + do + awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip + done > {output.mareydata} 2> {log} + + for c in $(seq 1 {params.chrom}) + do + awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip + done > {output.sexavg} 2> /dev/null + """ + + +rule mareymaps: + input: + data = "13_MareyMapsUntrimmed/data.marey.gz", + sexavg = "13_MareyMapsUntrimmed/data.marey.sexavg.gz", + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) + output: + indiv_plots = report(expand("13_MareyMapsUntrimmed/LG.{lgs}.mareymap.png", lgs = lg_range), category = "Marey Maps"), + summary = report("13_MareyMapsUntrimmed/LepAnchor.mareymaps.pdf", category = "Marey Maps") , + sequential = report("13_MareyMapsUntrimmed/LepAnchor.sequentialmaps.pdf", category = "Sequential Maps"), + SAsummary = report("13_MareyMapsUntrimmed/LepAnchor.sexavg.mareymaps.pdf", category = "Marey Maps Sex Avg"), + SAsequential = report("13_MareyMapsUntrimmed/LepAnchor.sexavg.sequentialmaps.pdf", category = "Sequential Maps Sex Avg") + message: "Creating Marey Maps" + shell: + """ + Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 11_AGP/contigs + Rscript scripts/LASummary.r {input.data} true + Rscript scripts/LASummarySexAvg.r {input.sexavg} + """ \ No newline at end of file diff --git a/rules/LepAnchor/mask_and_chain.smk b/rules/LepAnchor/mask_and_chain.smk new file mode 100644 index 0000000..d4b90e3 --- /dev/null +++ b/rules/LepAnchor/mask_and_chain.smk @@ -0,0 +1,94 @@ +rule repeatmask: + input: geno + output: "8_Repeatmask/repeatmasked.fa.gz" + log: "8_Repeatmask/Red.log" + message: "Using Red to repeat-mask {input}" + threads: 30 + shell: + """ + file={input} + if [ "${{file: -3}}" == ".gz" ]; then + echo "- Assembly is compressed, creating decompressed copy" + file=$(basename $file .gz) + gunzip --stdout {input} > $file + fi + ext=$(echo $file | rev | cut -d"." -f1 | rev) + if [ $ext != "fa" ]; then + echo "- Assembly extension must end in .fa for Red, creating a corrected symlink" + ln -srf $file ${{file}}.fa + fi + echo "- Running Red" + software/LepAnchor/deps/Red -gnm . -msk 8_Repeatmask -sco 8_Repeatmask -cnd 8_Repeatmask -rpt 8_Repeatmask > {log} 2>> {log} + echo "- Compressing repeat-masked genome from Red" + gzip --stdout 8_Repeatmask/*.msk > {output} && rm 8_Repeatmask/*.msk + """ + + +rule chain_1: + input: + geno = "8_Repeatmask/repeatmasked.fa.gz", + ctrl = "software/LepAnchor/deps/all_lastz.ctl", + scoremtx = "software/LepAnchor/deps/scoreMatrix.q" + output: + out1 = "9_Chain/repeatmaskedx.sizes", + out2 = "9_Chain/repeatmasked.sizes" + message: "Running Lastz via HaploMerger2" + threads: 30 + params: + os = os_name + shell: + """ + OS=$(echo {params} | tr '[:upper:]' '[:lower:]') + echo "Using the $OS lastz/chainNet binaries" + if [ $OS == "ubuntu" ] + then + export PATH="$PATH:software/LepAnchor/deps/ubuntu" + elif [ $OS == "centos5" ] + then + export PATH="$PATH:software/LepAnchor/deps/centOS5" + elif [ $OS == "centos6" ] + then + export PATH="$PATH:software/LepAnchor/deps/centOS6" + else + echo "$OS is not recognized as one of Ubuntu, CentOS5, or CentOS6, defaulting to Ubuntu" + export PATH="$PATH:software/LepAnchor/deps/ubuntu" + fi + ln -srf {input} 9_Chain/ + cd 9_Chain + ../software/LepAnchor/deps/step1.HM2 repeatmasked {threads} + """ + + +rule chain_2: + input: + f1 = "9_Chain/repeatmaskedx.sizes", + f2 = "9_Chain/repeatmasked.sizes" + output: + original = "9_Chain/repeatmasked.repeatmaskedx.result/all.chain.gz", + slink = "9_Chain/chainfile.gz" + message: "Running HaploMerger2 to generate the chain file" + threads: 30 + params: + os = os_name + shell: + """ + OS=$(echo {params} | tr '[:upper:]' '[:lower:]') + echo "Using the $OS lastz/chainNet binaries" + if [ $OS == "ubuntu" ] + then + export PATH="$PATH:software/LepAnchor/deps/ubuntu" + elif [ $OS == "centos5" ] + then + export PATH="$PATH:software/LepAnchor/deps/centOS5" + elif [ $OS == "centos6" ] + then + export PATH="$PATH:software/LepAnchor/deps/centOS6" + else + echo "$OS is not recognized as one of Ubuntu, CentOS5, or CentOS6, defaulting to Ubuntu" + export PATH="$PATH:software/LepAnchor/deps/ubuntu" + fi + + cd 9_Chain + ../software/LepAnchor/deps/step2.HM2 repeatmasked {threads} && rm -r repeatmasked.repeatmaskedx.result/raw.axt + ln -sr ../{output.original} ../{output.slink} + """ diff --git a/rules/LepAnchor/place_orient.smk b/rules/LepAnchor/place_orient.smk new file mode 100644 index 0000000..dd3546c --- /dev/null +++ b/rules/LepAnchor/place_orient.smk @@ -0,0 +1,35 @@ +rule place_orient: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map_extra.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.la" + output: + chrom = "10_PlaceAndOrientContigs/orient_1/chr.{lg_range}.la", + haplos = "10_PlaceAndOrientContigs/orient_1/haplotypes/chr.{lg_range}.haplo.suspected" + log: + chrom = report("10_PlaceAndOrientContigs/orient_1/logs/chr.{lg_range}.la.log", category = "Anchoring I Logs"), + haplos = "10_PlaceAndOrientContigs/orient_1/haplotypes/chr.{lg_range}.haplo.all", + errors = "10_PlaceAndOrientContigs/orient_1/errors/chr.{lg_range}.errors" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type, + haplo = haplo_limit + threads: 3 + message: "Running the 1st round of PlaceAndOrientContigs for linkage group {params.chrom}" + shell: + """ + gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs numThreads={threads} bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {log.chrom} + sort -n -r {log.chrom} | awk '($NF=="haplotype" && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {log.haplos} + sort -n -r {log.chrom} | awk -vlimit={params.haplo} '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {output.haplos} + grep "error$" {log.chrom} > {log.errors} + """ + + +rule mergehaplos: + input: expand("10_PlaceAndOrientContigs/orient_1/haplotypes/chr.{lg}.haplo.suspected", lg = lg_range) + output: "10_PlaceAndOrientContigs/suspected.haplotypes.after" + message: "Merging suspected haplotype contig information from the linkage groups" + shell: "cat {input} | sort | uniq > {output}" diff --git a/rules/LepAnchor/place_orient_ii.smk b/rules/LepAnchor/place_orient_ii.smk new file mode 100644 index 0000000..4bbb5f6 --- /dev/null +++ b/rules/LepAnchor/place_orient_ii.smk @@ -0,0 +1,60 @@ +rule liftover2: + input: + chain = "9_Chain/chainfile.gz", + intervals = "10_PlaceAndOrientContigs/lepanchor.input", + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.before", + haplos2 = "10_PlaceAndOrientContigs/suspected.haplotypes.after", + lengths = "10_PlaceAndOrientContigs/contigs.length" + output: + haplos = "10_PlaceAndOrientContigs/suspected.haplotypes.all", + lift = report("10_PlaceAndOrientContigs/liftover.nohaplotypes.la", category = "Lifted Intervals"), + sortedlift = report("10_PlaceAndOrientContigs/liftover.sorted.nohaplotypes.la", category = "Lifted Intervals"), + mapfile = "10_PlaceAndOrientContigs/map.nohaplotypes.clean", + bedfile = "10_PlaceAndOrientContigs/map.nohaplotypes.bed", + unused = "10_PlaceAndOrientContigs/not_used.nohaplotypes.txt", + chr0 = "10_PlaceAndOrientContigs/chr0.nohaplotypes.bed", + mapextra = "10_PlaceAndOrientContigs/map.nohaplotypes.extra.bed" + message: "Recreating bedfile omitting haplotypes discovered from PlaceAndOrientContigs" + params: + chrom = lg + shell: + """ + cat {input.haplos} {input.haplos2} > {output.haplos} + gunzip -fc {input.chain} | java -cp software/LepAnchor LiftoverHaplotypes map={input.intervals} haplotypes={output.haplos} chain=- > {output.lift} + cat {output.lift} | sort -V -k 1,1 -k 2,2n > {output.sortedlift} + java -cp software/LepAnchor CleanMap map={output.sortedlift} > {output.mapfile} + java -cp software/LepAnchor Map2Bed map={output.mapfile} contigLength={input.lengths} > {output.bedfile} + cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {output.haplos}; cut -f 1 {output.bedfile}) > {output.unused} + grep -w -F -f {output.unused} {input.lengths} | awk -vn={params.chrom} '{{s=$1"\t1\t"$2"\t?\t"; for (i=1;i<=n;++i) print s i}}' > {output.chr0} + cat {output.bedfile} {output.chr0} > {output.mapextra} + """ + + +rule place_orient2: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map.nohaplotypes.extra.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.nohaplotypes.la" + output: + chrom = "10_PlaceAndOrientContigs/orient_2/chr.{lg_range}.la", + haplos = "10_PlaceAndOrientContigs/orient_2/haplotypes/chr.{lg_range}.haplo.suspected" + log: + chrom = report("10_PlaceAndOrientContigs/orient_2/logs/chr.{lg_range}.la.log", category = "Anchoring II Logs"), + haplos = "10_PlaceAndOrientContigs/orient_2/haplotypes/chr.{lg_range}.haplo.all", + errors = "10_PlaceAndOrientContigs/orient_2/errors/chr.{lg_range}.errors" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type, + haplo = haplo_limit + threads: 3 + message: "Running 2nd round of PlaceAndOrientContigs for linkage group {params.chrom}" + shell: + """ + gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs numThreads={threads} bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {log.chrom} + sort -n -r {log.chrom} | awk '($NF=="haplotype" && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {log.haplos} + sort -n -r {log.chrom} | awk -vlimit={params.haplo} '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {output.haplos} + grep "error$" {log.chrom} > {log.errors} + """ \ No newline at end of file diff --git a/rules/LepAnchor/place_orient_iii.smk b/rules/LepAnchor/place_orient_iii.smk new file mode 100644 index 0000000..5938a5e --- /dev/null +++ b/rules/LepAnchor/place_orient_iii.smk @@ -0,0 +1,74 @@ +rule propogate: + input: + placed = expand("10_PlaceAndOrientContigs/orient_2/chr.{lgs}.la", lgs = lg_range), + bedfile = "10_PlaceAndOrientContigs/map.nohaplotypes.bed" + output: + propogated = "10_PlaceAndOrientContigs/map.propogated.bed", + tmp_prop = temp(expand("10_PlaceAndOrientContigs/propogate/propogated.{lgs}.la", lgs = range(lg + 1))), + message: "Propogating ...something" + shell: + """ + awk -f software/LepAnchor/scripts/propagate.awk {input.placed} > 10_PlaceAndOrientContigs/tmp1.la + awk -f software/LepAnchor/scripts/propagate.awk 10_PlaceAndOrientContigs/tmp1.la > 10_PlaceAndOrientContigs/tmp2.la + i=2 + + while ! cmp -s "10_PlaceAndOrientContigs/tmp$i.la" "10_PlaceAndOrientContigs/tmp$(( $i-1 )).la" ;do + awk -f software/LepAnchor/scripts/propagate.awk 10_PlaceAndOrientContigs/tmp$i.la > 10_PlaceAndOrientContigs/tmp$[$i+1].la + i=$[$i+1] + done + #create prop*.la + awk '/^[^#]/{{++d[$1 "\t" $7+0 "\t" $8+0]; data[++line]=$0}}END{{for (i = 1; i <= line; ++i) {{$0=data[i];if (d[$1 "\t" $7+0 "\t" $8+0] == 1) fn="10_PlaceAndOrientContigs/propogate/propogated."$5".la"; else if ($5==1) fn="10_PlaceAndOrientContigs/propogate/propogated.0.la"; else fn=""; if (fn != "") print $0>fn}}}}' 10_PlaceAndOrientContigs/tmp$i.la + + #create a new bed by combining propogated.[1-9]*.la and map.nohaplotypes.bed + awk '(NR==FNR){{print;c[$1]}}(NR!=FNR && !($1 in c)){{print $1 "\t" $7+0 "\t" $8+0"\t?\t"$5}}' {input.bedfile} {output.tmp_prop} > {output.propogated} + rm 10_PlaceAndOrientContigs/tmp*.la + """ + + +rule place_orient3: + input: + chain = "9_Chain/chainfile.gz", + bedfile = "10_PlaceAndOrientContigs/map.propogated.bed", + paf = paf, + prox = proximity, + lift = "10_PlaceAndOrientContigs/liftover.nohaplotypes.la" + output: + chrom = "10_PlaceAndOrientContigs/orient_3/ichr.{lg_range}.la", + haplos = "10_PlaceAndOrientContigs/orient_3/haplotypes/chr.{lg_range}.haplo.suspected" + log: + chrom = report("10_PlaceAndOrientContigs/orient_3/logs/ichr.{lg_range}.la.log", category = "Anchoring III Logs"), + haplos = "10_PlaceAndOrientContigs/orient_3/haplotypes/chr.{lg_range}.haplo.all", + errors = "10_PlaceAndOrientContigs/orient_3/errors/chr.{lg_range}.errors" + params: + chrom = "{lg_range}", + extras = place_orient_extra, + datatype = data_type, + haplo = haplo_limit + message: "Running 3rd round of PlaceAndOrientContigs for linkage group {params.chrom}" + shell: + """ + gunzip -fc {input.chain} | java -cp software/LepAnchor PlaceAndOrientContigs bed={input.bedfile} chromosome={params.chrom} map={input.lift} chain=- paf={input.paf} proximity={input.prox} {params.datatype} {params.extras} > {output.chrom} 2> {log.chrom} + sort -n -r {log.chrom} | awk '($NF=="haplotype" && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {log.haplos} + sort -n -r {log.chrom} | awk -vlimit={params.haplo} '($NF=="haplotype" && ($1>=($4-$3+1-limit)/limit) && (!(($5 SUBSEP $6 SUBSEP $7) in h))){{h[$2,$3,$4]; print}}' > {output.haplos} + grep "error$" {log.chrom} > {log.errors} + """ + + +rule prune: + input: + oriented = expand("10_PlaceAndOrientContigs/orient_3/ichr.{lgs}.la", lgs = lg_range), + bedfile = "10_PlaceAndOrientContigs/map.propogated.bed" + output: + pruned = report("10_PlaceAndOrientContigs/orient_3/pruned.la", category = "Logs"), + cleaned = report("10_PlaceAndOrientContigs/overlaps_rm.la", category = "Logs") + message: "Pruning contig blocks without map support and removing overlaps" + params: + chrom = lg + shell: + """ + for i in $(seq {params.chrom}) + do + awk -f software/LepAnchor/scripts/prune.awk 10_PlaceAndOrientContigs/orient_3/ichr.$i.la > 10_PlaceAndOrientContigs/orient_3/ichr.${{i}}.pruned.la + done 2> {output.pruned} + awk -f software/LepAnchor/scripts/removeOverlaps.awk {input.bedfile} 10_PlaceAndOrientContigs/orient_3/ichr.*.pruned.la > {output.cleaned} + """ \ No newline at end of file diff --git a/rules/LepAnchor/trim_edges.smk b/rules/LepAnchor/trim_edges.smk new file mode 100644 index 0000000..627c025 --- /dev/null +++ b/rules/LepAnchor/trim_edges.smk @@ -0,0 +1,42 @@ +rule generate_updated_intervals: + input: "13_MareyMapsUntrimmed/data.marey.gz" + output: "14_NewIntervals/LA.intervals.{lg_range}" + message: "Splitting out LG {params.chrom} from {input}" + params: + chrom = "{lg_range}" + shell: + """ + zgrep "LG{params.chrom}\s" {input} > {output} + """ + + +rule trim_newintervals: + input: "14_NewIntervals/LA.intervals.{lg_range}" + output: + outfile = "15_Trim/LA.intervals.{lg_range}.trimmed", + plot = "15_Trim/plots/LA.intervals.{lg_range}.trim.pdf" + message: "Trimming edge clusters for {input}" + params: + edge = edgelen, + dist = trimdist + shell: "Rscript scripts/LATrim.r {input} {params.dist} {params.edge} 15_Trim" + + +rule merge_trimplots: + input: expand("15_Trim/plots/LA.intervals.{lg}.trim.pdf", lg = lg_range) + output: "15_Trim/LA.trim.summary.pdf" + message: "Merging trimming plots into {output}" + shell: "convert -density 200 {input} {output}" + + +rule merge_trimmedintervals: + input: expand("15_Trim/LA.intervals.{lg}.trimmed", lg = lg_range) + output: "16_MareyMapsTrimmed/data.marey.trimmed.gz" + message: "Concatenating trimmed intervals to {output}" + shell: "cat {input} | gzip -c > {output}" + + +rule plot_trimmedintervals: + input: "16_MareyMapsTrimmed/data.marey.trimmed.gz" + output: report("16_MareyMapsTrimmed/LepAnchor.mareymaps.pdf", category = "Trimmed Marey Maps") + shell: "Rscript scripts/LASummary.r {input}" \ No newline at end of file diff --git a/rules/LepMap3.smk b/rules/LepMap3/LepMap3.smk similarity index 79% rename from rules/LepMap3.smk rename to rules/LepMap3/LepMap3.smk index ad62f8a..a7c9f93 100644 --- a/rules/LepMap3.smk +++ b/rules/LepMap3/LepMap3.smk @@ -7,26 +7,33 @@ configfile: "config.yaml" # data # vcf = config["vcf"] pedigree = config["pedigree"] +parentcall_extra = config["extra_params_ParentCall"] # filtering # data_tol=config["data_tol"] +filtering_extra = config["extra_params_Filtering"] # separate chromosomes # lod_max = str(config["lod_max"]) lod_range = list(range(config["lod_min"], config["lod_max"]+1)) informative = config["informative"] +sepchrom_extra = config["extra_params_SeparateChromosomes"] # join singles # joinsingles = config["run_joinsingles2all"] lod_lim = config["lod_limit"] lod_diff = config["lod_difference"] +js2a_extra = config["extra_params_JoinSingles"] # ordering # lg_range = list(range(1, config["exp_lg"]+1)) lg_count = config["exp_lg"] -threads_per = config["threads_per"] -dist_method = config["dist_method"] ITER = config["iterations"] -phasenum = config["phase_iterations"] +order_extra = config["extra_params_OrderMarkers"] # trimming # edge_len = str(config["edge_length"]) trim_thresh = str(config["trim_cutoff"]) +# ordering II # +reorder_extra = config["extra_params_reOrderMarkers"] +ITER2 = round(ITER/2) +# distances # +dist_method = config["distance_method"] include: "prepare_data.smk" include: "generate_map.smk" diff --git a/rules/distances.smk b/rules/LepMap3/distances.smk similarity index 71% rename from rules/distances.smk rename to rules/LepMap3/distances.smk index 196664f..ef94e55 100644 --- a/rules/distances.smk +++ b/rules/LepMap3/distances.smk @@ -5,6 +5,7 @@ rule calculate_distances: output: distance = "7_Distances/ordered.{lg_range}.distances", sex_averaged = "7_DistancesSexAverage/ordered.{lg_range}.sexavg", + sex_averagedtmp = temp("7_DistancesSexAverage/logs/.ordered.{lg_range}.sexavg"), intervals = "7_Intervals/ordered.{lg_range}.intervals" message: "Calculating marker distances and intervals for linkage group: {params.lg}" log: @@ -13,15 +14,15 @@ rule calculate_distances: params: dist_method = dist_method, lg = "{lg_range}" - threads: threads_per + threads: 2 shell: """ cp {input.lg} {output.distance} - zcat {input.data_call} | java -cp software/LepMap3 OrderMarkers2 data=- evaluateOrder={input.lg} {params.dist_method} numThreads={threads} improveOrder=0 sexAveraged=1 &> {log.sex_averaged}.tmp - sed -i -e 's/LG \= 0/LG \= {params.lg}/g' {log.sex_averaged}.tmp - sed -n '/\*\*\* LG \=/,$p' {log.sex_averaged}.tmp > {output.sex_averaged} - awk '/#java/{{flag=1}} flag; /*** LG =/{{flag=0}}' {log.sex_averaged}.tmp > {log.sex_averaged} && rm {log.sex_averaged}.tmp + zcat {input.data_call} | java -cp software/LepMap3 OrderMarkers2 data=- evaluateOrder={input.lg} {params.dist_method} numThreads={threads} improveOrder=0 sexAveraged=1 &> {output.sex_averagedtmp} + sed -i -e 's/LG \= 0/LG \= {params.lg}/g' {output.sex_averagedtmp} + sed -n '/\*\*\* LG \=/,$p' {output.sex_averagedtmp} > {output.sex_averaged} + awk '/#java/{{flag=1}} flag; /*** LG =/{{flag=0}}' {output.sex_averagedtmp} > {log.sex_averaged} zcat {input.data_call} | java -cp software/LepMap3 OrderMarkers2 data=- evaluateOrder={input.lg} {params.dist_method} numThreads={threads} calculateIntervals={output.intervals} > {log.intervals} 2>&1 """ \ No newline at end of file diff --git a/rules/generate_map.smk b/rules/LepMap3/generate_map.smk similarity index 51% rename from rules/generate_map.smk rename to rules/LepMap3/generate_map.smk index 7605f03..1adb2cb 100644 --- a/rules/generate_map.smk +++ b/rules/LepMap3/generate_map.smk @@ -1,48 +1,48 @@ rule separate_chromosomes: input: "2_Filtering/data.filtered.lepmap3.gz" - output: "3_SeparateChromosomes/map.{lod_range}" - log: "3_SeparateChromosomes/logs/map.{lod_range}.log" - message: "Creating map for lodLimit={params.lod} >> {output}" + output: "3_SeparateChromosomes/LOD.{lod_range}" + log: "3_SeparateChromosomes/logs/LOD.{lod_range}.log" + message: "Clustering markers for lodLimit={params.lod} >> {output}" threads: 30 params: lod = "{lod_range}", - dist_lod = "distortionLod=1", + extra = sepchrom_extra, shell: """ - zcat {input} | java -cp software/LepMap3 SeparateChromosomes2 data=- sizeLimit=5 {informative} lodLimit={params.lod} {params.dist_lod} numThreads={threads} > {output} 2> {log} + zcat {input} | java -cp software/LepMap3 SeparateChromosomes2 data=- {params.extra} {informative} lodLimit={params.lod} numThreads={threads} > {output} 2> {log} """ rule map_summary: - input: expand("3_SeparateChromosomes/map.{LOD}", LOD = lod_range) - output: "3_SeparateChromosomes/all.maps.summary" + input: expand("3_SeparateChromosomes/LOD.{LOD}", LOD = lod_range) + output: "3_SeparateChromosomes/all.LOD.summary" message: "Summarizing SeperateChromosomes2 maps >> {output}" shell: "scripts/MapSummary.r 3_SeparateChromosomes" rule join_singles: input: datacall = "2_Filtering/data.filtered.lepmap3.gz", - map_summ = "3_SeparateChromosomes/all.maps.summary" - output: "map.master" - log: "3_SeparateChromosomes/chosen.map" + map_summ = "3_SeparateChromosomes/all.LOD.summary" + output: "LOD.master" + log: "3_SeparateChromosomes/chosen.LOD" threads: 30 message: "Joining singles to linkage groups" params: run_js2all = joinsingles, lod_limit = lod_lim, lod_diff = lod_diff, - iterate = "iterate=1", + extra = js2a_extra shell: """ - echo -n -e '\nWhich map would you like to use (e.g. map.15)? map.' + echo -n -e '\nWhich map would you like to use (e.g. LOD.15)? LOD.' read -r - echo -e "# the map chosen to use with OrderMarkers2\nmap.$REPLY" > {log} + echo -e "# the map chosen to use with OrderMarkers2\nLOD.$REPLY" > {log} echo "A record of your choice can be found in {log}" JS2A=$(echo {params.run_js2all} | tr '[:upper:]' '[:lower:]') if [ $JS2A == "true" ]; then - zcat {input.datacall} | java -cp software/LepMap3 JoinSingles2All map=3_SeparateChromosomes/map.$REPLY data=- {params.lod_limit} {params.lod_diff} {params.iterate} numThreads={threads} > {output} + zcat {input.datacall} | java -cp software/LepMap3 JoinSingles2All map=3_SeparateChromosomes/LOD.$REPLY data=- {params.extra} {params.lod_limit} {params.lod_diff} numThreads={threads} > {output} else echo -e "\nSkipping JoinSingles2All and creating a symlink instead" - ln -sr 3_SeparateChromosomes/map.$REPLY {output} + ln -sr 3_SeparateChromosomes/LOD.$REPLY {output} fi - sleep 4s + sleep 2s """ diff --git a/rules/LepMap3/order.smk b/rules/LepMap3/order.smk new file mode 100644 index 0000000..885924d --- /dev/null +++ b/rules/LepMap3/order.smk @@ -0,0 +1,31 @@ +rule order_markers: + input: + datacall = "2_Filtering/data.filtered.lepmap3.gz", + filt_map = "LOD.master" + output: "4_OrderMarkers/ordered.{lg_range}" + log: + runlog = temp("4_OrderMarkers/logs/ordered.{lg_range}.running"), + run = "4_OrderMarkers/logs/ordered.{lg_range}.log", + recomb = "4_OrderMarkers/recombination/ordered.{lg_range}.recombinations" + message: "Ordering linkage group {params.chrom} with {params.iterations} iterations" + params: + chrom = "{lg_range}", + iterations = ITER, + extra = order_extra + threads: 2 + shell: + """ + zcat {input.datacall} | java -cp software/LepMap3 OrderMarkers2 map={input.filt_map} {params.extra} data=- numThreads={threads} numMergeIterations={params.iterations} chromosome={params.chrom} &> {log.runlog} + sed -n '/\*\*\* LG \=/,$p' {log.runlog} > {output} + grep "recombin" {log.runlog} > {log.recomb} + awk '/#java/{{flag=1}} flag; /logL/{{flag=0}}' {log.runlog} > {log.run} + """ + +rule recomb_summary: + input: expand("4_OrderMarkers/ordered.{lg}", lg = lg_range) + output: "4_OrderMarkers/recombination/recombination.summary" + message: "Recombination summary: {output}" + shell: + """ + Rscript scripts/RecombinationSummary.r 4_OrderMarkers/recombination > {output} + """ \ No newline at end of file diff --git a/rules/LepMap3/prepare_data.smk b/rules/LepMap3/prepare_data.smk new file mode 100644 index 0000000..fa7b5b9 --- /dev/null +++ b/rules/LepMap3/prepare_data.smk @@ -0,0 +1,26 @@ +rule parent_call: + input: + vcf = vcf, + pedigree = pedigree + output: "1_ParentCall/data.lepmap3.gz" + message: "Creating Lep-Map3 data file from {input.vcf} and {input.pedigree}" + params: + extra = parentcall_extra + shell: "java -cp software/LepMap3 ParentCall2 data={input.pedigree} vcfFile={input.vcf} {params} | gzip > {output}" + +rule filtering: + input: "1_ParentCall/data.lepmap3.gz" + output: "2_Filtering/data.filtered.lepmap3.gz" + message: "Filtering {input}" + params: + data_tolerance = data_tol, + extra = filtering_extra + shell: + """ + if [ {params.data_tolerance} == 0 ]; then + echo "Skipping Filtering2 and creating symlink {output} instead" + ln -sr {input} {output} + else + zcat {input} | java -cp software/LepMap3 Filtering2 data=- dataTolerance={params.data_tolerance} {params.extra} | gzip > {output} + fi + """ diff --git a/rules/LepMap3/reorder.smk b/rules/LepMap3/reorder.smk new file mode 100644 index 0000000..819d3c1 --- /dev/null +++ b/rules/LepMap3/reorder.smk @@ -0,0 +1,32 @@ +rule reorder_markers: + input: + datacall = "2_Filtering/data.filtered.lepmap3.gz", + filt_map = "LOD.master", + lg_order = "5_Trim/ordered.{lg_range}.trimmed" + output: "6_OrderMarkers/ordered.{lg_range}" + log: + runlog = temp("6_OrderMarkers/logs/ordered.{lg_range}.running"), + run = "6_OrderMarkers/logs/ordered.{lg_range}.log", + recomb = "6_OrderMarkers/recombination/ordered.{lg_range}.recombination" + message: "Reordering linkage group {params.lg} with {params.iterations} iterations" + params: + lg = "{lg_range}", + iterations = ITER2, + extra = reorder_extra + threads: 2 + shell: + """ + zcat {input.datacall} | java -cp software/LepMap3 OrderMarkers2 {params.extra} map={input.filt_map} data=- numThreads={threads} evaluateOrder={input.lg_order} numMergeIterations={params.iterations} &> {log.runlog} + sed -n '/\*\*\* LG \=/,$p' {log.runlog} > {output} + grep "recombin" {log.runlog} > {log.recomb} + awk '/#java/{{flag=1}} flag; /logL/{{flag=0}}' {log.runlog} > {log.run} + """ + +rule reorder_summary: + input: expand("6_OrderMarkers/ordered.{lg}", lg = lg_range) + output: "6_OrderMarkers/recombination/recombination.summary" + message: "Recombination summary of reordering: {output}" + shell: + """ + Rscript scripts/RecombinationSummary.r 6_OrderMarkers/recombination > {output} + """ \ No newline at end of file diff --git a/rules/trim.smk b/rules/LepMap3/trim.smk similarity index 82% rename from rules/trim.smk rename to rules/LepMap3/trim.smk index b2295eb..7f122c1 100644 --- a/rules/trim.smk +++ b/rules/LepMap3/trim.smk @@ -7,10 +7,10 @@ rule trim_edge_clusters: params: trim_threshold = trim_thresh, edge_length = edge_len - message: "Removing edge clusters >{params.trim_threshold}cM apart from the other markers at the ends of {input}" + message: "Removing edge clusters >{params.trim_threshold}%cM apart from the other markers at the ends of {input}" shell: """ - Rscript scripts/LepWrapTrim.r {input} {params.trim_threshold} {params.edge_length} + Rscript scripts/LepWrapTrim.r {input} {params.trim_threshold} {params.edge_length} 5_Trim """ rule trim_summary: @@ -31,9 +31,8 @@ rule trim_summary: """ for each in 5_Trim/logs/ordered.*.removed ; do BASE=$(basename $each | cut -d "." -f1,2) - sed -e "s/^/$BASE /" $each >> {output.detailed}.tmp + sed -e "s/^/$BASE /" $each done | sort -V > {output.detailed} - #sort -V {output.detailed}.tmp >> {output.detailed} && rm {output.detailed}.tmp scripts/TrimCounts.r {output.detailed} {params.lg} > {output.summary} scripts/TrimSummaryPlot.r {output.summary} echo "Merging QC plots for all linkage groups" diff --git a/rules/order.smk b/rules/order.smk deleted file mode 100644 index cfd1ed0..0000000 --- a/rules/order.smk +++ /dev/null @@ -1,33 +0,0 @@ -rule order_markers: - input: - datacall = "2_Filtering/data.filtered.lepmap3.gz", - filt_map = "map.master" - output: "4_OrderMarkers/ordered.{lg_range}" - log: - run = "4_OrderMarkers/logs/ordered.{lg_range}.log", - recomb = "4_OrderMarkers/recombination/ordered.{lg_range}.recombinations" - message: "Ordering the markers with {params.dist_method} on linkage group {params.chrom}" - params: - dist_method = dist_method, - chrom = "{lg_range}", - iterations = ITER, - phase = phasenum - threads: threads_per - shell: - """ - zcat {input.datacall} | java -cp software/LepMap3 OrderMarkers2 map={input.filt_map} data=- numThreads={threads} numMergeIterations={params.iterations} {params.dist_method} chromosome={params.chrom} phasingIterations={params.phase} &> {log.run}.tmp - sed -n '/\*\*\* LG \=/,$p' {log.run}.tmp > {output} - grep "recombin" {log.run}.tmp > {log.recomb} - awk '/#java/{{flag=1}} flag; /logL/{{flag=0}}' {log.run}.tmp > {log.run} && rm {log.run}.tmp - """ - -rule recomb_summary: - input: - expand("4_OrderMarkers/ordered.{lg}", lg = lg_range) - output: - recomb = "4_OrderMarkers/recombination/recombination.summary" - message: "Recombination summary: {output.recomb}" - shell: - """ - Rscript scripts/RecombinationSummary.r 4_OrderMarkers/recombination - """ \ No newline at end of file diff --git a/rules/prepare_data.smk b/rules/prepare_data.smk deleted file mode 100644 index 02e8cc5..0000000 --- a/rules/prepare_data.smk +++ /dev/null @@ -1,24 +0,0 @@ -rule parent_call: - input: - vcf = vcf, - pedigree = pedigree - output: - "1_ParentCall/data.lepmap3.gz" - message: "Creating Lep-Map3 data file from {input.vcf} and {input.pedigree}" - shell: - "java -cp software/LepMap3 ParentCall2 data={input.pedigree} vcfFile={input.vcf} removeNonInformative=1 | gzip > {output}" - -rule filtering: - input: "1_ParentCall/data.lepmap3.gz" - output: "2_Filtering/data.filtered.lepmap3.gz" - message: "Filtering {input}" - params: - data_tolerance = data_tol - shell: - """ - if [ {params} == 0 ]; then - ln -sr {input} {output} - else - zcat {input} | java -cp software/LepMap3 Filtering2 data=- dataTolerance={params} | gzip > {output} - fi - """ diff --git a/rules/reorder.smk b/rules/reorder.smk deleted file mode 100644 index 9f3ad9d..0000000 --- a/rules/reorder.smk +++ /dev/null @@ -1,34 +0,0 @@ -rule reorder_markers: - input: - datacall = "2_Filtering/data.filtered.lepmap3.gz", - filt_map = "map.master", - lg_order = "5_Trim/ordered.{lg_range}.trimmed" - output: - "6_OrderMarkers/ordered.{lg_range}" - log: - run = "6_OrderMarkers/logs/ordered.{lg_range}.log", - recomb = "6_OrderMarkers/recombination/ordered.{lg_range}.recombination" - message: "Reordering linkage group {params.lg} with {params.iterations} iterations" - params: - dist_method = dist_method, - lg = "{lg_range}", - iterations = ITER - threads: threads_per - shell: - """ - zcat {input.datacall} | java -cp software/LepMap3 OrderMarkers2 map={input.filt_map} data=- numThreads={threads} evaluateOrder={input.lg_order} {params.dist_method} numMergeIterations={params.iterations} &> {log.run}.tmp - sed -n '/\*\*\* LG \=/,$p' {log.run}.tmp > {output} - grep "recombin" {log.run}.tmp > {log.recomb} - awk '/#java/{{flag=1}} flag; /logL/{{flag=0}}' {log.run}.tmp > {log.run} && rm {log.run}.tmp - """ - -rule reorder_summary: - input: - expand("6_OrderMarkers/ordered.{lg}", lg = lg_range) - output: - recomb = "6_OrderMarkers/recombination/recombination.summary" - message: "Recombination summary of reordering: {output.recomb}" - shell: - """ - Rscript scripts/RecombinationSummary.r 6_OrderMarkers/recombination - """ \ No newline at end of file diff --git a/scripts/FilterLinkageMap.r b/scripts/FilterLinkageMap.r new file mode 100644 index 0000000..31fa7ff --- /dev/null +++ b/scripts/FilterLinkageMap.r @@ -0,0 +1,129 @@ +# This R file performs an adaptive method of filtering a linkage map +# It works by creating a spline on the linkage map, then performing +# a sliding window analysis on the residuals of the spline, calculating +# a local threshold of 2 x mean(local residuals) to flag markers as +# possible outliers. + +# More specifically, lgFilter creates two splines ("higher" and +# "lower" resolution), and performs a 50% overlapping sliding +# window analysis with a different window size for each resolution. +# Pass/Fails go into a simple penalty matrix and markers >= 3 penalties are +# finally flagged as FAIL. 3 fails means the marker failed both times for one +# sliding window and at least once for the other. End-markers that only get +# tested 2x (lack of overlaps) incur double-penalties. + +library(tidyverse) +library(splines) + +# will require un-gzipped mareydata +marey <- read_delim("data.marey.trimmed", delim = "\t", col_names = FALSE) %>% + select(X3, X2, X5, X6) %>% + rename(LG = X3, MB = X2, MALE = X5, FEMALE = X6) %>% + rowwise() %>% + mutate(AVG = mean(c(MALE,FEMALE))) %>% + pivot_longer(c(MALE,FEMALE), values_to = "CM", names_to = "SEX") + +lgFilter <- function(mb,cm){ + n <- length(mb) + knots <- round(n/100) + .spline <- lm(cm ~ bs(mb, knots = knots)) + res <- .spline$residuals + # mb = physical position vector + # cm = genetic position vector + # divide by 10 and round to next even integer + windowsize <- 2 * round((n/10)/2) + slide <- windowsize/2 + # sliding window of `windowsize`, sliding at half its length each iteration + # the input vector is divided into 10 ~even windows + windowseq <- seq(from = 1, to = n-slide, by = slide ) + # create empty penalty matrix + penalty <- rep(0, n) + for (.window in windowseq){ + # workaround for last window not being an even size + if (.window == windowseq[length(windowseq)]){ + .residuals <- res[.window:n] + } else{ + .residuals <- res[.window:(.window + windowsize-1)] + } + threshold <- mean(abs(.residuals)) * 2 + # workaround like above + # add +1 penalty to end-markers b/c they dont get scanned 4x like the others + if (.window == windowseq[length(windowseq)]){ + penalty[.window:n] <- penalty[.window:n] + (abs(.residuals) > threshold) + #slide window up to the halfway point + halfwindow = median(.window:n) + # double the penalty for the end points that only get scanned 2x rather than 4x + penalty[halfwindow:n] <- penalty[halfwindow:n] + 1 + } else { + penalty[.window:(.window+windowsize-1)] <- penalty[.window:(.window+windowsize-1)] + (abs(.residuals) > threshold) + } + } + # a second window ~30% smaller + knots <- round(n/70) + .spline <- lm(cm ~ bs(mb, knots = knots)) + res <- .spline$residuals + windowsize <- 2 * round((n/7)/2) + slide <- windowsize/2 + windowseq <- seq(from = 1, to = n-slide, by = slide ) + for (.window in windowseq){ + if (.window == windowseq[length(windowseq)]){ + .residuals <- res[.window:n] + } else{ + .residuals <- res[.window:(.window + windowsize-1)] + } + threshold <- mean(abs(.residuals)) * 2 + # workaround like above + if (.window == windowseq[length(windowseq)]){ + penalty[.window:n] <- penalty[.window:n] + (abs(.residuals) > threshold) + #slide window up to the halfway point + halfwindow = median(.window:n) + # double the penalty for the end points that only get scanned 2x rather than 4x + penalty[halfwindow:n] <- penalty[halfwindow:n] + 1 + } else { + penalty[.window:(.window+windowsize-1)] <- penalty[.window:(.window+windowsize-1)] + (abs(.residuals) > threshold) + } + } + qual <- penalty >= 3 + # convert to PASS/FAIL instead of TRUE/FALSE + qual[qual] <- "FAIL" + qual[qual == FALSE] <- "PASS" + return(qual) +} + +filter_df <- marey %>% filter(SEX == "MALE") %>% + group_by(LG) %>% + arrange(MB) %>% + group_by(LG) %>% + mutate(QC = lgFilter(MB, CM)) + +filter_df %>% + ggplot(aes(x = MB, y = CM)) + + geom_point(aes(color = QC)) + + facet_wrap(~LG, scales = "free") + + labs( + title = "Sex-Averaged markers removed by filtering" + ) + +ggsave("sexavg.filtering.png", width = 10, height = 10, units = "in") + +filter_df %>% filter(QC == "PASS") %>% + ggplot(aes(x = MB, y = CM)) + + geom_point(color = "dodgerblue", shape = 20, alpha = 0.5) + + facet_wrap(~LG, scales = "free") + + labs( + title = "Filtered Sex-Averaged Linkage Maps" + ) +ggsave("sexavg.filtered.png", width = 10, height = 10, units = "in") + +marey %>% + group_by(LG, SEX) %>% + arrange(MB) %>% + group_by(LG) %>% + mutate(QC = lgFilter(MB, CM)) %>% + ggplot(aes(x = MB, y = CM)) + + geom_point(aes(color = QC, shape = SEX)) + + facet_wrap(~LG, scales = "free") + + labs( + title = "Markers removed by filtering" + ) +ggsave("malefemale.filtered.png", width = 10, height = 10, units = "in") diff --git a/scripts/LAMidpointSummary.r b/scripts/LAMidpointSummary.r deleted file mode 100755 index 23dd908..0000000 --- a/scripts/LAMidpointSummary.r +++ /dev/null @@ -1,68 +0,0 @@ -#! /usr/bin/env Rscript - -suppressMessages(library(tidyverse, warn.conflicts = FALSE, quietly = TRUE)) - -args <- commandArgs(trailingOnly = TRUE) -# args[1] = mareydata file -# args[2] = marey midpoint data (optional) - -allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% - select(X3, X2, X5) %>% - rename(lg = X3, Mb = X2, cM = X5) %>% - mutate(Mb = Mb/1000000) %>% - group_by(lg) %>% - mutate(marker = seq_along(Mb)) - -allmaps %>% - ggplot(aes(x = Mb, y = cM)) + - geom_point(size = 0.6, color = "dodgerblue") + - labs( - title = "Marker Positions in Anchored+Oriented Assembly", - subtitle = "The relative marker positions vs their position in the chromosome/LG", - x = "Physical Position (Mbp)", - y = "Genetic Position (cM)" - ) + - theme( - legend.position = "top", - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-5, -5, -5, -5), - legend.title = element_blank(), - legend.text = element_text(size = 13), - axis.text.x = element_text(colour = "black", size = 11), - axis.text.y = element_text(colour = "black", size = 11), - strip.text = element_text(size = 13), - axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), - axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) - ) + - facet_wrap(~lg, ncol = 4, scales = "free") - -outfile <- paste0(dirname(args[1]), "/LepAnchor.midpoint.mareymaps.pdf") -savedims <- length(unique(allmaps$lg)) * 2.5 - -ggsave(outfile, width = 8.5, height = savedims/4, units = "in") - -allmaps %>% - ggplot(aes(x = marker, y = cM)) + - geom_point(size = 0.6, alpha = 0.5, color = "dodgerblue") + - labs( - title = "Relative Marker Positions within Linkge Groups", - subtitle = "The distance of sequential markers from each other in the linkage maps", - x = "Marker Number", - y = "Genetic Position (cM)" - ) + - theme( - legend.position = "top", - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-5, -5, -5, -5), - legend.title = element_blank(), - legend.text = element_text(size = 13), - axis.text.x = element_text(colour = "black", size = 11), - axis.text.y = element_text(colour = "black", size = 11), - strip.text = element_text(size = 13), - axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), - axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) - ) + - facet_wrap(~lg, ncol = 4, scales = "free_x") - -outfile2 <- paste0(dirname(args[1]), "/LepAnchor.midpoint.sequentialmaps.pdf") -ggsave(outfile2, width = 8.5, height = savedims/5, units = "in", limitsize = FALSE) \ No newline at end of file diff --git a/scripts/LASummary.r b/scripts/LASummary.r index 5d7b440..63ab938 100755 --- a/scripts/LASummary.r +++ b/scripts/LASummary.r @@ -12,7 +12,9 @@ allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% group_by(lg) %>% mutate(marker = seq_along(Mb)) %>% pivot_longer(c(male, female), names_to = "sex", values_to = "cM" ) - + +pdf(NULL) + allmaps %>% ggplot(aes(x = Mb, y = cM, color = sex)) + geom_point(size = 0.6) + diff --git a/scripts/LASummarySexAvg.r b/scripts/LASummarySexAvg.r index e176af4..8537843 100755 --- a/scripts/LASummarySexAvg.r +++ b/scripts/LASummarySexAvg.r @@ -12,7 +12,10 @@ allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% mutate(Mb = Mb/1000000) %>% group_by(lg) %>% mutate(marker = seq_along(Mb)) - + +pdf(NULL) + + allmaps %>% ggplot(aes(x = Mb, y = cM)) + geom_point(size = 0.6, color = "dodgerblue") + diff --git a/scripts/LATrim.r b/scripts/LATrim.r index d43110b..3ab5f23 100755 --- a/scripts/LATrim.r +++ b/scripts/LATrim.r @@ -43,14 +43,17 @@ dist_thresh <- as.numeric(args[2]) if(dist_thresh >= 1){ dist_thresh <- dist_thresh * .01 } -dist_thresh <- abs(max(lgfile$V5) - min(lgfile$V5)) * dist_thresh +.dist_thresh <- dist_thresh * 100 +dist_thresh_m <- round(abs(max(lgfile$V5) - min(lgfile$V5)) * dist_thresh, digits = 2) +dist_thresh_f <- round(abs(max(lgfile$V6) - min(lgfile$V6)) * dist_thresh, digits = 2) + # if the percent threshold is given as an integer, convert it to a decimal edge_length <- as.numeric(args[3]) if(edge_length >= 1){ edge_length <- edge_length * .01 } - +.edge_length <- edge_length * 100 n_markers <- length(lgfile$V1) front_edge <- round(n_markers * edge_length, digits = 0) @@ -60,8 +63,10 @@ for (j in 5:6){ # iterate over male (5) and female (6) # sort on column if (j == 5){ lgfile <- arrange(lgfile, V5) + dist_thresh <- dist_thresh_m } else { lgfile <- arrange(lgfile, V6) + dist_thresh <- dist_thresh_f } # trim beginning for(a in front_edge:2){ #first n% of total markers starting from the front edge, going out @@ -83,6 +88,9 @@ for (j in 5:6){ # iterate over male (5) and female (6) # create new table of markers passing QC cleaned_markers <- (lgfile %>% filter(Mpass == "PASS" & Fpass == "PASS"))[,1:6] +# re-scale cleaned markers to 0 by subtracting the minimum genetic position +cleaned_markers <- cleaned_markers %>% + mutate(V5 = V5 - min(V5), V6 = V6 - min(V6)) # isolate bad markers removed_markers <- (lgfile %>% filter(Mpass == "FAIL" | Fpass == "FAIL"))[,1:6] @@ -101,8 +109,8 @@ plot_male <- lgfile %>% arrange(V5) %>% geom_vline(xintercept = back_edge, linetype = "dashed", size = 0.2) + labs( title = "", - subtitle = paste0("Male markers trimmed: ", rm_male), - caption = paste0(edge_length*100, "% of edge markers, ", args[2], "% cM threshold= ", dist_thresh), + subtitle = paste0(rm_male, " male markers >", dist_thresh_m, "cM trimmed"), + caption = paste0(.edge_length, "% edge markers, ", .dist_thresh, "% cM"), x = "Marker Number", y = "Position (cM)", color = "Pass Filtering" @@ -115,7 +123,7 @@ plot_female <- lgfile %>% arrange(V6) %>% geom_vline(xintercept = back_edge, linetype = "dashed", size = 0.2) + labs( title = paste("Edge Cluster Trimming for LG:", lg), - subtitle = paste0("Female markers trimmed: ", rm_female), + subtitle = paste0(rm_female, " female markers >", dist_thresh_f, "cM trimmed"), caption = paste0("Markers failing both M+F: ", rm_both), x = "Marker Number", y = "Position (cM)", diff --git a/scripts/LA_summary.r b/scripts/LA_summary.r deleted file mode 100755 index 34ecea7..0000000 --- a/scripts/LA_summary.r +++ /dev/null @@ -1,68 +0,0 @@ -#! /usr/bin/env Rscript - -suppressMessages(library(tidyverse, warn.conflicts = FALSE, quietly = TRUE)) - -args <- commandArgs(trailingOnly = TRUE) -# args[1] = mareydata file - -allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% - select(X3, X2, X5, X6) %>% - rename(lg = X3, Mb = X2, male = X5, female = X6) %>% - mutate(Mb = Mb/1000000) %>% - group_by(lg) %>% - mutate(marker = seq_along(Mb)) %>% - pivot_longer(c(male, female), names_to = "sex", values_to = "cM" ) - -allmaps %>% - ggplot(aes(x = Mb, y = cM, color = sex)) + - geom_point(size = 0.6) + - labs( - title = "Marker Positions in Anchored+Oriented Assembly", - subtitle = "The relative marker positions vs their position in the chromosome/LG", - x = "Physical Position (Mbp)", - y = "Genetic Position (cM)" - ) + - theme( - legend.position = "top", - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-5, -5, -5, -5), - legend.title = element_blank(), - legend.text = element_text(size = 13), - axis.text.x = element_text(colour = "black", size = 11), - axis.text.y = element_text(colour = "black", size = 11), - strip.text = element_text(size = 13), - axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), - axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) - ) + - facet_wrap(~lg, ncol = 4, scales = "free") - -outfile <- paste0(dirname(args[1]), "/LepAnchor.mareymaps.pdf") -savedims <- length(unique(allmaps$lg)) * 2.5 - -ggsave(outfile, width = 8.5, height = savedims/4, units = "in") - -allmaps %>% - ggplot(aes(x = marker, y = cM)) + - geom_point(size = 0.6, alpha = 0.5) + - labs( - title = "Relative Marker Positions within Linkge Groups", - subtitle = "The distance of sequential markers from each other in the linkage maps", - x = "Marker Number", - y = "Genetic Position (cM)" - ) + - theme( - legend.position = "top", - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-5, -5, -5, -5), - legend.title = element_blank(), - legend.text = element_text(size = 13), - axis.text.x = element_text(colour = "black", size = 11), - axis.text.y = element_text(colour = "black", size = 11), - strip.text = element_text(size = 13), - axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), - axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) - ) + - facet_wrap(lg ~ sex, ncol = 2, scales = "free_x") - -outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sequentialmaps.pdf") -ggsave(outfile2, width = 8.5, height = savedims, units = "in", limitsize = FALSE) \ No newline at end of file diff --git a/scripts/LepWrapTrim.r b/scripts/LepWrapTrim.r index 8bddfe4..9d4a408 100755 --- a/scripts/LepWrapTrim.r +++ b/scripts/LepWrapTrim.r @@ -3,11 +3,10 @@ suppressMessages(if (!require("tidyverse")) install.packages("tidyverse")) suppressMessages(library("tidyverse")) args <- commandArgs(trailingOnly = TRUE) -#args <- c("~/ordered.1", 10, 15) - # args[1] is the OrderMarkers2 output file -# args[2] is the centimorgan cutoff distance +# args[2] is the centimorgan cutoff threshold # args[3] is the % of edge markers to scan +# args[4] is the output directory lgfile <- read.delim( args[1], @@ -15,7 +14,7 @@ lgfile <- read.delim( sep = "\t", comment.char="#" ) %>% - mutate(Mpass = TRUE, Fpass = TRUE) + mutate(Mpass = "PASS", Fpass = "PASS") ## setup output file names ## # split the filename by path @@ -25,55 +24,80 @@ filename <- filename[length(filename)] lg <- unlist(strsplit(filename, "\\."))[2] #========= output instantiation ========# -outfile_base <- paste("5_Trim", filename, sep = "/") -outfile_log_base <- paste("5_Trim", "logs", filename, sep = "/") -plotfile_base <- paste("5_Trim", "plots", filename, sep = "/") +dir.create(args[4], showWarnings = FALSE) +dir.create(paste0(args[4],"/plots"), showWarnings = FALSE) +dir.create(paste0(args[4],"/logs"), showWarnings = FALSE) +dir.create(paste0(args[4],"/QC_raw"), showWarnings = FALSE) +outfile_base <- paste(args[4], filename, sep = "/") +outfile_log_base <- paste(args[4], "logs", filename, sep = "/") +plotfile_base <- paste(args[4], "plots", filename, sep = "/") plotfile <- paste(plotfile_base, "trim.pdf", sep = ".") +rawfile_base <- paste(args[4], "QC_raw", filename, sep = "/") + + ##### Pruning the ends ##### -dist_thresh <- as.numeric(args[2]) # if the percent threshold is given as an interger, convert it to a decimal +dist_thresh <- as.numeric(args[2]) +if(dist_thresh >= 1){ + dist_thresh <- dist_thresh * .01 +} +dist_thresh_m <- abs(max(lgfile$V2) - min(lgfile$V2)) * dist_thresh +dist_thresh_f <- abs(max(lgfile$V3) - min(lgfile$V3)) * dist_thresh + edge_length <- as.numeric(args[3]) if(edge_length >= 1){ edge_length <- edge_length * .01 } +n_markers <- length(lgfile$V1) +front_edge <- round(n_markers * edge_length, digits = 0) +back_edge <- round(n_markers - front_edge, digits = 0) + for (j in 2:3){ # iterate over male (2) and female (3) + # sort on column + if (j == 2){ + lgfile <- arrange(lgfile, V2) + dist_thresh <- dist_thresh_m + } else { + lgfile <- arrange(lgfile, V3) + dist_thresh <- dist_thresh_f + } # trim beginning - n_markers <- length(lgfile$V1) * edge_length - for(a in 1:n_markers){ #first n% of total markers from the beginning - diff <- abs(lgfile[a+1,j]-lgfile[a,j]) # difference between two points + for(a in front_edge:2){ #first n% of total markers from the beginning + diff <- abs(lgfile[a,j]-lgfile[a-1,j]) # difference between two points if( diff > dist_thresh ){ # is the difference between the two points > distance argument? - lgfile[1:a, j+4] <- FALSE # mark that marker and all markers BEFORE it as FALSE + lgfile[(a-1):1, j+4] <- "FAIL" # mark that marker and all markers BEFORE it as FAIL + break() } } # trim end - filelen<-length(lgfile$V1) # get new file lengths for each time we remove NA's - for(z in filelen:(filelen-n_markers)){ #iterate n% total markers in starting from the end - diff <- abs(lgfile[z,j]-lgfile[z-1,j]) # difference between two points + for(z in back_edge:(n_markers-1)){ #last n% total markers starting from the back edge going out + diff <- abs(lgfile[z+1,j]-lgfile[z,j]) # difference between two points if( diff > dist_thresh ){ # is the difference between the two points > distance argument? - lgfile[filelen:z,j+4] <- FALSE # mark that marker and all markers AFTER it as FALSE + lgfile[(z+1):n_markers,j+4] <- "FAIL" # mark that marker and all markers AFTER it as FAIL + break() } } - - # create new table of markers passing QC - cleaned_markers <- lgfile %>% filter(Mpass == TRUE & Fpass == TRUE) + } ### +# create new table of markers passing QC +cleaned_markers <- lgfile %>% filter(Mpass == "PASS" & Fpass == "PASS") # isolate bad markers -removed_markers <- (lgfile %>% filter(Mpass == FALSE | Fpass == FALSE))$V1 +removed_markers <- (lgfile %>% filter(Mpass == "FAIL" | Fpass == "FAIL"))$V1 # get simple counts -rm_male <- (lgfile %>% filter(Mpass == FALSE & Fpass == TRUE))$V1 %>% length() -rm_female <- (lgfile %>% filter(Fpass == FALSE & Mpass == TRUE))$V1 %>% length() -rm_both <- (lgfile %>% filter(Mpass == FALSE & Fpass == FALSE))$V1 %>% length() +rm_male <- (lgfile %>% filter(Mpass == "FAIL" & Fpass == "PASS"))$V1 %>% length() +rm_female <- (lgfile %>% filter(Fpass == "FAIL" & Mpass == "PASS"))$V1 %>% length() +rm_both <- (lgfile %>% filter(Mpass == "FAIL" & Fpass == "FAIL"))$V1 %>% length() QA <- function(x,y){ - if(x & y) return("pass") - if(!x & !y) return("both") - if(!x & y) return("male") - if(x & !y) return("female") + if(x == "PASS" & y == "PASS") return("pass") + if(x != "PASS" & y != "PASS") return("both") + if(x == "FAIL" & y == "PASS") return("male") + if(x == "PASS" & y == "FAIL") return("female") } QAfix <- function (x,y){ @@ -86,9 +110,11 @@ QAfix <- function (x,y){ } } - +pdf(NULL) + plot_df <- lgfile %>% rename(Male = V2, Female = V3) %>% + arrange(Male) %>% mutate(Marker = seq_along(Mpass)) %>% rowwise() %>% mutate(Fail = QA(Mpass, Fpass)) %>% @@ -97,11 +123,6 @@ plot_df <- lgfile %>% rowwise() %>% mutate(Fail = QAfix(Fail, Sex)) -# locations for vertical lines -n_markers <- length(lgfile$V1) -front_edge <- round(n_markers * edge_length, digits = 0) -back_edge <- round(n_markers - (front_edge), digits = 0) - plot_df %>% ggplot(aes(Marker, Position)) + geom_point(aes(color = Fail), shape = 19) + @@ -111,7 +132,7 @@ plot_df %>% labs( title = paste("Edge Cluster Trimming Results for Linkage Group", lg), subtitle = paste0("Markers Failing QC: ", rm_female, " female, ", rm_male, " male, ", rm_both, " both (", rm_female+rm_male+rm_both, " total)" ), - caption = paste0("edge length: ", edge_length*100, "% of markers, cM threshold: ", dist_thresh), + caption = paste0(edge_length*100, "% of edge markers, ", args[2], "% cM threshold: ", dist_thresh_m, "(M) & ", dist_thresh_f, "(F)"), x = "Marker Number", y = "Position (cM)", color = "QA Status" @@ -121,7 +142,6 @@ plot_df %>% suppressMessages(ggsave(plotfile, width = 7, height = 4, units = "in")) # outputting filtered files -num_rm <- length(removed_markers) writeLines(readLines(args[1], n=3), con = paste(outfile_base, "trimmed", sep = ".")) write.table( @@ -134,6 +154,15 @@ write.table( append=TRUE ) +write.table( + lgfile, + file = paste(rawfile_base, "filtered.raw", sep = "."), + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, +) + write.table( removed_markers, file=paste(outfile_log_base, "removed", sep = "."), diff --git a/scripts/MapSummary.r b/scripts/MapSummary.r index 183f41e..0df8b32 100755 --- a/scripts/MapSummary.r +++ b/scripts/MapSummary.r @@ -25,7 +25,7 @@ library(stringr) targetdir <- args[1] # pattern match to find the map files -flist <- list.files(targetdir, full.names = TRUE, pattern = "^map.") +flist <- list.files(targetdir, full.names = TRUE, pattern = "^LOD.") # natural language sort flist <- str_sort(flist, numeric = TRUE) @@ -54,8 +54,8 @@ summtable[is.na(summtable)] <- 0 summtable <- summtable[order(summtable$LG),] # generate output filenames -out_tmp <-paste0(targetdir, "/all.map.summary") -out_file <- paste0(targetdir, "/all.maps.summary") +out_tmp <-paste0(targetdir, "/.all.LOD.summary") +out_file <- paste0(targetdir, "/all.LOD.summary") write.table(summtable, file = out_tmp, quote = FALSE, row.names = FALSE, col.names = TRUE) # use GNU column command to make the table fixed-width and remove the tmp file diff --git a/scripts/MarkerPlot.r b/scripts/MarkerPlot.r index 66cd2a4..a55d67e 100755 --- a/scripts/MarkerPlot.r +++ b/scripts/MarkerPlot.r @@ -4,6 +4,8 @@ suppressMessages(library(tidyverse, quietly = TRUE, warn.conflicts = FALSE)) args <- commandArgs(trailingOnly = TRUE) +pdf(NULL) + lgfile <- read.delim( args[1], header = FALSE, diff --git a/scripts/RecombinationSummary.r b/scripts/RecombinationSummary.r index cf515b8..e792bf4 100755 --- a/scripts/RecombinationSummary.r +++ b/scripts/RecombinationSummary.r @@ -1,6 +1,6 @@ #! /usr/bin/env Rscript # This script will parse all the recombination logs of LepWrap -suppressMessages(if (!require("stringr")) install.packages("stringr")) +suppressMessages(library(tidyverse)) suppressMessages(library("stringr")) ## setup outfile # format trailing arguments for script @@ -25,14 +25,16 @@ for (i in files[2:length(files)]){ lg <- lg + 1 } +.recomb_df <- recomb_df %>% select(-family, -sample) +cnames <- names(.recomb_df) +cnames <- c(c("family", "sample", "mean", "max"), cnames) +.recomb_df <- .recomb_df %>% + mutate(mean = round(rowMeans(., na.rm = TRUE), digits = 2), max = pmap(., max, na.rm = TRUE)) + +recomb_df$mean <- .recomb_df$mean +recomb_df$max <- .recomb_df$max +recomb_df <- recomb_df[, cnames] + outfile <- paste(args[1], "recombination.summary", sep = "/") -write.table( - recomb_df, - file=outfile, - append=FALSE, - sep = "\t", - quote = FALSE, - row.names = FALSE, - col.names = TRUE -) \ No newline at end of file +print(recomb_df, row.names = FALSE, width = 10000) \ No newline at end of file diff --git a/scripts/TrimSummaryPlot.r b/scripts/TrimSummaryPlot.r index 8b4cd60..72573b9 100755 --- a/scripts/TrimSummaryPlot.r +++ b/scripts/TrimSummaryPlot.r @@ -7,6 +7,7 @@ args <- commandArgs(trailingOnly = TRUE) # read in the summary table and pop out the LG number tbl <- read_table(args[1]) %>% suppressMessages() +pdf(NULL) tbl %>% ggplot(aes(lg, n_removed)) + diff --git a/scripts/iterate_js2all.sh b/scripts/iterate_js2all.sh index bbf69f9..8a7642b 100755 --- a/scripts/iterate_js2all.sh +++ b/scripts/iterate_js2all.sh @@ -33,10 +33,10 @@ fi for i in $(seq $LODMIN $LODMAX); do zcat 2_Filtering/data.filtered.lepmap3.gz | java -cp software/LepMap3 JoinSingles2All map=$TARGETMAP data=- lodLimit=$i lodDifference=4 iterate=1 distortionLod=1 numThreads=10 informativeMask=$INFMASK > JoinSingles2All_iter/logs/map.$i.$4.js2all - cut -f1 JoinSingles2All_iter/logs/map.$i.$4.js2all > JoinSingles2All_iter/map.$i.$4.js2all + cut -f1 JoinSingles2All_iter/logs/map.$i.$4.js2all > JoinSingles2All_iter/LOD.$i.$4.js2all done -echo "The generated maps are named map.LODlim.LODdiff.js2all" +echo "The generated maps are named LOD.LODlim.LODdiff.js2all" # generate a summary of the results scripts/MapSummary.r JoinSingles2All_iter diff --git a/software/LepMap3/Error.class b/software/LepMap3/Error.class index 1d4bdbd..4a3c416 100644 Binary files a/software/LepMap3/Error.class and b/software/LepMap3/Error.class differ diff --git a/software/LepMap3/Filtering2.class b/software/LepMap3/Filtering2.class index a3f1222..aaa172b 100644 Binary files a/software/LepMap3/Filtering2.class and b/software/LepMap3/Filtering2.class differ diff --git a/software/LepMap3/IBD.class b/software/LepMap3/IBD.class index a7a21bc..c7cfb65 100644 Binary files a/software/LepMap3/IBD.class and b/software/LepMap3/IBD.class differ diff --git a/software/LepMap3/LMPlot.class b/software/LepMap3/LMPlot.class index 5ddc750..0702bbf 100644 Binary files a/software/LepMap3/LMPlot.class and b/software/LepMap3/LMPlot.class differ diff --git a/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator$UpdateScoreThread.class b/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator$UpdateScoreThread.class index bc43548..b85abbe 100644 Binary files a/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator$UpdateScoreThread.class and b/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator$UpdateScoreThread.class differ diff --git a/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator.class b/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator.class index 0a17a2f..dd09c5e 100644 Binary files a/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator.class and b/software/LepMap3/OrderFinder$MergeClass$ParallelScoreCalculator.class differ diff --git a/software/LepMap3/OrderFinder$MergeClass.class b/software/LepMap3/OrderFinder$MergeClass.class index 297419c..e28748e 100644 Binary files a/software/LepMap3/OrderFinder$MergeClass.class and b/software/LepMap3/OrderFinder$MergeClass.class differ diff --git a/software/LepMap3/OrderFinder$PhysicalFamily.class b/software/LepMap3/OrderFinder$PhysicalFamily.class index 208fc0d..0e56c8e 100644 Binary files a/software/LepMap3/OrderFinder$PhysicalFamily.class and b/software/LepMap3/OrderFinder$PhysicalFamily.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class index ab4efaa..939ca88 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class and b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class index c10314b..7e200af 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class and b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale1.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class index d8cf4cc..36c807e 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class and b/software/LepMap3/OrderFinder$SingleFamily$RecombinationScale2.class differ diff --git a/software/LepMap3/OrderFinder$SingleFamily.class b/software/LepMap3/OrderFinder$SingleFamily.class index c5be9b6..5344fea 100644 Binary files a/software/LepMap3/OrderFinder$SingleFamily.class and b/software/LepMap3/OrderFinder$SingleFamily.class differ diff --git a/software/LepMap3/OrderFinder$polishThread.class b/software/LepMap3/OrderFinder$polishThread.class index 8b2cbeb..1d4fec3 100644 Binary files a/software/LepMap3/OrderFinder$polishThread.class and b/software/LepMap3/OrderFinder$polishThread.class differ diff --git a/software/LepMap3/OrderFinder.class b/software/LepMap3/OrderFinder.class index eb56744..620b72c 100644 Binary files a/software/LepMap3/OrderFinder.class and b/software/LepMap3/OrderFinder.class differ diff --git a/software/LepMap3/OrderMarkers2.class b/software/LepMap3/OrderMarkers2.class index 4fd828c..1a6f8fa 100644 Binary files a/software/LepMap3/OrderMarkers2.class and b/software/LepMap3/OrderMarkers2.class differ diff --git a/software/LepMap3/ParameterParser.class b/software/LepMap3/ParameterParser.class index f111df0..8924c5c 100644 Binary files a/software/LepMap3/ParameterParser.class and b/software/LepMap3/ParameterParser.class differ diff --git a/software/LepMap3/Separate2$SeparateIdenticalThread.class b/software/LepMap3/Separate2$SeparateIdenticalThread.class index 3b2ef0c..f3be6ae 100644 Binary files a/software/LepMap3/Separate2$SeparateIdenticalThread.class and b/software/LepMap3/Separate2$SeparateIdenticalThread.class differ diff --git a/software/LepMap3/Separate2$SeparateThread.class b/software/LepMap3/Separate2$SeparateThread.class index d497a0b..283a9d5 100644 Binary files a/software/LepMap3/Separate2$SeparateThread.class and b/software/LepMap3/Separate2$SeparateThread.class differ diff --git a/software/LepMap3/Separate2.class b/software/LepMap3/Separate2.class index eec615d..1c9670c 100644 Binary files a/software/LepMap3/Separate2.class and b/software/LepMap3/Separate2.class differ diff --git a/software/LepMap3/SeparateIdenticals.class b/software/LepMap3/SeparateIdenticals.class index deec4a6..718ed66 100644 Binary files a/software/LepMap3/SeparateIdenticals.class and b/software/LepMap3/SeparateIdenticals.class differ