Skip to content

Commit

Permalink
Continued with assembly polishing and analyzing intermediate assemblies.
Browse files Browse the repository at this point in the history
  • Loading branch information
iwohlers committed Mar 12, 2019
1 parent 96b712f commit 8302646
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 14 deletions.
31 changes: 28 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ EGYPTREFWTDBG2_SCAFFOLDS = ["ctg"+str(x) for x in range(1,3338)]
# Note: the six EGYPTREFWTDBG2_SCAFFOLFDS for which no repeats have been
# detected are ctg1293, ctg1878, ctg2618, ctg3116, ctg2493, ctg3293

LONGEST_EGYPTREFWTDBG2_SCAFFOLDS = ["ctg"+str(x) for x in range(1,31)]

################################################################################
######### Preprocessing of the first Novogene assembly called EGYPTREF #########
################################################################################
Expand Down Expand Up @@ -224,7 +226,7 @@ rule compute_assembly_stats:
rule compute_content_and_assembly_numbers:
input: expand( \
"results/{assembly}/{task}_Homo_sapiens.{assembly}.dna.primary_assembly.txt", \
assembly = ["EGYPTREFWTDBG2","GRCh38","EGYPTREF","AK1","YORUBA","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"], \
assembly = ["EGYPTREFWTDBG2V2","EGYPTREFWTDBG2","GRCh38","EGYPTREF","AK1","YORUBA","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"], \
task = ["scaffold_names","num_bases","num_all","assembly_stats"])


Expand Down Expand Up @@ -458,6 +460,27 @@ rule write_scaffold_fastas_egyptrefwtdbg2:
SeqIO.write(record, f_out, "fasta")
i += 1

rule wtdbg2_secondary_assembly:
input: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.2nd.fa"
output: "seq_EGYPTREFWTDBG2V2/Homo_sapiens.EGYPTREFWTDBG2V2.dna.primary_assembly.fa"
shell: "cp {input} {output}"


# Writing the scaffolds of the Egyptian genome to separate fasta files because
# processing the whole assembly often takes too much time
rule write_scaffold_fastas_egyptrefwtdbg2v2:
input: "seq_EGYPTREFWTDBG2V2/Homo_sapiens.EGYPTREFWTDBG2V2.dna.primary_assembly.fa"
output: expand("seq_EGYPTREFWTDBG2V2/Homo_sapiens.EGYPTREFWTDBG2V2.dna.{scaffold}.fa", \
scaffold=EGYPTREFWTDBG2_SCAFFOLDS)
run:
with open(input[0], "r") as f_in:
i = 0
for record in SeqIO.parse(f_in,"fasta"):
with open(output[i], "w") as f_out:
SeqIO.write(record, f_out, "fasta")
i += 1



################################################################################
######################### Repeat masking with repeatmasker #####################
Expand Down Expand Up @@ -594,7 +617,7 @@ rule repeatmasker_summary_table_egyptrefwtdbg2:
# one line for GRCh38
rule comparison_repeatmasker:
input: expand("repeatmasked_{assembly}/summary.txt", \
assembly=["EGYPTREFWTDBG2","EGYPTREF","EGYPTREFV2","CEGYPTREF","CEGYPTREFV2","AK1","YORUBA","GRCh38"])
assembly=["EGYPTREFWTDBG2V2","EGYPTREFWTDBG2","EGYPTREF","EGYPTREFV2","CEGYPTREF","CEGYPTREFV2","AK1","YORUBA","GRCh38"])
output: "results/repeatmasker_comparison.txt"
script: "scripts/repeatmasker_comparison.py"

Expand Down Expand Up @@ -682,7 +705,9 @@ rule dotplots_scaffold_vs_chromosomes_all:
expand("align_lastz_GRCh38_vs_EGYPTREFV2/dotplots/{scaffold}.pdf", \
scaffold=LONGEST_EGYPTREFV2_SCAFFOLDS),
expand("align_lastz_GRCh38_vs_CEGYPTREFV2/dotplots/{contig}.pdf", \
contig=CEGYPTV2_CONTIGS[:50])
contig=CEGYPTV2_CONTIGS[:50]),
expand("align_lastz_GRCh38_vs_EGYPTREFWTDBG2/dotplots/{scaffold}.pdf", \
scaffold=LONGEST_EGYPTREFWTDBG2_SCAFFOLDS)

# All versus all comparisons of reference and Egyptian genome
rule align_all_vs_all:
Expand Down
75 changes: 73 additions & 2 deletions Snakefile_assembly
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,68 @@ rule assembl_with_wtdbg2:
# -V Print version information and then exit
rule consensus_with_wtdbg2:
input: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.lay.gz"
output: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.lay.fa"
output: protected("assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.lay.fa")
conda: "envs/wtdbg.yaml"
shell: "wtpoa-cns -i {input} " + \
" -t 31 " + \
" -o {output}"

rule polish_with_pb_reads:
input: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.lay.fa",
"pacbio/pb_EGYPTREF.fa"
output: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.map.bam",
"assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.map.srt.bam",
protected("assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.2nd.fa")
params: prefix="assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.map.srt"
conda: "envs/polish_wtdbg.yaml"
shell: "minimap2 -t 31 -x map-pb -a {input[0]} {input[1]} | " + \
"samtools view -Sb - > {output[0]}; " + \
"samtools sort {output[0]} -o {output[1]}; " + \
"samtools view {output[1]} | " + \
"wtpoa-cns -t 31 -d {input[0]} -i - -fo {output[2]}; "

# Mapping the Illumina PE data to the scaffolds
# -a STR: Algorithm for constructing BWT index. Chosen option:
# bwtsw: Algorithm implemented in BWT-SW. This method works with the
# whole human genome.
# -p STR: Prefix of the output database [same as db filename]
rule bwa_index_for_polishing:
input: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.2nd.fa"
output: "assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd.amb",
"assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd.ann",
"assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd.bwt",
"assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd.pac",
"assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd.sa"
conda: "envs/bwa.yaml"
shell: "bwa index -a bwtsw " + \
"-p assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.2nd " + \
"{input}"

rule bwa_mem_for_polishing:
input: index = "assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd.sa",
fastq_r1 = "data/02.DES/{lib}_1.fq.gz",
fastq_r2 = "data/02.DES/{lib}_2.fq.gz"
output: "assembly_wtdbg2/bwa/{lib}.bam"
conda: "envs/bwa.yaml"
shell: "bwa mem -t 30 " + \
"assembly_wtdbg2/bwa/EGYPTREF_wtdbg2.ctg.2nd "+\
"{input.fastq_r1} {input.fastq_r2} " + \
" | samtools sort -@30 -o {output} -"

rule merge_bam_for_polishing:
input: expand("assembly_wtdbg2/bwa/{lib}.bam", lib=ILLUMINA_LIBS)
output: "assembly_wtdbg2/bwa/sr.srt.bam"
conda: "envs/polish_wtdbg.yaml"
shell: "samtools merge {output} {input}"

rule polish_with_short_reads:
input: "assembly_wtdbg2/bwa/sr.srt.bam",
"assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.2nd.fa"
output: "assembly_wtdbg2/EGYPTREF_wtdbg2.ctg.3rd.fa"
conda: "envs/polish_wtdbg.yaml"
shell: "samtools view {input[0]} | " + \
"wtpoa-cns -t 30 -x sam-sr -d {input[1]} -i - -fo {output}"


################################################################################
##### Correcting / improving / scaffolding the assembly using 10X data #########
Expand Down Expand Up @@ -200,4 +256,19 @@ rule merge_bam_files_for_tigmint:
lib = [x.split("_")[0] for x in ILLUMINA_10X_LIBS])
output: "tigmint/draft.reads.sortbx.bam"
conda: "envs/tigmint.yaml"
shell: "samtools merge -tBX {output[0]} {input}"
conda: "envs/tigmint.yaml"
shell: "samtools merge -tBX {output[0]} {input}"

rule run_tigmint_molecule_single_10xlib:
input: "tigmint/{lib}.reads.sortbx.bam"
output: "tigmint/{lib}.reads.molecule.bed"
conda: "envs/tigmint.yaml"
shell: "tigmint-molecule {input} | " + \
"sort -k1,1 -k2,2n -k3,3n > {output}"

rule run_tigmint_cut_single_10xlib:
input: "tigmint/EGYPTREFWTDBG2.fa",
"tigmint/{lib}.reads.molecule.bed"
output: "tigmint/{lib}.tigmint.fa"
conda: "envs/tigmint.yaml"
shell: "tigmint-cut -p16 -o {output} {input[0]} {input[1]}"
15 changes: 8 additions & 7 deletions Snakefile_genome_alignment
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ rule align_assemblies_with_nucmer:
conda: "envs/mummer.yaml"
shell: "nucmer " + \
"--mum " + \
"--threads=16 " + \
"--threads=32 " + \
"-p align_nucmer_{wildcards.a1}_vs_{wildcards.a2}/{wildcards.a1}_vs_{wildcards.a2} " + \
"{input[0]} {input[1]}"

Expand Down Expand Up @@ -219,13 +219,14 @@ rule mummerplot:
"{input[0]}; " + \
"gnuplot {output[0]}; "

# aligntype=["align","alignrepeatmasked"], \
# a2=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"] + \
# ["EGYPTREFWTDBG2","YORUBA","AK1","GRCh38"], \
rule mummerplot_all:
input: expand("{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.ps", \
aligntype=["align","alignrepeatmasked"], \
aligntype=["align"], \
a1="GRCh38", \
a2="EGYPTREFWTDBG2", \
a2="EGYPTREFWTDBG2V2", \
filter=["nofilter","1to1"])

rule mummerplot_chromosomewise:
Expand All @@ -251,14 +252,14 @@ rule mummerplot_chromosomewise:
"gnuplot {output[0]}; "

ASSEMBLIES = ["CEGYPTREFV2"]
CHROMOSOMES = [str(x) for x in range(23)]+["X","Y","MT"]
#"YORUBA", "EGYPTREFV2"
CHROMOSOMES = [str(x) for x in range(1,23)]+["X","Y","MT"]
# a2=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"] + \
# ["EGYPTREFWTDBG2","YORUBA","AK1","GRCh38"], \
rule mummerplot_chromosomewise_all:
input: expand("{aligntype}_nucmer_{a1}_vs_{a2}/chrom_{a1}_vs_{a2}_{filter}_{chrom}.ps", \
aligntype=["align","alignrepeatmasked"], \
a1="GRCh38", \
a2=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"] + \
["EGYPTREFWTDBG2","YORUBA","AK1","GRCh38"], \
a2=["EGYPTREFWTDBG2"], \
filter=["nofilter","1to1"], \
chrom=CHROMOSOMES)

Expand Down
7 changes: 7 additions & 0 deletions envs/polish_wtdbg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- bioconda
dependencies:
- wtdbg == 2.3-0
- minimap2
- samtools
- bwa
21 changes: 20 additions & 1 deletion global_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,23 @@
"NDHX00201-AK657_L5",
"NDHX00201-AK657_L6",
"NDHX00201-AK657_L7"
]
]


################################################################################
################ Illumina short read related variables #########################
################################################################################

# The Illumina library sample names
ILLUMINA_SAMPLES = ["NDES00177","NDES00178","NDES00179","NDES00180","NDES00181"]
ILLUMINA_SAMPLES_TO_LANES = {
"NDES00177": [4,5,6,7],
"NDES00178": [1,4,5,6,7],
"NDES00179": [4,5,6,7],
"NDES00180": [1,4,5,6,7],
"NDES00181": [4,5,6,7]
}
ILLUMINA_LIBS = []
for sample in ILLUMINA_SAMPLES:
ILLUMINA_LIBS += [sample+"_L"+str(x) for x in \
ILLUMINA_SAMPLES_TO_LANES[sample]]
4 changes: 3 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ fi
source activate workflow_lied_egypt_genome

echo "RUNNING SNAKEMAKE WORKFLOW..."
snakemake --reason --rerun-incomplete -k -j 1 --resources io=4 --use-conda --jobname "{jobid}.{rulename}.sh" --cluster "sbatch --mem 16G --partition=shortterm,longterm --time 3-00:00:00 -c 4 -o log/%j.{rule}.log" --printshellcmds results/repeatmasker_comparison.txt #compute_content_and_assembly_numbers comparison_repeatmasker dotplots_scaffold_vs_chromosomes_all

snakemake -n -p -k -s Snakefile_assembly --use-conda polish_with_short_reads
#snakemake -n --reason --rerun-incomplete -k -j 8 --resources io=4 --use-conda --jobname "{jobid}.{rulename}.sh" --cluster "sbatch --mem 90G --partition=shortterm,longterm --time 3-00:00:00 -c 16 -o log/%j.{rule}.log" --printshellcmds # dotplots_scaffold_vs_chromosomes_all results/repeatmasker_comparison.txt compute_content_and_assembly_numbers comparison_repeatmasker dotplots_scaffold_vs_chromosomes_all

source deactivate
conda list -n workflow_lied_egypt_genome --export > environment_versions.yaml

0 comments on commit 8302646

Please sign in to comment.