Snakefile_phasing

#kate:syntax python;
import gzip

# Grob habe ich nun alle Varianten (auch Indels) mit Phasing genommen, die
# Erfolgreich auf hg19 geliftet wurden
# In hg19 das gleiche Referenz-Allele wie in hg38 haben (das wechselt ganz 
# selten)
# Deren Genotypen für EGYPTREF mind. 1 Alt-Allel haben (wenn es 0/0 war, hab ich
# es weggelassen, um das VCF File möglichst klein zu machen; ändert aber nichts
# am Ergebnis)
# Deren Genotypen für EGYPTREF gecalled wurden
# Die „known“ sind, d.h. eine rsID besitzen
# 
# Die Ergebnisse sind nun in der Datei 
# /data/lied_egypt_genome/output_wgs/phasing/sanger/joined.txt zusammengefasst.
# 
# Spalte: chr:pos:ref:alt in hg38 (mit „chr“ vor der Zahl UCSC-Schreibweise)
# Spalte: chr:pos:ref:alt in hg19 (ohne „chr“ vor der Zahl Ensembl-Schreibweise)
# Spalte: rsid aus der Variant-Calling-Datei
# Spalte: Sample-Name
# Unphased Genotypen
# Phased Genotypen
# 
# Wahrscheinlich ist es am einfachsten, wenn du aus deinem gephasten VCF nun 
# auch eine TSV-Datei erzeugst (mit „bcftools query“ geht das relativ einfach) 
# und diese dann mit meiner Datei joinst.
#  
# Als Phasing-Programm habe ich EAGLE mit dem HRC-Panel verwendet.

# Getting HRC reference panel information (Readme and sites)
rule get_hrc_readme:
    output: "phasing_comparison/HRC.r1-1/README"
    shell: "wget -P phasing_comparison/HRC.r1-1/ ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/README"

rule get_hrc_sites:
    output: "phasing_comparison/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.vcf.gz",
            "phasing_comparison/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz"
    shell: "wget -P phasing_comparison/HRC.r1-1/ ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.vcf.gz; " + \
           "wget -P phasing_comparison/HRC.r1-1/ ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz; "

rule link_10x_phased_variants:
    input: "longranger_phasing/EGYPTREF/outs/phased_variants.vcf.gz"
    output: "phasing_comparison/EGYPTREF_10x_phased_orig.vcf.gz"
    shell: "ln -s ../{input} {output}"

# Normalize the 10x-phased variants, i.e. split multiallelic sites into 
# multiple rows
#  -m, --multiallelics -|+[snps|indels|both|any]
# split multiallelic sites into biallelic records (-) or join biallelic sites 
# into multiallelic records (+). An optional type string can follow which 
# controls variant types which should be split or merged together: If only SNP 
# records should be split or merged, specify snps; if both SNPs and indels 
# should be merged separately into two records, specify both; if SNPs and indels
#  should be merged into a single record, specify any. 
# This and the next bcftools command are the same that Matthias applied before 
# uploading the variant data to the Sanger imputation server
rule split_multiallelic_sites:
    input: "phasing_comparison/EGYPTREF_10x_phased_orig.vcf.gz"
    output: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz"
    shell: "bcftools norm -m-both -Ov {input} | bgzip -c > {output}; "+ \
           "tabix {output}"

# Normalize the 10x-phased variants, i.e. split multiallelic sites into 
# multiple rows
# Left-align and normalize indels, check if REF alleles match the reference, 
# split multiallelic sites into multiple rows; recover multiallelics from 
# multiple rows. Left-alignment and normalization will only be applied if the 
# --fasta-ref option is supplied.
# Not needed???
#rule normalize_vcf:
#    input: "phasing_comparison/EGYPTREF_10x_phased_nomultiallelic.vcf.gz"
#    output: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz"
#    shell: "bcftools norm -o {output} -Oz {input} "

rule link_sanger_phased_variants:
    input: "/data/lied_egypt_genome/output_wgs/phasing/sanger/joined.txt"
    output: "phasing_comparison/EGYPTREF_sanger_phased.txt"
    shell: "ln -s {input} {output}"

rule txt_to_vcf:
    input: "phasing_comparison/EGYPTREF_sanger_phased.txt"
    output: "phasing_comparison/EGYPTREF_sanger_phased.vcf"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            f_out.write("##fileformat=VCFv4.2\n")
            f_out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tEGYPTREF\n")
            for line in f_in:
                # Skip header
                if line[:20] == "hg38_chr_pos_ref_alt":
                    continue
                s = line.strip("\n").split(" ")
                # Get hg38 chromosomes and coordinates
                chrom,pos,ref,alt = s[0].split(":")
                rsid = s[2]
                unphased_gt = s[4]
                phased_gt = s[5]
                unphased_alleles = unphased_gt.split("/")
                phased_alleles = phased_gt.split("|")
                # Make sure phased alleles match unphased alleles
                assert(((unphased_alleles[0]==phased_alleles[0]) and \
                        (unphased_alleles[1]==phased_alleles[1])) or \
                        ((unphased_alleles[0]==phased_alleles[1]) and \
                        (unphased_alleles[1]==phased_alleles[0])))
                f_out.write("\t".join([chrom,pos,rsid,ref,alt,'.','PASS','.', \
                                       'GT',phased_gt])+"\n")

rule compress_sanger_phased:
    input: "phasing_comparison/EGYPTREF_sanger_phased.vcf"
    output: "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz"
    shell: "cat {input} | vcf-sort | bgzip -c > {output}; "+ \
           "tabix {output}"

# --diff-site
# Outputs the sites that are common / unique to each file. The output file has 
# the suffix ".diff.sites_in_files".
rule compare_phasings_sites_in_file:
    input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
           "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz"
    output: "phasing_comparison/EGYPTREF_{chrom}.diff.sites_in_files"
    params: out_base=lambda wildcards, output: output[0][:-20]
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--gzdiff {input[1]} " + \
                    "--out {params.out_base} " + \
                    "--chr {wildcards.chrom} " + \
                    "--diff-site "

# Deduce from diff.sites_in_file file the bi-alellic sites present in both 
# imputations
rule get_sites_for_discordance:
    input: "phasing_comparison/EGYPTREF_{chrom}.diff.sites_in_files"
    output: "phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
    shell: "cat {input} | grep -v ',' | grep -P '\tB\t' | cut -f 1,2 > {output}"

# --diff-site-discordance
# This option calculates discordance on a site by site basis. The resulting 
# output file has the suffix ".diff.sites".
rule compare_phasings_site:
    input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
           "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz"
    output: "phasing_comparison/EGYPTREF_{chrom}.diff.sites"
    params: out_base=lambda wildcards, output: output[0][:-11]
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--gzdiff {input[1]} " + \
                    "--out {params.out_base} " + \
                    "--chr {wildcards.chrom} " + \
                    "--diff-site-discordance "

# --diff-discordance-matrix
# This option calculates a discordance matrix. This option only works with 
# bi-allelic loci with matching alleles that are present in both files. The 
# resulting output file has the suffix ".diff.discordance.matrix".
rule compare_phasings_discordance_matrix:
    input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
           "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz",
           "phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
    output: "phasing_comparison/EGYPTREF_{chrom}.diff.discordance_matrix"
    params: out_base=lambda wildcards, output: output[0][:-24]
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--gzdiff {input[1]} " + \
                    "--out {params.out_base} " + \
                    "--positions {input[2]} " + \
                    "--diff-discordance-matrix "

# Get shared sites from 10x phasing
rule vcf_10x_files_sites_both:
    input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
           "phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
    output: "phasing_comparison/EGYPTREF_10x_phased_{chrom}.vcf.gz"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--stdout " + \
                    "--recode " + \
                    "--recode-INFO-all " + \
                    "--remove-indels " + \
                    "--non-ref-ac-any 1 " + \
                    "--positions {input[1]} | " + \
                    "bgzip -c > {output}; "+ \
           "tabix {output}"

# Get shared sites from sanger phasing
rule vcf_sanger_files_site_both:
    input: "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz",
           "phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
    output: "phasing_comparison/EGYPTREF_sanger_phased_{chrom}.vcf.gz"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--stdout " + \
                    "--recode " + \
                    "--recode-INFO-all " + \
                    "--positions {input[1]} | "
                    "bgzip -c > {output}; "+ \
           "tabix {output}"

# Merge Sanger_phasing to 10x phasing VCF file
rule compare_phasings:
    input: "phasing_comparison/EGYPTREF_10x_phased_{chrom}.vcf.gz",
           "phasing_comparison/EGYPTREF_sanger_phased_{chrom}.vcf.gz"
    output: "phasing_comparison/EGYPTREF_{chrom}_concordant.txt",
            "phasing_comparison/EGYPTREF_{chrom}_discordant.txt",
            "phasing_comparison/EGYPTREF_{chrom}_nonmatching.txt"
    run: 
        vars = {}
        lines = {}
        with gzip.open(input[0],"r") as f_10x:
            for line in f_10x:
                line = line.decode()
                if line[0] == '#':
                    continue
                s = line.strip("\n").split("\t")
                chrom = s[0]
                pos = s[1]
                ref = s[3]
                alt = s[4]
                gt = s[9].split(':')[0]
                vars["\t".join([chrom,pos,ref,alt])] = gt
                lines["\t".join([chrom,pos,ref,alt])] = line
        with gzip.open(input[1],"r") as f_sanger, \
             open(output[0],"w") as f_concordant, \
             open(output[1],"w") as f_discordant:
            for line in f_sanger:
                line = line.decode()
                if line[0] == '#':
                    continue
                s = line.strip("\n").split("\t")
                chrom = s[0]
                pos = s[1]
                ref = s[3]
                alt = s[4]
                gt = s[9].split(':')[0]
                gt_10x = vars["\t".join([chrom,pos,ref,alt])]
                line_10x = lines["\t".join([chrom,pos,ref,alt])]
                print(gt)
                print(gt_10x)
                if gt == gt_10x:
                    f_concordant.write(line_10x)
                else:
                    f_discordant.write(line_10x.strip("\n")+"\t"+line)
                del lines["\t".join([chrom,pos,ref,alt])]
        with open(output[2],"w") as f_nonmatching:
            for line in list(lines):
                f_nonmatching.write(line)
    

rule compare_phasings_all:
    input: expand("phasing_comparison/EGYPTREF_{chrom}.{diff_type}", \
                   diff_type=["diff.sites_in_files","diff.sites", \
                               "diff.discordance_matrix"], \
                   chrom=["chr"+str(x) for x in range(1,23)]), \
           expand("phasing_comparison/EGYPTREF_{chrom}_concordant.txt", \
                   chrom=["chr"+str(x) for x in range(1,23)])