Snakefile_gwas

# kate: syntax python;
# Run rule list_diseases first!

import gzip
import os.path
from decimal import Decimal
import math


# Because some files are generated, this workflow has to be run in various steps
# one after another. Reproduce results by running in the specified order:
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p gwas/results/custom_disease_ids.txt
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p gwas/results/num_recurrentloci_per_disease.txt
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p gwas/results/num_recurrentlocivcf_per_disease.txt
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p gwas/results/num_recurrentlocild_per_disease.txt
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p gwas/tag_snps/tag_snppos_egyptians.txt
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p all
# snakemake --rerun-incomplete -j 10 -k --use-conda -s Snakefile_gwas -p egyptian_sv_in_proximity_all


rule all:
    input: "gwas/results/num_diseases.txt", \
           expand("gwas/results/num_{analysis}_per_disease.txt", analysis= \
                 ["associations","europeanassociations","recurrentloci","loci"]), \
           "gwas/results/combined_loci_annotated.txt", \
           "gwas/tag_snps/figures/proxy_number.pdf", \
           "gwas/tag_snps/num_proxies.txt"


########### Preprocessing GWAS catalog data ###########

# Get the GWAS catalog
rule get_gwas_catalog:
    output: "gwas/results/alternative"
    shell: "wget --directory-prefix=gwas/results https://www.ebi.ac.uk/gwas/api/search/downloads/alternative 2> /dev/null"

# Get the GWAS catalog
rule get_gwas_catalog_ancestry:
    output: "gwas/results/ancestry"
    shell: "wget --directory-prefix=gwas/results https://www.ebi.ac.uk/gwas/api/search/downloads/ancestry 2> /dev/null"

rule list_diseases:
    input: "gwas/results/alternative"
    output: "gwas/results/custom_disease_ids.txt"
    run:
        disease_ids = {}
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                # Write custom header
                if line[:4] == "DATE":
                    f_out.write("MAPPED_TRAIT\tMAPPED_TRAIT_URI\tCUSTOM_ID\n")
                    continue
                s = line.split("\t")
                new_disease_id = "-".join([x.split("/")[-1] for x in s[35].split(",")])
                if not new_disease_id in disease_ids:
                    f_out.write("\t".join([s[34],s[35],new_disease_id])+"\n")
                    disease_ids[new_disease_id] = True

rule num_diseases:
    input: "gwas/results/custom_disease_ids.txt"
    output: "gwas/results/num_diseases.txt"
    shell: "cat {input} | grep -v 'MAPPED_TRAIT' | wc -l > {output}"

GWAS_CATALOG_DISEASES = []
if os.path.exists("gwas/results/custom_disease_ids.txt"):
    with open("gwas/results/custom_disease_ids.txt","r") as f_in:
        for line in f_in:
            # Skip header and skip lines without mapped trait
            if line[:6] == "MAPPED" or line == "\t\t\n":
                continue
            GWAS_CATALOG_DISEASES.append(line.strip("\n").split("\t")[-1])
#    print(GWAS_CATALOG_DISEASES)

# Use the GWAS catalog data and make lists of variants for every disease / 
# phenotype using the ontology IDs provided in column MAPPED_TRAIT_URI as file 
# Watchout: there are 249 associations without mapped ID
rule make_disease_specific_gwas_lists:
    input: "gwas/results/alternative"
    output: expand("gwas/results/{disease}_catalog.tab", disease=GWAS_CATALOG_DISEASES)
    run:
        i = 0
        for filename in output:
            with open(input[0],"r") as f_in, open(filename,"w") as f_out:
                for line in f_in:
                    if line[:4] == "DATE":
                        f_out.write(line)
                        continue
                    s = line.split("\t")
                    disease_id = "-".join([x.split("/")[-1] for x in s[35].split(",")])
                    if disease_id == GWAS_CATALOG_DISEASES[i]:
                        f_out.write(line)                        
                i += 1 

rule num_associations_per_disease:
    input: expand("gwas/results/{disease}_catalog.tab", disease=GWAS_CATALOG_DISEASES)
    output: "gwas/results/num_associations_per_disease.txt"
    run:
        i = 0
        for filename in input:
            with open(input[i],"r") as f_in, open(output[0],"a") as f_out:
                f_out.write(GWAS_CATALOG_DISEASES[i]+"\t"+str(len(f_in.readlines())-1)+"\n")
            i += 1

# Make a list of disease-specific gwas hits only for studies using individuals
# of European ancestry
rule filter_disease_specific_gwas_lists_for_european:
    input: "gwas/results/{disease}_catalog.tab",
           "gwas/results/ancestry"
    output: "gwas/results/{disease}_european.tab"
    run:
        # Get study IDs of studies with only European individuals
        study_accessions = {}
        with open(input[1],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:5] == "STUDY":
                    f_out.write(line)
                    continue
                s = line.split("\t")
                # We use the column "BROAD ANCESTRAL CATEGORY", and of this only
                # European only
                ancestry = s[8]
                if ancestry == "European":
                    study_accessions[s[0]] = True
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "DATE":
                    f_out.write(line)
                    continue
                # get the study accession of this association              
                study_accession = line.split("\t")[36]
                if study_accession in study_accessions:
                    f_out.write(line)

rule num_european_associations_per_disease:
    input: expand("gwas/results/{disease}_european.tab", disease=GWAS_CATALOG_DISEASES)
    output: "gwas/results/num_europeanassociations_per_disease.txt"
    run:
        i = 0
        for filename in input:
            with open(input[i],"r") as f_in, open(output[0],"a") as f_out:
                f_out.write(GWAS_CATALOG_DISEASES[i]+"\t"+str(len(f_in.readlines())-1)+"\n")
            i += 1

rule filter_disease_specific_gwas_lists_for_european_all:
    input: expand("gwas/results/{disease}_european.tab", disease=GWAS_CATALOG_DISEASES)

# Filter disease-specific associations such that only associations within 1 MB
# of another association are kept
rule filter_disease_specific_gwas_lists_for_recurrent_loci:
    input: "gwas/results/{disease}_european.tab"
    output: "gwas/results/{disease}_recurrentloci.tab"
    run:
        # Record all positions for this disease
        positions = []
        with open(input[0],"r") as f_in:
            for line in f_in:
                if line[:4] == "DATE":
                    continue
                s = line.split("\t")
                chrom = s[11]
                # Disregard assoc. without one unique position specified
                if s[12] == "" or "x" in s[12] or ";" in s[12]:
                    continue
                pos = int(s[12])
                positions.append([chrom,pos,line])
        # Go again over the associations
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "DATE":
                    f_out.write(line)
                    continue
                s = line.split("\t")
                chrom = s[11]
                # Disregard assoc. without position specified
                if s[12] == "" or "x" in s[12] or ";" in s[12]:
                    continue
                pos = int(s[12])
                for prev_chrom,prev_pos,prev_line in positions:
                    if prev_line == line or not chrom == prev_chrom:
                        continue
                    if abs(pos-prev_pos)<1000000:
                        f_out.write(line)
                        break

rule num_recurrentloci_per_disease:
    input: expand("gwas/results/{disease}_recurrentloci.tab", disease=GWAS_CATALOG_DISEASES)
    output: "gwas/results/num_recurrentloci_per_disease.txt"
    run:
        i = 0
        for filename in input:
            with open(input[i],"r") as f_in, open(output[0],"a") as f_out:
                f_out.write(GWAS_CATALOG_DISEASES[i]+"\t"+str(len(f_in.readlines())-1)+"\n")
            i += 1

rule filter_disease_specific_gwas_lists_for_recurrent_loci_all:
    input: expand("gwas/results/{disease}_recurrentloci.tab", disease=GWAS_CATALOG_DISEASES)

POPULATIONS_EUR = ["CEU","FIN","GBR","IBS","TSI"]
rule extract_european_1000g:
    input: "1000_genomes/integrated_call_samples_v2.20130502.ALL.ped"
    output: "gwas/ld/keep_indiv.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                s = line.split("\t")
                if s[6] in POPULATIONS_EUR and s[13] == "1":
                    f_out.write(s[1]+"\n")

# Selecting from the VCF files those individuals that are to be used
# Keeping only bi-allelic variants (min-allele = max-allele = 2)
rule select_european_1000g_individual_genotypes:
    input: "1000_genomes/ALL.chr{chr}_GRCh38.genotypes.20170504.vcf.gz",
           "gwas/ld/keep_indiv.txt"
    output: "gwas/ld/EUR.chr{chr}_GRCh38.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--keep {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

# Concatenate the vcf file from several chromosomes
# --pad-missing: Write '.' in place of missing columns. Useful for joining chrY 
# with the rest.
rule concatenate_european_chr_vcfs:
    input: expand("gwas/ld/EUR.chr{chr}_GRCh38.vcf.gz", \
                   chr=[str(x) for x in range(1,23)])
    output: "gwas/ld/EUR_GRCh38.vcf.gz"
    conda: "envs/genotype_pcs.yaml"
    shell: "vcf-concat --pad-missing {input} | bgzip > {output}"

rule filter_european_snps:
    input: "gwas/ld/EUR_GRCh38.vcf.gz"
    output: "gwas/ld/EUR_GRCh38_common.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--min-alleles 2 " + \
                    "--max-alleles 2 " + \
                    "--maf 0.05 " + \
                    "--hwe 0.000001 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

# Extract the rsids from GWAS of the specific disease
rule get_snp_ids:
    input: "gwas/results/{disease}_recurrentloci.tab"
    output: "gwas/ld/{disease}.rsids"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "DATE":
                    continue
                f_out.write(line.split("\t")[20].split("-")[0]+"\n")

rule european_vcf_for_disease:
    input: "gwas/ld/EUR_GRCh38_common.vcf.gz",
           "gwas/ld/{disease}.rsids"
    output: "gwas/ld/{disease}_EUR.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--snps {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

GWAS_CATALOG_DISEASES_LOCI = []
if os.path.exists("gwas/results/num_recurrentloci_per_disease.txt"):
    with open("gwas/results/num_recurrentloci_per_disease.txt","r") as f_in:
        for line in f_in:
            s = line.strip("\n").split("\t")
            if not s[1] == "0":
                GWAS_CATALOG_DISEASES_LOCI.append(s[0])

rule num_recurrentloci_in_vcf_per_disease:
    input: expand("gwas/ld/{disease}_EUR.vcf.gz", disease=GWAS_CATALOG_DISEASES_LOCI)
    output: "gwas/results/num_recurrentlocivcf_per_disease.txt"
    run:
        i = 0
        for filename in input:
            num_ass = 0
            with gzip.open(input[i],"r") as f_in, open(output[0],"a") as f_out:
                for line in f_in:
                    if line.decode()[0] == "#":
                        continue
                    num_ass += 1
                f_out.write(GWAS_CATALOG_DISEASES_LOCI[i]+"\t"+str(num_ass)+"\n")
            i += 1

GWAS_CATALOG_DISEASES_LOCI_VCF = []
if os.path.exists("gwas/results/num_recurrentlocivcf_per_disease.txt"):
    with open("gwas/results/num_recurrentlocivcf_per_disease.txt","r") as f_in:
        for line in f_in:
            s = line.strip("\n").split("\t")
            if not s[1] in ["0","1"]:
                GWAS_CATALOG_DISEASES_LOCI_VCF.append(s[0])

# Compute the LD between any pair of SNPs
# Output the LDs less than 0.8 
# Compute only for SNPs less than 1MB apart from each other
rule compute_ld_1000g:
    input: "gwas/ld/{disease}_EUR.vcf.gz",
    output: "gwas/ld/{disease}.geno.ld"
    params: log_base=lambda wildcards, output: output[0][:-8]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--geno-r2 " + \
                    "--ld-window-bp 1000000 " + \
                    "--min-r2 0.8 " + \
                    "--out {params.log_base} "

rule num_recurrentloci_in_ld_per_disease:
    input: expand("gwas/ld/{disease}.geno.ld", disease=GWAS_CATALOG_DISEASES_LOCI_VCF)
    output: "gwas/results/num_recurrentlocild_per_disease.txt"
    run:
        i = 0
        for filename in input:
            num_ass = 0
            with open(input[i],"r") as f_in, open(output[0],"a") as f_out:
                for line in f_in:
                    if line[0] == "CHR":
                        continue
                    num_ass += 1
                f_out.write(GWAS_CATALOG_DISEASES_LOCI_VCF[i]+"\t"+str(num_ass)+"\n")
            i += 1

GWAS_CATALOG_DISEASES_LOCI_LD = []
if os.path.exists("gwas/results/num_recurrentlocild_per_disease.txt"):
    with open("gwas/results/num_recurrentlocild_per_disease.txt","r") as f_in:
        for line in f_in:
            s = line.strip("\n").split("\t")
            # There must be more than the header line, i.e. at least one pair
            # of SNPs to continue
            if not s[1] == "1":
                GWAS_CATALOG_DISEASES_LOCI_LD.append(s[0])

# The idea is to keep one association per loci, where we define a locus as
# a region of 1MB with a tag association (this means we miss independent
# association signals that are closer than 1 MB from each other)
# We automatically select the association based on the number of occurrences 
# per rsid position plus this positions must be in LD>0.8 with at least one
# other association
rule keep_one_variant_per_locus:
    input: "gwas/results/{disease}_recurrentloci.tab",
           "gwas/ld/{disease}.geno.ld"
    output: "gwas/results/{disease}_loci.tab"
    run: 
        # Read in the association positions with high ld >0.8
        position_count = {}
        position2rsid = {}
        with open(input[1],"r") as f_in:
            for line in f_in:
                if line[:3] == "CHR":
                    continue
                s = line.split("\t")
                chrom = s[0]
                pos1 = s[1]
                pos2 = s[2]
                position_count["\t".join([chrom,pos1])] = 0
                position_count["\t".join([chrom,pos2])] = 0
        with open(input[0],"r") as f_in:
            for line in f_in:
                if line[:4] == "DATE":
                    continue
                s = line.split("\t")
                chrom = s[11]
                pos = s[12]
                # Only positions occurring at least once in ld with another ass.
                if "\t".join([chrom,pos]) in position_count:
                    position_count["\t".join([chrom,pos])] += 1
        sorted_pc = sorted(position_count.items(), key = lambda x: -1*x[1])
        print(sorted_pc)
        loci = [sorted_pc[0][0].split("\t")]
        # Go over the positions in descending order of their occurrences
        for chr_pos,occ in sorted_pc:
            chrom,pos = chr_pos.split("\t")
            locus_new = True
            for loc_chrom,loc_pos in loci:
                if not chrom == loc_chrom:
                    continue
                if (int(pos)-int(loc_pos)) <= 1000000:
                    locus_new = False
            if locus_new:
                loci.append([chrom,pos])
        print(loci)
        # Go over the input gwas ass. file and write only those ass. to output
        # which are uniquely defining a locus
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "DATE":
                    f_out.write(line)
                    continue
                s = line.split("\t")
                chrom = s[11]
                pos = s[12]
                if [chrom,pos] in loci:
                    f_out.write(line)

rule num_european_loci_per_disease:
    input: expand("gwas/results/{disease}_loci.tab", disease=GWAS_CATALOG_DISEASES_LOCI_LD)
    output: "gwas/results/num_loci_per_disease.txt"
    run:
        i = 0
        for filename in input:
            with open(input[i],"r") as f_in, open(output[0],"a") as f_out:
                f_out.write(GWAS_CATALOG_DISEASES_LOCI_LD[i]+"\t"+str(len(f_in.readlines())-1)+"\n")
            i += 1

# Make a long list for the tag snps obtained like this combining all diseases
rule tag_snp_list:
    input: expand("gwas/results/{disease}_loci.tab", disease=GWAS_CATALOG_DISEASES_LOCI_LD)    
    output: "gwas/tag_snps/tag_snps.txt"
    shell: "cat {input} | grep -v 'SNPS' | cut -f 12,13,22 | sort | uniq > {output}"

rule tag_rsids:
    input: "gwas/tag_snps/tag_snps.txt"
    output: "gwas/tag_snps/tag_snps.rsids"
    shell: "cat {input} | cut -f 3 > {output}"

# How much does allele frequency differ for the tag snps
# For this, first get allele frequency in European data 
rule select_tag_european:
    input: "gwas/ld/EUR_GRCh38_common.vcf.gz", "gwas/tag_snps/tag_snps.rsids"
    output: "gwas/tag_snps/tag_snps_EUR.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--snps {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

rule af_tag_european:
    input: "gwas/tag_snps/tag_snps_EUR.vcf.gz"
    output: "gwas/tag_snps/tag_snps_EUR.frq"
    params: log_base=lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--freq " + \
                    "--out {params.log_base} "

rule select_tag_egyptian:
    input: "vep_annotation/vep.vcf.gz", \
           "gwas/tag_snps/tag_snps.rsids"
    output: "gwas/tag_snps/tag_snps_EGP.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--snps {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

rule af_tag_egyptian:
    input: "gwas/tag_snps/tag_snps_EGP.vcf.gz"
    output: "gwas/tag_snps/tag_snps_EGP.frq"
    params: log_base=lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--freq " + \
                    "--out {params.log_base} "

rule compare_tag_af:
    input: "gwas/tag_snps/tag_snps_EUR.frq", "gwas/tag_snps/tag_snps_EGP.frq"
    output: "gwas/tag_snps/tag_snps_af.txt", \
            "gwas/tag_snps/tag_snps_af_multiallelic.txt", \
            "gwas/tag_snps/tag_snps_posofmissingrsid.txt"
    run:
        header = "\t".join(["CHROM","POS","REF","ALT","EUR_N_CHR","EUR_ALT_AF","EGP_N_CHR","EGP_ALT_AF"])
        header_multi = "\t".join(["CHROM","POS","REF","ALT","EUR_N_CHR","EUR_ALT_AF"])
        eur_af = {}
        eur_af_present = {}
        with open(input[0],"r") as f_in:
            for line in f_in:
                # Skip header
                if line[:5] == "CHROM":
                    continue
                s = line.strip("\n").split("\t")
                chrom = s[0]
                pos = s[1]
                # Make sure all tag SNPs are in fact bi-allelic
                assert(s[2] == "2")
                ref,ref_af = s[4].split(":")
                alt,alt_af = s[5].split(":")
                # Make sure frequencies add up to 1
                assert(0.9999<=float(ref_af)+float(alt_af)<=1.0001)
                nchrom = s[3]
                loc = "\t".join([chrom,pos,ref,alt])
                eur_af[loc] = [nchrom,alt_af]
                eur_af_present[loc] = [nchrom,alt_af]
        with open(input[1],"r") as f_in, open(output[0],"w") as f_out, \
             open(output[1],"w") as f_out_multi, \
             open(output[2],"w") as f_out_missing:
            for line in f_in:
                # Skip header; write new header
                if line[:5] == "CHROM":
                    f_out.write(header+"\n")
                    f_out_multi.write(header_multi+"\n")
                    continue
                s = line.strip("\n").split("\t")
                chrom = s[0]
                if chrom[:3] == "chr":
                    chrom = chrom[3:]
                pos = s[1]
                # Make sure all tag SNPs are in fact bi-allelic
                assert(s[2] == "2")
                ref,ref_af = s[4].split(":")
                alt,alt_af = s[5].split(":")
                # Make sure frequencies add up to 1
                assert(0.9999<=float(ref_af)+float(alt_af)<=1.0001)
                nchrom = s[3]
                loc = "\t".join([chrom,pos,ref,alt])
                egp_af = [nchrom,alt_af]
                if not loc in eur_af:
                    f_out_multi.write(loc+"\t"+"\t".join(egp_af)+"\n")
                else:
                    f_out.write(loc+"\t"+"\t".join(eur_af[loc])+"\t"+"\t".join(egp_af)+"\n")
                    # Remove this locus, it is present in Egyptian data
                    del eur_af_present[loc]
            # Those tag SNPs not called in the Egyptian data have allele 
            # frequency of zero
            for loc in eur_af_present:
                 f_out.write(loc+"\t"+"\t".join(eur_af_present[loc])+"\t220\t0\n")
                 missing_chr = loc.split("\t")[0]
                 missing_pos = loc.split("\t")[1]
                 start = str(int(missing_pos)-15)
                 end = str(int(missing_pos)+15)
                 f_out_missing.write("chr"+missing_chr+"\t"+start+"\t"+end+"\n")

# There are only two positions of "missing" rsids in the Egyptian data
# (chr12:102577841 and chr17:78711314);
# Further within the MHC locus (chr6:28510020-33480577) are 42 positions
# Here we check whether there is variant data in the region surrounding the 
# missing SNP positions in order to check whether missing SNPs are maybe caused 
# by inconsistent indel normalisation; this appears to be for about 20 positions
rule select_missing_tag_egyptian_by_pos:
    input: "vep_annotation/vep.vcf.gz", \
           "gwas/tag_snps/tag_snps_posofmissingrsid.txt"
    output: "gwas/tag_snps/tag_snps_EGP_by_pos.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--bed {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

rule plot_tag_af_diff:
    input: "gwas/tag_snps/tag_snps_af.txt"
    output: "gwas/tag_snps/figures/tag_af_diff_hist.pdf", \
            "gwas/tag_snps/figures/tag_af_diff_scatter.pdf", \
            "gwas/tag_snps/figures/tag_af_diff_missing.pdf", \
            "gwas/tag_snps/figures/tag_af_diff_boxplot.pdf"
    script: "scripts/plot_tag_af_diff.R"
        
rule get_tag_snps_in_egyptians:
    input: "gwas/tag_snps/tag_snps_EGP.vcf.gz"
    output: "gwas/tag_snps/tag_snps_egyptians.rsids"
    shell: "zcat {input} | grep -v '#' | cut -f 3 > {output}"

rule get_tag_snps_and_pos_in_egyptians:
    input: "gwas/tag_snps/tag_snps_EGP.vcf.gz"
    output: "gwas/tag_snps/tag_snppos_egyptians.txt"
    shell: "zcat {input} | grep -v '#' | " + \
           "awk '{{print $1 \"_\" $2 \"_\" $3}}' | " + \
           "cut -c 4- > {output}"

TAG_SNPPOS = []
if os.path.exists("gwas/tag_snps/tag_snppos_egyptians.txt"):
    with open("gwas/tag_snps/tag_snppos_egyptians.txt","r") as f_in:
        for line in f_in:
            TAG_SNPPOS.append(line.strip("\n"))

rule bed_for_eur_locus_extraction:
    output: "gwas/tag_snps/{rsid_pos}_EUR.bed"
    run:
        chrom,pos,rsid = wildcards.rsid_pos.split("_")
        start = str(int(pos)-1000000)
        end = str(int(pos)+1000000)
        with open(output[0],"w") as f_out:
            f_out.write("\t".join([chrom,start,end])+"\n")

rule index_eur_variants:
    input: "gwas/ld/EUR_GRCh38.vcf.gz"
    output: "gwas/ld/EUR_GRCh38.vcf.gz.tbi"
    shell: "tabix -p vcf {input}" 

# Extract the SNPs within a window of +/-1MB
rule tag_snp_loci_european:
    input: "gwas/ld/EUR_GRCh38.vcf.gz", \
           "gwas/ld/EUR_GRCh38.vcf.gz.tbi", \
           "gwas/tag_snps/{rsid_pos}_EUR.bed"
    output: "gwas/tag_snps/{rsid_pos}_EUR.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "bcftools view --regions-file {input[2]} {input[0]} | " + \
           "bgzip > {output[0]}"

# For the Egyptian variants, we need a trailing "chr" in the chromsome 
# defintion in the bed file
rule bed_for_egp_locus_extraction:
    output: "gwas/tag_snps/{rsid_pos}_EGP.bed"
    run:
        chrom,pos,rsid = wildcards.rsid_pos.split("_")
        start = str(int(pos)-1000000)
        end = str(int(pos)+1000000)
        with open(output[0],"w") as f_out:
            f_out.write("\t".join(["chr"+chrom,start,end])+"\n")

rule index_egp_variants:
    input: "vep_annotation/vep.vcf.gz"
    output: "vep_annotation/vep.vcf.gz.tbi"
    shell: "tabix -p vcf {input}" 

# Extract the SNPs within a window of +/-1MB
rule tag_snp_loci_egyptian:
    input: "vep_annotation/vep.vcf.gz", \
           "vep_annotation/vep.vcf.gz.tbi", \
           "gwas/tag_snps/{rsid_pos}_EGP.bed"
    output: "gwas/tag_snps/{rsid_pos}_EGP.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "bcftools view --regions-file {input[2]} {input[0]} | " + \
           "bgzip > {output[0]}"

rule tag_snp_loci_european_egyptian_all:
    input: expand("gwas/tag_snps/{rsid_pos}_EUR.vcf.gz",rsid_pos=TAG_SNPPOS),
           expand("gwas/tag_snps/{rsid_pos}_EGP.vcf.gz",rsid_pos=TAG_SNPPOS)

rule make_eur_snp_pos_file:
    output: "gwas/tag_snps/{rsid_pos}_EUR.pos"
    run:
        chrom,pos,rsid = wildcards.rsid_pos.split("_")
        with open(output[0],"w") as f_out:
            # vcftools skips the first row, assuming it is a comment or so
            f_out.write("#\n")
            # write chromosome and position to file
            f_out.write(chrom+"\t"+pos+"\n")    

rule compute_european_proxy_snps:
    input: "gwas/tag_snps/{rsid_pos}_EUR.vcf.gz",
           "gwas/tag_snps/{rsid_pos}_EUR.pos"
    output: "gwas/tag_snps/{rsid_pos}_EUR.list.geno.ld"
    params: log_base=lambda wildcards, output: output[0][:-13]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--geno-r2-positions {input[1]} " + \
                    "--ld-window-bp 1000000 " + \
                    "--min-r2 0.8 " + \
                    "--out {params.log_base} "

rule make_egp_snp_pos_file:
    output: "gwas/tag_snps/{rsid_pos}_EGP.pos"
    run:
        chrom,pos,rsid = wildcards.rsid_pos.split("_")
        with open(output[0],"w") as f_out:
            # vcftools skips the first row, assuming it is a comment or so
            f_out.write("#\n")
            # write chromosome and position to file
            f_out.write("chr"+chrom+"\t"+pos+"\n")    

rule compute_egyptian_proxy_snps:
    input: "gwas/tag_snps/{rsid_pos}_EGP.vcf.gz",
           "gwas/tag_snps/{rsid_pos}_EGP.pos"
    output: "gwas/tag_snps/{rsid_pos}_EGP.list.geno.ld"
    params: log_base=lambda wildcards, output: output[0][:-13]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--geno-r2-positions {input[1]} " + \
                    "--ld-window-bp 1000000 " + \
                    "--min-r2 0.8 " + \
                    "--out {params.log_base} "

rule compute_proxy_snps:
    input: expand("gwas/tag_snps/{rsid_pos}_EUR.list.geno.ld",rsid_pos=TAG_SNPPOS),
           expand("gwas/tag_snps/{rsid_pos}_EGP.list.geno.ld",rsid_pos=TAG_SNPPOS)

# Make a file with the proxy SNP numbers and add information such as MAF etc.
rule num_proxy_snps:
    input: "gwas/tag_snps/tag_snps_af.txt", \
           "gwas/tag_snps/proxycomparison.txt", \
           expand("gwas/tag_snps/{rsid_pos}_{pop}.list.geno.ld", \
                  rsid_pos=TAG_SNPPOS, pop=["EUR","EGP"])
    output: "gwas/tag_snps/num_proxies.txt"
    run: 
        # Read in the tag SNP proxy files
        num_proxies = {}       
        for filename in input[2:]:
            proxy_positions = {}   
            with open(filename,"r") as f_in:
                chrom,pos,rsid,pop = filename.split("/")[-1].split(".")[0].split("_")
                position_pop = "\t".join([chrom,pos,pop])
                num_proxies[position_pop] = str(len(f_in.readlines())-1)
        # Read in the proxycomparison (i.e. shared, *-only, etc.)
        # There are 6 tag snps that are multiallelic in the Egyptian data
        # but having the same rsid. We don't want to use them because LD is
        # likely not computed for the correct allel. Thus we remove all 
        # positions that occur more than once
        num_proxyoverlap = {}
        with open(input[1],"r") as f_in:
            for line in f_in:
                if line[:5] == "CHROM":
                    continue
                s = line.strip("\n").split("\t")
                chrom_pos = "\t".join(s[:2])
                num_proxyoverlap[chrom_pos] = "\t".join(s[2:])
        # Read in the base tag SNP file to which to add proxy numbers
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                # Write new header
                if line[:5] == "CHROM":
                    f_out.write(line.strip("\n")+"\tEUR_N_PROXY\tEGP_N_PROXY\tPROXY_SHARED\tPROXY_EUR_ONLY\tPROXY_EGP_ONLY\n")
                    continue
                s = line.split("\t")
                chrom = s[0]
                pos = s[1]
                if chrom+"\t"+pos+"\tEUR" in num_proxies:
                    num_eur = num_proxies[chrom+"\t"+pos+"\tEUR"]
                    proxy_sharing = num_proxyoverlap[chrom+"\t"+pos]
                else:
                    num_eur = "NA"
                    proxy_sharing = "NA\tNA\tNA"
                if chrom+"\t"+pos+"\tEGP" in num_proxies:
                    num_egp = num_proxies[chrom+"\t"+pos+"\tEGP"]
                else:
                    num_egp = "NA"
                f_out.write(line.strip("\n")+"\t"+num_eur+"\t"+num_egp+"\t"+proxy_sharing+"\n")

rule plot_proxy_snp_numbers:
    input: "gwas/tag_snps/num_proxies.txt"
    output: "gwas/tag_snps/figures/proxy_number.pdf", \
            "gwas/tag_snps/figures/proxysharing.pdf",\
            "gwas/tag_snps/figures/af_difference.pdf",\
            "gwas/tag_snps/figures/eurvsegp.pdf",\
            "gwas/tag_snps/figures/eur_af_vs_proxynum.pdf",\
            "gwas/tag_snps/figures/egp_af_vs_proxynum.pdf"
    script: "scripts/tag_proxies.R"

# Watchout: We count proxies at the same position individually, i.e. duplicate
# positions in the list.geno.ld file are counted each
rule compare_proxy_snps:
    input: "gwas/tag_snps/{rsid_pos}_EUR.list.geno.ld", \
           "gwas/tag_snps/{rsid_pos}_EGP.list.geno.ld"
    output: "gwas/tag_snps/{rsid_pos}_proxycomparison.txt"
    run:
        # Record European proxy positions
        proxy_eur = {}
        num_proxy_eur = 0
        with open(input[0],"r") as f_eur:
             for line in f_eur:
                if line[:3] == "CHR":
                    continue
                chrom_pos = "chr"+"\t".join(line.split("\t")[2:4])
                proxy_eur[chrom_pos] = True
                num_proxy_eur += 1
        # Check Egyptian proxy positions and count same and different numbers
        same_proxy = 0
        eur_only = 0
        egp_only = 0
        with open(input[1],"r") as f_egp:
             for line in f_egp:
                if line[:3] == "CHR":
                    continue
                chrom_pos = "\t".join(line.split("\t")[2:4])
                if chrom_pos in proxy_eur:
                    same_proxy += 1
                else:
                    egp_only += 1
        eur_only = num_proxy_eur-same_proxy
        with open(output[0],"w") as f_out:
            chrom_pos = "\t".join(wildcards.rsid_pos.split("_")[:2])
            f_out.write("CHROM\tPOS\tSHARED\tEUR_ONLY\tEGP_ONLY\n")
            f_out.write(chrom_pos+"\t"+str(same_proxy)+"\t"+str(eur_only)+"\t"+str(egp_only)+"\n")

rule compare_proxy_snps_all:
    input: expand("gwas/tag_snps/{rsid_pos}_proxycomparison.txt",rsid_pos=TAG_SNPPOS)
    output: "gwas/tag_snps/proxycomparison.txt"
    run:
        with open(output[0],"w") as f_out:
            f_out.write("CHROM\tPOS\tSHARED\tEUR_ONLY\tEGP_ONLY\n")
            for filename in input:
                with open(filename,"r") as f_in:
                    for line in f_in:
                        # Skip header
                        if line[:5] == "CHROM":
                            continue
                        f_out.write(line)

rule annotated_loci_files:
    input: "gwas/tag_snps/num_proxies.txt", \
           "gwas/results/{disease}_loci.tab"
    output: "gwas/results/{disease}_loci_annotated.tab"
    run:
        snp_pos = {}
        with open(input[0],"r") as f_in:
            for line in f_in:
                if line[:5] == "CHROM":
                    header = line
                    continue 
                s = line.split("\t")
                chrom = s[0]
                pos = s[1]
                snp_pos[chrom+"\t"+pos] = line
        with open(input[1],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "DATE":
                    f_out.write(line.strip("\n")+"\t"+header)
                    continue 
                s = line.split("\t")
                chrom = s[11]
                pos = s[12]
                f_out.write(line.strip("\n")+"\t"+snp_pos[chrom+"\t"+pos])    

rule combine_annotated_loci_files_all:
    input: expand("gwas/results/{disease}_loci_annotated.tab", disease=GWAS_CATALOG_DISEASES_LOCI_LD)
    output: "gwas/results/combined_loci_annotated.txt"
    run: 
        header_written = False
        with open(output[0],"w") as f_out:
            for filename in input:
                with open(filename,"r") as f_in:
                    for line in f_in:
                        if line[:4] == "DATE":                
                            if not header_written:
                                f_out.write(line)
                                header_written = True
                            continue
                        f_out.write(line)
                        

########### Matching Egyptian population-specific variants and GWAS data #######

# Extract those Egyptian population-specific SNPs that are within 100kb of a SNP 
# listed in the GWAS catalog
rule egyptian_popspecific_in_proximity:
    input: "gwas/results/{disease}_catalog.tab",
           "vep_annotation/popspecific_0.01_final.txt"
    output: "gwas/results_{distance}/{disease}_{distance}_matched.txt"
    run:
        # Get the distance; SNPs within distance of the GWAS SNP are matched
        dist = int(wildcards.distance)
        # Read in positions of Egyptian population-specific SNPs
        pop_spec = {}
        header_pop = ""
        with open(input[1], 'r') as f_in:
            for line in f_in:
                if line[:3] == 'chr':
                    header_pop = line
                    continue
                s = line.split("\t")
                chrom = s[0]
                pos = s[1]
                if chrom in pop_spec:
                    pop_spec[chrom].append([chrom,int(pos),line])
                else:
                    pop_spec[chrom] = [[chrom,int(pos),line]]
        # Go over the GWAS catalog lines associations for this disease
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                # Skip header
                if line[:4] == "DATE":
                    f_out.write(line.strip("\n")+"\t"+header_pop)
                    continue
                s = line.split("\t")
                chrom = s[11]
                # Skip GWAS catalog associations not on the autosomes
                if not chrom in [str(x) for x in range(23)]:
                    continue
                # Skip GWAS catalog associations without position specified
                if s[12] == "" or " x " in s[12] or ";" in s[12]:
                    continue
                pos = int(s[12])
                for variant in pop_spec[chrom]:
                    if pos-dist <= variant[1] <= pos+dist:
                        f_out.write(line.strip("\n")+"\t"+variant[2]) 

rule egyptian_popspecific_in_proximity_all:
    input: expand("gwas/results_{distance}/{disease}_{distance}_matched.txt", \
                  disease=GWAS_CATALOG_DISEASES, \
                  distance=["10","100","1000","10000","100000"])#,"0","10","1000","10000","100000"])
        

######################### Matching Egyptian SVs and GWAS data ##################

# Extract those Egyptian SVs that are within a certain distances of a SNP listed
# in the GWAS catalog
rule egyptian_sv_in_proximity:
    input: "gwas/results/{disease}_catalog.tab",
           "sv_stats/egyptians_{sv_type}.vcf"
    output: "gwas/results_{distance}/{sv_type}/{disease}_{distance}_{sv_type}_matched.txt"
    run:
        # Get the distance; SNPs within distance of the GWAS SNP are matched
        dist = int(wildcards.distance)
        # Read in positions of Egyptian population-specific SNPs
        svs = {}
        header_pop = ""
        with open(input[1], 'r') as f_in:
            for line in f_in:
                if line[0] == '##':
                    continue
                if line[0] == '#':
                    header_pop = line[1:]
                    continue
                s = line.split("\t")
                # Only consider passing SV calls
                if not s[6] == "PASS":
                    continue
                chrom = s[0]
                if chrom[:3] == "chr":
                    chrom = chrom[3:]
                pos = s[1]
                if chrom in svs:
                    svs[chrom].append([chrom,int(pos),line])
                else:
                    svs[chrom] = [[chrom,int(pos),line]]
                # Get the end position
                chr2 = ""
                pos2 = ""
                for entry in s[7].split(";"):
                    if not len(entry.split("=")) == 2:
                        continue
                    first,second = entry.split("=")
                    if first == "CHR2":
                        chrom2 = second
                        if chrom2[:3] == "chr":
                            chrom2 = chrom2[3:]
                    if first == "END":
                        assert(chrom == chrom2)
                        if chrom2 in svs:
                            svs[chrom2].append([chrom2,int(pos),line])
                        else:
                            svs[chrom2] = [[chrom2,int(pos),line]]
        # Go over the GWAS catalog lines associations for this disease
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                # Skip header
                if line[:4] == "DATE":
                    f_out.write(line.strip("\n")+"\t"+header_pop)
                    continue
                s = line.split("\t")
                chrom = s[11]
                # Skip GWAS catalog associations not on the autosomes
                if not chrom in [str(x) for x in range(23)]:
                    continue
                # Skip GWAS catalog associations without position specified
                if s[12] == "" or " x " in s[12] or ";" in s[12]:
                    continue
                pos = int(s[12])
                # For insertions, e.g., there are not SVs on all chromosomes
                if not chrom in svs:
                    continue
                for variant in svs[chrom]:
                    if pos-dist <= variant[1] <= pos+dist:
                        f_out.write(line.strip("\n")+"\t"+variant[2]) 

rule egyptian_sv_in_proximity_all:
    input: expand("gwas/results_{distance}/{sv_type}/{disease}_{distance}_{sv_type}_matched.txt", \
                  disease=GWAS_CATALOG_DISEASES, \
                  sv_type=["insertions","deletions"],\
                  distance=["10"])#,"100","100000","deletions","insertions","inversions","duplications","translocations"
        

rule generate_paper_numbers:
    input: "gwas/results_table/number_of_index_snps.txt"