Snakefile_variants

#kate:syntax python;

from global_variables import *
import gzip

# Rules to be executed for new assemblies: compute_content_and_assembly_numbers,
# repeatmasker_summary_table_egyptrefv2, align_assemblies_with_mummer_all, 

################################################################################
################### Analyzing variants of 110 Egyptians ########################
################################################################################


EGYPT_SAMPLES = ["EGYPTREF","LU18","LU19","LU2","LU22","LU23","LU9","PD114", \
                 "PD115","PD82"]

PAGANI_SAMPLES = [
    "EGAN00001101667","EGAN00001101668","EGAN00001101669","EGAN00001101670", \
    "EGAN00001101671","EGAN00001101672","EGAN00001101676","EGAN00001101677", \
    "EGAN00001101678","EGAN00001101679","EGAN00001101680","EGAN00001101681", \
    "EGAN00001101682","EGAN00001101687","EGAN00001101688","EGAN00001101689", \
    "EGAN00001101690","EGAN00001101692","EGAN00001101694","EGAN00001101699", \
    "EGAN00001101700","EGAN00001101702","EGAN00001101705","EGAN00001101706", \
    "EGAN00001101711","EGAN00001101712","EGAN00001101713","EGAN00001101716", \
    "EGAN00001101717","EGAN00001101718","EGAN00001101719","EGAN00001101723", \
    "EGAN00001101724","EGAN00001101725","EGAN00001101732","EGAN00001101734", \
    "EGAN00001101735","EGAN00001101736","EGAN00001101737","EGAN00001101739", \
    "EGAN00001101742","EGAN00001101744","EGAN00001101748","EGAN00001101749", \
    "EGAN00001101750","EGAN00001101751","EGAN00001101752","EGAN00001101753", \
    "EGAN00001101754","EGAN00001101755","EGAN00001101756","EGAN00001101758", \
    "EGAN00001101759","EGAN00001101761","EGAN00001101767","EGAN00001101768", \
    "EGAN00001101769","EGAN00001101771","EGAN00001101772","EGAN00001101774", \
    "EGAN00001101776","EGAN00001101780","EGAN00001101781","EGAN00001101782", \
    "EGAN00001101783","EGAN00001101784","EGAN00001101786","EGAN00001101787", \
    "EGAN00001101788","EGAN00001101791","EGAN00001101792","EGAN00001101793", \
    "EGAN00001101794","EGAN00001101796","EGAN00001101797","EGAN00001101798", \
    "EGAN00001101799","EGAN00001101801","EGAN00001101802","EGAN00001101803", \
    "EGAN00001101804","EGAN00001101807","EGAN00001101808","EGAN00001101809", \
    "EGAN00001101813","EGAN00001101814","EGAN00001101816","EGAN00001101819", \
    "EGAN00001101820","EGAN00001101823","EGAN00001101824","EGAN00001101825", \
    "EGAN00001101827","EGAN00001101829","EGAN00001101830","EGAN00001101831", \
    "EGAN00001101835","EGAN00001101839","EGAN00001101840","EGAN00001101841"
]

INDIVIDUALS = EGYPT_SAMPLES+PAGANI_SAMPLES

################################################################################
################### Variant stats for the SNPs called by Matthias ##############
################################################################################

rule symlink_var_file:
    input: "/data/lied_egypt_genome/output_wgs/vars.clean.vcf.gz"
    output: "variant_stats/egyptians.vcf.gz"
    shell: "ln -s {input} {output}"

# Matthias performed mapping against 3366 in FASTA File 
# /data/lied_egypt_genome/reference/hg38/Homo_sapiens_assembly38.fasta
# followed by variant calling within the intervals listed in 
# /data/lied_egypt_genome/reference/hg38/wgs_calling_regions.hg38.interval_list 
# which are:
# * 194 sequences primary assembly
# * 2,069 https://www.ncbi.nlm.nih.gov/assembly/GCA_000786075.2#/def 
# (https://www.simonsfoundation.org/2013/12/23/simons-genome-diversity-project/)
# * 525 HLA sequences
rule cp_calling_info:
    input: "/data/lied_egypt_genome/reference/hg38/wgs_calling_regions.hg38.interval_list"
    output: protected("variant_stats/wgs_calling_regions.hg38.interval_list")
    shell: "cp {input} {output}"

# Include only sites with all Non-Reference (ALT) Allele Frequencies (af) or 
# Counts (ac) within the range specified, and including the specified value. The
#  default options require all alleles to meet the specified criteria, whereas 
# the options appended with "any" require only one allele to meet the criteria. 
rule num_variants:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: temp("{var_type}_stats/{individual}_num_variants.txt")
    shell: "vcftools --gzvcf {input} " + \
           "         --non-ref-ac-any 1 " + \
           "         --indv {wildcards.individual} " + \
           "         --stdout " + \
           "         --recode " + \
           " | grep -v '#' | wc -l > {output} "
           
rule num_variants_all:
    input: expand("{{var_type}}_stats/{individual}_num_variants.txt", \
                  individual=INDIVIDUALS)
    output: protected("{var_type}_stats/egyptians.insnps")
    run:
        i = 0
        with open(output[0],"w") as f_out:
            f_out.write("INDV\tN_SNPS\n")
            for filename in input:
                with open(filename,"r") as f_in:
                    for line in f_in:
                        f_out.write(INDIVIDUALS[i]+"\t"+line)
                i += 1

rule indv_missingness:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: protected("{var_type}_stats/egyptians.imiss")
    params: prefix="{var_type}_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --missing-indv " + \
           "         --out {params.prefix} " 

rule site_missingness:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: protected("{var_type}_stats/egyptians.lmiss")
    params: prefix="{var_type}_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --missing-site " + \
           "         --out {params.prefix} " 

rule heterozygosity:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: protected("{var_type}_stats/egyptians.het")
    params: prefix="{var_type}_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --het " + \
           "         --out {params.prefix} " 

rule relatedness:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: protected("{var_type}_stats/egyptians.relatedness")
    params: prefix="{var_type}_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --relatedness " + \
           "         --out {params.prefix} "

rule relatedness2:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: protected("{var_type}_stats/egyptians.relatedness2")
    params: prefix="{var_type}_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --relatedness2 " + \
           "         --out {params.prefix} "

rule indel_hist:
    input: "variant_stats/egyptians.vcf.gz"
    output: protected("variant_stats/egyptians.indel.hist")
    params: prefix="variant_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --hist-indel-len " + \
           "         --out {params.prefix} " 

rule indvidual_depth:
    input: "variant_stats/egyptians.vcf.gz"
    output: protected("variant_stats/egyptians.idepth")
    params: prefix="variant_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --depth " + \
           "         --out {params.prefix} " 

rule site_depth:
    input: "variant_stats/egyptians.vcf.gz"
    output: protected("variant_stats/egyptians.ldepth")
    params: prefix="variant_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --site-depth " + \
           "         --out {params.prefix} " 

rule site_depth_mean:
    input: "variant_stats/egyptians.vcf.gz"
    output: protected("variant_stats/egyptians.ldepth.mean")
    params: prefix="variant_stats/egyptians"
    shell: "vcftools --gzvcf {input} " + \
           "         --site-mean-depth " + \
           "         --out {params.prefix} " 

rule variants_per_chrom:
    input: "{var_type}_stats/egyptians.vcf.gz"
    output: protected("{var_type}_stats/variants_per_chrom.txt")
    shell: "zcat {input} | grep -v '#' | cut -f 1 | uniq -c > {output}"

rule variant_stats:
    input: expand("variant_stats/egyptians.{stat}", stat=["imiss","lmiss", \
                                            "het","relatedness","relatedness2", \
                                            "indel.hist","idepth", "ldepth", \
                                            "ldepth.mean","insnps"])

rule variant_stats_boxplots:
    input: "variant_stats/egyptians.insnps", "variant_stats/egyptians.imiss",
           "variant_stats/egyptians.idepth", "variant_stats/egyptians.het",
           "variant_stats/egyptians.indel.hist", 
           "variant_stats/variants_per_chrom.txt"
    output: "variant_stats/figures/variant_stats_boxplots.pdf",
            "variant_stats/figures/variant_stats_missing_vs_het.pdf",
            "variant_stats/figures/variant_stats_corplot.pdf",
            "variant_stats/figures/variant_stats_indel_hist.pdf",
            "variant_stats/figures/variant_stats_var_per_chrom.pdf"
    script: "scripts/variant_stats_boxplots.R"

################################################################################
##### Population stratification analysis using Eigenstrat (e.g. PC plots) ######
################################################################################

# Downloading 1000 genomes data
rule download_1000g_genotypes:
    output: "1000_genomes/ALL.chr{chr}_GRCh38.genotypes.20170504.vcf.gz"
    shell: "wget -P 1000_genomes/ http://ftp.1000genomes.ebi.ac.uk/vol1/" + \
                                  "ftp/release/20130502/supporting/" + \
                                  "GRCh38_positions/" + \
                                  "ALL.chr{wildcards.chr}_GRCh38.genotypes.20170504.vcf.gz"

# Downloading 1000 genomes data (index)
rule download_1000g_genotypes_index:
    output: "1000_genomes/ALL.chr{chr}_GRCh38.genotypes.20170504.vcf.gz.tbi"
    shell: "wget -P 1000_genomes/ http://ftp.1000genomes.ebi.ac.uk/vol1/" + \
                                  "ftp/release/20130502/supporting/" + \
                                  "GRCh38_positions/" + \
                                  "ALL.chr{wildcards.chr}_GRCh38.genotypes.20170504.vcf.gz.tbi"

# Downloading 1000 genomes data (Readme)
rule download_1000g_genotypes_readme:
    output: "1000_genomes/README_GRCh38_liftover_20170504.txt"
    shell: "wget -P 1000_genomes/ http://ftp.1000genomes.ebi.ac.uk/vol1/" + \
                                  "ftp/release/20130502/supporting/" + \
                                  "GRCh38_positions/" + \
                                  "README_GRCh38_liftover_20170504.txt"

# Get the ped file which contains the population of the samples (and more info)
rule download_1000g_genotypes_ped:
    output: "1000_genomes/integrated_call_samples_v2.20130502.ALL.ped"
    shell: "wget -P 1000_genomes/ http://ftp.1000genomes.ebi.ac.uk/vol1/" + \
                                  "ftp/release/20130502/" + \
                                  "integrated_call_samples_v2.20130502.ALL.ped"

rule download_1000g_genotypes_all:
    input: expand("1000_genomes/ALL.chr{chr}_GRCh38.genotypes.20170504.vcf.gz", \
                   chr=[str(x) for x in range(1,23)]+["X","Y"]), \
           expand("1000_genomes/ALL.chr{chr}_GRCh38.genotypes.20170504.vcf.gz.tbi", \
                   chr=[str(x) for x in range(1,23)]+["X","Y"]), \
           "1000_genomes/README_GRCh38_liftover_20170504.txt", \
           "1000_genomes/integrated_call_samples_v2.20130502.ALL.ped"

# Selecting 1000G individuals for inclusion in Stratification analysis
# We select individuals belonging to populations of interest and which
# have phase3 genotypes avaliable (this is column 14) 
# ACB: African Caribbeans in Barbados
# ASW: Americans of African Ancestry in SW USA 	
# CEU: Utah Residents (CEPH) with Northern and Western European Ancestry
# ESN: Esan in Nigeria
# FIN: Finnish in Finland
# GBR: British in England and Scotland
# GWD: Gambian in Western Divisions in the Gambia
# IBS: Iberian Population in Spain
# LWK: Luhya in Webuye, Kenya
# MSL: Mende in Sierra Leone
# TSI: Toscani in Italia
# YRI: Yoruba in Ibadan, Nigeria
POPULATIONS_1000G = [
"ACB","ASW","CEU","ESN","FIN","GBR","GWD","IBS","LWK","MSL","TSI","YRI"
]
POPULATIONS_AFR = ["ACB","ASW","ESN","GWD","LWK","MSL","YRI"]
POPULATIONS_EUR = ["CEU","FIN","GBR","IBS","TSI"]
rule gp_select_pop_from_1000g:
    input: "1000_genomes/integrated_call_samples_v2.20130502.ALL.ped"
    output: "genotype_pcs/keep_indiv.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                s = line.split("\t")
                if s[6] in POPULATIONS_1000G and s[13] == "1":
                    f_out.write(s[1]+"\n")

# Make an annotation file with sample names and population for plotting
rule gp_make_pop_annotation:
    input: "1000_genomes/integrated_call_samples_v2.20130502.ALL.ped"
    output: "genotype_pcs/annotation_EGYPT_AFR_EUR_GRCh38.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            # First write the egyptians
            for egyptian in EGYPT_SAMPLES:
                # PD114, PD115, PD82 are from Upper Egypt
                if egyptian[:2] == "PD":
                    f_out.write(egyptian+"\tEGU\tEGY\n")
                # LU18, LU19, LU2, LU22. LU23, LU9, and Egyptref are from Delta
                else:
                    f_out.write(egyptian+"\tEGD\tEGY\n")
            # Then Egyptians from Pagani et al.
            for egyptian in PAGANI_SAMPLES:
                f_out.write(egyptian+"\tEGP\tEGY\n")
            # Then the 1000G samples
            for line in f_in:
                s = line.split("\t")
                pop = s[6]
                if pop in POPULATIONS_1000G and s[13] == "1":
                    if pop in POPULATIONS_AFR:
                        f_out.write(s[1]+"\t"+pop+"\tAFR\n")
                    elif pop in POPULATIONS_EUR:
                        f_out.write(s[1]+"\t"+pop+"\tEUR\n")

# Selecting from the VCF files those individuals that are to be used
# Keeping only variants with at last 5% MAF
# Keeping only variants not violating Hardy-Weinberg-Equilibrium
# Keeping only bi-allelic variants (min-allele = max-allele = 2)
rule gp_select_1000g_individual_genotypes:
    input: "1000_genomes/ALL.chr{chr}_GRCh38.genotypes.20170504.vcf.gz",
           "genotype_pcs/keep_indiv.txt"
    output: "genotype_pcs/AFR_EUR.chr{chr}_GRCh38.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--keep {input[1]} " + \
                    "--min-alleles 2 " + \
                    "--max-alleles 2 " + \
                    "--maf 0.05 " + \
                    "--hwe 0.000001 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

# Compressing and indexing of files to be used with vcf-merge
rule gp_index_1000g:
    input: "genotype_pcs/AFR_EUR.chr{chr}_GRCh38.vcf.gz"
    output: "genotype_pcs/AFR_EUR.chr{chr}_GRCh38.vcf.gz.tbi"
    conda: "envs/genotype_pcs.yaml"
    shell: "tabix -p vcf {input}"

# Getting the list of SNPs for genotype PCs from the 1000 Genomes samples
rule gp_get_1000g_snps:
    input: "genotype_pcs/AFR_EUR.chr{chr}_GRCh38.vcf.gz"
    output: "genotype_pcs/snps_chr{chr}.txt"
    shell: "zcat {input} | grep -v '#' | cut -f 1,2 > {output}"

# Symlink variant file for genotype pc computation
rule gp_symlink_var_file:
    input: "/data/lied_egypt_genome/output_wgs/vars.clean.vcf.gz"
    output: "genotype_pcs/egyptians.vcf.gz"
    shell: "ln -s {input} {output}"

# Split Egyptian vcf file chromosome-wise; 
# further, select SNPs called in all Egyptian samples (--max-missing), i.e. 
# all genotypes ara available
rule gp_split_vcf_chromosomewise:
    input: "genotype_pcs/egyptians.vcf.gz"
    output: "genotype_pcs/egyptians.chromosome.{chr}.vcf"
    conda: "envs/genotype_pcs.yaml"
    params: log_base=lambda wildcards, output: output[0][:-4]
    shell: "vcftools --gzvcf {input} " + \
                    "--chr chr{wildcards.chr} " + \
                    "--max-missing 1 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout > {output[0]}"

# Remove the "chr" from the chromosome names since the SNP calling from Matthias
# has chr1, chr2,... instead of 1,2,..., which is given in 1000 genomes files
rule gp_remove_chr:
    input: "genotype_pcs/egyptians.chromosome.{chr}.vcf"
    output: "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz"
    conda: "envs/genotype_pcs.yaml"
    shell: "cat {input} " + \
           "| sed 's/^chr{wildcards.chr}/{wildcards.chr}/g' " + \
           "| bgzip > {output} "

# Here, we select from the SNPs called for the egyptians those, which are also
# kept from the 1000 genomes samples, i.e. 5% MAF, HWE, bi-allelic
rule gp_select_matching_egyptian_snps:
    input: "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz",
           "genotype_pcs/snps_chr{chr}.txt"
    output: "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--positions {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

# Compressing and indexing of files to be used with vcf-merge
rule gp_index_egyptians:
    input: "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz"
    output: "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz.tbi"
    conda: "envs/genotype_pcs.yaml"
    shell: "tabix -p vcf {input}"

# Merging the vcf-files of 1000 genomes with our SNP calls for the egyptians
rule gp_merge_1000g_with_egyptians:
    input: "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz",
           "genotype_pcs/egyptians.chromosome.{chr}.vcf.gz.tbi",
           "genotype_pcs/AFR_EUR.chr{chr}_GRCh38.vcf.gz",
           "genotype_pcs/AFR_EUR.chr{chr}_GRCh38.vcf.gz.tbi"
    output: "genotype_pcs/EGYPT_AFR_EUR.chr{chr}_GRCh38.vcf.gz"
    conda: "envs/genotype_pcs.yaml"
    shell: "vcf-merge {input[0]} {input[2]} | bgzip > {output[0]}"

rule gp_merge_1000g_with_egyptians_all:
    input: expand("genotype_pcs/EGYPT_AFR_EUR.chr{chr}_GRCh38.vcf.gz", \
                   chr=[str(x) for x in range(1,23)])

# Concatenate the vcf file from several chromosomes
# --pad-missing: Write '.' in place of missing columns. Useful for joining chrY 
# with the rest.
rule gp_concatenate_chr_vcfs:
    input: expand("genotype_pcs/EGYPT_AFR_EUR.chr{chr}_GRCh38.vcf.gz", \
                   chr=[str(x) for x in range(1,23)])
    output: "genotype_pcs/EGYPT_AFR_EUR_GRCh38.vcf.gz"
    conda: "envs/genotype_pcs.yaml"
    shell: "vcf-concat --pad-missing {input} | bgzip > {output}"

# Filtering egyptian only variants
# Additional to maf and number of alleles, we also exclude all SNPs with 
# missing data here, since we have only 10 individuals
# --max-missing <float>: Exclude sites on the basis of the proportion of missing
#                        data (defined to be between 0 and 1, where 0 allows 
#                        sites that are completely missing and 1 indicates no 
#                        missing data allowed).
rule gp_filter_for_egyptian_only_pcs:
    input: "genotype_pcs/egyptians.vcf.gz"
    output: "genotype_pcs/EGYPT_GRCh38.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--min-alleles 2 " + \
                    "--max-alleles 2 " + \
                    "--maf 0.05 " + \
                    "--hwe 0.000001 " + \
                    "--max-missing 1 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

rule gp_vcf_wo_missing_genotypes:
    input: "genotype_pcs/{set}_GRCh38.vcf.gz"
    output: "genotype_pcs/{set}_GRCh38_wo_missing.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--max-missing 1 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}"

# Converting vcf files to plink binary format (bed/bim/fam) for preparing for
# Eigenstrat analysis
rule gp_vcf_to_plink:
    input: "genotype_pcs/{set}_GRCh38_wo_missing.vcf.gz"
    output: "genotype_pcs/plink/{set}_GRCh38.bed",
            "genotype_pcs/plink/{set}_GRCh38.bim",
            "genotype_pcs/plink/{set}_GRCh38.fam"
    params: out_base=lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "plink2 --vcf {input} " + \
                  "--make-bed " + \
                  "--out {params.out_base}"

# Removal of regions of high LD and/or known inversions from Abraham 2014, 
# i.e. Fellay 2009:
# chr6:25 Mb–33.5 Mb, (see also Wang 2009)
# chr5:44 Mb–51.5 Mb, chr8:8 Mb–12 Mb, chr11:45 Mb–57 Mb
# Therefore, make lists of SNPs in the respective regions to be removed,
# Then: Concatenate all the SNPs to be removed
rule gp_find_snps_from_high_ld_regions:
    input: "genotype_pcs/plink/{set}_GRCh38.bed", 
           "genotype_pcs/plink/{set}_GRCh38.bim",
           "genotype_pcs/plink/{set}_GRCh38.fam"
    output: "genotype_pcs/plink/{set}_GRCh38_6_25-33.5.snplist",
            "genotype_pcs/plink/{set}_GRCh38_5_44-51.5.snplist",
            "genotype_pcs/plink/{set}_GRCh38_8_8-12.snplist",
            "genotype_pcs/plink/{set}_GRCh38_11_45-57.snplist",
            "genotype_pcs/plink/{set}_GRCh38_exclusion.snplist" 
    params: in_base = lambda wildcards, input: input[0][:-4],
            chr6_base = lambda wildcards, output: output[0][:-8],
            chr5_base = lambda wildcards, output: output[1][:-8],
            chr8_base = lambda wildcards, output: output[2][:-8],
            chr11_base = lambda wildcards, output: output[3][:-8]
    conda: "envs/genotype_pcs.yaml"
    shell: 
        "plink2 --bfile {params.in_base} " + \ 
               "--chr 6 " + \
               "--from-mb 25 " + \
               "--to-mb 33.5 " + \
               "--write-snplist " + \
               "--out {params.chr6_base}; " + \
        "plink2 --bfile {params.in_base} " + \ 
               "--chr 5 " + \
               "--from-mb 44 " + \
               "--to-mb 51.5 " + \
               "--write-snplist " + \
               "--out {params.chr5_base}; " + \
        "plink2 --bfile {params.in_base} " + \ 
               "--chr 8 " + \
               "--from-mb 8 " + \
               "--to-mb 12 " + \
               "--write-snplist " + \
               "--out {params.chr8_base}; " + \
        "plink2 --bfile {params.in_base} " + \ 
               "--chr 11 " + \
               "--from-mb 45 " + \
               "--to-mb 57 " + \
               "--write-snplist " + \
               "--out {params.chr11_base}; " + \
        "cat {output[0]} {output[1]} {output[2]} {output[3]}  > {output[4]} "

# Now exclude the SNPs from these regions
rule gp_exclude_snps_from_high_ld_regions:
    input: "genotype_pcs/plink/{set}_GRCh38.bed", 
           "genotype_pcs/plink/{set}_GRCh38.bim",
           "genotype_pcs/plink/{set}_GRCh38.fam",
           "genotype_pcs/plink/{set}_GRCh38_exclusion.snplist"
    output: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.bed", 
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.bim",
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.fam"
    params: in_base = lambda wildcards, input: input[0][:-4],
            out_base = lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "plink2 --bfile {params.in_base} " + \
                  "--exclude {input[3]} " + \
                  "--make-bed " + \
                  "--out {params.out_base} "

# LD prune the PLINK files; therefore, first make a list of SNPs in LD (and not 
# in LD)(i.e. to be removed or not)
# Parameters for indep-pairwise: [window size]<kb> [step size (variant ct)] 
# [VIF threshold]
# Explanation Plink website): the command above that specifies 50 5 0.5 would 
# a) consider a window of 50 SNPs, 
# b) calculate LD between each pair of SNPs in the window, 
# c) remove one of a pair of SNPs if the LD is greater than 0.5, 
# d) shift the window 5 SNPs forward and repeat the procedure
# Abraham 2014 used: 1000 10 0.02
# Anderson 2010 used: 50 5 0.2
# Wang 2009 used: 100 ? 0.2
# Fellay 2009 used: 1500 150 0.2 
# Watch out: Before, the LD pruned SNPs (to be kept, to be pruned out) were 
# always taken from the entire set of Egyptian, European and African samples we 
# have; there we use as LD pruning parameter: 1000 10 0.2 because we only had 
# 10 Egyptians which was insufficient for LD pruning. Now we have 110 Egyptians
# and we will perform LD pruning for the Egyptian set as well 
rule gp_find_ld_pruned_snps:
    input: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.bed", 
           "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.bim",
           "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.fam"
    output: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.prune.in", 
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.prune.out"
    params: in_base = lambda wildcards, input: input[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "plink2 --bfile {params.in_base} " + \
                  "--indep-pairwise 1000 10 0.2 " + \
                  "--out {params.in_base} "

# Now exclude the pruned SNPs
# Before, we used the same set of pruned SNPs, those from the entire 
# Egypt/Eur/Afr samples also for other subsets, because the LD pruning will not 
# work well for small numbers of samples
# "--exclude {input[3]} " + \ in case the file is 
# genotype_pcs/plink/EGYPT_AFR_EUR_GRCh38_wo_ldregions.prune.out
# Now that we have 110 Egyptians, we perform LD pruning for this set of 110 
# indviduals
rule gp_exclude_ld_pruned_snps:
    input: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.bed", 
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.bim",
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.fam",
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions.prune.in"
    output: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.bed", 
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.bim",
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.fam"
    params: in_base = lambda wildcards, input: input[0][:-4],
            out_base = lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "plink2 --bfile {params.in_base} " + \
                  "--extract {input[3]} " + \
                  "--make-bed " + \
                  "--out {params.out_base}"

# Change missing phenotype to population ID
rule gp_missing_phenotype_to_population:
    input: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.fam",
           "genotype_pcs/annotation_EGYPT_AFR_EUR_GRCh38.txt"
    output: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned_phenotype.fam"
    run:
        sample_to_pop = {}
        pop_to_num = {"EGY": "2", "EUR": "3", "AFR": "4"}
        with open(input[1],"r") as f_annotation:
            for line in f_annotation:
                s = line.strip().split("\t")
                sample_to_pop[s[0]] = s[2]
        with open(input[0],"r") as f_fam, open(output[0],"w") as f_out:
            for line in f_fam:
                s = line.split(" ")
                num_pop = pop_to_num[sample_to_pop[s[0]]]
                f_out.write(" ".join(s[:-1])+" "+num_pop+"\n")

# Conversion from bed/bim/fam to ped/map
rule gp_convert_to_ped_map:
    input: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.bed", 
           "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.bim",
           "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned_phenotype.fam"
    output: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.ped", 
            "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.map"
    params: in_base = lambda wildcards, input: input[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "plink2 --bfile {params.in_base} " + \
                  "--fam {input[2]} " + \
                  "--recode " + \
                  "--out {params.in_base} "

# Write the parameter file needed by the Eigensoft convertf program
rule gp_eigentstrat_parameter_file:
    input: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.ped", 
           "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.map"
    output: "genotype_pcs/eigenstrat/{set}_GRCh38.ped2eigenstrat.params",
    params: gout="genotype_pcs/eigenstrat/{set}_GRCh38.eigenstratgeno",
            sout="genotype_pcs/eigenstrat/{set}_GRCh38.snp",
            iout="genotype_pcs/eigenstrat/{set}_GRCh38.ind"
    run: 
        with open(output[0],"w") as f_out:
            f_out.write("genotypename:    "+input[0]+"\n")
            f_out.write("snpname:         "+input[1]+"\n") 
            f_out.write("indivname:       "+input[0]+"\n")
            f_out.write("outputformat:    EIGENSTRAT\n")
            f_out.write("genotypeoutname: "+params.gout+"\n")
            f_out.write("snpoutname:      "+params.sout+"\n")
            f_out.write("indivoutname:    "+params.iout+"\n")
            f_out.write("familynames:     NO\n")

# This is the actual conversion from ped format to the eigenstrat input format
rule gp_ped_to_eigentstrat:
    input: "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.ped", 
           "genotype_pcs/plink/{set}_GRCh38_wo_ldregions_pruned.map",
           "genotype_pcs/eigenstrat/{set}_GRCh38.ped2eigenstrat.params"
    output: "genotype_pcs/eigenstrat/{set}_GRCh38.eigenstratgeno",
            "genotype_pcs/eigenstrat/{set}_GRCh38.snp",
            "genotype_pcs/eigenstrat/{set}_GRCh38.ind"
    conda: "envs/genotype_pcs.yaml"
    shell: "convertf -p {input[2]}"

# Running Eigensofts smartpca module which computes the population PCs
# The smartpca parameters:
# -i example.geno  : genotype file in any format (see ../CONVERTF/README)
# -a example.snp   : snp file in any format (see ../CONVERTF/README)
# -b example.ind   : indiv file in any format (see ../CONVERTF/README)
# -k k             : (Default is 10) number of principal components to output
# -o example.pca   : output file of principal components.  Individuals removed
#                    as outliers will have all values set to 0.0 in this file.
# -p example.plot  : prefix of output plot files of top 2 principal components.
#                    (labeling individuals according to labels in indiv file)
# -e example.eval  : output file of all eigenvalues
# -l example.log   : output logfile
# -m maxiter       : (Default is 5) maximum number of outlier removal iterations.
#                    To turn off outlier removal, set -m 0.
# -t topk          : (Default is 10) number of principal components along which 
#                    to remove outliers during each outlier removal iteration.
# -s sigma         : (Default is 6.0) number of standard deviations which an
#                    individual must exceed, along one of topk top principal
# 		             components, in order to be removed as an outlier.
rule gp_smartpca_parameter_file:
    input: "genotype_pcs/eigenstrat/{set}_GRCh38.eigenstratgeno",
           "genotype_pcs/eigenstrat/{set}_GRCh38.snp",
           "genotype_pcs/eigenstrat/{set}_GRCh38.ind"
    output: "genotype_pcs/eigenstrat/{set}_GRCh38.smartpca.params"
    params: evec="genotype_pcs/eigenstrat/{set}_GRCh38.pca.evec",
            eval="genotype_pcs/eigenstrat/{set}_GRCh38.eval",
            iout="genotype_pcs/eigenstrat/{set}_GRCh38.ind"
    run: 
        with open(output[0],"w") as f_out:
            f_out.write("genotypename:    "+input[0]+"\n")
            f_out.write("snpname:         "+input[1]+"\n") 
            f_out.write("indivname:       "+input[2]+"\n")
            f_out.write("evecoutname:     "+params.evec+"\n")
            f_out.write("evaloutname:     "+params.eval+"\n")
            f_out.write("numoutevec: 10\n")
            f_out.write("numoutlieriter: 0\n")

rule gp_eigensoft_smartpca:
    input: "genotype_pcs/eigenstrat/{set}_GRCh38.eigenstratgeno",
           "genotype_pcs/eigenstrat/{set}_GRCh38.snp",
           "genotype_pcs/eigenstrat/{set}_GRCh38.ind",
           "genotype_pcs/eigenstrat/{set}_GRCh38.smartpca.params"
    output: "genotype_pcs/eigenstrat/{set}_GRCh38.eval",
            "genotype_pcs/eigenstrat/{set}_GRCh38.log",
            "genotype_pcs/eigenstrat/{set}_GRCh38.pca.evec"
    conda: "envs/genotype_pcs.yaml"
    shell: "smartpca -p {input[3]} > {output[1]}"

# Computing the Tracy-Widom statistics to evaluate the statistical 
# significance of each principal component identified by pca
rule gp_tracy_widom_pval:
    input: "genotype_pcs/eigenstrat/{set}_GRCh38.eval",
           "data/misc/twtable"
    output: "genotype_pcs/eigenstrat/{set}_GRCh38.tw"
    conda: "envs/genotype_pcs.yaml"
    shell: "twstats -i {input[0]} " + \
                   "-t {input[1]} " + \
                   "-o {output[0]} "

# Plotting the PCs
rule gp_plot_gt_pcs:
    input: "genotype_pcs/eigenstrat/EGYPT_AFR_EUR_GRCh38.pca.evec",
           "genotype_pcs/annotation_EGYPT_AFR_EUR_GRCh38.txt"
    output: "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_1vs2.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_1vs3.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_1vs4.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_2vs3.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_2vs4.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_3vs4.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_scree_plot.pdf",
            "genotype_pcs/figures/EGYPT_AFR_EUR_GRCh38_pca_3d.pdf"
    params: out_path = "genotype_pcs/figures/"
    conda: "envs/genotype_pcs.yaml"
    script: "scripts/plot_gt_pcs.R"

rule gp_plot_gt_pcs_egyptians:
    input: "genotype_pcs/eigenstrat/EGYPT_GRCh38.pca.evec",
           "genotype_pcs/annotation_EGYPT_AFR_EUR_GRCh38.txt"
    output: "genotype_pcs/figures/EGYPT_GRCh38_pca_1vs2.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_pca_1vs3.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_pca_1vs4.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_pca_2vs3.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_pca_2vs4.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_pca_3vs4.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_scree_plot.pdf",
            "genotype_pcs/figures/EGYPT_GRCh38_pca_3d.pdf"
    params: out_path = "genotype_pcs/figures/"
    conda: "envs/genotype_pcs.yaml"
    script: "scripts/plot_gt_pcs_egyptians.R"

rule gp_genotype_pcs_all:
    input: expand("genotype_pcs/figures/{set}_GRCh38_pca_1vs2.pdf", \
                  set=["EGYPT_AFR_EUR","EGYPT"]),
           expand("genotype_pcs/eigenstrat/{set}_GRCh38.tw", \
                  set=["EGYPT_AFR_EUR","EGYPT"]),


################################################################################
############ Dealing with annotated variants and filtering them ################
################################################################################

rule av_symlink_annovar_annotation:
    input: "/data/lied_egypt_genome/axel/Annovar/final.annotation.txt"
    output: "annovar_annotation/egyptians_annovar_annotated.txt"
    shell: "ln -s {input} {output}"

# This counts occurrences in the column "Func_refGene"
rule av_location_annovar_anotated_vars:
    input: "annovar_annotation/egyptians_annovar_annotated.txt"
    output: "annovar_annotation/count_location.txt"
    shell: "cat {input} | sort | uniq | cut -f 6 | sort | uniq -c > {output}"

# Rhis counts occurrences in the column "ExonicFunc_refGene"
rule av_exonic_effect_annovar_anotated_vars:
    input: "annovar_annotation/egyptians_annovar_annotated.txt"
    output: "annovar_annotation/count_exonic_effect.txt"
    shell: "cat {input} | sort | uniq | cut -f 9 | sort | uniq -c > {output}"   

# Here, first filter variants that are exonic and rare according to no (="NA")
# or according to low gnomAD_exome_ALL
rule av_filter_exonic_rare_variants:
    input: "annovar_annotation/egyptians_annovar_annotated.txt"
    output: "annovar_annotation/egyptians_annovar_annotated_exonic_rare.txt"
    run:
        with open (input[0],"r") as f_in, open (output[0],"w") as f_out:
            for line in f_in:
                s = line.split("\t")
                if line[:3] == "Chr":
                    assert(s[5] == "Func_refGene")
                    assert(s[17] == "gnomAD_exome_ALL")
                    f_out.write(line)
                    continue
                function = s[5]
                gnomAD_exome = s[17]
                if function == "exonic":
                    if gnomAD_exome in ["NA","."]:
                        f_out.write(line)
                    elif float(gnomAD_exome) <= 0.001:
                        f_out.write(line)

# Filter variants with CADD score greater 20
rule av_filter_deleterious:
    input: "annovar_annotation/egyptians_annovar_annotated_exonic_rare.txt"
    output: "annovar_annotation/egyptians_annovar_annotated_exonic_rare_deleterious.txt"
    run:
        with open (input[0],"r") as f_in, open (output[0],"w") as f_out:
            for line in f_in:
                s = line.split("\t")
                if line[:3] == "Chr":
                    assert s[77] == "CADD_phred"
                    f_out.write(line)
                    continue
                cadd_phred = s[77]
                if cadd_phred in ["NA","."]:
                    continue
                if float(cadd_phred) >= 20:
                        f_out.write(line)

# Get chromosome and position of the SNPs; this is better than selecting by rsid
# because if there are Egyptian SNPs that have no rsid, then they will not be
# selected from the vcf File later. I checked (using 
# "cat annovar_annotation/egyptians_annovar_annotated_exonic_rare.txt  | 
# cut -f 1,2 | uniq -c | grep -v '  1 ' | wc -l") that there are only 13 
# positions occurring more than once, so this is neglect-able
rule av_get_snp_ids:
    input: "annovar_annotation/egyptians_annovar_annotated_exonic_rare.txt"
    output: "annovar_annotation/snppos_egyptians_annovar_annotated_exonic_rare.txt"
    shell: "cat {input} | tail -n +2 | cut -f 1,2 > {output}"

# Symlinking the VCF file with Egyptian SNP calling
rule av_symlink_var_file:
    input: "/data/lied_egypt_genome/output_wgs/vars.clean.vcf.gz"
    output: "annovar_annotation/egyptians.vcf.gz"
    shell: "ln -s {input} {output}"

# We want to select here the SNPs that are exonic and rare BUT are common (i.e.
# more than 5%) in Egyptians 
rule av_get_egyptian_common_variants:
    input: "annovar_annotation/egyptians.vcf.gz",
           "annovar_annotation/snppos_egyptians_annovar_annotated_exonic_rare.txt"
    output: "annovar_annotation/egyptians_exonic_rare_egyptiancommon.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} "
                    "--positions {input[1]} " + \
                    "--mac 3 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output}"

rule av_get_positions_exonic_rare_egyptiancommon:
    input: "annovar_annotation/egyptians_exonic_rare_egyptiancommon.vcf.gz"
    output: "annovar_annotation/snppos_egyptians_annovar_annotated_exonic_rare_egyptiancommon.txt"
    shell: "zcat {input} | grep -v '#' | cut -f 1,2 > {output}"

rule av_get_annotations_exonic_rare_egyptiancommon:
    input: "annovar_annotation/egyptians_annovar_annotated.txt",
           "annovar_annotation/snppos_egyptians_annovar_annotated_exonic_rare_egyptiancommon.txt"
    output: "annovar_annotation/egyptians_annovar_annotated_exonic_rare_egyptiancommon.txt"
    run:
        pos = {}
        with open(input[1],"r") as f_pos:
            for line in f_pos:
                pos[line.strip()] = True
        with open(input[0],"r") as f_anno, open(output[0],"w") as f_out:
            for line in f_anno:
                s = line.split("\t")
                if s[0]+"\t"+s[1] in pos:
                    f_out.write(line)

rule av_exonic_effect_exonic_rare_egyptiancommon:
    input: "annovar_annotation/egyptians_annovar_annotated_exonic_rare_egyptiancommon.txt"
    output: "annovar_annotation/count_exonic_effect_exonic_rare_egyptiancommon.txt"
    shell: "cat {input} | sort | uniq | cut -f 9 | sort | uniq -c > {output}"  


################################################################################
############ Dealing with VEP annotated variants and filtering them ############
################################################################################

# This is linking to the file annotated by Matthias using VEP 
rule symlink_snv_file:
    input: "/data/lied_egypt_genome/output_wgs/vep/vep.vcf.gz"
    output: "vep_annotation/vep.vcf.gz"
    shell: "ln -s {input} {output}"

# Select common variants in the Egyptian cohort for further consideration. 
# These should be 
# * biallelic (otherwise gets too complicated) (--min-alleles/--max-alleles 2)
# * no indels (also too complicated) (--remove-indels)
# * no missing genotypes (--max-missing)
# * have minor allel frequency of 5% (--maf)
# These are the variants for further characterization concerning population
# frequencies
rule extract_egyptian_common_variants:
    input: "vep_annotation/vep.vcf.gz"
    output: "vep_annotation/vep_egyptian_common.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input} " + \
                    "--min-alleles 2 " + \
                    "--max-alleles 2 " + \
                    "--remove-indels " + \
                    "--remove-indels " + \
                    "--max-missing 1 " + \
                    "--maf 0.05 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output}"

rule compute_egyptian_af:
    input: "vep_annotation/vep_egyptian_common.vcf.gz"
    output: "vep_annotation/egyptian_af.frq"
    params: out_base=lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input} --freq --out {params.out_base}"

# Here, we extract the SNP information and for every SNP all annotated allele
# frequencies; these are
# CHROM,POS,ID,REF,ALT: VCF file fields
# Add allele frequency from continental populations (AFR,AMR,EAS,EUR,SAS) of 
# 1000 Genomes Phase 3: AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF
# Include allele frequency from Genome Aggregation Database (gnomAD) exome 
# populations. Note only data from the gnomAD exomes are included:
# gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF
# Report the highest allele frequency observed in any population from 1000 
# genomes, ESP or gnomAD: MAX_AF, MAX_AF_POPS
AF_FIELDS = ["CHROM","POS","ID","REF","ALT"] + \
            ["AF","AFR_AF","AMR_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF"] + \
            ["gnomAD_AF","gnomAD_AFR_AF","gnomAD_AMR_AF","gnomAD_ASJ_AF"] + \
            ["gnomAD_EAS_AF","gnomAD_FIN_AF","gnomAD_NFE_AF","gnomAD_OTH_AF","gnomAD_SAS_AF"] + \
            ["MAX_AF","MAX_AF_POPS"]
AF_HEADER = "\t".join(AF_FIELDS)          
rule annotated_af_by_population:
    input: "vep_annotation/vep_egyptian_common.vcf.gz"
    output: "vep_annotation/af_annotated.txt"
    shell: "echo \"{AF_HEADER}\" > {output[0]}; " + \
           "bcftools query -f'[%CHROM\t%POS\t%ID\t%REF\t%ALT\t%INFO/CSQ[0]\n]' {input} " + \
           "| cut -d ',' -f 1 " + \
           "| sed \"s/|/\t/g\" " + \
           "| cut -f 1-5,49-67 >> {output}; "

rule extract_egyptian_af:
    input: "vep_annotation/egyptian_af.frq"
    output: "vep_annotation/egyptian_af_only.frq"
    shell: "echo \"EGP_AF\" > {output}; " + \
           "tail -n+2 {input} | cut -d ':' -f 3 | grep -v \"\\n\" >> {output}"

rule add_egyptian_af_to_af_annotation:
    input: "vep_annotation/af_annotated.txt",
           "vep_annotation/egyptian_af_only.frq"
    output: "vep_annotation/egyptian_af_annotated.txt"
    shell: "paste {input} > {output}"

rule select_population_specific_variants:
    input: "vep_annotation/egyptian_af_annotated.txt"
    output: "vep_annotation/egyptian_pop_specific.txt"
    run:
        pops_1000g = ["AFR","AMR","EAS","EUR","SAS"]
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                # Write updated header
                if line[:5] == "CHROM":
                    f_out.write(line.strip("\n")+"\tSPECIFIC_POP"+"\n")
                    continue
                s = line.strip("\n").split("\t")
                # Get 1000g population maf needed
                af = s[5]
                afs_1000g = s[6:11]
                # If fields are empty, these variants don't occur in 1000g
                # and we set afs to zero
                if afs_1000g[0] == "":
                    # Make sure there is no info for any 1000g pop
                    assert(afs_1000g == ["" for i in range(5)])
                    # And set afs to zero
                    afs_1000g = [0 for i in range(5)]
                afs_1000g = [float(x) for x in afs_1000g]
                # If the overall allele frequency is not given, approximate it
                # by computing an average allele frequency over the 5 pops
                # It seems this happens for SNPs that are multi-allelic 
                # according to Ensembl; if so, the population frequencies
                # are correctly annotated for the respective allele and 
                # approximating with them seems OK
                if af == "":
                    af = 0.2*sum([float(x) for x in afs_1000g])
                else:
                    af = float(af)
                # Just check that all variants in this list are in fact common
                # in Egyptians (MAF>5%)
                af_egyptian = float(s[-1])
                # If overall af is greater than 0.5 then the reference allele
                # is the minor allele and we compute all allele frequencies
                # as 1 minus the specified allele frequency
                if af>0.5:
                    afs_1000g = [1.0-pop_af for pop_af in afs_1000g]
                    af_egyptian = 1.0-af_egyptian
                # Just check that all variants in this list are in fact common
                # in Egyptians (MAF>5%)
                assert(af_egyptian>=0.05)
                # Check if the current SNP is population-specific for any of the 
                # 5 continental populations
                for i in range(5):
                    # If not common in pop, continue
                    if afs_1000g[i]<0.05:
                        continue
                    num_pop_rare = 0
                    for j in range(5):
                        if j == i:
                            continue
                        if afs_1000g[j]<=0.01:
                            num_pop_rare += 1
                    if num_pop_rare == 4:
                        f_out.write(line.strip("\n")+"\t"+pops_1000g[i]+"\n")
                # Check if the current SNP is Egyptian population-specific
                for i in range(5):
                    # If not common in pop, continue
                    if not afs_1000g[i]<=0.01:
                        break
                    if i == 4:
                        f_out.write(line.strip("\n")+"\tEGP\n")

rule snp_pos_for_pop_specific_vars:
    input: "vep_annotation/egyptian_pop_specific.txt"
    output: "vep_annotation/egyptian_pop_specific_pos.txt"
    shell: "cat {input} | cut -f 1,2 > {output}"

rule select_pop_specific_vars:
    input: "vep_annotation/vep_egyptian_common.vcf.gz",
           "vep_annotation/egyptian_pop_specific_pos.txt"
    output: "vep_annotation/vep_egyptian_popspecific.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--positions {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output}"

ANNO_FIELDS = ["CHROM","POS","ID","REF","ALT"] + \
    ["Allele","Consequence","IMPACT","SYMBOL","Gene","Feature_type"] + \
    ["Feature","BIOTYPE","EXON","INTRON","HGVSc","HGVSp","cDNA_position"] + \
    ["CDS_position","Protein_position","Amino_acids","Codons"] + \
    ["Existing_variation","DISTANCE","STRAND","FLAGS","VARIANT_CLASS"] + \
    ["SYMBOL_SOURCE","HGNC_ID","CANONICAL","TSL","APPRIS","CCDS","ENSP"] + \
    ["SWISSPROT","TREMBL","UNIPARC","REFSEQ_MATCH","SOURCE","GIVEN_REF"] + \
    ["USED_REF","BAM_EDIT","GENE_PHENO","NEAREST","SIFT","PolyPhen"] + \
    ["DOMAINS","HGVS_OFFSET"] + \
    ["AF","AFR_AF","AMR_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF"] + \
    ["gnomAD_AF","gnomAD_AFR_AF","gnomAD_AMR_AF","gnomAD_ASJ_AF"] + \
    ["gnomAD_EAS_AF","gnomAD_FIN_AF","gnomAD_NFE_AF","gnomAD_OTH_AF","gnomAD_SAS_AF"] + \
    ["MAX_AF","MAX_AF_POPS"] + \
    ["CLIN_SIG","SOMATIC","PHENO","PUBMED","MOTIF_NAME","MOTIF_POS"] + \
    ["HIGH_INF_POS","MOTIF_SCORE_CHANGE","CADD_PHRED","CADD_RAW"]
ANNO_HEADER = "\t".join(ANNO_FIELDS)  
# Here, we select all annotation of the population specific SNPs (Egyptian-only
# but also population-specific SNPs shared with other continental populations)
# Every VEP annotation is provided in a seperate line
rule annotated_pop_specific_vars:
    input: "vep_annotation/vep_egyptian_popspecific.vcf.gz"
    output: "vep_annotation/pop_specific_annotated.txt"
    run:
        with gzip.open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            f_out.write(ANNO_HEADER+"\n")
            for line in f_in:
                # Skip header
                if line.decode()[0] == "#":
                    continue
                s = line.decode().split("\t")
                vep_anno_string = s[7].split("=")[-1]
                vep_annos = vep_anno_string.split(",")
                for anno in vep_annos:
                    f_out.write(("\t").join(s[:5]+anno.split("|"))+"\n")


################################################################################
### This is an alternative way of selecting population-specific variants #######
################################################################################

rule symlink_var_step2_file:
    input: "/data/lied_egypt_genome/output_wgs/vep/vars.step2.vcf.gz"
    output: "vep_annotation/egyptians_step2.vcf.gz"
    shell: "ln -s {input} {output}"

# Select common variants in the Egyptian cohort for further consideration. 
# These should be 
# * limited number of missing genotypes (--max-missing) (10 individuals max)
# * have minor allel frequency of 5% (--maf)
# These are the variants for further characterization concerning population
# frequencies
rule extract_egyptian_common_vars_w_indels_and_missing_gt:
    input: "vep_annotation/egyptians_step2.vcf.gz"
    output: "vep_annotation/vep_egyptian_common_withindelsandmissinggt.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input} " + \
                    "--max-missing-count 20 " + \
                    "--maf 0.05 " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output}"

rule symlink_freq_file:
    input: "/data/lied_egypt_genome/output_wgs/highest_maf/vep_egyptian_common.freqs.final.txt.gz"
    output: "vep_annotation/vep_egyptian_common.freqs.final.txt.gz"
    shell: "ln -s {input} {output}"

#This counts the occurence of ref alt allele combinations of SNVs (not indels)
# There are more purine-to-purine and pyrimidin-to-pyrimidin exchanges
# Also there are more nucleotide exchanges involving G and C (presumably because 
# of increased GC content in the sequencing data)
rule number_allele_combinations:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_ref_alt_alleles.txt"
    shell: "cat {input} | cut -f 5,6 | sort | uniq -c > {output}"

rule number_flags:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_flags.txt"
    shell: "cat {input} | cut -f 7 | sort | uniq -c > {output}"

rule number_n_mappings:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_n_mappings.txt"
    shell: "cat {input} | cut -f 8 | sort | uniq -c > {output}"

rule number_n_alleles:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_n_alleles.txt"
    shell: "cat {input} | cut -f 9 | sort | uniq -c > {output}"

rule number_highest_ref_freq_pop:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_highest_ref_freq_pop.txt"
    shell: "cat {input} | cut -f 11 | sort | uniq -c | sort -n -r > {output}"

rule number_highest_alt_freq_pop:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_highest_alt_freq_pop.txt"
    shell: "cat {input} | cut -f 13 | sort | uniq -c | sort -n -r > {output}"

rule number_ref_alt_freq_pop_combinations:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/number_highest_freq_pop_combinations.txt"
    shell: "cat {input} | cut -f 11,13 | sort | uniq -c > {output}"

rule all_freq_anno_numbers:
    input: "vep_annotation/number_ref_alt_alleles.txt", \
           "vep_annotation/number_flags.txt", \
           "vep_annotation/number_n_mappings.txt", \
           "vep_annotation/number_n_alleles.txt", \
           "vep_annotation/number_highest_ref_freq_pop.txt", \
           "vep_annotation/number_highest_alt_freq_pop.txt", \
           "vep_annotation/number_highest_freq_pop_combinations.txt"

rule remove_indels:
    input: "vep_annotation/vep_egyptian_common.freqs.final.txt.gz"
    output: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    run:
        with gzip.open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for encoded_line in f_in:
                line = encoded_line.decode()
                s = line.split("\t")
                if not "-" in s[4] and not "-" in s[5]:
                    f_out.write(line)

# After contacting the Ensembl help desk, we use only those MAFs used and 
# documented by the website: 
# http://www.ensembl.org/info/genome/variation/species/populations.html
POP_CODES = []
with open("data/population_codes_vep.txt","r") as f_in:
    for line in f_in:
        if line[:4] == "Name":
            continue
        POP_CODES.append(line.split("\t")[0])
rule add_selected_max_alt_freq:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.txt"
    output: "vep_annotation/vep_egyptian_common.freqs.noindels.max.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "orig":
                    header = line.strip("\n")+"\t"+"\t".join(POP_CODES)
                    header += "\t"+"selected_highest_alt_freq"
                    header += "\t"+"selected_highest_alt_freq_pop"+"\n"
                    f_out.write(header)
                    continue
                # There are additional tabs in two rows, we report and skip
                if not len(line.split("\t")) == 15:
                    print("Line wrongly formated: "+ line)
                    continue
                # Initialize a dict with the population codes to use as keys
                # Initialize with value 0.0, because if a code doesn't occur
                # it means the variant was not detected in the respective
                # population.
                pop_freqs = {k : "0" for k in POP_CODES}
                alt_freqs = line.strip("\n").split("\t")[-1]
                freqs = alt_freqs.split(";")[:-1]
                for freq in freqs:
                    alt_pop,alt_freq = freq.split("=")
                    if alt_pop in pop_freqs:
                        pop_freqs[alt_pop] = alt_freq
                new_line = line.strip("\n")
                max_freq = 0.0
                max_pop = ""
                for pop in POP_CODES:
                    # Remember highest alt AF
                    if float(pop_freqs[pop])> max_freq:
                        max_freq = float(pop_freqs[pop])
                        max_pop = pop
                    new_line += "\t"+pop_freqs[pop]
                # Add max alt AF
                new_line += "\t"+str(max_freq)+"\t"+max_pop+"\n"
                f_out.write(new_line)


# Here we obtain the population-specific variants. We annotated only common 
# variants within the Egyptian population. This means, that we only have 
# variants in the set that are polymorph in the Egyptians and thus both alleles
# occur with a minimum of 5% in our Egyptian sample. Still, because a variant
# may not be bi-allelic in another data set, we cannot assume that the frequency
# of the reference allele is 1-MAF of the alternative allele and we thus select 
# population-specific SNPs only by consulting the alternative frequencies.
rule filter_pop_specific_with_threshold:
    input: "vep_annotation/vep_egyptian_common.freqs.noindels.max.txt"
    output: "vep_annotation/popspecific_{threshold}.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                s = line.split("\t")
                # Write header
                if line[:4] == "orig":
                    f_out.write(line)
                # Skip those variants with flag indicating errors (i.e. not NA)
                # Skip variants that map to more than one position
                # Skip also X and Y chromosome calls
                if not s[6] == "NA" or not s[7] == "1" or s[2] in ["X","Y"]:
                    continue
                try:
                    max_alt_freq = float(s[65])
                except:
                    print(line)
                    continue
                if max_alt_freq<float(wildcards.threshold):
                    f_out.write(line)

rule filter_pop_specific_with_threshold_all:
    input: expand("vep_annotation/popspecific_{threshold}.txt", \
           threshold=["0.01","0.001","0.0001","0.00001","0.000001"]), \
           expand("vep_annotation/popspecific_{threshold}.vcf.gz", \
           threshold=["0.01","0.001","0.0001","0.00001","0.000001"]),

rule get_pos_pop_specific:
    input: "vep_annotation/popspecific_{threshold}.txt"
    output: "vep_annotation/popspecific_{threshold}.pos"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:4] == "orig":
                    continue
                s = line.split("\t")
                f_out.write("chr"+s[2]+"\t"+s[3]+"\n")

rule index_common_snps:
    input: "vep_annotation/vep_egyptian_common_withindelsandmissinggt.vcf.gz"
    output: "vep_annotation/vep_egyptian_common_withindelsandmissinggt.vcf.gz.tbi"
    shell: "tabix -p vcf {input}" 

rule get_pop_specific_vcf:
    input: "vep_annotation/vep_egyptian_common_withindelsandmissinggt.vcf.gz", \
           "vep_annotation/vep_egyptian_common_withindelsandmissinggt.vcf.gz.tbi", \
           "vep_annotation/popspecific_{threshold}.pos"
    output: "vep_annotation/popspecific_{threshold}.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "bcftools view --regions-file {input[2]} {input[0]} | " + \
           "bgzip > {output[0]}"

rule compute_af_for_pop_spec:
    input: "vep_annotation/popspecific_{threshold}.vcf.gz"
    output: "vep_annotation/popspecific_{threshold}.frq"
    params: log_base=lambda wildcards, output: output[0][:-4]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--freq " + \
                    "--out {params.log_base} "

rule final_table_pop_specific_vars:
    input: "vep_annotation/popspecific_0.01.txt", \
           "vep_annotation/popspecific_0.01.frq"
    output: "vep_annotation/popspecific_0.01_final.txt"
    run:
        # Read in AF info and record wether a position is multi-allelic in the
        # Egyptian data
        num_alleles = {}
        alt_afs = {}
        with open(input[1],"r") as f_in:
            for line in f_in:
                if line[:5] == "CHROM":
                    continue
                s = line.strip("\n").split("\t")
                # Remove the chr from the chromosome specification because 
                # Ensembl annotation doesn't use it
                chrom = s[0][3:]
                pos = s[1]
                if chrom+"\t"+pos in num_alleles:
                    num_alleles[chrom+"\t"+pos] += 1
                else:
                    num_alleles[chrom+"\t"+pos] = 1
                alt_allel,alt_freq = s[-1].strip("\n").split(":")
                alt_afs[chrom+"\t"+pos+"\t"+alt_allel] = alt_freq
        # Go over the large final table, extracted columns for final table and
        # add Egyptian AF
        all_lines = []
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                s = line.strip("\n").split("\t")
                # Write header
                if line[:4] == "orig":
                    header = s[2]+"\t"+s[3]+"\t"+s[1]+"\t"+s[4]+"\t"+s[5]
                    header += "\t"+s[8]+"\t"+"egp_n_alleles_incl_indels"
                    header += "\t"+"egp_alt_freq"+"\t"
                    header += s[-2]+"\t"+s[-1]
                    header += "\t"+s[11]+"\t"+s[12]+"\t"+s[14]+"\n"
                    f_out.write(header)
                    continue
                chrom = s[2]
                pos = s[3]
                alt_allel = s[5]
                # We counted the number of alt alleles before, but 
                # Ensembl returns the number of alleles all over, including
                # the reference allele. Thus by adding one we assume that we
                # observed the reference allele (which for this data set
                # can savely be assumed)
                n_alleles = str(int(num_alleles[chrom+"\t"+pos])+1)
                af = alt_afs[chrom+"\t"+pos+"\t"+alt_allel]
                out_line = [s[2],s[3],s[1],s[4],s[5],s[8],str(n_alleles),af,s[-2]]
                out_line += [s[-1],s[11],s[12],s[14]]
                all_lines.append(out_line)
            # Sort by chromosome (primary) and position (secondary)
            sorted_lines = sorted(all_lines,key=lambda x: (int(x[0]),int(x[1])))
            for entry in sorted_lines:
                f_out.write("\t".join(entry)+"\n")


rule get_pos_pop_specific_final:
    input: "vep_annotation/popspecific_0.01_final.txt"
    output: "vep_annotation/popspecific_0.01_final.pos"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[:3] == "chr":
                    continue
                s = line.split("\t")
                f_out.write("chr"+s[0]+"\t"+s[1]+"\n")
                

rule get_pop_specific_annotated_vcf:
    input: "vep_annotation/vep.vcf.gz", \
           "vep_annotation/vep.vcf.gz.tbi", \
           "vep_annotation/popspecific_0.01_final.pos"
    output: "vep_annotation/popspecific_0.01_annotated.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "bcftools view --regions-file {input[2]} {input[0]} | " + \
           "bgzip > {output[0]}"

# Here, we select all annotation of the population specific SNPs 
# Every VEP annotation is provided in a seperate line
# Because the previous bcftools command extracts all variants overlapping the 
# specified positions, we still also have to filter the annotation list 
rule annotated_pop_specific_vars_threshold_positionwise:
    input: "vep_annotation/popspecific_0.01_annotated.vcf.gz"
    output: "vep_annotation/popspecific_0.01_posannotated.txt"
    run:
        with gzip.open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            f_out.write(ANNO_HEADER+"\n")
            for line in f_in:
                # Skip header
                if line.decode()[0] == "#":
                    continue
                s = line.decode().split("\t")
                vep_anno_string = s[7].split("=")[-1]
                vep_annos = vep_anno_string.split(",")
                for anno in vep_annos:
                    f_out.write(("\t").join(s[:5]+anno.split("|"))+"\n")

rule annotated_pop_specific_vars_threshold:
    input: "vep_annotation/popspecific_0.01_final.txt", \
           "vep_annotation/popspecific_0.01_posannotated.txt"
    output: "vep_annotation/popspecific_0.01_annotated.txt"
    run:
        # Read in pop-specific positions and alleles
        snps = {}
        with open(input[0],"r") as f_in:
            for line in f_in:
                # Skip header
                if line[:3] == "chr":
                    continue
                s = line.split("\t")
                chrom = "chr"+s[0]
                pos = s[1]
                alt = s[4]
                snps["\t".join([chrom,pos,alt])] = True
        print(snps)
        with open(input[1],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                # Write header
                if line[:5] == "CHROM":
                    f_out.write(line)
                    continue
                s = line.split("\t")
                chrom = s[0]
                pos = s[1]
                alt = s[4]
                if "\t".join([chrom,pos,alt]) in snps:
                    f_out.write(line)


################################################################################
########## Extracting population-specific variants without rsids  ##############
################################################################################

# We use variants without rsid assigned by VEP which are
# * no indels (also too complicated) (--remove-indels)
# * max 10 individuals with missing genotypes, i.e. allele count >=100
# * have minor allel frequency of 5%
# * are autosomal (not X or Y)
rule select_variants_wo_dbsnp_id:
    input: "vep_annotation/vep.vcf.gz"
    output: "vep_annotation/vep_egyptian_common_wo_rsid.vcf"
    run:
        with gzip.open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for encoded_line in f_in:
                line = encoded_line.decode()
                if line[0] == "#":
                    f_out.write(line)
                    continue
                vep_rsid = line.split("|")[17][:2]
                s = line.split("\t")
                chrom = s[0]
                ref = s[3]
                alt = s[4]
                info = s[7].split(";")
                ac_string,ac = info[0].split("=")
                assert(ac_string == "AC")
                an_string,an = info[2].split("=")
                assert(an_string == "AN")
                alt = s[4]
                # only variants wo rsid and in at least 100 individuals (i.e. 
                # in 200 chromosomes)
                if not vep_rsid[:2] == "rs" and int(an)>=200 and not chrom in ["chrX","chrY"]:
                    # only if not an indel
                    if ref in ["A","C","G","T"] and alt in ["A","C","G","T"]:
                        af = int(ac)/int(an)
                        if af>=0.05:
                            f_out.write(line)

# I used this file for annotating with Ensembl VEP the rsids again, using the 
# latest VEP version, the result is only 49 SNPs without rsid left.
# Here, we use the manually constructed file of these SNPs to extract the
# corresponding information from the VCF file; this is useful for adding
# allele frequencies to the suppl. table and it is also needed for submission to 
# dbbSNP
rule select_49_snps_wo_rsid:
    input: "vep_annotation/vep.vcf.gz",
           "data/positions_of_49_egyptian_snps_wo_rsid.txt"
    output: "vep_annotation/vep_egyptian_common_wo_rsid_for_dbsnp.vcf.gz"
    params: log_base=lambda wildcards, output: output[0][:-7]
    conda: "envs/genotype_pcs.yaml"
    shell: "vcftools --gzvcf {input[0]} " + \
                    "--positions {input[1]} " + \
                    "--recode-INFO-all " + \
                    "--recode " + \
                    "--out {params.log_base} " + \
                    "--stdout | bgzip > {output[0]}" 


################################################################################
####### Performing preprocessing and statistics for SVs called by Axel #########
################################################################################

# Symlinking the SV file with delly SV calls performed by Axel
rule sv_symlink:
    input: "/data/lied_egypt_genome/axel/SVs/germline.vcf"
    output: "sv_stats/egyptians.vcf.gz"
    shell: "cat {input} | gzip > {output}"

rule sv_stats:
    input: expand("sv_stats/egyptians.{stat}", stat=["imiss","lmiss", \
                                            "het","relatedness","relatedness2", \
                                            "ldepth","ldepth.mean","insnps"])

rule sv_stats_boxplots:
    input: "sv_stats/egyptians.insnps", "sv_stats/egyptians.imiss",
           "sv_stats/egyptians.het", "sv_stats/variants_per_chrom.txt"
    output: "sv_stats/figures/variant_stats_boxplots.pdf",
            "sv_stats/figures/variant_stats_missing_vs_het.pdf",
            "sv_stats/figures/variant_stats_corplot.pdf",
            "sv_stats/figures/variant_stats_var_per_chrom.pdf"
    script: "scripts/sv_stats_boxplots.R"

rule split_by_sv_type_dels:
    input: "sv_stats/egyptians.vcf.gz"
    output: "sv_stats/egyptians_deletions.vcf"
    shell: "zcat {input} | grep '#' > {output}; " + \
           "zcat {input} | grep '<DEL>' >> {output} " 

rule split_by_sv_type_insertions:
    input: "sv_stats/egyptians.vcf.gz"
    output: "sv_stats/egyptians_insertions.vcf"
    shell: "zcat {input} | grep '#' > {output}; " + \
           "zcat {input} | grep '<INS>' >> {output} " 

rule split_by_sv_type_duplications:
    input: "sv_stats/egyptians.vcf.gz"
    output: "sv_stats/egyptians_duplications.vcf"
    shell: "zcat {input} | grep '#' > {output}; " + \
           "zcat {input} | grep '<DUP' >> {output} " 

rule split_by_sv_type_inversions:
    input: "sv_stats/egyptians.vcf.gz"
    output: "sv_stats/egyptians_inversions.vcf"
    shell: "zcat {input} | grep '#' > {output}; " + \
           "zcat {input} | grep '<INV' >> {output} " 

rule split_by_sv_type_translocations:
    input: "sv_stats/egyptians.vcf.gz"
    output: "sv_stats/egyptians_translocations.vcf"
    shell: "zcat {input} | grep '#' > {output}; " + \
           "zcat {input} | grep -v '<' >> {output} " 

rule split_by_sv_type:
    input: expand("sv_stats/egyptians_{sv_type}.vcf", \
                   sv_type=["deletions","insertions","duplications", \
                            "inversions","translocations"])

rule get_ins_len:
    input: "sv_stats/egyptians_insertions.vcf"
    output: "sv_stats/len_insertions_all.txt"
    shell: "cat {input} | grep -v '#' " + \
                       "| cut -d ';' -f 12 " + \
                       "| cut -d '=' -f 2 > {output}"

rule get_ins_len_filtered:
    input: "sv_stats/egyptians_insertions.vcf"
    output: "sv_stats/len_insertions_pass.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[0] == "#":
                    continue
                # If this is a file considering only "PASS" calls, don't consider
                # "LowQual" calls (this is the only filter flag except "PASS"
                if line.split("\t")[6] == "LowQual":
                    continue
                f_out.write(line.split(";")[11][7:]+"\n")

rule get_del_inv_dup_len:
    input: "sv_stats/egyptians_{del_inv_dup}.vcf"
    output: "sv_stats/len_{del_inv_dup}_{filter}.txt"
    wildcard_constraints: del_inv_dup="deletions|inversions|duplications",
                          filter="pass|all"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            for line in f_in:
                if line[0] == "#":
                    continue
                s = line.split("\t")
                # If this is a file considering only "PASS" calls, don't consider
                # "LowQual" calls (this is the only filter flag except "PASS"
                if wildcards.filter == "pass":
                    if s[6] == "LowQual":
                        continue
                pos = int(s[1])
                end = int(s[7].split(";")[4][4:])
                f_out.write(str(end-pos)+"\n")

rule num_all_translocations:
    input: "sv_stats/egyptians_translocations.vcf"
    output: "sv_stats/num_translocations_all.txt"
    shell: "cat {input} | grep -v '#' | wc -l > {output}"

rule num_pass_translocations:
    input: "sv_stats/egyptians_translocations.vcf"
    output: "sv_stats/num_translocations_pass.txt"
    run:
        with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
            num_trans = 0
            for line in f_in:
                if line[0] == "#":
                    continue
                # If this is a file considering only "PASS" calls, don't consider
                # "LowQual" calls (this is the only filter flag except "PASS"
                if line.split("\t")[6] == "LowQual":
                    continue
                else:
                    num_trans += 1                
            f_out.write(str(num_trans)+"\n")

# Get the length of the different types of SVs, note that translocations have 
# no length because they are at the end of the chromosome (or, more precisely,
# we'd need to consider chromosome length to compute a length)
rule get_sv_lens:
    input: expand("sv_stats/len_{del_inv_dup}_{filter}.txt", \
                   del_inv_dup=["insertions","deletions","inversions","duplications"], \
                   filter=["all","pass"])

rule plot_sv_len:
    input: expand("sv_stats/len_{del_inv_dup}_{filter}.txt", \
                   del_inv_dup=["insertions","deletions","inversions","duplications"], \
                   filter=["all","pass"]),
            "sv_stats/num_translocations_all.txt",
            "sv_stats/num_translocations_pass.txt"
    output: "sv_stats/figures/len_svs_all.pdf",
            "sv_stats/figures/len_svs_pass.pdf",
            "sv_stats/figures/sv_hist_all.pdf",
            "sv_stats/figures/sv_hist_pass.pdf"
    script: "scripts/sv_lens.R"