-
Notifications
You must be signed in to change notification settings - Fork 0
/
ProcessGWASForGREGOR.py
109 lines (81 loc) · 3.35 KB
/
ProcessGWASForGREGOR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import pandas as pd
import sys
from pathlib import Path
import re
import argparse
parser = argparse.ArgumentParser(description='Program for compiling GWAS catalog associations for GREGOR.')
parser.add_argument("associationfile")
parser.add_argument("outputdirectory")
#Optional arguments
parser.add_argument('-m', dest='excludemultiple', action='store_const', default=False, const=True,
help='Exclude multiple SNP associations (rsid1; rsid2; etc.).')
parser.add_argument('-x', dest='excludeinteractions', action='store_const', default=False, const=True,
help='Exclude snp interactions (rsid1 x rsid2).')
args = parser.parse_args()
infile = args.associationfile
outfile = args.outputdirectory
includemult = not args.excludemultiple
includeinter = not args.excludeinteractions
#Check inputs
if not Path(infile).is_file:
print("Gwas trait file does not exit.")
sys.exit(0)
if Path(outfile).is_dir():
print("Output directory exists.")
sys.exit(0)
Path(outfile).mkdir(exist_ok=False)
Path(outfile+"/traits/").mkdir()
#Read gwasdata and initialize dictionary for assigning SNPs to traits
gwasdata = pd.read_csv(infile, sep="\t")
traits = list(np.unique(gwasdata["DISEASE/TRAIT"]))
traitdictinit = []
for curtrait in traits:
traitdictinit.append((curtrait, []))
traitdict = dict(traitdictinit)
#Parse the GWAS catalog file based on optional parameters
singlesnpc = 0
multisnpc = 0
intersnpc = 0
notreadc = 0
for index, currow in gwasdata.iterrows():
trait = currow['DISEASE/TRAIT']
snps = currow['SNPS'].strip()
if re.match('rs[0-9]+$', snps):
traitdict[trait].append(snps)
singlesnpc += 1
elif ";" in snps:
if includemult:
multisnps = snps.split(";")
for curmsnp in multisnps:
curmsnp_stripped = curmsnp.strip()
if re.match('rs[0-9]+$', curmsnp_stripped):
traitdict[trait].append(curmsnp_stripped)
multisnpc += 1
elif "x" in snps:
if includeinter:
intersnps = snps.split("x")
for curisnp in intersnps:
curisnp_stripped = curisnp.strip()
if re.match('rs[0-9]+$', curisnp_stripped):
traitdict[trait].append(curisnp_stripped)
intersnpc += 1
else:
notreadc += 1
#Write stats
stattable = [["Single SNP association:", singlesnpc],
["Multi SNP assocoation ("+("included" if includemult else "excluded")+"):", multisnpc],
["SNP interaction assocation ("+("included" if includeinter else "excluded")+"):", intersnpc],
["Not read (no rsId, excluded):", notreadc]]
pd.DataFrame(np.array(stattable)).to_csv(outfile+"/stats.txt", sep="\t", index=None, header=None)
#Map new ids for traits that can be used for directories, but only traits with at least 1 rsid
traitmapping = []
index = 1
for curkey in traitdict.keys():
cursnplist = traitdict[curkey]
if len(cursnplist) > 0:
traitmapid = "Trait_"+str(index)
traitmapping.append([traitmapid, curkey])
pd.DataFrame(np.unique(cursnplist)).to_csv(outfile+"/traits/"+traitmapid+".txt", sep="\t", index=None, header=None)
index += 1
pd.DataFrame(np.array(traitmapping), columns=["TraitId", "Trait Label"]).to_csv(outfile+"/traitmapping.txt", sep="\t", index=None)