-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathodds.py
158 lines (129 loc) · 5.11 KB
/
odds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python
# Name: Eric Leung
# Assignment: Final Project
# Date: December 8th, 2014
# Python ver: 2.7.8
# Script: odds.py
# Description:
# Use Python to perform odds ratio calculation to identify pathways
# containing a larger number of differentially expressed genes than would
# be expected by chance
# Example: "python odds.py"
##################
### FILE CHECK ###
##################
# make sure the right files are in the current directory
import os.path, sys # import necessary packages
files = ["H5N1_VN1203_DE_Probes.txt",
"H5N1_VN1203_UNIVERSE_Probes.txt",
"KEGG_Pathway_Genes.txt"]
print "Checking for necessary files for analysis..."
for n in files: # loop through list of files that should be directory
temp = os.path.isfile(n) # check if file is in directory
if temp != True: # if you get a false output from line above
sys.exit("One of the following files for this script are not" + \
" in your current directory:" + "\n" + \
str(files)) # list of files
print "Files necessary for analysis are in the current directory.\n"
###################
### IMPORT DATA ###
###################
# focus on KEGG Pathway Genes
# SAMPLE FORMAT
# ID PATHWAY TITLE MEMBERS
# 04142 Lysosome AP3S2 CTSE GALNS ...
# 04916 Melanogenesis WNT10A ADCY7 CREB3L3 ...
print "Loading KEGG_Pathway_Genes.txt file..."
with open("KEGG_Pathway_Genes.txt", "r") as fh:
fh.readline().rstrip() # remove header
kegg = [] # empty list to put KEGG Pathway Genes file in
for line in fh.readlines(): # loop through all lines of file
words = line.rstrip().split("\t") # split line by tab
kegg.append(words) # add to master KEGG pathway list
print "Successfully loaded KEGG_Pathway_Genes.txt\n"
# focus on H5N1 VN1203 UNIVERSE Probes
# SAMPLE FORMAT
# A_23_P100011 AP3S2
# A_23_P100022 SV2B
# A_23_P100111 CHP
print "Loading Universe probes..."
with open("H5N1_VN1203_UNIVERSE_Probes.txt", "r") as fh:
fh.readline().rstrip() # remove header
universe = {} # empty dictionary for universe probes
for line in fh.readlines():
words = line.rstrip().split("\t") # split line by tab
universe[words[0]] = words[1] # save in dictionary
universeGenes = universe.values() # get all genes in universe
universeSet = set(universeGenes)
print "Successfully loaded Universe probes\n"
# focus on H5N1 VN1203 DE Probes
# SAMPLE FORMAT
# A_23_P100539 ABCC6
# A_23_P100642 PNMT
# A_23_P100704 MAPK7
print "Loading differentially expressed file..."
with open("H5N1_VN1203_DE_Probes.txt", "r") as fh:
fh.readline().rstrip() # remove header
de = {} # empty dictionary for universe probes
for line in fh.readlines():
words = line.rstrip().split("\t") # split line by tab
de[words[0]] = words[1] # save in dictionary
deGenes = de.values() # get all DE genes
deSet = set(deGenes)
print "Successfully loaded differentially expressed data\n"
#################
### ODDS RATIO ###
#################
print "Calculating odds ratio for pathways...\n"
# variables/data types
sigPathways = [] # list of pathways with signif num DE genes
# matrix for odds ratio
# | DE Genes | Non-DE
# target pathway | a | b
# non-pathway | c | d
de_pathways = {} # dictionary to store DE and non-DE genes for pathway
# loop through pathways to get genes for target pathway
for pathway in kegg: # loop through all pathways
name = pathway[0:2] # get pathway ID and pathway name
print "Calculating odds ratio for " + name[1] # print pathway name
genes = pathway[2:] # get probes in certain pathway
print name[1] + " has " + str(len(genes)) + " number of genes."
genes = set(genes) # make a set of genes in pathway
# set values based on matrix shown above
keggDE = universeSet.intersection(deSet) # make sure DE genes in KEGG
nonDE = universeSet.difference(keggDE) # get non-DE genes
# odds ratio pieces as defined above
aSet = genes.intersection(keggDE)
a = float(len(aSet))
bSet = genes.intersection(nonDE)
b = float(len(bSet))
cSet = len(keggDE.difference(genes))
c = float(cSet)
dSet = len(nonDE.difference(genes))
d = float(dSet)
# add genes to pathway dictionary for DE and non-DE genes
allList = [aSet, bSet] # [DE genes, non-DE genes]
de_pathways[name[1]] = allList # put into dictionary
# find pathways with odds ratio greater than 1.5
OR = (a*d)/(b*c)
if OR > 1.5:
name.append(OR)
sigPathways.append(name)
print ""
####################
### SIG PATHWAYS ###
####################
print "Here are pathways with an OR >1.5:"
for path in sigPathways:
print path[1]
print "Odds ratio = " + str(path[2]) + "\n"
######################
### CHOOSE PATHWAY ###
######################
# write to a file
fh = open("pathway_info.csv", "w")
myPathway = sigPathways[1][1] # select pathway to study
fh.write(myPathway + "\n") # write pathway name first
fh.write(",".join(de_pathways[myPathway][0]) + "\n") # DE genes
fh.write(",".join(de_pathways[myPathway][1]) + "\n") # non-DE genes
fh.close() # close file