-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrossSpecies.py
495 lines (408 loc) · 16.5 KB
/
crossSpecies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#!/usr/bin/python
# Name: Eric Leung
# Assignment: Research Project
# Date: December 8th, 2014
# Python Ver: 2.7.8
# script: crossSpecies.py
# Description: Examine cross-species conservation of chosen pathway genes
# Example: "python crossSpecies.py PATHWAY_NAME"
#################
### FUNCTIONS ###
#################
def parse_kegg_html(pathway, species):
"""
INPUT: pathway name, species of interest
OUTPUT: HTML output from KEGG API
"""
# search for pathway ID
opener = urllib.FancyURLopener({})
searchSite = "http://rest.kegg.jp/find/pathway/" # original site
straight = opener.open(searchSite+pathway) # put together query
rawResults = straight.read().rstrip().split("\t") # get results
pathID = rawResults[0].split(":")[1] # get pathway ID
# convert ID to human's
ID = re.findall(r"[0-9]+", pathID) # extract ID number
ID = species + ID[0] # convert path ID to human
# get pathway genes
pathwaySite = "http://rest.kegg.jp/get/" # searching site
pathwayInfo = opener.open(pathwaySite + ID)
rawResults = pathwayInfo.read() # get results
listOfResults = rawResults.split("\n") # put each line into list
return listOfResults
def get_genes(listOfResults):
"""
INPUT: HTML output from KEGG API
OUTPUT: gene ID, gene name, gene description in a list
NOTE: need to run get_pathway function prior
"""
genes = [] # empty liist to put all genes in
# parse through HTML page list
for line in listOfResults:
temp = [] # list to put results
words = line.strip().split() # remove white space
if len(words) == 0: # if there is an empty line
continue # skip the empty line
elif words[0] == "GENE": # for the case with GENE first
entry = line.strip().split(";") # split into two parts
# get ID numbers
first = entry[0] # take first entry
idName = first.split() # split string by whitespace
temp.extend(idName[1:]) # add ID and gene name to temp
# get gene full name
second = entry[1] # take second entry
name = second.split("[") # split by "[" char
temp.append(name[0].strip())
# put Gene ID, Name, Full name into final list
genes.append(temp)
elif re.match(r"\d+", words[0]): # ID number in front of gene
entry = line.strip().split(";") # split into two parts
# get ID number and name
first = entry[0] # take first entry
idName = first.split() # split string by whitespace
temp.extend(idName) # add ID and gene name to temp
# get gene full name
second = entry[1] # take second entry
name = second.split("[") # split by "[" char
temp.append(name[0].strip())
# put Gene ID, Name, Full name into final list
genes.append(temp)
# return list of genes
return genes
def get_accession(geneSearch):
"""
INPUT: gene ID number
OUTPUT: gene accession number
"""
# create search
handle = Entrez.efetch(db="gene", id=geneSearch, retmode="xml")
fasta_record = handle.read() # XML file of gene
handle.close() # close connection to database
# parse through XML file
root = et.fromstring(fasta_record)
locus = root[0].find("Entrezgene_locus")
product = locus[0].find("Gene-commentary_products")
temp = [] # line of accession numbers
# get all mRNA accession numbers for the gene
for access in product:
name = access.find("Gene-commentary_accession").text
temp.append(name)
# return RefSeq mRNA if it exists
for num in temp:
if "NM" in num:
print "Got accession " + num + "\n"
return num
# return predicted mRNA if it exists
for num in temp:
if "XM" in num:
print "Got accession " + num + "\n"
return num
# return something
print "Got accession " + num + "\n"
return temp[0]
def keep_genes_common_with_humans(genes):
"""
INPUT: genes list from each species
OUTPUT: genes list with genes only in common with human
"""
print "Filtering out genes in non-Human",
print "species that do not exist in Humans...\n"
# get set of common genes among all species
otherSpecies = ["Mouse", "Chimp"]
humanSet = set([]) # empty human set
for gene in genes["Human"]: # go through human genes
humanSet.add(gene[1].upper()) # add gene to set
# get genes that in common with human pathway
for org in otherSpecies:
temp = set([]) # create empty set
# NOTE: write commands to keep genes removed
# get genes in species that are common with humans
for gene in genes[org]: # loop through genes
temp.add(gene[1].upper()) # add gene to a set
genesInHuman = humanSet.intersection(temp) # find genes in human
# keep genes that are common with humans
for gene in genes[org]:
if gene[1].upper() not in genesInHuman:
print "Removing " + gene[1] + " from " + org
genes[org].remove(gene) # remove gene from list
return genes
def save_sequences(allAccession):
"""
INPUT: all accession IDs for each species' genes
OUTPUT: sequence files for each gene with sequence for each species a
gene exists
The sequences will be put into the sequenceAnalysis directory that was
created.
"""
for gene in allAccession.keys(): # loop through genes
print "We're going to find sequences for " + gene
temp = [] # list to put accession numbers in
for org in allAccession[gene].keys(): # loop through species
temp.append(allAccession[gene][org]) # add accession to temp list
print "Got sequence in " + org
# only keep genes that have sequences for all three species
if len(temp) != 3:
print "There seems to be <3 sequences for analysis."
print "Thus we will skip using the " + gene + " gene\n"
continue
handle = Entrez.efetch(db="nuccore", id=','.join(temp),
rettype="fasta",retmode="text")
print "Sequences for " + gene + " successfully obtained!"
fasta_records = handle.read()
handle.close()
path = "./sequenceAnalysis/" + gene # name of directory
if not os.path.isdir(path): # check if directory exists
os.makedirs(path) # make folder for gene
directory = path + "/"
fileName = gene + ".fasta" # sequences
directoryFile = directory + fileName # combine to get complete
fh = open(directoryFile, "w") # open file to put sequences
fh.write(fasta_records) # write sequences to file
print "Sequences written into ./sequenceAnalysis/" + gene + "\n"
fh.close() # close file
def calculate_hamming(de_list):
# dictionary encompassing all genes
allGenes = {"de" : de_list[0], "nonDe" : de_list[1]}
hammingDist = {} # empty dictionary for hamming distances
count = 0 # count number of genes couldn't find Hamming distance
for geneType in allGenes.keys(): # loop through DE and non-DE
temp = [] # empty list to put Hamming distances in for gene type
for gene in allGenes[geneType]: # loop through genes in gene type
print "Calculating Hamming distance for " + gene
path = "./sequenceAnalysis/" + gene
print "Looking for alignment file in " + path
if os.path.isdir(path): # check if alignment exists
print gene + " alignment found!"
fullPath = path + "/" + gene + ".aln" # alignment file
alignment = AlignIO.read(fullPath, "clustal") # get alignment
seqLen = len(alignment[0]) # length of sequence alignment
totalDist= 0 # total Hamming distance
numSeq = len(alignment) # number of seqs to normalize over
for i in range(seqLen): # loop over len of seq alignment
nucSet = set(alignment[:,i]) # set of all elements in column
totalDist += len(nucSet) - 1 # Hamming dist for 1 position
hammingScore = (totalDist / float(numSeq)) / float(seqLen)
temp.append(hammingScore)
print "Successfully found normalized Hamming distance for",
print gene, "\n"
else:
print gene + " not found in Human pathway"
print "We will have to skip this gene in",
print "the statistical analysis\n"
count += 1 # add one to the count
hammingDist[geneType] = temp # add entire list as element to gene type
print "We couldn't find " + str(count),
print "genes from the original path file\n"
return hammingDist
###################
### GET PATHWAY ###
###################
"""
Take in as target pathway
OUTPUT: pathway, de_list
This output has the name of the pathway that will be used in the analysis
plus de_list with DE and non-DE genes
"""
try:
import sys
except ImportError:
pass
with open(sys.argv[1], "r") as fh:
pathway = fh.readline().rstrip()
de_list = [] # list of lists of DE and non-DE genes
temp = fh.readline().rstrip().split(",")
de_list.append(temp)
temp = fh.readline().rstrip().split(",")
de_list.append(temp)
print "The " + pathway + " pathway will be used for this analysis.\n"
#############################
### EXTRACT PATHWAY GENES ###
#############################
"""
Extract gene list for each species
The species that will be analyzed are:
- Homo sapiens (Humans)
- Mus musculus (Mouse)
- Pan troglodytes (Chimpanzee)
OUTPUT: genes
This output is a dictionary of lists where the key is the organism/species
(e.g. "Human") and the values are lists of lists of genes formatted like so:
[ID_NUMBER, GENE_NAME, GENE_DESCRIPTION]
The last part of this section will only keep genes from others species (Chimps
and Mice) if they are in the set of genes in Humans.
"""
try:
import re
except ImportError:
pass
try:
import urllib
except ImportError:
pass
# species legend
species = {"Human":"hsa",
"Mouse":"mmu",
"Chimp":"ptr"}
# dictionary for each species
genes = {}
# loop through species
for org in species.keys():
orgList = parse_kegg_html(pathway, species[org])
print "KEGG website was extracted for a " + org
genes[org] = get_genes(orgList)
print "KEGG website was parsed for " + org + " genes.\n"
# keep genes in other species that are common with humans
genes = keep_genes_common_with_humans(genes)
geneNum = {} # total number of genes
print "\nThe remaining number of genes from each species is:"
for org in genes.keys():
print org + " has " + str(len(genes[org])) + " number of genes"
geneNum[org] = len(genes[org]) # add to number of total genes
print "\nFinished filtering out only Human genes.\n"
#############################
### OBTAIN mRNA ACCESSION ###
#############################
"""
Get mRNA accession numbers for each gene
OUTPUT: allAccession
The output is a dictionary of dictionaries where the key is a gene and the
values are a dictionary with their key as the organism (e.g. "Human") and
the value is the accession number for the gene from that organism
KEY: organism
VALUE: accession
"""
print "Beginning mRNA accession requests...\n"
try:
from Bio import Entrez
except ImportError:
pass
try:
import xml.etree.ElementTree as et
except ImportError:
pass
# provide email address
email = "leunge@ohsu.edu"
Entrez.email = email # add email to object
# dictionary of dictionaries for accession IDs for each species
allAccession = {}
# loop through all species to create dictionary
# KEYS:gene, VALUE:dictionary with accession for diff species
for org in genes.keys():
numAccess = geneNum[org] # total num of genes to get
j = 1 # number of genes for a particular species
ticks = 1 # number of tick marks to make
# loop through genes
for gene in genes[org]:
# progress bar
percentTicks = ticks / float(numAccess) # percent we're at
if j/float(numAccess) > ticks/30.: # if place > percent history
ticks += 1 # add one to number of ticks shown
print "[ {0:30} ]".format("-" * ticks)
print str(j)+" out of "+str(numAccess)+" genes in "+org
target = gene[0] # focus on ID
geneName = gene[1].upper() # focus on uppercase name
print "Searching for " + geneName + " accession in a " + org + "..."
if geneName not in allAccession.keys(): # if first instance
allAccession[geneName] = {} # make dictionary
allAccession[geneName][org] = get_accession(target)
else: # if this isn't first instance
allAccession[geneName][org] = get_accession(target)
j += 1 # increment for species genes
print "All accession IDs for all sequences in mind have been fetched."
##########################
### CREATE DIRECTORIES ###
##########################
"""
Create directories for analysis
OUTPUT: None
The output here is to create the folder for sequence analysis
"""
try:
import os
except ImportError:
pass
# create folder to put sequences in if it doesn't exist
if not os.path.exists("sequenceAnalysis"):
os.makedirs("sequenceAnalysis")
print "sequenceAnalysis directory created to save sequences and analyses\n"
#############################
### OBTAIN mRNA SEQUENCES ###
#############################
"""
Loop through accession numbers to get sequences
"""
save_sequences(allAccession)
print "Successfully saved all sequences that we want for analysis.\n"
#########################
### CLUSTAL ALIGNMENT ###
#########################
try:
from Bio.Align.Applications import ClustalwCommandline
except ImportError:
pass
### UNCOMMENT NECESSARY COMMANDS BELOW TO USE CLUSTAL ###
# change path for clustalw as necessary
# clustalw_exe = r"C:\Program Files (x86)\ClustalW2\clustalw2.exe" # windows
# clustalw_exe = r"/Applications/clustalw-2.1-macosx/clustalw2" # macintosh
# print "ClustalW is assumed to be located at the following: "
# print clustalw_exe,"\n"
directories = os.listdir("./sequenceAnalysis/") # genes in analysis
totalAlign = len(directories) # number of alignments to perform
i = 1 # alignment at currently
ticks = 1 # number of tick marks to make
for gene in directories:
# progress bar
if i/float(totalAlign) > ticks/30.: # if place > percent history
ticks += 1 # add one to number of ticks shown
print "[ {0:30} ]".format("-" * ticks)
print "Starting "+str(i)+" out of "+str(totalAlign)+" alignments"
filePlace = "./sequenceAnalysis/"+gene+"/"+gene+".fasta"
# clustalw_cline = ClustalwCommandline(clustalw_exe, infile=filePlace)
### ASSUMES CLUSTALW2 IS IN YOUR PATH ###
# change to clustalw if that is what you have
# otherwise, uncomment the commands above to manually use ClustalW
clustalw_cline = ClustalwCommandline("clustalw2", infile=filePlace)
clustalw_cline() # clustal alignment
print "Successfully aligned the " + gene + " genes!\n"
i += 1 # increment index
#####################
### DE and non-DE ###
#####################
"""
OUTPUT: list with two lists, one with DE, another with non-DE Hamming dist.
"""
try:
from Bio import AlignIO
except ImportError:
pass
hammingDist = calculate_hamming(de_list) # calculate normed Hamming distances
######################
### STATS ANALYSIS ###
######################
try:
from scipy import stats
except ImportError:
pass
print "Performing a Mann-Whitney U test..."
u, p = stats.mannwhitneyu(hammingDist["de"], hammingDist["nonDe"])
print "Found a U = " + str(u)
print "Found a P-value = " + str(p) + "\n"
###############
### BOXPLOT ###
###############
try:
import matplotlib.pyplot as plt
except ImportError:
pass
fig, ax = plt.subplots() # make parameters to mess around with
labels = ["DE", "non-DE"]
box = plt.boxplot([hammingDist["de"], hammingDist["nonDe"]])
xtickNames = plt.setp(ax, xticklabels=labels) # create labels for boxplots
plt.setp(xtickNames, fontsize=10)
ax.set_xlabel("Expression Type")
ax.set_ylabel("Normalized Hamming Distance/Score")
plt.title("Normalized Hamming Scores for DE and non-DE Genes")
plt.figtext(0.2,0.8,"p-value = 0.2994",color="black",
weight="roman",size="medium")
plt.savefig("boxplot.png") # save boxplot to show in analysis
print "Saved boxplot of normalized DE and non-DE genes of pathway",
print "as boxplot.png in the current directory."