forked from logust79/BioTools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenes.py
338 lines (312 loc) · 12.4 KB
/
Genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import logging
import sqlite3
from CommonFuncs import *
from sqlite_utils import *
import copy
import json
import re
def _initiate_db(db_conn):
db_c = db_conn.cursor()
db_c.execute('''CREATE TABLE IF NOT EXISTS genes
(id text NOT NULL UNIQUE, entrez_id text, pLI real, pRec real, mis_z real, genomic_pos_hg19 text, genomic_pos text, symbol text, alias text, PRIMARY KEY (id, entrez_id))''')
db_conn.commit()
def _update_db(self, mgs):
# a wrapper of sqlite_utils.update_db for genes
fields = ['entrez_id','pLI','pRec','mis_z','genomic_pos_hg19','genomic_pos','symbol','alias']
# transform mgs to a dict
good_result = []
# remove not found
bad_genes = []
for i in mgs:
if i.get('notfound',None):
bad_genes.append(i['query'])
else:
good_result.append(i)
if bad_genes:
self._bad_genes.extend(bad_genes)
print('-----some queries are not found----')
print(json.dumps(bad_genes))
data = {}
for i in good_result:
gene = None
genomic_pos = None
# some genes miss ensembl ids, fill them manually for the time being
if '_id' not in i: continue #not found
if i['_id'] == '7012':
i['ensembl'] = {'gene':'ENSG00000270141'}
elif i['_id'] == '6315':
i['ensembl'] = {'gene':'ENSG00000230223'}
elif i['_id'] == '2657':
i['ensembl'] = {'gene':'ENSG00000130283'}
elif i['_id'] == '84876':
i['ensembl'] = {'gene':'ENSG00000276045'}
elif i['_id'] == '9103':
i['ensembl'] = {'gene':'ENSG00000244682'}
elif 'genomic_pos_hg19' not in i:
i['genomic_pos_hg19'] = {}
if 'ensembl' not in i:
self._bad_genes.append(i['query'])
logging.warning('Warning: %s is not registered in ensembl' % i['query'])
continue
if isinstance(i['ensembl'], list):
# sometimes ensembl returns a list, each element corresponds to an id
# check which is the active ensembl id
# genomic_pos has only one in valid chromosomes
print('use ensembl API to check ensemblid')
gene_array = [j for j in i['ensembl'] if check_ensemblId(j['gene'])]
if not gene_array: continue
gene = gene_array[0]['gene']
else:
gene = i['ensembl']['gene']
if isinstance(i.get('genomic_pos_19',None), list):
for val in i['genomic_pos_19']:
if val['chr'] in VALID_CHROMOSOMES:
genomic_pos_hg19 = val
break
else:
genomic_pos_hg19 = i.get('genomic_pos_19',{})
if isinstance(i.get('genomic_pos',None), list):
for val in i['genomic_pos']:
if val['chr'] in VALID_CHROMOSOMES:
genomic_pos = val
break
else:
genomic_pos = i.get('genomic_pos',{})
data[gene] = [
i['_id'],
i['exac']['all']['p_li'] if 'exac' in i and 'all' in i['exac'] else -1, #pLI
i['exac']['all']['p_rec'] if 'exac' in i and 'all' in i['exac'] else -1, #pRec
i['exac']['all']['mis_z'] if 'exac' in i and 'all' in i['exac'] else -1, #mis_z
json.dumps(i['genomic_pos_hg19']), #genomic_pos_hg19
json.dumps(genomic_pos), #genomic_pos
i['symbol'],
json.dumps(i.get('alias',[])), #alias
]
# update
update_db(
self.db_conn,
'genes',
fields,
data
)
def _fetch_one(self,field):
db_c = self.db_conn.cursor()
db_c.execute('SELECT * FROM genes WHERE id=?',(self.id,))
db_gene = dict_factory(db_c,db_c.fetchone())
if db_gene == None or db_gene[field] == None:
# query mygene
print('query mygene')
mg = my_gene(self.id)
# update db
_update_db(self, [mg])
# refetch
db_c.execute('SELECT * FROM genes WHERE id=?',(self.id,))
db_gene = dict_factory(db_c,db_c.fetchone())
return db_gene[field]
def _fetch_many(self,field):
db_c = self.db_conn.cursor()
result = batch_query(db_c,'genes',self.ids)
data = {}
new_genes = []
final = {}
for i in result:
temp = dict_factory(db_c, i)
data[temp['id']] = temp[field]
for g in self.ids:
if g in data and data[g] != None:
final[g] = data[g]
elif g not in self._bad_genes:
new_genes.append(g)
if new_genes:
print('querying mygenes from fetch_many')
new_result = my_genes(new_genes)
# update database
_update_db(self,new_result)
# query again
new_result = batch_query(db_c,'genes',new_genes)
for i in new_result:
temp = dict_factory(db_c, i)
final[temp['id']] = temp[field]
return final
class Gene(object):
def __init__(self, db_conn, id=None):
# id ok?
if id[:4] != 'ENSG':
raise ValueError("can't recognise gene id. It has to be an ensembl id!")
self.id = id
_initiate_db(db_conn)
self.db_conn = db_conn
@property
def entrez_id(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_entrez_id', None) is None:
self._entrez_id = _fetch_one(self,'entrez_id')
return self._entrez_id
@property
def pLI(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_pLI', None) is None:
self._pLI = _fetch_one(self,'pLI')
return self._pLI
@property
def pRec(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_pRec', None) is None:
self._pRec = _fetch_one(self,'pRec')
return self._pRec
@property
def mis_z(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_mis_z', None) is None:
self._mis_z = _fetch_one(self,'mis_z')
return self._mis_z
@property
def genomic_pos_hg19(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_gp19', None) is None:
self._gp19 = json.loads(_fetch_one(self,'genomic_pos_hg19'))
return self._gp19
@property
def genomic_pos(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_gp', None) is None:
self._gp = json.loads(_fetch_one(self,'genomic_pos'))
return self._gp
@property
def symbol(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_symbol', None) is None:
self._symbol = _fetch_one(self,'symbol')
return self._symbol
@property
def alias(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_alias', None) is None:
self._alias = json.loads(_fetch_one(self,'alias'))
return self._alias
class Genes(object):
def __init__(self, db_conn, ids=None):
# id ok?
if ids:
for id in ids:
if id[:4] != 'ENSG': raise ValueError('id has to be an Ensembl id, such as ENSG00000050453. (%s)' % id)
_initiate_db(db_conn)
self.db_conn = db_conn
self.ids = ids
self._bad_genes = [] # this is for bad ids that have no ordinary entries in mygenes. Avoid repetitive queries.
def entrezIds_to_ensemblIds(self,entrez_ids=None):
# convert from entrez ids to ensembl ids
db_c = self.db_conn.cursor()
entrez_ids = list(set(entrez_ids) - set(self._bad_genes))
db_result = batch_query(db_c,'genes',entrez_ids,'entrez_id')
new_genes = []
data = {}
final = {}
for i in db_result:
temp = dict_factory(db_c,i)
data[temp['entrez_id']] = data.get(temp['entrez_id'],[])
data[temp['entrez_id']].append(temp['id'])
for g in entrez_ids:
if g in data and data[g] != None:
final[g] = data[g]
else:
new_genes.append(g)
if new_genes:
print('querying mygenes from entrezIds_to_ensemblIds')
new_result = my_genes(new_genes)
# update database
_update_db(self,new_result)
# query again
new_result = batch_query(db_c,'genes',new_genes,'entrez_id')
for i in new_result:
temp = dict_factory(db_c, i)
final[temp['entrez_id']] = final.get(temp['entrez_id'],[])
final[temp['entrez_id']].append(temp['id'])
return final
def symbols_to_ensemblIds(self,symbols=None):
# convert from symbols to ensembl ids
db_c = self.db_conn.cursor()
# remove bad symbols
symbols = list(set(symbols) - set(self._bad_genes))
# seek symbols
db_result = batch_query(db_c,'genes',symbols,'symbol')
new_genes = []
data = {}
final = {}
for i in db_result:
temp = dict_factory(db_c,i)
data[temp['symbol']] = temp['id']
for g in symbols:
if g in data and data[g] != None:
final[g] = data[g]
else:
new_genes.append(g)
# seek aliases
sql = '''SELECT * FROM genes WHERE alias like ? '''
found = []
for g in new_genes:
temp = [j for j in db_c.execute(sql,('%"'+g+'"%',))]
if temp:
final[g] = temp[0][0]
found.append(g)
for g in found:
new_genes.remove(g)
if new_genes:
print('querying mygenes for symbols to ensemblIds')
new_result = my_genes_by_symbol(new_genes,species='human')
# update database
_update_db(self,new_result)
# query again
new_result = batch_query(db_c,'genes',new_genes,'symbol')
for i in new_result:
temp = dict_factory(db_c, i)
final[temp['symbol']] = temp['id']
return final
@property
def entrez_id(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_entrez_id', None) is None:
self._entrez_id = _fetch_many(self,'entrez_id')
return self._entrez_id
@property
def pLI(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_pLI', None) is None:
self._pLI = _fetch_many(self,'pLI')
return self._pLI
@property
def pRec(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_pRec', None) is None:
self._pRec = _fetch_many(self,'pRec')
return self._pRec
@property
def mis_z(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_mis_z', None) is None:
self._mis_z = _fetch_many(self,'mis_z')
return self._mis_z
@property
def genomic_pos_hg19(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_gp19', None) is None:
self._gp19 = {k:json.loads(v) for k,v in _fetch_many(self,'genomic_pos_hg19').items()}
return self._gp19
@property
def genomic_pos(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_gp', None) is None:
self._gp = {k:json.loads(v) for k,v in _fetch_many(self,'genomic_pos').items()}
return self._gp
@property
def symbol(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_symbol', None) is None:
self._symbol = _fetch_many(self,'symbol')
return self._symbol
@property
def alias(self):
# check local database first. if na, use CommonFuncs to annotate, then store in db
if getattr(self, '_alias', None) is None:
self._alias = {k:json.loads(v) for k,v in _fetch_many(self,'alias').items()}
return self._alias