-
Notifications
You must be signed in to change notification settings - Fork 0
/
x_kinase_pdb_search_query.py
410 lines (353 loc) · 15.4 KB
/
x_kinase_pdb_search_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
#!/usr/bin/env python3
import io
import os
import re
import sys
import time
import requests
import pandas as pd
from tqdm import tqdm
from pathos import multiprocessing
from Bio import PDB
from Bio.PDB import Select
from x_search_align import BlastpPairwiseIdentity
from x_pdb_modif_resid import ReplacePDBModifiedAA
#######################################################################################
#
# v1.0 20.01.02
#
# Bug - 20.01.12 - Biopython 1.74/1.76 have issue with PDB writeout. v1.72 has no issue
# > AttributeError: 'Atom' object has no attribute 'disordered_get_list'
# to go around this issue, generate an intermediate file instead
#
#
############################################################################################
## search RCSB PDB site with predefined kinase search queries,
## download those that are not in internal library already
def DownloadFASTA( pdb_id ):
peptides = []
## use 'requests' to download fasta and output the fasta
download_fasta = 'https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList={0}&compressionType=uncompressed'.format(pdb_id)
fasta_url = requests.get(download_fasta)
## delay retrieve by 0.1s to avoid server from download lockout (prevent downloading)
time.sleep(0.1)
## downloaded FASTA is byte, need to split it into individual chains by '>',
## then concatenate pdb_id and chain_id. Split string by 'SEQUENCE', then
## remove all '\n'
Tmp = [ x for x in fasta_url.content.decode().split('>') if x is not '' ]
for line in Tmp:
try:
pdb_tmp, seq_tmp = re.sub(':', '_', line).split('SEQUENCE')
except ValueError:
print(' > \033[31mFailed to retrieve:\033[0m '+pdb_id)
continue
pdb_idx = pdb_tmp.split('|')[0]
pdb_id, chain_id = pdb_idx.split('_')
seq = re.sub('\n', '', seq_tmp)
length = len(seq)
peptides.append([pdb_idx, pdb_id, chain_id, length ,seq])
return peptides
############################################################################################
## Get all PDB IDs that hit the query, compare query PDBs to known list and
## output the new ones not in the known list
def SearchRCSBWithQuery( key, search_query ):
pdb_url = 'http://www.rcsb.org/pdb/rest/search'
header = {'Content-Type': 'application/x-www-form-urlencoded'}
## seach RCSB with query using requests
response = requests.post(pdb_url, data=search_query, headers=header)
if response.status_code == 200:
print(" > Found PDB entries matching query: \033[31m{0} - {1}\033[0m".format(key, len(response.text)))
else:
sys.exit("\033[34m FATAL \033[0m- Failed to retrieve results")
## response.content will give a list of 4-letter PDB_ID
pdb_id_list = sorted([i for i in response.content.decode().split('\n') if i != ''])
print(' > Parsed PDB entries that matches: \033[31m{0}\033[0m'.format(len(pdb_id_list)))
return pdb_id_list
#############################################################################
## Check if the newly retreived PDB name already exists in internal library
def ReadKnownKinaseList( list_name ):
known_pdbs = {}
if list_name == 'None':
return known_pdbs
with open(list_name, 'r') as fi:
for l in fi:
if l.rstrip() and not re.search('#', l):
pdb_id, rest = l.rstrip().split('_')
chain_id = rest.split('.')[0]
if pdb_id not in known_pdbs:
known_pdbs[pdb_id] = [chain_id]
else:
known_pdbs[pdb_id].append(chain_id)
print(' > Found Known Kinase entries: \033[31m{0}\n - {1}\033[0m\n'.format(len(known_pdbs), list_name))
return known_pdbs
## First open the known_pdb.list, check if query PDB is in known_pdb.list already
## return those that are not found in known_pdb.list
def CheckExistingPDBs( pdb_list, known_pdbs ):
return [ pdb_id for pdb_id in pdb_list if pdb_id not in known_pdbs ]
##########################################################################################
## Download PDB from RCSB then extract the Chain of interest, return path/name of PDB
class DownloadNewPDB(object):
def __init__( self, work_dir='' ):
self.work_dir = work_dir
def __call__( self, info ):
return self.download_pdb( info )
def download_pdb( self, info ):
pdb_id, chain_id = info
## Check if atom has alternative position, if so, keep 'A' position and remove the flag
## but somehow this class doesn't seem to function well
class NotDisordered(Select):
def accept_atom(self, atom):
if not atom.is_disordered() or atom.get_altloc() == 'A':
atom.set_altloc(' ')
return True
else:
return False
## BioPython downloads PDB but it gives a lowercase name in pdb{}.ent format
biopdb_name = '{0}/pdb{1}.ent'.format(self.work_dir, pdb_id.lower())
biopdb_modf = '{0}/pdb{1}.mod.ent'.format(self.work_dir, pdb_id.lower())
if not os.path.isfile(biopdb_modf):
try:
PDB.PDBList(verbose=False).retrieve_pdb_file(pdb_id, pdir=self.work_dir,
obsolete=False, file_format='pdb')
except FileNotFoundError:
print(' \033[31m> ERROR: BioPython cannot download PDB: \033[0m'+pdb_id)
return None
## Replace modified AA to avoid mis-recognition in biopython readin
## Replace disordered atoms and keep only the "A" variant
ReplacePDBModifiedAA(biopdb_name, biopdb_modf)
os.system('grep "REMARK " {0} > {0}.remark'.format(biopdb_modf))
with open(biopdb_modf, 'r') as fi:
remarks = [l for l in fi if re.search('REMARK HET ',l)]
## Read the PDB file and extract the chain from structure[0]
try:
model = PDB.PDBParser(PERMISSIVE=1,QUIET=1).get_structure(pdb_id, biopdb_modf)[0]
except KeyError:
print(' \033[31m> ERROR: BioPython cannot read in PDB: \033[0m'+biopdb_modf)
return None
except ValueError:
print(' \033[31m> ERROR: PDB file is empty: \033[0m'+biopdb_modf)
return None
### Bug alert: as of 20.02.18, Biopython dev hasn't come up with good
### strategy to fix the 'atom.disordered_get_list()' issue with alternative
### position of residue side chains. To go around this, will physically
### remove "B" variant and keep only "A" variant in
io = PDB.PDBIO()
io.set_structure(model[chain_id])
io.save('{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id),
select=NotDisordered())
# Attach REMARK to end of PDB as safekeeping
os.system('cat {0}/{1}_{2}.pdb {3}.remark > {1}.temp'.format(self.work_dir, pdb_id, chain_id, biopdb_modf))
os.system('mv {1}.temp {0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id))
# os.system('mv {1} {0}/{2}.ent'.format(self.work_dir, biopdb_name, pdb_id))
# os.system('bzip2 -f {0}/{1}.ent'.format(self.work_dir, pdb_id))
# os.system('rm {0} {0}.remark'.format(biopdb_modf))
return '{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id)
############################################################################################
def RunBlastDB( data ):
kinase_db, pdb_id, chain_id, pdb_idx, seq = data
fasta_file = '{0}_{1}.fasta'.format(pdb_id, chain_id)
with open(fasta_file, 'w') as fo:
fo.write('>'+pdb_idx+'\n'+seq+'\n')
Identity = BlastpPairwiseIdentity( '.', fasta_file, kinase_db )
if Identity is not None:
imat_df = pd.DataFrame(Identity, columns=['kinase','length','ident','simi'])
else:
imat_df = None
return [data, imat_df]
############################################################################################
## Check FASTA is kinase by comparing sequence identity to all canonical human kinases (kinome)
## output is a Dictionary of pdb_id to chain_id
def CheckKinaseSeqIdentity( uniq_f_df, kinase_db, len_cutoff, idt_cutoff, outpref ):
# good: length match > 175, seq ident > 40%
# check: length match > 175, seq ident 30-40%
# bad: length match > 175, seq ident < 30%
# no: length match <= 175
f_ok = open(outpref+'.good_seq_ident.txt', 'w')
f_ok.write('pdb_idx\tlength\tident\tsimi\n')
f_chk = open(outpref+'.check_seq_ident.txt', 'w')
f_chk.write('pdb_idx\tlength\tident\tsimi\n')
f_bad = open(outpref+'.bad_seq_ident.txt', 'w')
f_bad.write('pdb_idx\tlength\tident\tsimi\n')
f_no = open(outpref+'.no_seq_ident.txt', 'w')
f_no.write('pdb_idx\tlength\tident\tsimi\n')
new_pdb_ids = [] # collection of confirmed kinases
non_kin_idx = [] # collection of non-kinase pdb_idx
print('\n \033[34m## Comparing sequence identity of unique PDB FASTA ##\033[0m')
## Run Blastp in parallel
Data = [[kinase_db, r.pdb_id, r.chain_id, r.pdb_idx, r.seq] for idx, r in uniq_f_df.iterrows()]
mpi = multiprocessing.Pool()
df_l = [x for x in tqdm(mpi.imap(RunBlastDB, Data), total=len(Data))]
mpi.close()
mpi.join()
for items in df_l:
data, imat_df = items
kinase_db, pdb_id, chain_id, pdb_idx, seq = data
fasta_file = '{0}_{1}.fasta'.format(pdb_id, chain_id)
if imat_df is None:
f_no.write(fasta_file+'\t no output\n')
continue
## Required matching to at least 200 residues as cutoff, if fewer than 200 res,
## unlikely to be a kinase catalytic domain. if seq ident < 40% of any
## known human kinases, need to check and confirm if it is a bacterial
## or viral kinases; mammalian kinases are very similar, even to chicken/fish
sele_df = imat_df[ imat_df.length > (len_cutoff - 40) ]
info = '{0}_{1}\t{2}\t{3}\t{4}\n'.format(pdb_id, chain_id,
imat_df.length[0], imat_df.ident[0], imat_df.simi[0])
## catch a strange error with ident.iloc[0] key error
try:
test = sele_df.ident
except KeyError:
print(' \033[31mERROR:\033[0m "sele_df.ident[0]" - '+pdb_idx)
continue
if len(sele_df) == 0 or len(sele_df.ident) == 0:
non_kin_idx.append('{0}_{1}'.format(pdb_id, chain_id))
f_no.write(info)
continue
elif sele_df.ident.iloc[0] > idt_cutoff:
f_ok.write(info)
new_pdb_ids.append( [pdb_id, chain_id] )
elif sele_df.ident.iloc[0] > (idt_cutoff - 10.):
f_chk.write(info)
else:
non_kin_idx.append('{0}_{1}'.format(pdb_id, chain_id))
f_bad.write(info)
f_bad.close()
f_chk.close()
f_ok.close()
f_no.close()
nonkin_seq_df = uniq_f_df[ uniq_f_df.pdb_idx.isin(non_kin_idx) ]
return new_pdb_ids, nonkin_seq_df
########################################################################
########### These Queries are generalized Keyword search #############
## Search RCSB Protein Data Bank using the 'AdvancedKeywordQuery' keywords:, then
## select for 'X-RAY structures' -- NMR are unlikely to be the catalytic domain
## These queries are derived from the 'Query Details' but replacing some
## of the key factors
### EnzymeClassificationTree Search for 2.7.11.1: Non-specific serine/threonine protein kinase
### Experimental Method is X-RAY
n_stk_query = '''
<orgPdbCompositeQuery version="1.0">
<queryRefinement>
<queryRefinementLevel>0</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.AdvancedKeywordQuery</queryType>
<description>Text Search for: 2.7.11.1</description>
<keywords>2.7.11.1</keywords>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>1</queryRefinementLevel>
<conjunctionType>and</conjunctionType>
<orgPdbQuery>
<queryType>org.pdb.query.simple.ExpTypeQuery</queryType>
<description>Experimental Method is X-RAY</description>
<mvStructure.expMethod.value>X-RAY</mvStructure.expMethod.value>
<mvStructure.expMethod.exclusive>y</mvStructure.expMethod.exclusive>
</orgPdbQuery>
</queryRefinement>
</orgPdbCompositeQuery>
'''
### EnzymeClassificationTree Search for 2.7.11.30: Receptor protein serine/threonine kinase
### Experimental Method is X-RAY
r_stk_query = '''
<orgPdbCompositeQuery version="1.0">
<queryRefinement>
<queryRefinementLevel>0</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.AdvancedKeywordQuery</queryType>
<description>Text Search for: 2.7.11.30</description>
<keywords>2.7.11.30</keywords>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>1</queryRefinementLevel>
<conjunctionType>and</conjunctionType>
<orgPdbQuery>
<queryType>org.pdb.query.simple.ExpTypeQuery</queryType>
<description>Experimental Method is X-RAY</description>
<mvStructure.expMethod.value>X-RAY</mvStructure.expMethod.value>
<mvStructure.expMethod.exclusive>y</mvStructure.expMethod.exclusive>
</orgPdbQuery>
</queryRefinement>
</orgPdbCompositeQuery>
'''
### EnzymeClassificationTree Search for 2.7.10.2: Non-specific protein-tyrosine kinase
### Experimental Method is X-RAY
n_tk_query = '''
<orgPdbCompositeQuery version="1.0">
<queryRefinement>
<queryRefinementLevel>0</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.AdvancedKeywordQuery</queryType>
<description>Text Search for: 2.7.10.2</description>
<keywords>2.7.10.2</keywords>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>1</queryRefinementLevel>
<conjunctionType>and</conjunctionType>
<orgPdbQuery>
<queryType>org.pdb.query.simple.ExpTypeQuery</queryType>
<description>Experimental Method is X-RAY</description>
<mvStructure.expMethod.value>X-RAY</mvStructure.expMethod.value>
<mvStructure.expMethod.exclusive>y</mvStructure.expMethod.exclusive>
</orgPdbQuery>
</queryRefinement>
</orgPdbCompositeQuery>
'''
### EnzymeClassificationTree Search for 2.7.10.1: Receptor protein-tyrosine kinase
### Experimental Method is X-RAY
r_tk_query = '''
<orgPdbCompositeQuery version="1.0">
<queryRefinement>
<queryRefinementLevel>0</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.AdvancedKeywordQuery</queryType>
<description>Text Search for: 2.7.10.1</description>
<keywords>2.7.10.1</keywords>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>1</queryRefinementLevel>
<conjunctionType>and</conjunctionType>
<orgPdbQuery>
<queryType>org.pdb.query.simple.ExpTypeQuery</queryType>
<description>Experimental Method is X-RAY</description>
<mvStructure.expMethod.value>X-RAY</mvStructure.expMethod.value>
<mvStructure.expMethod.exclusive>y</mvStructure.expMethod.exclusive>
</orgPdbQuery>
</queryRefinement>
</orgPdbCompositeQuery>
'''
### EnzymeClassificationTree Search for 2.7.12.1: Dual-specificity kinase
### Experimental Method is X-RAY
dual_k_query = '''
<orgPdbCompositeQuery version="1.0">
<queryRefinement>
<queryRefinementLevel>0</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.AdvancedKeywordQuery</queryType>
<description>Text Search for: 2.7.12.1</description>
<keywords>2.7.12.1</keywords>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>1</queryRefinementLevel>
<conjunctionType>and</conjunctionType>
<orgPdbQuery>
<queryType>org.pdb.query.simple.ExpTypeQuery</queryType>
<description>Experimental Method is X-RAY</description>
<mvStructure.expMethod.value>X-RAY</mvStructure.expMethod.value>
<mvStructure.expMethod.exclusive>y</mvStructure.expMethod.exclusive>
</orgPdbQuery>
</queryRefinement>
</orgPdbCompositeQuery>
'''
def KinaseDownloadQueries( parms ):
return {
'NonSpec_stk': n_stk_query, # non-specific S/T kinase
'Recp_stk': r_stk_query, # receptor S/T kinase
'NonSpec_tk': n_tk_query, # non-specific Tyr kinase
'Recp_tk': r_tk_query, # receptor Tyr kinase
'DualSpec_k': dual_k_query # dual-specific kinase
}