-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_find_new_kinase_pdb.py
executable file
·298 lines (244 loc) · 13.4 KB
/
1_find_new_kinase_pdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env python3
import sys,os
##########################################################################
#
# Peter MU Ung @ MSSM/Yale
#
# v1 19.12.28
# v2 20.01.12 generate MSA alignment for all new PDB for Konformation
# v3 20.02.20 clean up read/write of result files
#
# Search RCSB Protein Data Bank for all kinase-associated structures with
# Queries, then compare the new query results to existing internal kinase
# 1ATP-superposed PDB library (known_list) to find those that aren't in the
# library yet. These new entries are checked (seq length, seq identity to
# human kinome) before downloading the kinase structures, which are then
# superposed onto 1ATP using a standard reference residue set. The new
# results are added to the existing internal kinase library list (known_list)
#
# * as long as the (known_list) is kept current and the downloaded PDBs
# are placed into the correct folder, this script can be made into a
# repeating "cron" job to update the kinase structure library.
#
# * tried using MPI to speed up the download of FASTA and PDB but seems to
# run into server dockdown if multiple jobs are retrieving from the same
# IP. Could be a defense mechanism against DOS attack?? Using serial
# retrieve with a delay of 0.1s in between seem to work fine.
#
##########################################################################
msg = '''
> {0}
\t-r <file> [ Read in Parameter file, search RCSB PDB for kinases ]\n
Optional:
\t-set [ Generate Parameter file used by "-r" ]
\t-p <file> [ List of PDB to download directly (format: pdb_id\tchain_id) ]
'''.format(sys.argv[0])
if len(sys.argv) == 1: sys.exit(msg)
#######################################################################
import re
import pandas as pd
from Bio import PDB
from Bio import SeqIO
from tqdm import tqdm
from pathos import multiprocessing
from argparse import ArgumentParser
from x_pdb_superpose import SuperposePDB
from x_query_var import ParseParameterFile
from x_search_align import CacheSeqDatabase
from x_query_var import GenerateTemplSetupScript
from x_search_align import GenerateProfileAlignment
from x_kinase_pdb_search_query import DownloadFASTA
from x_kinase_pdb_search_query import DownloadNewPDB
from x_kinase_pdb_search_query import CheckExistingPDBs
from x_kinase_pdb_search_query import ReadKnownKinaseList
from x_kinase_pdb_search_query import SearchRCSBWithQuery
from x_kinase_pdb_search_query import KinaseDownloadQueries
from x_kinase_pdb_search_query import CheckKinaseSeqIdentity
#######################################################################
def main():
args = cmdlineparse()
if args.set:
if args.read_param:
GenerateTemplSetupScript(args.read_param)
sys.exit('\n \033[34m## Printing out Setting File. Rerun with -r <setting file> to run ##\033[0m')
else:
GenerateTemplSetupScript('kinase_query.template.setup')
sys.exit('\n \033[34m## Printing out Setting File. Rerun with -r <setting file> to run ##\033[0m')
if args.read_param:
parms = ParseParameterFile(args.read_param)
print('\n \033[34m## Reading in Parameter file ##\033[0m')
if args.use_pdb:
new_pdb_ids = pd.read_csv(args.use_pdb, comment='#', header=None, sep='\s+').iloc[:,:].to_numpy()
print('\n \033[34m## Work with a list of pre-supplied PDB: \033[31m{}\033[0m'.format(len(new_pdb_ids)))
else:
new_pdb_ids = None
#################
## Get basic settings from parameter file
work_dir = parms['WORKDIR']
outpref = parms['OUTPREF']
len_cutoff = int(parms['LENCUTOFF'])
idt_cutoff = float(parms['IDTCUTOFF'])
known_pdbs = ReadKnownKinaseList(parms['RDKNOWNKIN'])
read_nonkinase = parms['RDNONKIN']
read_checked = parms['RDCHECKED']
################
## Read from a list of PDBs for known non-kinase, will be used to
## subtract from retrieved entries
if not re.search('none', read_nonkinase, re.IGNORECASE):
if os.path.isfile(read_nonkinase):
non_kinase = pd.read_csv(read_nonkinase,sep='\s+',comment='#',header=None,names=['pdb_idx'])
else:
print(' \033[31m#1# Param ERROR: Supplied Non-kinase List file is not found: \033[0m\n'+read_nonkinase)
non_kinase = pd.DataFrame([], columns=['pdb_idx'])
else:
non_kinase = pd.DataFrame([], columns=['pdb_idx'])
print(' > Found Known Non-Kinase entries: \033[31m{0}\n - {1}\033[0m\n'.format(len(non_kinase), read_nonkinase))
## Look for previously checked PDBs from the query list
## to be used to Subtract reviewed PDB from the query list
if not re.search('none', read_checked, re.IGNORECASE):
if os.path.isfile(read_checked):
checked_df = pd.read_csv(read_checked,sep='\s+',header=None,comment='#',names=['pdb_id'])
else:
print(' \033[31m#1# Param ERROR: Supplied Checked-Kinase List file is not found: \033[0m\n'+read_checked)
checked_df = pd.DataFrame([], columns=['pdb_id'])
else:
checked_df = pd.DataFrame([], columns=['pdb_id'])
print(' > Found checked-kinase entries: \033[31m{0}\n - {1}\033[0m\n'.format(len(checked_df), read_checked))
################################################################
new_ents = [] # store new pdb entries found from queries
## Skip all the checking steps and directly download/extract new PDB if 'use_pdb'
if new_pdb_ids is None:
## create the predefined queries to search for all protein kinases
kinase_queries = KinaseDownloadQueries( parms )
## Get the list of PDBs that hits the queries, filter out the known ones
print('\n \033[34m## Checking RCSB PDB REST API for entries hitting the "Kinase" queries ##\033[0m')
query_rt = [SearchRCSBWithQuery(key, val) for key, val in kinase_queries.items()]
query_et = list(set([pdb_id for sublist in query_rt for pdb_id in sublist])) # unpack
que_ents = sorted(CheckExistingPDBs( query_et, known_pdbs ))
print('\n > Found new PDB entries: \033[31m{}\033[0m'.format(len(que_ents)))
## Subtract previously checked PDB (kinase or nonkinase) from the entry list
if not re.search('none', read_checked, re.IGNORECASE):
print(' > Found Previously-Checked-PDB List: \033[31m{}\033[0m'.format(read_checked))
que_df = pd.DataFrame(que_ents, columns=['pdb_id'])
new_ents = que_df[~que_df.pdb_id.isin(checked_df.pdb_id.tolist())].pdb_id.tolist()
print(' > Entries after subtracting from known non-kinases: \033[31m{}\033[0m'.format(len(new_ents)))
else:
new_ents = que_ents
## End the job if no new PDB is found
if not len(new_ents):
sys.exit('\n \033[31m### Checked and No New PDB Found ###\033[0m\n')
#####################################
## download FASTA of the new entries
print('\n \033[34m## Downloading FASTA of the New Entries ##\033[0m')
dl_fasta = [ DownloadFASTA(pdb_id) for pdb_id in tqdm(new_ents) ]
raw_fasta = list([pdb_id for sublist in dl_fasta for pdb_id in sublist]) # unpack
print(' > FASTA search of entries: {0} -> \033[31m{1}\033[0m chains'.format(len(new_ents), len(raw_fasta)))
raw_f_df = pd.DataFrame(raw_fasta, columns=['pdb_idx','pdb_id','chain_id','length','seq'])
## Subtract retrieved entries that are known to be non-kinase
if not re.search('none', read_nonkinase, re.IGNORECASE):
raw_f_df = raw_f_df[~raw_f_df.pdb_idx.isin(non_kinase.pdb_idx.tolist())].reset_index()
print(' > Entries after subtracting from known non-kinases: \033[31m{}\033[0m'.format(len(raw_f_df)))
## remove sequences too short to be kinase catalytic domain
## remove duplicate sequences of different chains from same pdb_id, keep first one
clean_f_df = raw_f_df[ raw_f_df.length > len_cutoff ]
uniq_f_df = clean_f_df.drop_duplicates(subset=['pdb_id','seq'], keep='first').reset_index(drop=True)
print('\n \033[34m## Removing FASTA \033[31m < {}\033[0m \033[34mresidues ##\033[0m'.format(len_cutoff))
print(' > \033[31m{0}\033[0m/{1} FASTA passed the filter'.format(len(clean_f_df),len(raw_f_df)))
print(' > \033[31m{0}\033[0m/{1} FASTA are unique'.format(len(uniq_f_df),len(clean_f_df)))
#####################################
## Check if FASTA is kinase by comparing sequence identity to all canonical human kinases (kinome)
## output is Length>175, SeqIdent>40%, [[pdb_id, chain_id], [], [] ]
new_pdb_ids, nonkin_seq_df = CheckKinaseSeqIdentity( uniq_f_df, parms['KINOMEDB'], len_cutoff, idt_cutoff, outpref )
print(' > Found kinase seq: \033[31m{0}/{1}\033[0m'.format(len(new_pdb_ids), len(uniq_f_df)))
## write out all updated kinase structures (pdb_id, chain_id)
with open(parms['WTNEWKNOWN'], 'w') as fo:
for i in new_pdb_ids:
fo.write('{0} {1}\n'.format(i[0], i[1]))
## collect all non_kinases and write it out for future reference
nonkin_df = pd.concat( [non_kinase, raw_f_df[ raw_f_df.length <= len_cutoff ], nonkin_seq_df ], sort=True).drop_duplicates(subset='pdb_idx')
nonkin_df.pdb_id = nonkin_df.pdb_idx.apply(lambda x: x.split('_')[0])
nonkin_df.to_csv(parms['WTNONKIN'], header=None, index=None, columns=['pdb_idx'],sep='\t')
print(' > Found non-kinase PDB: \033[31m{}\033[0m'.format(len(nonkin_df)))
print(' > Saved non-kinase PDB: '+parms['WTNONKIN'])
########### new_pdb_ids #############
## By this stage, if 'new_pdb_ids' is still None, meaning no new PDB is found. Exit
if new_pdb_ids is None:
sys.exit('\n \033[31m### Checked and No New PDB Found ###\033[0m\n')
#####################################
mpi = multiprocessing.Pool()
## Download new PDB, extract the chain(s)
print('\n \033[34m## Downloading new PDB ##\033[0m')
oPD = DownloadNewPDB(work_dir=work_dir)
Temp = [x for x in tqdm(mpi.imap(oPD, new_pdb_ids), total=len(new_pdb_ids))]
# Temp = [ oPD(info) for info in tqdm(new_pdb_ids) ]
raw_pdbs = [pdb for pdb in Temp if pdb is not None]
print(' > Downloaded new PDB entries: \033[31m{0}/{1}\033[0m'.format(len(raw_pdbs),len(new_pdb_ids)))
## updated known kinase list by adding new ones to it; chain_id is also recorded
for item in new_pdb_ids:
if item[0] not in known_pdbs:
known_pdbs[item[0]] = [item[1]]
else:
known_pdbs[item[0]].append(item[1])
## write out all updated + exisitng kinase structures (with output extension)
print('\n \033[34m## Finalize all results ##\033[0m')
with open(parms['WTALLKNOWN'], 'w') as fo:
for i in sorted(known_pdbs.keys()):
for chain in known_pdbs[i]:
fo.write('{0}_{1}.{2}\n'.format(i, chain, parms['OUTEXT']))
print('\n > Total number of known kinase PDB_x structures: \033[31m{}\033[0m'.format(len(known_pdbs)))
## write out all checked PDB, may use this next time to skip these checked ones
with open(parms['WTCHECKED'], 'w') as fc:
for pdb_id in sorted(list(set(checked_df.pdb_id.tolist() + new_ents + list(known_pdbs.keys() ))) ):
fc.write(pdb_id+'\n')
print('\n \033[34m> All checked PDBs are listed in:\033[0m \033[31m{}\033[0m'.format(parms['WTCHECKED']))
#####################################
## Generate MSA alignment for new PDBs against known MSA alignment
refname = parms['REFPDB'].split('/')[-1].split('.pdb')[0]
print('\n \033[34m## Get MSA alignment for PDB structures ##\033[0m')
# Use canonical kinome dbase to align; suppress Blastp StdOut during MSA alignment
Fasta_Dict = CacheSeqDatabase(parms['KINOMEGPDB'])
oAl = GenerateProfileAlignment( hom_dir=parms['WORKDIR'], tmp_dir=parms['WORKDIR'],
rst_dir=parms['WORKDIR'], ref_pdb=refname,
f_nogap=parms['KINOMEDB'], f_dict=Fasta_Dict )
Aligned_Fasta = [x for x in tqdm(mpi.imap(oAl, raw_pdbs),total=len(raw_pdbs))]
# Aligned_Fasta = [ oAl(pdb) for pdb in tqdm(raw_pdbs) ]
mpi.close()
mpi.join()
# Remove any input kinase that have been rejected due to low seq identity
with open('{0}.fasta'.format(outpref), 'w') as fo:
SeqIO.write( [Fasta_Dict[refname]]+[x for x in Aligned_Fasta if x is not None], fo, 'fasta' )
#####################################
# sys.exit()
## Superpose new PDBs to 1ATP C-lobe
## limit 500 structures per pymol session to avoid overflow the memory
step = 500
if len(raw_pdbs) > step:
a = 0
for idx, i in enumerate(range(step, len(raw_pdbs)+step, step)):
if i > len(raw_pdbs):
i = len(raw_pdbs)
print('\n \033[34m## Superposing all PDB structure with extension \033[31m{0}.{1}\033[0m \033[34m##\033[0m'.format(idx, parms['OUTEXT']))
SuperposePDB( parms['PYMOL'], parms['REFPDB'], raw_pdbs[a:i], None,
parms['OUTEXT'], parms['REFRES'],
outpref+'.'+str(idx+1) )
a += i
else:
print('\n \033[34m## Superposing all PDB structure with extension \033[31m{}\033[0m \033[34m##\033[0m'.format(parms['OUTEXT']))
SuperposePDB( parms['PYMOL'], parms['REFPDB'], raw_pdbs, None,
parms['OUTEXT'], parms['REFRES'], outpref )
##########################################################################################
def cmdlineparse():
p = ArgumentParser(description="command line arguments")
p.add_argument('-set', action='store_true', help='Print out Parameter file used by "-r"')
p.add_argument('-r', dest='read_param', required=False,
help='Read in Parameter file, search RCSB PDB for kinases')
p.add_argument('-p', dest='use_pdb', required=False,
help='List of kinases to download directly (format: pdb_id\tchain_id)')
return p.parse_args()
########################################################################
if __name__ == '__main__':
main()
#
# v1.0 20.01.10
#
#