-
Notifications
You must be signed in to change notification settings - Fork 1
/
4_classify_proteins_vs2_results.py
323 lines (239 loc) · 12.1 KB
/
4_classify_proteins_vs2_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
'''
'''
import os, glob, argparse, multiprocessing, time
from functools import partial
import pandas as pd
import numpy as np
from Bio import SeqIO
def parse_args():
parser = argparse.ArgumentParser(description='')
requiredArgs = parser.add_argument_group("Required Arguments")
requiredArgs.add_argument('-n', '--run_name',
dest='run_name',
required=True,
help='name of the analysis (round1, round2...)'
)
return parser.parse_args()
def get_CLs_to_analyze():
'''
Get cl_ids from MRCAs directories
'''
mrcas_files = glob.glob("/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/*/0_mrcas/*.txt")
cl_ids = [os.path.basename(file).replace(".txt", "") for file in mrcas_files]
return cl_ids
def get_contigs_status():
'''
'''
#
# # list .contigs files, get identifiers from them and init a dict
contigs_files = glob.glob("/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/*/1_ipg/*.contigs")
contigs_ids = list()
for file in contigs_files:
contigs_ids += [line.strip() for line in open(file).readlines()]
# init them as "not_downloaded"
contigs_status = {contig:"not_down" for contig in contigs_ids}
# get nr length stats from different rounds
contigs_stats_files = glob.glob("/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/*/3_vs2/0_fasta/nr_contigs.stats")
#contigs_status = dict()
for stats_file in contigs_stats_files:
lines = [line.strip().split("\t") for line in open(stats_file).readlines()[1:]]
# split by space the first column with the header to have only the contig_id
for line in lines:
contig_id = line[0].split(" ")[0]
length = int(line[1])
if length < 1500:
contigs_status[contig_id] = "short"
else:
contigs_status[contig_id] = "ok"
return contigs_status
def read_summary_file(summary_table):
'''
Read summary file to know for which proteins should I create an entry in the final table
'''
summary_df = pd.read_csv(summary_table, sep="\t", header=0, index_col=0, low_memory=False)
# keep only Bacteria
bacteria_summary_df = summary_df[(summary_df["included"] == True) & (summary_df["superkingdom"] != "Viruses")]
prots_ids = bacteria_summary_df.index.to_list()
return prots_ids
def get_ipgs_for_prots(cl_id, prots_ids):
'''
Checks the .ipg directories of all analysis rounds, concat all of them in the same df (or list of lists).
Then, parse the merged ipg table and assign the unique identifiers associated with each protein
'''
# first get which is the ipg for the protein that is in the tree. NB this does
# not need to be the one in the .ipg_filtered. Because of that I need to first
# list all the ipg tables of different rounds for the cl
ipg_tables = glob.glob(f"/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/*/1_ipg/{cl_id}.ipg") # .ipg_filtered
ipg_lines = list()
for ipg_table in ipg_tables:
lines = [line.strip().split("\t") for line in open(ipg_table, "r").readlines()[1:]]
ipg_lines += lines
# iterate the ipg table(s) looking for the unique identifiers for each protein
prots_ipgs = dict()
for line in ipg_lines:
if line[6] in prots_ids:
prots_ipgs[line[6]] = line[0]
if line[6] == "MCC6012565.1":
print("whut")
# do the inverse, store in a dict k=ipg_id v=prot_id
ipgs_prots = dict()
for prot, ipg_id in prots_ipgs.items():
ipgs_prots[ipg_id] = prot
return prots_ipgs, ipgs_prots
def get_contigs_ipgs(cl_id, ipgs_prots):
'''
For each ipg, store the contigs associated with it.
'''
# list all the ipg tables of different rounds for the cl
ipg_tables = glob.glob(f"/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/*/1_ipg/{cl_id}.ipg_filtered")
ipg_lines = list()
for ipg_table in ipg_tables:
lines = [line.strip().split("\t") for line in open(ipg_table, "r").readlines()[1:]]
ipg_lines += lines
# iterate the .ipg lines while storing the contigs downloaded for each bacteria protein
ipgs_contigs = {ipg_id:dict() for ipg_id in ipgs_prots}
for line in ipg_lines:
# cl_0465-MCC6012565.1 check
if line[0] in ipgs_contigs:
if line[2] in ipgs_contigs[line[0]]:
ipgs_contigs[line[0]][line[2]] += [[int(line[3]), int(line[4])]]
else:
ipgs_contigs[line[0]][line[2]] = [int(line[3]), int(line[4])]
else:
print(f"{line[0]}, {cl_id} is not in the contigs dictionary")
return ipgs_contigs
def parse_vs2_results():
'''
List all the vs2 results from different rounds and store them in a dict
'''
vs2_dirs = "/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/*/3_vs2/1_vs2_results"
# read scores files
vs2_scores_files = glob.glob(f"{vs2_dirs}/final-viral-score*.tsv")
vs2_scores_lines = list()
for file in vs2_scores_files:
lines = [line.strip().split("\t") for line in open(file, "r").readlines()[1:]] # discard header
vs2_scores_lines += lines
# store contigs in the results in a dict along with its score
vs2_results_dict = {line[0].split("|")[0]:dict() for line in vs2_scores_lines}
for line in vs2_scores_lines:
contig = line[0].split("|")[0]
vs2_results_dict[contig][line[0]] = [float(line[3])]
# control for lt2gene contigs: set them as viral score 1, coordinates 1 to 10000000
if "||lt2gene" in line[0]:
vs2_results_dict[contig][line[0]] = [1, 1, 10000000]
# read boundaries files
vs2_boundary_files = glob.glob(f"{vs2_dirs}/final-viral-boundary*.tsv")
vs2_boundary_lines = list()
for file in vs2_boundary_files:
lines = [line.strip().split("\t") for line in open(file, "r").readlines()[1:]] # discard header
vs2_boundary_lines += lines
# store start and stop prophage coordinates in each contig
for line in vs2_boundary_lines:
contig = line[0]
# not all the contigs in boundary.tsv are in score.tsv. Weird, don't know why yet
# check if that is the case
if contig in vs2_results_dict:
vs2_results_dict[contig][line[-1]] += [int(line[3]), int(line[4])]
else:
print(f"Contig {contig} not in scores file.")
vs2_results_dict[contig] = {line[-1]:[0, int(line[3]), int(line[4])]}
return vs2_results_dict
def check_protein_in_prophage(prot_start, prot_end, prophage_start, prophage_end):
if prophage_start < prot_start < prophage_end or prophage_start < prot_end < prophage_end:
return True
else:
return False
def classify_ipg(cl_id, bact_proteins, ipgs_prots, ipgs_contigs, vs2_results_dict, contigs_status):
'''
'''
# init a dataframe with the bacteria proteins as index
columns = ["ipg_id", "prophage", "discard", "n_contigs", "n_prophage", "n_short", "n_notdown", "contigs", "prophage_contigs", "short_contigs", "notdown_contigs"]
final_df = pd.DataFrame(0, index=bact_proteins, columns=columns)
final_df["prophage"] = False
final_df["discard"] = False
final_df["contigs"] = ""
final_df["short_contigs"] = ""
final_df["prophage_contigs"] = ""
# iterate the ipgs
for ipg_id, prot_id in ipgs_prots.items():
final_df.loc[prot_id, "ipg_id"] = ipg_id
# list to store the prophage contigs ids, so I can write them later together
prophage_contigs = list()
# check if there are downloaded contigs associated with the ipg_id
if ipgs_contigs[ipg_id]:
# set the number of contigs inspected
final_df.loc[prot_id, "n_contigs"] = len(ipgs_contigs[ipg_id])
final_df.loc[prot_id, "contigs"] = ",".join(ipgs_contigs[ipg_id])
# get short contigs number and ids
short_contigs = [contig_id for contig_id in ipgs_contigs[ipg_id] if contigs_status[contig_id] == "short"]
final_df.loc[prot_id, "n_short"] = len(short_contigs)
final_df.loc[prot_id, "short_contigs"] = ",".join(short_contigs)
# get not_downloaded contigs number and ids
notdown_contigs = [contig_id for contig_id in ipgs_contigs[ipg_id] if contigs_status[contig_id] == "not_down"]
final_df.loc[prot_id, "n_notdown"] = len(notdown_contigs)
final_df.loc[prot_id, "notdown_contigs"] = ",".join(notdown_contigs)
# iterate the contigs associated with each ipg
for contig_id, prot_coords in ipgs_contigs[ipg_id].items():
# first check it is not a "short" contig
if contigs_status[contig_id] != "short":
check = False
if contig_id in vs2_results_dict:
#print(cl_id, ipg_id, contig_id)
for prophage in vs2_results_dict[contig_id]:
#print(prophage, vs2_results_dict[contig_id][prophage])
score = vs2_results_dict[contig_id][prophage][0]
prophage_start = vs2_results_dict[contig_id][prophage][1]
prophage_end = vs2_results_dict[contig_id][prophage][2]
if check_protein_in_prophage(prot_coords[0], prot_coords[1], prophage_start, prophage_end):
check = True
if check:
final_df.loc[prot_id, "n_prophage"] += 1
final_df.loc[prot_id, "prophage"] = True
prophage_contigs.append(contig_id)
# write prophage contigs, if any
final_df.loc[prot_id, "prophage_contigs"] = ",".join(prophage_contigs)
# Protein was not inspected in this round
else:
final_df.loc[prot_id, :] = np.nan
return final_df
def process_cl(contigs_status, vs2_results_dict, run_name, cl_id):
'''
input: .ipg table
1. Read summary table
2. Read all .ipg tables for the CL from different rounds
'''
print(cl_id)
# parse summary table, get the list of proteins in the tree
summary_dir = "/home/danielc/projects/Bas_phages/5_nr_screening/2_hits_summary"
summary_table = f"{summary_dir}/{cl_id}.summary"
bact_proteins = read_summary_file(summary_table)
if "MCC6012565.1" in bact_proteins:
print("OK")
# for each of the proteins, get its ipg_id, and for each ipg_id get its proteins
prots_ipgs, ipgs_prots = get_ipgs_for_prots(cl_id, bact_proteins)
# associate each ipg with the contigs where they have been found
ipgs_contigs = get_contigs_ipgs(cl_id, ipgs_prots)
# iterate the ipg ids while assessing if they come from a prophage
final_df = classify_ipg(cl_id, bact_proteins, ipgs_prots, ipgs_contigs, vs2_results_dict, contigs_status)
# write df to file
outfile = f"/home/danielc/projects/Bas_phages/5_nr_screening/4_merged_ncbi_crassvirales/3_bacterial/{run_name}/3_vs2/2_classified/{cl_id}.prophages"
final_df.to_csv(outfile, sep="\t", header=True, index=True)
def main():
args = parse_args()
# get the CL_ids to analyze from the 2_trees folder
cl_ids = get_CLs_to_analyze()
# parse contigs .fasta files to know which ones are short or could not be downloaded
print("Parsing contigs .fasta files...")
contigs_status = get_contigs_status()
# store all the vs2 results I have so far from different rounds in a dict
print("Parsing vs2 results...")
vs2_results_dict = parse_vs2_results()
# part = partial(process_cl, contigs_status, vs2_results_dict, args.run_name)
# pool = multiprocessing.Pool(processes=18)
# pool.map(part, cl_ids)
# pool.close()
# pool.join()
for cl_id in cl_ids:
process_cl(contigs_status, vs2_results_dict, args.run_name, cl_id)
if __name__ == "__main__":
main()