-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_paths.py
269 lines (237 loc) · 10.4 KB
/
create_paths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
@authors: Juan L. Trincado
@email: juanluis.trincado@upf.edu
create_paths.py: create all the possible transcripts from the list of possible exons.
It returns the fasta sequence associated to each combination
"""
import logging, sys, os, re
from copy import deepcopy
from argparse import ArgumentParser, RawTextHelpFormatter
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create console handler and set level to info
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
# def get_unique_tuples(elements):
# '''
# Given a list of tuples (of 2 cordinates), return a unique list of elements
# '''
# aux_dict = {}
# for tuple in elements:
# if(tuple[0] not in aux_dict):
# aux_dict[tuple[0]] = [tuple[1]]
# else:
# #Save the element if is not included in the associated list
# if(tuple[1] not in aux_dict[tuple[0]]):
# aux_dict[tuple[0]].append(tuple[1])
#
# #Return the unique list of elements
# list_out = []
# for key, value in aux_dict.items():
# for x in value:
# list_out.append([key,x])
#
# return list_out
def get_pos(list, exon):
'''
Returns the position of the exon in the given list
'''
i = 0
for x in list:
if(x[0]==exon[0] and x[1]==exon[1]):
return i
i += 1
def check_all_possible_paths(matrix, exon_stop_coords_pos, local_path, pos):
'''
Using recursion, it generates all possible paths according to the transcipts-exon map
This function call to an auxiliary one (check_all_possible_paths_aux)
'''
paths_list = []
for j in range(0,len(matrix[pos])):
if (matrix[pos][j] == 1):
check_all_possible_paths_aux(matrix, exon_stop_coords_pos, paths_list, deepcopy(local_path), j+1)
return paths_list
def check_all_possible_paths_aux(matrix, exon_stop_coords_pos, paths_list, local_path, pos):
'''
Auxiliary function of check_all_possible_paths
'''
# Put the value in local_path and keep on looking for more exons, from pos
local_path.append(pos-1)
# If it have reached the the exon_stop, save local_path in paths_dict if it doesn't exist and return
if(pos-1==exon_stop_coords_pos):
if (local_path not in paths_list):
paths_list.append(local_path)
return None
else:
for j in range(0,len(matrix[pos])):
if (matrix[pos][j] == 1):
if(j!=len(matrix[pos])-1):
check_all_possible_paths_aux(matrix, exon_stop_coords_pos, paths_list, deepcopy(local_path), j+1)
# If it have reached the end, save local_path in paths_dict if it doesn't exist and return
else:
if (local_path not in paths_list):
paths_list.append(local_path)
return None
return None
def main():
try:
bed_path = sys.argv[1]
fasta_path = sys.argv[2]
exon_stop = sys.argv[3]
output_path = sys.argv[4]
# bed_path = "/home/shinoda/Desktop/Florida/annotation/MBNL1_exons_hg19_refseq.bed"
# fasta_path = "/home/shinoda/Desktop/Florida/annotation/MBNL1_exons_hg19_refseq.bed.fa"
# exon_stop = "152177060-152177134"
# output_path = "/home/shinoda/Desktop/Florida/annotation/MBNL1_possible_transcripts_refseq.fa"
exon_stop_coords = exon_stop.split("-")
# 1. Sort the exon information. Also, save the information of the transcripts and their associated exons
with open(bed_path) as f:
logger.info("Processing bed file...")
exons_list = []
transcripts_dict = {}
for line in f:
tokens = line.rstrip().split("\t")
chr = tokens[0]
strand = tokens[5]
# Save the cordinates in the list
if([tokens[1],tokens[2]] not in exons_list):
exons_list.append([tokens[1],tokens[2]])
transcript = str(tokens[3].split(":")[1])
if(transcript not in transcripts_dict):
transcripts_dict[transcript] = [[tokens[1],tokens[2]]]
else:
transcripts_dict[transcript].append([tokens[1], tokens[2]])
# If there is only 1 transcript, stop the execution.
# Raise an ERROR exception (we are interested in cases with various transcripts)
if(len(transcripts_dict)==1):
raise Exception("Only 1 transcript associated to this gene. Stop execution.")
# Sort the list of exons
# Depending on the strand, we have to sort it in a different way
if(strand=="+"):
exons_list_sorted = sorted(exons_list, key=lambda x: (x[1], x[0]))
else:
exons_list_sorted = sorted(exons_list,key=lambda x: (x[1], x[0]), reverse = True)
# Get the position of the exon_stop in exons_list_sorted
exon_stop_coords_pos = exons_list_sorted.index(exon_stop_coords)
# Sort the exons associated to each transcript
for key, values in transcripts_dict.items():
if (strand == "+"):
transcripts_dict[key] = sorted(values, key=lambda x: (x[1], x[0]))
else:
transcripts_dict[key] = sorted(values, key=lambda x: (x[1], x[0]), reverse=True)
# Remove all the transcripts that doesn't include the exon_end
# Copy the dictionary for iterating on it
transcripts_dict_copy = deepcopy(transcripts_dict)
for key, values in transcripts_dict_copy.items():
if(exon_stop_coords not in values):
del transcripts_dict[key]
# 2. Associate the sequence to each exon
exons_sequence = {}
i = 0
header_id = ""
with open(fasta_path) as f:
logger.info("Processing fasta file...")
for line in f:
if(re.search(">", line)):
prev_header_id = header_id
header_id = str(line.rstrip().split(":")[3])
header_flag = True
# If we have reach a new header, save the stored sequence in
if(i!=0):
if(header_flag):
exons_sequence[prev_header_id] = sequence
sequence = ""
header_flag = False
#Save the sequence lines until the next header
else:
sequence = sequence + line.rstrip()
else:
sequence = ""
header_flag = False
i += 1
exons_sequence[header_id] = sequence
# 3. Build the connection matrix
width = len(exons_list_sorted)
height = len(transcripts_dict)
# Leave an extra row and column for the start and end positions
matrix = [[0 for x in range(width+1)] for y in range(width+1)]
# For each transcript, check the associated exons and mark them in the matrix
i = 0
for key, exon_list in transcripts_dict.items():
j = 0
pos_previous = 0
for exon in exon_list:
#Check the associated position in exons_list_sorted
pos = get_pos(exons_list_sorted,exon)
# Mark the first exon of this transcript as connected with start
if(j==0):
matrix[0][pos] = 1
pos_previous = pos
# Mark the last exon of this transcript as connected with end
else:
if(j==len(exon_list)-1):
matrix[pos+1][len(exons_list_sorted)] = 1
matrix[pos_previous+1][pos] = 1
pos_previous = pos
j += 1
i += 1
# 4. For each row, get all the possible pathways from left to right
paths_list = check_all_possible_paths(matrix, exon_stop_coords_pos, [], 0)
# 5. Obtain the sequence for each pathway
i = 1
outFile = open(output_path, 'w')
# Save all the combinations of the exons in a separate file
outFile2 = open(output_path+".paths", 'w')
for element in paths_list:
header_id = ">Transcript_"+str(i) + ":" + strand
outFile2.write(header_id+": "+str(element) + "\n")
full_seq = ""
for x in element:
# Check the coordinates associated to the number of the exon
coords = exons_list_sorted[x]
id_coords = coords[0]+"_"+coords[1]
# Add this cordinates to the header_id
header_id = header_id + ":" + id_coords
# Get the associated sequence
seq = exons_sequence[id_coords].rstrip()
full_seq = full_seq + seq
#Output the results
outFile.write(header_id+"\n"+full_seq+"\n")
i += 1
outFile.close()
outFile2.close()
logger.info("Number of possible transcripts: "+str(len(paths_list)))
logger.info("Saved " + output_path)
logger.info("Saved " + output_path+".paths")
# 6. Generate a bed track with the annotation of all the possible exons
# Obtain all the unique possible exons from paths_list
unique_exon_list = []
for element in paths_list:
for x in element:
if(x not in unique_exon_list):
unique_exon_list.append(x)
# Sort the list
unique_exon_list_sorted = sorted(unique_exon_list)
# Get the cordinates for each exon and output the results in an external bed tracks file
outFile = open(output_path+".bedtrack", 'w')
outFile.write("track name=exon_number color=0,0,0\n")
for x in unique_exon_list_sorted:
coords = exons_list_sorted[x]
outFile.write(chr+"\t"+coords[0]+"\t"+coords[1]+"\t"+str(x)+"\n")
outFile.close()
logger.info("Generated also bed track with the info of all the possible exons: " + output_path+".bedtrack")
logger.info("Done. Exiting program.")
exit(0)
except Exception as error:
logger.error('ERROR: ' + repr(error))
logger.error("Aborting execution")
sys.exit(1)
if __name__ == '__main__':
main()