-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathtama_merge.py
3689 lines (2733 loc) · 157 KB
/
tama_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import re
import sys
import time
from Bio import SeqIO
from StringIO import StringIO
from Bio import AlignIO
import os
import argparse
from __init__ import __version__
"""
Transcriptome Annotation by Modular Algorithms (TAMA)
TAMA Merge
Author: Richard I. Kuo
This script merges transcriptome/genome annotations.
Last Updated: 2020/12/17
Added more informative error message for issues with strand information from input bed files.
"""
tm_version = 'tm0.0'
start_time = time.time()
prev_time = start_time
#####################################################################
#####################################################################
ap = argparse.ArgumentParser(description='This script merges transcriptomes.')
ap.add_argument('-f', type=str, nargs=1, help='File list')
ap.add_argument('-p', type=str, nargs=1, help='Output prefix')
ap.add_argument('-e', type=str, nargs=1, help='Collapse exon ends flag: common_ends or longest_ends (Default is common_ends)')
ap.add_argument('-a', type=str, nargs=1, help='5 prime threshold (Default is 10)')
ap.add_argument('-m', type=str, nargs=1, help='Exon ends threshold/ splice junction threshold (Default is 10)')
ap.add_argument('-z', type=str, nargs=1, help='3 prime threshold (Default is 10)')
ap.add_argument('-d', type=str, nargs=1, help='Flag for merging duplicate transcript groups (default no_merge quits when duplicates are found, merge_dup will merge duplicates)')
ap.add_argument('-s', type=str, nargs=1, help='Use gene and transcript ID from a merge source. Specify source name from filelist file here.')
ap.add_argument('-cds', type=str, nargs=1, help='Use CDS from a merge source. Specify source name from filelist file here.')
opts = ap.parse_args()
#check for missing args
missing_arg_flag = 0
if not opts.f:
print("Filelist file missing")
missing_arg_flag = 1
if not opts.p:
print("Output prefix name missing")
missing_arg_flag = 1
if not opts.e:
print("Default collapse exon ends flag will be used: common_ends")
collapse_flag = "common_ends"
else:
collapse_flag = opts.e[0]
if not opts.a:
print("Default 5 prime threshold: 20")
fiveprime_threshold = 20
else:
fiveprime_threshold = int(opts.a[0])
if not opts.m:
print("Default exon end/splice junction threshold: 10")
exon_diff_threshold = 10
else:
exon_diff_threshold = int(opts.m[0])
if not opts.z:
print("Default 3 prime threshold: 20")
threeprime_threshold = 20
else:
threeprime_threshold = int(opts.z[0])
if not opts.d:
print("Default duplicate merge flag: no_merge")
duplicate_flag = "no_merge"
else:
duplicate_flag = str(opts.d[0])
if not opts.s:
print("Default source ID merge flag: no_source_id")
source_id_flag = "no_source_id"
else:
source_id_flag = str(opts.s[0])
source_id_flag_list = source_id_flag.split(",")
if not opts.cds:
print("Default CDS merge flag: no_cds")
source_cds_flag = "no_cds"
else:
source_cds_flag = str(opts.cds[0])
source_cds_flag_list = source_cds_flag.split(",")
if missing_arg_flag == 1:
print("Please try again with complete arguments")
filelist_file = opts.f[0]
outfile_prefix = opts.p[0]
#####################################################################
#####################################################################
print("opening file list")
#filelist_file = sys.argv[1]
filelist_file_contents = open(filelist_file).read().rstrip("\n").split("\n")
#outfile_prefix = sys.argv[2]
bed_outfile_name = outfile_prefix + ".bed"
outfile_bed = open(bed_outfile_name,"w")
trans_report_outfile_name = outfile_prefix + "_trans_report.txt"
outfile_trans_report = open(trans_report_outfile_name,"w")
trans_report_line = "\t".join(["transcript_id","num_clusters","sources","start_wobble_list","end_wobble_list","exon_start_support","exon_end_support","all_source_trans"])
outfile_trans_report.write(trans_report_line)
outfile_trans_report.write("\n")
gene_report_outfile_name = outfile_prefix + "_gene_report.txt"
outfile_gene_report = open(gene_report_outfile_name,"w")
gene_report_line = "\t".join(["gene_id","num_clusters","num_final_trans","sources","chrom", "start","end","source_genes","source_summary"])
outfile_gene_report.write(gene_report_line)
outfile_gene_report.write("\n")
merge_outfile_name = outfile_prefix + "_merge.txt"
outfile_merge = open(merge_outfile_name,"w")
#merge_line = "\t".join(["transcript_id","cluster_id","scaffold","strand","start","end","exon_starts","exon_ends"])
#outfile_merge.write(merge_line)
#outfile_merge.write("\n")
bed_dict = {} # bed_dict[scaffold][start][end][strand] = list of bed lines with added source id
source_dict = {} # source_dict[source id][filename, seq type, priority rank] = value
def track_time(start_time,prev_time):
end_time = time.time()
time_taken = int(end_time - prev_time)
tt_hours = time_taken / 60
tt_hours = tt_hours /60
leftover_time = time_taken - (tt_hours * 60 * 60)
tt_minutes = leftover_time / 60
leftover_time = time_taken - (tt_minutes * 60)
tt_seconds = leftover_time
print("time taken since last check:\t" + str(tt_hours) + ":" + str(tt_minutes) + ":" + str(tt_seconds) )
time_total = int(end_time - start_time)
tt_hours = time_total / 60
tt_hours = tt_hours /60
leftover_time = time_total - (tt_hours * 60 * 60)
tt_minutes = leftover_time / 60
leftover_time = time_total - (tt_minutes * 60)
tt_seconds = leftover_time
print("time taken since beginning:\t" + str(tt_hours) + ":" + str(tt_minutes) + ":" + str(tt_seconds) )
this_time = end_time
return this_time
class Transcript:
def __init__(self, trans_id):
self.trans_id = trans_id
self.uniq_trans_id = ""
self.gene_id = ""
self.merge_gene_id = ""
self.scaffold = "none"
self.trans_start = -1
self.start_pos = self.trans_start
self.trans_end = 0
self.end_pos = self.trans_end
self.exon_start_list = []
self.exon_end_list = []
self.id_line = ""
self.cds_start = 0
self.cds_end = 0
self.num_exons = 0
self.strand = ""
self.source_id = ""
self.start_priority = 0
self.end_priority = 0
self.junction_priority = 0
def add_bed_info(self,bed_line):
line_split = bed_line.split("\t")
self.scaffold = line_split[0]
self.trans_start = int(line_split[1])
self.start_pos = self.trans_start
self.trans_end = int(line_split[2])
self.end_pos = self.trans_end
self.id_line = line_split[3]
id_line_split = self.id_line.split(";")
self.gene_id = id_line_split[0]
self.strand = line_split[5]
self.num_exons = int(line_split[9])
self.cds_start = int(line_split[6])
self.cds_end = int(line_split[7])
block_size_list = line_split[10].split(",")
block_start_list = line_split[11].split(",")
#add this for commas at the end of the exon block and start strings
if block_size_list[-1] == "":
block_size_list.pop(-1)
if block_start_list[-1] == "":
block_start_list.pop(-1)
for i in xrange(len(block_size_list)):
rel_exon_start = int(block_start_list[i])
rel_exon_end = rel_exon_start + int(block_size_list[i])
exon_start = self.trans_start + rel_exon_start
exon_end = self.trans_start + rel_exon_end
self.exon_end_list.append(exon_end)
self.exon_start_list.append(exon_start)
def add_source_id(self,source_id):
self.source_id = source_id
self.uniq_trans_id = "_".join([source_id,self.trans_id])
def add_priority(self,start_priority,junction_priority,end_priority):
self.start_priority = int(start_priority)
self.end_priority = int(end_priority)
self.junction_priority = int(junction_priority)
def make_exon_start_end_lines(self):
exon_start_string_list = []
exon_end_string_list = []
for i in xrange(len(self.exon_start_list)):
exon_start_string_list.append(str(self.exon_start_list[i]))
exon_end_string_list.append(str(self.exon_end_list[i]))
exon_start_string_line = ",".join(exon_start_string_list)
exon_end_string_line = ",".join(exon_end_string_list)
return exon_start_string_line,exon_end_string_line
def format_bed_line(self,final_trans_id):
bed_list = []
bed_list.append(str(self.scaffold))
bed_list.append(str(self.trans_start))
bed_list.append(str(self.trans_end))
#gene_id = self.trans_id.split(".")[0]
id_line = ";".join([final_trans_id,self.uniq_trans_id])
bed_list.append(str(id_line))
bed_list.append("40")
bed_list.append(self.strand)
bed_list.append(str(self.trans_start))
bed_list.append(str(self.trans_end))
bed_list.append("255,0,0")
bed_list.append(str(self.num_exons))
relative_exon_start_list = []
exon_length_list = []
for i in xrange(self.num_exons):
exon_start = self.exon_start_list[i]
exon_end = self.exon_end_list[i]
exon_length = exon_end - exon_start
relative_exon_start = exon_start - self.trans_start
relative_exon_start_list.append(str(relative_exon_start))
exon_length_list.append(str(exon_length))
relative_exon_start_line = ",".join(relative_exon_start_list)
exon_length_line = ",".join(exon_length_list)
bed_list.append(exon_length_line)
bed_list.append(relative_exon_start_line)
bed_line = "\t".join(bed_list)
return bed_line
####################################################################################################
rgb_colour_dict = {} # rgb_colour_dict[number] = rgb values ie (255,0,0)
rgb_colour_dict[1] = "255,0,0" #red
rgb_colour_dict[2] = "255,100,0" #orange
rgb_colour_dict[3] = "255,200,0" #yellow
rgb_colour_dict[4] = "200,255,0" #lime
rgb_colour_dict[5] = "0,255,200" # light turquoise
rgb_colour_dict[6] = "0,200,255" # light blue
rgb_colour_dict[7] = "0,100,255" # royal blue
rgb_colour_dict[8] = "0,0,255" #dark blue
rgb_colour_dict[9] = "100,0,255" # dark purple
rgb_colour_dict[10] = "200,0,255" # magenta
class Merged:
def __init__(self, trans_id):
self.trans_id = trans_id
self.extra_trans_id_list = "NA"
self.extra_gene_id_list = "NA"
#same as collapse start and end list but used for flexibility in calling
self.exon_start_list = []
self.exon_end_list = []
self.scaffold = "none"
self.merged_trans_dict = {} # merged_trans_dict[source_trans id] = trans obj
self.trans_list = []
self.trans_obj_list = []
self.strand = "none"
self.num_trans = 0
self.collapse_start_list = []
self.collapse_end_list = []
self.start_wobble_list = []
self.end_wobble_list = []
self.start_pos = 0
self.end_pos = 0
self.start_cds = 0
self.end_cds = 0
self.num_exons = 0
#use these to see which transcripts support which coordinates
self.e_start_trans_dict = {}
self.e_end_trans_dict = {}
self.e_start_trans_list = []
self.e_end_trans_list = []
def add_merged_trans(self,trans_obj):
merged_trans_id = trans_obj.uniq_trans_id
if merged_trans_id in self.merged_trans_dict:
print("Transcript is already in this group")
return
self.trans_list.append(merged_trans_id)
self.trans_obj_list.append(trans_obj)
self.merged_trans_dict[merged_trans_id] = trans_obj
merged_trans_id_list = self.merged_trans_dict.keys()
self.num_trans = len(merged_trans_id_list)
if self.num_exons < int(trans_obj.num_exons):
self.num_exons = int(trans_obj.num_exons)
self.scaffold = trans_obj.scaffold
if self.strand == "none":
self.strand = trans_obj.strand
elif self.strand != trans_obj.strand:
print("Error with merged trans not on the same strand")
sys.exit()
def add_merge_info(self,collapse_start_list,collapse_end_list,start_wobble_list,end_wobble_list,e_start_trans_dict,e_end_trans_dict ):
self.collapse_start_list = collapse_start_list
self.collapse_end_list = collapse_end_list
self.start_wobble_list = start_wobble_list
self.end_wobble_list = end_wobble_list
self.exon_start_list = collapse_start_list
self.exon_end_list = collapse_end_list
self.start_pos = collapse_start_list[0]
self.end_pos = collapse_end_list[-1]
self.e_start_trans_dict = e_start_trans_dict
self.e_end_trans_dict = e_end_trans_dict
#print(self.trans_list)
#print(collapse_start_list)
#print(collapse_end_list)
for i in xrange(len(collapse_start_list)):
e_start = collapse_start_list[i]
e_end = collapse_end_list[i]
e_start_trans = list(e_start_trans_dict[e_start].keys())
e_end_trans = list(e_end_trans_dict[e_end].keys())
e_start_trans.sort()
e_end_trans.sort()
e_start_trans_line = ",".join(e_start_trans)
e_end_trans_line = ",".join(e_end_trans)
self.e_start_trans_list.append(e_start_trans_line)
self.e_end_trans_list.append(e_end_trans_line)
def format_bed_line(self):
bed_list = []
bed_list.append(str(self.scaffold))
bed_list.append(str(self.start_pos))
bed_list.append(str(self.end_pos))
gene_id = self.trans_id.split(".")[0]
if self.extra_trans_id_list == "NA":
id_line = ";".join([gene_id,self.trans_id])
else:
this_id_line_list = []
this_id_line_list.append(gene_id)
this_id_line_list.append(self.trans_id)
for k in xrange(len(self.extra_gene_id_list)):
this_id_line_list.append(self.extra_gene_id_list[k])
this_id_line_list.append(self.extra_trans_id_list[k])
id_line = ";".join(this_id_line_list)
bed_list.append(str(id_line))
bed_list.append("40")
bed_list.append(self.strand)
if self.start_cds != 0:
bed_list.append(str(self.start_cds))
bed_list.append(str(self.end_cds))
else:
bed_list.append(str(self.start_pos))
bed_list.append(str(self.end_pos))
this_source_dict = {} # this_source_dict[source] = 1
for source_trans_id in self.merged_trans_dict:
source_trans_obj = self.merged_trans_dict[source_trans_id]
this_source = source_trans_obj.source_id
this_source_dict[this_source] = 1
this_source_list = list(this_source_dict.keys())
if len(this_source_list) > 1:
rgb_colour_code = rgb_colour_dict[10] # magenta for mixed sources
elif len(this_source_list) == 1:
colour_num = source_colour_dict[this_source_list[0]]
rgb_colour_code = rgb_colour_dict[colour_num]
#bed_list.append("255,0,0")
bed_list.append(rgb_colour_code)
bed_list.append(str(self.num_exons))
relative_exon_start_list = []
exon_length_list = []
for i in xrange(self.num_exons):
exon_start = self.collapse_start_list[i]
exon_end = self.collapse_end_list[i]
exon_length = exon_end - exon_start
relative_exon_start = exon_start - self.start_pos
relative_exon_start_list.append(str(relative_exon_start))
exon_length_list.append(str(exon_length))
relative_exon_start_line = ",".join(relative_exon_start_list)
exon_length_line = ",".join(exon_length_list)
bed_list.append(exon_length_line)
bed_list.append(relative_exon_start_line)
bed_line = "\t".join(bed_list)
return bed_line
def format_trans_report_line(self):
trans_report_list = []
trans_report_list.append(self.trans_id)
trans_report_list.append(str(self.num_trans))
#get all sources
merged_source_dict = {} #merged_source_dict[source id] = 1
for merged_trans_id in self.merged_trans_dict:
a_trans_obj = self.merged_trans_dict[merged_trans_id]
source_id = a_trans_obj.source_id
merged_source_dict[source_id] = 1
merged_source_list = list(merged_source_dict.keys())
merged_source_list.sort()
merged_source_line = ",".join(merged_source_list)
trans_report_list.append(merged_source_line)
start_wobble_string_list = []
end_wobble_string_list = []
for i in xrange(len(self.start_wobble_list)):
start_wobble_string = str(self.start_wobble_list[i])
start_wobble_string_list.append(start_wobble_string)
end_wobble_string = str(self.end_wobble_list[i])
end_wobble_string_list.append(end_wobble_string)
start_wobble_line = ",".join(start_wobble_string_list)
end_wobble_line = ",".join(end_wobble_string_list)
trans_report_list.append(start_wobble_line)
trans_report_list.append(end_wobble_line)
e_start_trans_support = ";".join(self.e_start_trans_list)
e_end_trans_support = ";".join(self.e_end_trans_list)
trans_report_list.append(e_start_trans_support)
trans_report_list.append(e_end_trans_support)
####################
# 2020/05/31
all_source_trans_line = ",".join(self.trans_list)
trans_report_list.append(all_source_trans_line)
# 2020/05/31
###################
trans_report_line = "\t".join(trans_report_list)
return trans_report_line
#above this line are def's used for looping through sam file
####################################################################################################
####################################################################################################
####################################################################################################
#below this line are def's use for post sam pocessing
####################################################################################################
####################################################################################################
def fuzzy_match(coord1, coord2, diff_threshold): # use this to allow for fuzzy matches of splice junctions
diff_num = 0
match_flag = "none"
# diff_threshold = 10
if coord1 == coord2:
match_flag = "perfect_match"
else:
diff_num = coord1 - coord2
if match_flag == "none":
if abs(diff_num) <= diff_threshold:
match_flag = "wobbly_match"
else:
match_flag = "no_match"
if match_flag == "none":
print("Error with match flag")
sys.exit()
return match_flag, diff_num
####################################################################################################
def compare_transcripts_both_capped(trans_obj,o_trans_obj,fivecap_flag,o_fivecap_flag ,strand): #use this to compare two transcripts
#check cap flag, both transcripts should be capped
if fivecap_flag != "capped" or o_fivecap_flag != "capped":
print("Error, both transcripts need to be from capped libraries!")
print(fivecap_flag + " " + o_fivecap_flag)
sys.exit()
diff_num_exon_flag = 0
max_exon_num = 0
min_exon_num = 0
e_start_list = trans_obj.exon_start_list
o_e_start_list = o_trans_obj.exon_start_list
e_end_list = trans_obj.exon_end_list
o_e_end_list = o_trans_obj.exon_end_list
if len(e_start_list) != len(o_e_start_list):
diff_num_exon_flag = 1
if diff_num_exon_flag == 1: # if 5prime capped then should have same number of exons
trans_comp_flag = "diff_transcripts"
start_match_list = []
start_diff_list = []
end_match_list = []
end_diff_list = []
short_trans = "none"
min_exon_num = 0
else:
short_trans = "same"
min_exon_num = len(e_start_list)
start_match_list = []
start_diff_list = []
end_match_list = []
end_diff_list = []
all_match_flag = 1 # 1 if all matching and 0 if at least one not matching
for i in xrange(min_exon_num): # iterate from 3' end of transcript, strand corrected
if strand == "+":
j = -1 * (i + 1) #iterate from last exon to account for possible 5' degradation for forward strand
elif strand == "-":
j = i # iterate from first exon for reverse strand
e_start = e_start_list[j]
o_e_start = o_e_start_list[j]
e_end = e_end_list[j]
o_e_end = o_e_end_list[j]
# set as default before deciding which thresholds to use depending on situation
# Need to know which exon this if for threshold setting
start_threshold = exon_diff_threshold
end_threshold = exon_diff_threshold
if strand == "+":
if i == 0: #use three prime threshold if this is last exon
end_threshold = threeprime_threshold
if diff_num_exon_flag == 0 and i == min_exon_num-1: #use 5 prime threshold if this is first exon
start_threshold = fiveprime_threshold
elif strand == "-":
if i == 0: #use three prime threshold if this is last exon
start_threshold = threeprime_threshold
if diff_num_exon_flag == 0 and i == min_exon_num-1: #use 5 prime threshold if this is first exon
end_threshold = fiveprime_threshold
start_match_flag,start_diff_num = fuzzy_match(e_start,o_e_start,start_threshold)
end_match_flag,end_diff_num = fuzzy_match(e_end,o_e_end,end_threshold)
#remember this is for both 5cap
if start_match_flag == "no_match" or end_match_flag == "no_match":
all_match_flag = 0
start_match_list.append(start_match_flag)
start_diff_list.append(start_diff_num)
end_match_list.append(end_match_flag)
end_diff_list.append(end_diff_num)
trans_comp_flag = "none"
if all_match_flag == 1:
trans_comp_flag = "same_transcript"
else:
trans_comp_flag = "diff_transcripts"
#Keep in mind that the lists are ordered from 3' end to 5' end
return trans_comp_flag,start_match_list,start_diff_list,end_match_list,end_diff_list,short_trans,min_exon_num
####################################################################################################
def compare_transcripts_capped_nocap(trans_obj,o_trans_obj,fivecap_flag,o_fivecap_flag ,strand): #use this to compare two transcripts
#check cap situation
if fivecap_flag == "capped" and o_fivecap_flag == "no_cap":
c_trans_obj = trans_obj
n_trans_obj = o_trans_obj
#should be implied but just for checking
c_fivecap_flag = fivecap_flag
n_fivecap_flag = o_fivecap_flag
elif fivecap_flag == "no_cap" and o_fivecap_flag == "capped":
c_trans_obj = o_trans_obj
n_trans_obj = trans_obj
#should be implied but just for checking
c_fivecap_flag = o_fivecap_flag
n_fivecap_flag = fivecap_flag
else:
print("Error with cap flags, one needs to be capped and the other no cap")
print(fivecap_flag +" "+o_fivecap_flag)
sys.exit()
diff_num_exon_flag = 0
max_exon_num = 0
min_exon_num = 0
c_e_start_list = c_trans_obj.exon_start_list
n_e_start_list = n_trans_obj.exon_start_list
c_e_end_list = c_trans_obj.exon_end_list
n_e_end_list = n_trans_obj.exon_end_list
if len(c_e_start_list) != len(n_e_start_list):
diff_num_exon_flag = 1
trans_comp_flag = "unassigned"
c_num_exons = len(c_e_start_list)
n_num_exons = len(n_e_start_list)
if diff_num_exon_flag == 1: # nocap should not have more exons than capped
if n_num_exons > c_num_exons:
trans_comp_flag = "diff_transcripts"
start_match_list = []
start_diff_list = []
end_match_list = []
end_diff_list = []
short_trans = "none"
min_exon_num = 0
if trans_comp_flag != "diff_transcripts":
max_exon_num = c_num_exons
min_exon_num = n_num_exons
short_trans = n_trans_obj.trans_id
start_match_list = []
start_diff_list = []
end_match_list = []
end_diff_list = []
all_match_flag = 1 # 1 if all matching and 0 if at least one not matching
for i in xrange(min_exon_num): # iterate from 3' end of transcript, strand corrected
if strand == "+":
j = -1 * (i + 1) # iterate from last exon to account for possible 5' degradation for forward strand
elif strand == "-":
j = i # iterate from first exon for reverse strand
c_e_start = c_e_start_list[j]
n_e_start = n_e_start_list[j]
c_e_end = c_e_end_list[j]
n_e_end = n_e_end_list[j]
start_threshold = exon_diff_threshold
end_threshold = exon_diff_threshold
if strand == "+":
if i == 0: # use three prime threshold if this is last exon
end_threshold = threeprime_threshold
if i == max_exon_num - 1: # use 5 prime threshold if this is first exon, never reaches max_exon_num if diff exon num
start_threshold = fiveprime_threshold
elif strand == "-":
if i == 0: # use three prime threshold if this is last exon
start_threshold = threeprime_threshold
if i == max_exon_num - 1: # use 5 prime threshold if this is first exon, never reaches max_exon_num if diff exon num
end_threshold = fiveprime_threshold
start_match_flag, start_diff_num = fuzzy_match(c_e_start, n_e_start, start_threshold)
end_match_flag, end_diff_num = fuzzy_match(c_e_end, n_e_end, end_threshold)
if i < min_exon_num - 1 : # not final exon
if end_match_flag == "no_match" or start_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
break
# Note that these occur at last loop, so no need ot break after match decision
elif max_exon_num > min_exon_num and i == min_exon_num - 1: # diff num exons and at first min exon
if strand == "+":
if end_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
elif start_match_flag == "no_match":
if c_e_start > n_e_start: #if capped start is later than nocap start then no match
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
else: # same three prime exons, with first no cap exon not as long as matching capped exon
trans_comp_flag = "same_three_prime_diff_exons"
else: # matches all exons for nocap but diff num exons
trans_comp_flag = "same_three_prime_diff_exons"
elif strand == "-":
if start_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
elif end_match_flag == "no_match":
if c_e_end < n_e_end: # for reverse strand, if capped start is not as long as nocap start then no match
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
else: # same three prime exons, with first no cap exon not as long as matching capped exon
trans_comp_flag = "same_three_prime_diff_exons"
else: # matches all exons for nocap but diff num exons
trans_comp_flag = "same_three_prime_diff_exons"
elif max_exon_num == min_exon_num and i == min_exon_num - 1: # same number of exons and first exon
if strand == "+":
if end_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
elif start_match_flag == "no_match":
if c_e_start > n_e_start: #if capped start is later than nocap start then no match
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
else: # start does not match but start exon matches
trans_comp_flag = "same_three_prime_same_exons"
else: # complete match
trans_comp_flag = "same_transcript"
elif strand == "-":
if start_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
elif end_match_flag == "no_match":
if c_e_end < n_e_end: # for reverse strand, if capped start is not as long as nocap start then no match
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
else: # start does not match but start exon matches
trans_comp_flag = "same_three_prime_same_exons"
else: # complete match
trans_comp_flag = "same_transcript"
start_match_list.append(start_match_flag)
start_diff_list.append(start_diff_num)
end_match_list.append(end_match_flag)
end_diff_list.append(end_diff_num)
# note that comparison stops at first no match signal
# so the match lists may not be as long as number of exons if there is an internal no match
# Keep in mind that the lists are ordered from 3' end to 5' end
return trans_comp_flag, start_match_list, start_diff_list, end_match_list, end_diff_list, short_trans, min_exon_num
####################################################################################################
####################################################################################################
# use this for compare_transcripts_both_nocap
def assign_short_long_trans(s_trans_obj,l_trans_obj):
# short trans obj is first in arguments
l_e_start_list = l_trans_obj.exon_start_list # long
s_e_start_list = s_trans_obj.exon_start_list # short
l_e_end_list = l_trans_obj.exon_end_list #long
s_e_end_list = s_trans_obj.exon_end_list #short
max_exon_num = len(l_e_start_list)
min_exon_num = len(s_e_start_list)
return(s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num)
####################################################################################################
def compare_transcripts_both_nocap(trans_obj,o_trans_obj,fivecap_flag,o_fivecap_flag ,strand): #use this to compare two transcripts
diff_num_exon_flag = 0
max_exon_num = 0
min_exon_num = 0
#check cap situation
if fivecap_flag != "no_cap" or o_fivecap_flag != "no_cap":
print("At least one transcript is capped, both need to be nocap")
sys.exit()
e_start_list = trans_obj.exon_start_list
o_e_start_list = o_trans_obj.exon_start_list
e_end_list = trans_obj.exon_end_list
o_e_end_list = o_trans_obj.exon_end_list
############################
#assign short and long trans
if len(e_start_list) != len(o_e_start_list):
diff_num_exon_flag = 1
if len(e_start_list) > len(o_e_start_list):
short_trans = o_trans_obj.trans_id
# short trans is first in argument
[s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num] = assign_short_long_trans(o_trans_obj,trans_obj)
elif len(e_start_list) < len(o_e_start_list):
short_trans = trans_obj.trans_id
# short trans is first in argument
[s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num] = assign_short_long_trans(trans_obj,o_trans_obj)
elif len(e_start_list) == len(o_e_start_list):
if strand == "+":
# assigan first as long trans if longer or equal
if e_start_list[0] <= o_e_start_list[0]:
short_trans = o_trans_obj.trans_id
# short trans is first in argument
[s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num] = assign_short_long_trans(o_trans_obj,trans_obj)
elif e_start_list[0] > o_e_start_list[0]:
short_trans = trans_obj.trans_id
# short trans is first in argument
[s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num] = assign_short_long_trans(trans_obj,o_trans_obj)
else:
print("Error with short long trans assigning")
sys.exit()
elif strand == "-":
# assigan first as long trans if longer or equal
if e_end_list[-1] >= o_e_end_list[-1]:
short_trans = o_trans_obj.trans_id
# short trans is first in argument
[s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num] = assign_short_long_trans(o_trans_obj,trans_obj)
elif e_end_list[-1] < o_e_end_list[-1]:
short_trans = trans_obj.trans_id
# short trans is first in argument
[s_e_start_list,s_e_end_list,l_e_start_list,l_e_end_list,max_exon_num,min_exon_num] = assign_short_long_trans(trans_obj,o_trans_obj)
else:
print("Error with short long trans assigning")
sys.exit()
else:
print("Error with short long trans assigning")
sys.exit()
#assign short and long trans above
############################
############################
start_match_list = []
start_diff_list = []
end_match_list = []
end_diff_list = []
all_match_flag = 1 # 1 if all matching and 0 if at least one not matching
for i in xrange(min_exon_num): # iterate from 3' end of transcript, strand corrected
if strand == "+":
j = -1 * (i + 1) # iterate from last exon to account for possible 5' degradation for forward strand
elif strand == "-":
j = i # iterate from first exon for reverse strand
l_e_start = l_e_start_list[j]
s_e_start = s_e_start_list[j]
l_e_end = l_e_end_list[j]
s_e_end = s_e_end_list[j]
start_threshold = exon_diff_threshold
end_threshold = exon_diff_threshold
if strand == "+":
if i == 0: # use three prime threshold if this is last exon
end_threshold = threeprime_threshold
if i == max_exon_num - 1: # use 5 prime threshold if this is first exon, never reaches max_exon_num if diff exon num
start_threshold = fiveprime_threshold
elif strand == "-":
if i == 0: # use three prime threshold if this is last exon
start_threshold = threeprime_threshold
if i == max_exon_num - 1: # use 5 prime threshold if this is first exon, never reaches max_exon_num if diff exon num
end_threshold = fiveprime_threshold
start_match_flag, start_diff_num = fuzzy_match(l_e_start, s_e_start, start_threshold)
end_match_flag, end_diff_num = fuzzy_match(l_e_end, s_e_end, end_threshold)
#################################################################
if i < min_exon_num - 1 : # not final exon
if end_match_flag == "no_match" or start_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
break
# Note that these occur at last loop, so no need ot break after match decision
elif max_exon_num > min_exon_num and i == min_exon_num - 1: # diff num exons and at first min exon, last loop
if strand == "+":
if end_match_flag == "no_match":
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
elif start_match_flag == "no_match":
if l_e_start > s_e_start: #if capped start is later than nocap start then no match
all_match_flag = 0
trans_comp_flag = "diff_transcripts"
else: # same three prime exons, with first no cap exon not as long as matching capped exon
trans_comp_flag = "same_three_prime_diff_exons"
else: # matches all exons for nocap but diff num exons